1.1 --- a/java/dictionary-generator/src/cz/frantovo/telco/dictionary/Generator.java Mon Jul 08 23:38:22 2013 +0200
1.2 +++ b/java/dictionary-generator/src/cz/frantovo/telco/dictionary/Generator.java Tue Jul 09 18:59:07 2013 +0200
1.3 @@ -17,13 +17,189 @@
1.4 */
1.5 package cz.frantovo.telco.dictionary;
1.6
1.7 +import static cz.frantovo.telco.dictionary.Xmlns.*;
1.8 +import static cz.frantovo.telco.dictionary.Functions.*;
1.9 +import java.io.BufferedWriter;
1.10 +import java.io.ByteArrayOutputStream;
1.11 +import java.io.DataOutputStream;
1.12 +import java.io.File;
1.13 +import java.io.FileOutputStream;
1.14 +import java.io.FileWriter;
1.15 +import java.io.IOException;
1.16 +import java.nio.ByteBuffer;
1.17 +import java.nio.charset.Charset;
1.18 +import java.util.ArrayList;
1.19 +import java.util.List;
1.20 +import java.util.logging.Level;
1.21 +import java.util.logging.Logger;
1.22 +import javax.xml.parsers.DocumentBuilder;
1.23 +import javax.xml.parsers.DocumentBuilderFactory;
1.24 +import javax.xml.parsers.ParserConfigurationException;
1.25 +import javax.xml.transform.Transformer;
1.26 +import javax.xml.transform.TransformerConfigurationException;
1.27 +import javax.xml.transform.TransformerException;
1.28 +import javax.xml.transform.TransformerFactory;
1.29 +import javax.xml.transform.dom.DOMSource;
1.30 +import javax.xml.transform.stream.StreamResult;
1.31 +import javax.xml.transform.stream.StreamSource;
1.32 +import javax.xml.xpath.XPath;
1.33 +import javax.xml.xpath.XPathConstants;
1.34 +import javax.xml.xpath.XPathExpression;
1.35 +import javax.xml.xpath.XPathExpressionException;
1.36 +import javax.xml.xpath.XPathFactory;
1.37 +import org.w3c.dom.Document;
1.38 +import org.w3c.dom.Node;
1.39 +import org.w3c.dom.NodeList;
1.40 +import org.xml.sax.SAXException;
1.41 +
1.42 /**
1.43 + * <p>
1.44 + * Generates dictionary files in StarDict format from source in our XML format.
1.45 + * </p>
1.46 + *
1.47 + * <p>
1.48 + * Number format should be: 32-bits unsigned number in network byte order
1.49 + * </p>
1.50 *
1.51 * @author Ing. František Kučera (frantovo.cz)
1.52 */
1.53 public class Generator {
1.54
1.55 + private static final Logger log = Logger.getLogger(Generator.class.getName());
1.56 + private static final String EMAIL_TOKEN = "ixumhht68";
1.57 + private final DocumentBuilderFactory documentBuilderFactory;
1.58 + private final DocumentBuilder documentBuilder;
1.59 + private final XPathFactory xpathFactory;
1.60 + private final XPath xpath;
1.61 + private final TransformerFactory xslFactory;
1.62 + private final Transformer xsl;
1.63 + private final Charset utf8;
1.64 +
1.65 + public Generator() throws ParserConfigurationException, TransformerConfigurationException {
1.66 + utf8 = Charset.forName("UTF-8");
1.67 +
1.68 + documentBuilderFactory = DocumentBuilderFactory.newInstance();
1.69 + documentBuilderFactory.setNamespaceAware(true);
1.70 + documentBuilder = documentBuilderFactory.newDocumentBuilder();
1.71 +
1.72 + xslFactory = TransformerFactory.newInstance();
1.73 + xsl = xslFactory.newTransformer(new StreamSource("concept.xsl"));
1.74 +
1.75 + xpathFactory = XPathFactory.newInstance();
1.76 + xpath = xpathFactory.newXPath();
1.77 + xpath.setNamespaceContext(getNamespaceContext());
1.78 + }
1.79 +
1.80 + private void generate(File folder, String filePrefix) {
1.81 + File infoFile = new File(folder, filePrefix + ".ifo");
1.82 + File dictFile = new File(folder, filePrefix + ".dict");
1.83 + File indexFile = new File(folder, filePrefix + ".idx");
1.84 + File synonymFile = new File(folder, filePrefix + ".syn");
1.85 +
1.86 + FileOutputStream dictOutputStream = null;
1.87 + DataOutputStream synonymOutputStream = null;
1.88 + DataOutputStream indexOutputStream = null;
1.89 + BufferedWriter infoWriter = null;
1.90 +
1.91 + try {
1.92 + dictOutputStream = new FileOutputStream(dictFile);
1.93 + synonymOutputStream = new DataOutputStream(new FileOutputStream(synonymFile));
1.94 + indexOutputStream = new DataOutputStream(new FileOutputStream(indexFile));
1.95 + infoWriter = new BufferedWriter(new FileWriter(infoFile));
1.96 +
1.97 + Document sourceDocument = documentBuilder.parse("../../data/dictionary.xml");
1.98 + XPathExpression termsXPath = xpath.compile("d:term/@completeForm|d:term/@abbreviation");
1.99 + // TODO: tags - labels/descriptions
1.100 + xsl.setParameter("tags", sourceDocument.getElementsByTagNameNS(DICTIONARY, "tags").item(0));
1.101 +
1.102 + /**
1.103 + * TODO: sorting
1.104 + */
1.105 + long offset = 0;
1.106 + long conceptIndex = 0;
1.107 + long synonymCount = 0;
1.108 + for (Node conceptNode : nodeIterable(sourceDocument.getElementsByTagNameNS(DICTIONARY, "concept"))) {
1.109 + ByteArrayOutputStream conceptXhtml = new ByteArrayOutputStream();
1.110 + xsl.transform(new DOMSource(conceptNode), new StreamResult(conceptXhtml));
1.111 + int length = conceptXhtml.size();
1.112 + dictOutputStream.write(conceptXhtml.toByteArray());
1.113 +
1.114 + NodeList nameNodes = (NodeList) termsXPath.evaluate(conceptNode, XPathConstants.NODESET);
1.115 + List<String> names = new ArrayList<>();
1.116 +
1.117 + for (Node nameNode : nodeIterable(nameNodes)) {
1.118 + String name = nameNode.getTextContent().trim();
1.119 + if (!name.isEmpty()) {
1.120 + names.add(name);
1.121 + }
1.122 + }
1.123 +
1.124 + synonymCount = +writeSynonyms(synonymOutputStream, names, conceptIndex);
1.125 + writeIndex(indexOutputStream, names.get(0), offset, length);
1.126 +
1.127 + offset = offset + length;
1.128 + conceptIndex++;
1.129 + }
1.130 +
1.131 + indexOutputStream.flush();
1.132 + writeInfo(infoWriter, sourceDocument, conceptIndex + 1, synonymCount, indexFile.length());
1.133 + } catch (SAXException | IOException | TransformerException | XPathExpressionException e) {
1.134 + log.log(Level.SEVERE, "unable to generate", e);
1.135 + } finally {
1.136 + close(dictOutputStream);
1.137 + close(synonymOutputStream);
1.138 + close(indexOutputStream);
1.139 + close(infoWriter);
1.140 + }
1.141 + }
1.142 +
1.143 + private void writeIndex(DataOutputStream indexOutputStream, String name, long offset, long length) throws IOException {
1.144 + indexOutputStream.write(name.getBytes(utf8));
1.145 + indexOutputStream.write(0);
1.146 + indexOutputStream.writeInt((int) offset); // unsigned int 32
1.147 + indexOutputStream.writeInt((int) length); // unsigned int 32
1.148 + }
1.149 +
1.150 + private int writeSynonyms(DataOutputStream synonymOutputStream, List<String> names, long baseIndex) throws IOException {
1.151 + if (names.size() > 1) {
1.152 + for (int i = 1; i < names.size(); i++) {
1.153 + String name = names.get(i);
1.154 + synonymOutputStream.write(name.getBytes(utf8));
1.155 + synonymOutputStream.write(0);
1.156 + synonymOutputStream.writeInt((int) baseIndex); // unsigned int 32
1.157 + }
1.158 + return names.size() - 1;
1.159 + } else {
1.160 + return 0;
1.161 + }
1.162 + }
1.163 +
1.164 + private void writeInfo(BufferedWriter infoWriter, Document sourceDocument, long wordcount, long synwourdcount, long idxfilesize) throws IOException {
1.165 + // TODO: values from document metadata
1.166 + infoWriter.write("StarDict's dict ifo file\n");
1.167 + infoWriter.write("version=2.4.2\n");
1.168 + infoWriter.write("bookname=Free Telco Dictionary\n");
1.169 + infoWriter.write("wordcount=" + wordcount + "\n");
1.170 + infoWriter.write("synwordcount=" + synwourdcount + "\n");
1.171 + infoWriter.write("idxfilesize=" + idxfilesize + "\n");
1.172 + infoWriter.write("idxoffsetbits=32\n");
1.173 + infoWriter.write("author=František Kučera\n");
1.174 + infoWriter.write("email=telco-dictionary." + EMAIL_TOKEN + "@" + "frantovo.cz\n");
1.175 + infoWriter.write("website=https://telco.frantovo.cz\n");
1.176 + infoWriter.write("description=A dictionary for telecommunications licensed under GNU FDL\n");
1.177 + infoWriter.write("date=2013.07.09\n");
1.178 + infoWriter.write("sametypesequence=h\n");
1.179 + }
1.180 +
1.181 public static void main(String[] args) {
1.182 -
1.183 + File outputFolder = new File("../../delivery/free-telco-dictionary");
1.184 + outputFolder.mkdir();
1.185 +
1.186 + try {
1.187 + Generator g = new Generator();
1.188 + g.generate(outputFolder, "telco");
1.189 + } catch (ParserConfigurationException | TransformerConfigurationException e) {
1.190 + log.log(Level.SEVERE, "error during initialization", e);
1.191 + }
1.192 }
1.193 }