# HG changeset patch # User František Kučera # Date 1373389147 -7200 # Node ID 93208f791318fe11e2f941d9799ac4632b21cf3e # Parent d78b74a2abe9b288280b7d1c503561e832d78b81 generator: first version diff -r d78b74a2abe9 -r 93208f791318 .hgignore --- a/.hgignore Mon Jul 08 23:38:22 2013 +0200 +++ b/.hgignore Tue Jul 09 18:59:07 2013 +0200 @@ -1,4 +1,5 @@ temp/* +delivery/* data/schemas.xml java/dictionary-generator/nbproject/private java/dictionary-generator/build/* diff -r d78b74a2abe9 -r 93208f791318 data/dictionary.xml --- a/data/dictionary.xml Mon Jul 08 23:38:22 2013 +0200 +++ b/data/dictionary.xml Tue Jul 09 18:59:07 2013 +0200 @@ -32,7 +32,7 @@ - + @@ -1259,7 +1259,7 @@ computer - + diff -r d78b74a2abe9 -r 93208f791318 java/dictionary-generator/concept.xsl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/java/dictionary-generator/concept.xsl Tue Jul 09 18:59:07 2013 +0200 @@ -0,0 +1,72 @@ + + + + + + + +
+ + + + + + + + + + + +
+ +

+ + +

+ Tags: + + + + + , + +

+
+
+
+ +
diff -r d78b74a2abe9 -r 93208f791318 java/dictionary-generator/src/cz/frantovo/telco/dictionary/DocumentNamespaceContext.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/java/dictionary-generator/src/cz/frantovo/telco/dictionary/DocumentNamespaceContext.java Tue Jul 09 18:59:07 2013 +0200 @@ -0,0 +1,58 @@ +/** + * Free Telco Dictionary + * Copyright © 2013 František Kučera (frantovo.cz) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package cz.frantovo.telco.dictionary; + +import java.util.Iterator; +import javax.xml.XMLConstants; +import javax.xml.namespace.NamespaceContext; +import org.w3c.dom.Document; + +/** + * + * @author Ing. František Kučera (frantovo.cz) + */ +public class DocumentNamespaceContext implements NamespaceContext { + + private Document document; + + public DocumentNamespaceContext(Document document) { + this.document = document; + } + + @Override + public String getNamespaceURI(String prefix) { + if (prefix.equals(XMLConstants.DEFAULT_NS_PREFIX)) { + return document.lookupNamespaceURI(null); + } else { + return document.lookupNamespaceURI(prefix); + } + } + + @Override + public String getPrefix(String xmlns) { + return document.lookupPrefix(xmlns); + } + + /** + * TODO: support multiple prefixes + */ + @Override + public Iterator getPrefixes(String xmlns) { + return new OneItemIterator(getPrefix(xmlns)); + } +} \ No newline at end of file diff -r d78b74a2abe9 -r 93208f791318 java/dictionary-generator/src/cz/frantovo/telco/dictionary/Functions.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/java/dictionary-generator/src/cz/frantovo/telco/dictionary/Functions.java Tue Jul 09 18:59:07 2013 +0200 @@ -0,0 +1,92 @@ +/** + * Free Telco Dictionary + * Copyright © 2013 František Kučera (frantovo.cz) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package cz.frantovo.telco.dictionary; + +import java.io.Closeable; +import java.io.IOException; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; +import java.util.logging.Level; +import java.util.logging.Logger; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; + +/** + * + * @author Ing. František Kučera (frantovo.cz) + */ +public class Functions { + + private static final Logger log = Logger.getLogger(Functions.class.getName()); + + public static Iterable nodeIterable(final NodeList list) { + return new Iterable() { + @Override + public Iterator iterator() { + return new Iterator() { + int position; + + @Override + public boolean hasNext() { + return position < list.getLength(); + } + + @Override + public Node next() { + return list.item(position++); + } + + @Override + public void remove() { + throw new UnsupportedOperationException("remove not supported"); + } + }; + } + }; + } + + public static void close(Closeable c) { + try { + c.close(); + } catch (Exception e) { + log.log(Level.WARNING, "closing of " + c + " has failed", e); + } + } + + public static boolean equalz(Object a, Object b) { + return a == null ? b == null : a.equals(b); + } + + public static Set getKeysForValue(Map map, V value) { + Set keysFound = new HashSet<>(); + + for (Entry entry : map.entrySet()) { + if (equalz(value, entry.getValue())) { + keysFound.add(entry.getKey()); + } + } + + return keysFound; + } + + private Functions() { + } +} diff -r d78b74a2abe9 -r 93208f791318 java/dictionary-generator/src/cz/frantovo/telco/dictionary/Generator.java --- a/java/dictionary-generator/src/cz/frantovo/telco/dictionary/Generator.java Mon Jul 08 23:38:22 2013 +0200 +++ b/java/dictionary-generator/src/cz/frantovo/telco/dictionary/Generator.java Tue Jul 09 18:59:07 2013 +0200 @@ -17,13 +17,189 @@ */ package cz.frantovo.telco.dictionary; +import static cz.frantovo.telco.dictionary.Xmlns.*; +import static cz.frantovo.telco.dictionary.Functions.*; +import java.io.BufferedWriter; +import java.io.ByteArrayOutputStream; +import java.io.DataOutputStream; +import java.io.File; +import java.io.FileOutputStream; +import java.io.FileWriter; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.List; +import java.util.logging.Level; +import java.util.logging.Logger; +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerConfigurationException; +import javax.xml.transform.TransformerException; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; +import javax.xml.transform.stream.StreamSource; +import javax.xml.xpath.XPath; +import javax.xml.xpath.XPathConstants; +import javax.xml.xpath.XPathExpression; +import javax.xml.xpath.XPathExpressionException; +import javax.xml.xpath.XPathFactory; +import org.w3c.dom.Document; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.xml.sax.SAXException; + /** + *

+ * Generates dictionary files in StarDict format from source in our XML format. + *

+ * + *

+ * Number format should be: 32-bits unsigned number in network byte order + *

* * @author Ing. František Kučera (frantovo.cz) */ public class Generator { + private static final Logger log = Logger.getLogger(Generator.class.getName()); + private static final String EMAIL_TOKEN = "ixumhht68"; + private final DocumentBuilderFactory documentBuilderFactory; + private final DocumentBuilder documentBuilder; + private final XPathFactory xpathFactory; + private final XPath xpath; + private final TransformerFactory xslFactory; + private final Transformer xsl; + private final Charset utf8; + + public Generator() throws ParserConfigurationException, TransformerConfigurationException { + utf8 = Charset.forName("UTF-8"); + + documentBuilderFactory = DocumentBuilderFactory.newInstance(); + documentBuilderFactory.setNamespaceAware(true); + documentBuilder = documentBuilderFactory.newDocumentBuilder(); + + xslFactory = TransformerFactory.newInstance(); + xsl = xslFactory.newTransformer(new StreamSource("concept.xsl")); + + xpathFactory = XPathFactory.newInstance(); + xpath = xpathFactory.newXPath(); + xpath.setNamespaceContext(getNamespaceContext()); + } + + private void generate(File folder, String filePrefix) { + File infoFile = new File(folder, filePrefix + ".ifo"); + File dictFile = new File(folder, filePrefix + ".dict"); + File indexFile = new File(folder, filePrefix + ".idx"); + File synonymFile = new File(folder, filePrefix + ".syn"); + + FileOutputStream dictOutputStream = null; + DataOutputStream synonymOutputStream = null; + DataOutputStream indexOutputStream = null; + BufferedWriter infoWriter = null; + + try { + dictOutputStream = new FileOutputStream(dictFile); + synonymOutputStream = new DataOutputStream(new FileOutputStream(synonymFile)); + indexOutputStream = new DataOutputStream(new FileOutputStream(indexFile)); + infoWriter = new BufferedWriter(new FileWriter(infoFile)); + + Document sourceDocument = documentBuilder.parse("../../data/dictionary.xml"); + XPathExpression termsXPath = xpath.compile("d:term/@completeForm|d:term/@abbreviation"); + // TODO: tags - labels/descriptions + xsl.setParameter("tags", sourceDocument.getElementsByTagNameNS(DICTIONARY, "tags").item(0)); + + /** + * TODO: sorting + */ + long offset = 0; + long conceptIndex = 0; + long synonymCount = 0; + for (Node conceptNode : nodeIterable(sourceDocument.getElementsByTagNameNS(DICTIONARY, "concept"))) { + ByteArrayOutputStream conceptXhtml = new ByteArrayOutputStream(); + xsl.transform(new DOMSource(conceptNode), new StreamResult(conceptXhtml)); + int length = conceptXhtml.size(); + dictOutputStream.write(conceptXhtml.toByteArray()); + + NodeList nameNodes = (NodeList) termsXPath.evaluate(conceptNode, XPathConstants.NODESET); + List names = new ArrayList<>(); + + for (Node nameNode : nodeIterable(nameNodes)) { + String name = nameNode.getTextContent().trim(); + if (!name.isEmpty()) { + names.add(name); + } + } + + synonymCount = +writeSynonyms(synonymOutputStream, names, conceptIndex); + writeIndex(indexOutputStream, names.get(0), offset, length); + + offset = offset + length; + conceptIndex++; + } + + indexOutputStream.flush(); + writeInfo(infoWriter, sourceDocument, conceptIndex + 1, synonymCount, indexFile.length()); + } catch (SAXException | IOException | TransformerException | XPathExpressionException e) { + log.log(Level.SEVERE, "unable to generate", e); + } finally { + close(dictOutputStream); + close(synonymOutputStream); + close(indexOutputStream); + close(infoWriter); + } + } + + private void writeIndex(DataOutputStream indexOutputStream, String name, long offset, long length) throws IOException { + indexOutputStream.write(name.getBytes(utf8)); + indexOutputStream.write(0); + indexOutputStream.writeInt((int) offset); // unsigned int 32 + indexOutputStream.writeInt((int) length); // unsigned int 32 + } + + private int writeSynonyms(DataOutputStream synonymOutputStream, List names, long baseIndex) throws IOException { + if (names.size() > 1) { + for (int i = 1; i < names.size(); i++) { + String name = names.get(i); + synonymOutputStream.write(name.getBytes(utf8)); + synonymOutputStream.write(0); + synonymOutputStream.writeInt((int) baseIndex); // unsigned int 32 + } + return names.size() - 1; + } else { + return 0; + } + } + + private void writeInfo(BufferedWriter infoWriter, Document sourceDocument, long wordcount, long synwourdcount, long idxfilesize) throws IOException { + // TODO: values from document metadata + infoWriter.write("StarDict's dict ifo file\n"); + infoWriter.write("version=2.4.2\n"); + infoWriter.write("bookname=Free Telco Dictionary\n"); + infoWriter.write("wordcount=" + wordcount + "\n"); + infoWriter.write("synwordcount=" + synwourdcount + "\n"); + infoWriter.write("idxfilesize=" + idxfilesize + "\n"); + infoWriter.write("idxoffsetbits=32\n"); + infoWriter.write("author=František Kučera\n"); + infoWriter.write("email=telco-dictionary." + EMAIL_TOKEN + "@" + "frantovo.cz\n"); + infoWriter.write("website=https://telco.frantovo.cz\n"); + infoWriter.write("description=A dictionary for telecommunications licensed under GNU FDL\n"); + infoWriter.write("date=2013.07.09\n"); + infoWriter.write("sametypesequence=h\n"); + } + public static void main(String[] args) { - + File outputFolder = new File("../../delivery/free-telco-dictionary"); + outputFolder.mkdir(); + + try { + Generator g = new Generator(); + g.generate(outputFolder, "telco"); + } catch (ParserConfigurationException | TransformerConfigurationException e) { + log.log(Level.SEVERE, "error during initialization", e); + } } } diff -r d78b74a2abe9 -r 93208f791318 java/dictionary-generator/src/cz/frantovo/telco/dictionary/IndexEntry.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/java/dictionary-generator/src/cz/frantovo/telco/dictionary/IndexEntry.java Tue Jul 09 18:59:07 2013 +0200 @@ -0,0 +1,30 @@ +/** + * Free Telco Dictionary + * Copyright © 2013 František Kučera (frantovo.cz) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package cz.frantovo.telco.dictionary; + +/** + * + * @author Ing. František Kučera (frantovo.cz) + */ +public class IndexEntry implements Comparable { + + @Override + public int compareTo(IndexEntry o) { + throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates. + } +} diff -r d78b74a2abe9 -r 93208f791318 java/dictionary-generator/src/cz/frantovo/telco/dictionary/MappedNamespaceContext.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/java/dictionary-generator/src/cz/frantovo/telco/dictionary/MappedNamespaceContext.java Tue Jul 09 18:59:07 2013 +0200 @@ -0,0 +1,63 @@ +/** + * Free Telco Dictionary + * Copyright © 2013 František Kučera (frantovo.cz) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package cz.frantovo.telco.dictionary; + +import static cz.frantovo.telco.dictionary.Functions.*; +import java.util.Iterator; +import java.util.Map; +import java.util.Map.Entry; +import javax.xml.namespace.NamespaceContext; + +/** + * + * @author Ing. František Kučera (frantovo.cz) + */ +public class MappedNamespaceContext implements NamespaceContext { + + /** + * maps prexix to namespace URI + */ + private Map prefixMap; + + public MappedNamespaceContext(Map prefixMap) { + this.prefixMap = prefixMap; + } + + @Override + public String getNamespaceURI(String prefix) { + return prefixMap.get(prefix); + } + + @Override + public String getPrefix(String xmlns) { + for (Entry e : prefixMap.entrySet()) { + if (equalz(xmlns, e.getValue())) { + return e.getKey(); + } + } + return null; + } + + /** + * TODO: support multiple prefixes + */ + @Override + public Iterator getPrefixes(String xmlns) { + return new OneItemIterator(getPrefix(xmlns)); + } +} \ No newline at end of file diff -r d78b74a2abe9 -r 93208f791318 java/dictionary-generator/src/cz/frantovo/telco/dictionary/OneItemIterator.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/java/dictionary-generator/src/cz/frantovo/telco/dictionary/OneItemIterator.java Tue Jul 09 18:59:07 2013 +0200 @@ -0,0 +1,49 @@ +/** + * Free Telco Dictionary + * Copyright © 2013 František Kučera (frantovo.cz) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package cz.frantovo.telco.dictionary; + +import java.util.Iterator; + +/** + * + * @author Ing. František Kučera (frantovo.cz) + */ +public class OneItemIterator implements Iterator { + + private final T item; + private boolean unused = true; + + public OneItemIterator(T item) { + this.item = item; + } + + @Override + public boolean hasNext() { + return unused; + } + + @Override + public T next() { + return item; + } + + @Override + public void remove() { + throw new UnsupportedOperationException("remove not supported"); + } +} diff -r d78b74a2abe9 -r 93208f791318 java/dictionary-generator/src/cz/frantovo/telco/dictionary/SynonymsEntry.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/java/dictionary-generator/src/cz/frantovo/telco/dictionary/SynonymsEntry.java Tue Jul 09 18:59:07 2013 +0200 @@ -0,0 +1,30 @@ +/** + * Free Telco Dictionary + * Copyright © 2013 František Kučera (frantovo.cz) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package cz.frantovo.telco.dictionary; + +/** + * + * @author Ing. František Kučera (frantovo.cz) + */ +public class SynonymsEntry implements Comparable { + + @Override + public int compareTo(SynonymsEntry o) { + throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates. + } +} diff -r d78b74a2abe9 -r 93208f791318 java/dictionary-generator/src/cz/frantovo/telco/dictionary/Xmlns.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/java/dictionary-generator/src/cz/frantovo/telco/dictionary/Xmlns.java Tue Jul 09 18:59:07 2013 +0200 @@ -0,0 +1,41 @@ +/** + * Free Telco Dictionary + * Copyright © 2013 František Kučera (frantovo.cz) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package cz.frantovo.telco.dictionary; + +import java.util.HashMap; +import java.util.Map; +import javax.xml.namespace.NamespaceContext; + +/** + * XML namespaces used in Telco dictionary + * + * @author Ing. František Kučera (frantovo.cz) + */ +public class Xmlns { + + public static final String DICTIONARY = "https://telco.frantovo.cz/xmlns/dictionary"; + + public static NamespaceContext getNamespaceContext() { + Map prefixMap = new HashMap<>(); + prefixMap.put("d", DICTIONARY); + return new MappedNamespaceContext(prefixMap); + } + + private Xmlns() { + } +}