diff -r d78b74a2abe9 -r 93208f791318 java/dictionary-generator/src/cz/frantovo/telco/dictionary/Generator.java --- a/java/dictionary-generator/src/cz/frantovo/telco/dictionary/Generator.java Mon Jul 08 23:38:22 2013 +0200 +++ b/java/dictionary-generator/src/cz/frantovo/telco/dictionary/Generator.java Tue Jul 09 18:59:07 2013 +0200 @@ -17,13 +17,189 @@ */ package cz.frantovo.telco.dictionary; +import static cz.frantovo.telco.dictionary.Xmlns.*; +import static cz.frantovo.telco.dictionary.Functions.*; +import java.io.BufferedWriter; +import java.io.ByteArrayOutputStream; +import java.io.DataOutputStream; +import java.io.File; +import java.io.FileOutputStream; +import java.io.FileWriter; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.List; +import java.util.logging.Level; +import java.util.logging.Logger; +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerConfigurationException; +import javax.xml.transform.TransformerException; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; +import javax.xml.transform.stream.StreamSource; +import javax.xml.xpath.XPath; +import javax.xml.xpath.XPathConstants; +import javax.xml.xpath.XPathExpression; +import javax.xml.xpath.XPathExpressionException; +import javax.xml.xpath.XPathFactory; +import org.w3c.dom.Document; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.xml.sax.SAXException; + /** + *

+ * Generates dictionary files in StarDict format from source in our XML format. + *

+ * + *

+ * Number format should be: 32-bits unsigned number in network byte order + *

* * @author Ing. František Kučera (frantovo.cz) */ public class Generator { + private static final Logger log = Logger.getLogger(Generator.class.getName()); + private static final String EMAIL_TOKEN = "ixumhht68"; + private final DocumentBuilderFactory documentBuilderFactory; + private final DocumentBuilder documentBuilder; + private final XPathFactory xpathFactory; + private final XPath xpath; + private final TransformerFactory xslFactory; + private final Transformer xsl; + private final Charset utf8; + + public Generator() throws ParserConfigurationException, TransformerConfigurationException { + utf8 = Charset.forName("UTF-8"); + + documentBuilderFactory = DocumentBuilderFactory.newInstance(); + documentBuilderFactory.setNamespaceAware(true); + documentBuilder = documentBuilderFactory.newDocumentBuilder(); + + xslFactory = TransformerFactory.newInstance(); + xsl = xslFactory.newTransformer(new StreamSource("concept.xsl")); + + xpathFactory = XPathFactory.newInstance(); + xpath = xpathFactory.newXPath(); + xpath.setNamespaceContext(getNamespaceContext()); + } + + private void generate(File folder, String filePrefix) { + File infoFile = new File(folder, filePrefix + ".ifo"); + File dictFile = new File(folder, filePrefix + ".dict"); + File indexFile = new File(folder, filePrefix + ".idx"); + File synonymFile = new File(folder, filePrefix + ".syn"); + + FileOutputStream dictOutputStream = null; + DataOutputStream synonymOutputStream = null; + DataOutputStream indexOutputStream = null; + BufferedWriter infoWriter = null; + + try { + dictOutputStream = new FileOutputStream(dictFile); + synonymOutputStream = new DataOutputStream(new FileOutputStream(synonymFile)); + indexOutputStream = new DataOutputStream(new FileOutputStream(indexFile)); + infoWriter = new BufferedWriter(new FileWriter(infoFile)); + + Document sourceDocument = documentBuilder.parse("../../data/dictionary.xml"); + XPathExpression termsXPath = xpath.compile("d:term/@completeForm|d:term/@abbreviation"); + // TODO: tags - labels/descriptions + xsl.setParameter("tags", sourceDocument.getElementsByTagNameNS(DICTIONARY, "tags").item(0)); + + /** + * TODO: sorting + */ + long offset = 0; + long conceptIndex = 0; + long synonymCount = 0; + for (Node conceptNode : nodeIterable(sourceDocument.getElementsByTagNameNS(DICTIONARY, "concept"))) { + ByteArrayOutputStream conceptXhtml = new ByteArrayOutputStream(); + xsl.transform(new DOMSource(conceptNode), new StreamResult(conceptXhtml)); + int length = conceptXhtml.size(); + dictOutputStream.write(conceptXhtml.toByteArray()); + + NodeList nameNodes = (NodeList) termsXPath.evaluate(conceptNode, XPathConstants.NODESET); + List names = new ArrayList<>(); + + for (Node nameNode : nodeIterable(nameNodes)) { + String name = nameNode.getTextContent().trim(); + if (!name.isEmpty()) { + names.add(name); + } + } + + synonymCount = +writeSynonyms(synonymOutputStream, names, conceptIndex); + writeIndex(indexOutputStream, names.get(0), offset, length); + + offset = offset + length; + conceptIndex++; + } + + indexOutputStream.flush(); + writeInfo(infoWriter, sourceDocument, conceptIndex + 1, synonymCount, indexFile.length()); + } catch (SAXException | IOException | TransformerException | XPathExpressionException e) { + log.log(Level.SEVERE, "unable to generate", e); + } finally { + close(dictOutputStream); + close(synonymOutputStream); + close(indexOutputStream); + close(infoWriter); + } + } + + private void writeIndex(DataOutputStream indexOutputStream, String name, long offset, long length) throws IOException { + indexOutputStream.write(name.getBytes(utf8)); + indexOutputStream.write(0); + indexOutputStream.writeInt((int) offset); // unsigned int 32 + indexOutputStream.writeInt((int) length); // unsigned int 32 + } + + private int writeSynonyms(DataOutputStream synonymOutputStream, List names, long baseIndex) throws IOException { + if (names.size() > 1) { + for (int i = 1; i < names.size(); i++) { + String name = names.get(i); + synonymOutputStream.write(name.getBytes(utf8)); + synonymOutputStream.write(0); + synonymOutputStream.writeInt((int) baseIndex); // unsigned int 32 + } + return names.size() - 1; + } else { + return 0; + } + } + + private void writeInfo(BufferedWriter infoWriter, Document sourceDocument, long wordcount, long synwourdcount, long idxfilesize) throws IOException { + // TODO: values from document metadata + infoWriter.write("StarDict's dict ifo file\n"); + infoWriter.write("version=2.4.2\n"); + infoWriter.write("bookname=Free Telco Dictionary\n"); + infoWriter.write("wordcount=" + wordcount + "\n"); + infoWriter.write("synwordcount=" + synwourdcount + "\n"); + infoWriter.write("idxfilesize=" + idxfilesize + "\n"); + infoWriter.write("idxoffsetbits=32\n"); + infoWriter.write("author=František Kučera\n"); + infoWriter.write("email=telco-dictionary." + EMAIL_TOKEN + "@" + "frantovo.cz\n"); + infoWriter.write("website=https://telco.frantovo.cz\n"); + infoWriter.write("description=A dictionary for telecommunications licensed under GNU FDL\n"); + infoWriter.write("date=2013.07.09\n"); + infoWriter.write("sametypesequence=h\n"); + } + public static void main(String[] args) { - + File outputFolder = new File("../../delivery/free-telco-dictionary"); + outputFolder.mkdir(); + + try { + Generator g = new Generator(); + g.generate(outputFolder, "telco"); + } catch (ParserConfigurationException | TransformerConfigurationException e) { + log.log(Level.SEVERE, "error during initialization", e); + } } }