franta-hg@13: /**
franta-hg@13: * Free Telco Dictionary
franta-hg@13: * Copyright © 2013 František Kučera (frantovo.cz)
franta-hg@13: *
franta-hg@13: * This program is free software: you can redistribute it and/or modify
franta-hg@13: * it under the terms of the GNU General Public License as published by
franta-hg@13: * the Free Software Foundation, either version 3 of the License, or
franta-hg@13: * (at your option) any later version.
franta-hg@13: *
franta-hg@13: * This program is distributed in the hope that it will be useful,
franta-hg@13: * but WITHOUT ANY WARRANTY; without even the implied warranty of
franta-hg@13: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
franta-hg@13: * GNU General Public License for more details.
franta-hg@13: *
franta-hg@13: * You should have received a copy of the GNU General Public License
franta-hg@13: * along with this program. If not, see .
franta-hg@13: */
franta-hg@13: package cz.frantovo.telco.dictionary;
franta-hg@13:
franta-hg@15: import static cz.frantovo.telco.dictionary.Xmlns.*;
franta-hg@15: import static cz.frantovo.telco.dictionary.Functions.*;
franta-hg@15: import java.io.BufferedWriter;
franta-hg@15: import java.io.ByteArrayOutputStream;
franta-hg@15: import java.io.DataOutputStream;
franta-hg@15: import java.io.File;
franta-hg@15: import java.io.FileOutputStream;
franta-hg@15: import java.io.FileWriter;
franta-hg@15: import java.io.IOException;
franta-hg@18: import java.text.SimpleDateFormat;
franta-hg@15: import java.util.ArrayList;
franta-hg@18: import java.util.Date;
franta-hg@15: import java.util.List;
franta-hg@17: import java.util.SortedSet;
franta-hg@17: import java.util.TreeSet;
franta-hg@15: import java.util.logging.Level;
franta-hg@15: import java.util.logging.Logger;
franta-hg@15: import javax.xml.parsers.DocumentBuilder;
franta-hg@15: import javax.xml.parsers.DocumentBuilderFactory;
franta-hg@15: import javax.xml.parsers.ParserConfigurationException;
franta-hg@15: import javax.xml.transform.Transformer;
franta-hg@15: import javax.xml.transform.TransformerConfigurationException;
franta-hg@15: import javax.xml.transform.TransformerException;
franta-hg@15: import javax.xml.transform.TransformerFactory;
franta-hg@15: import javax.xml.transform.dom.DOMSource;
franta-hg@15: import javax.xml.transform.stream.StreamResult;
franta-hg@15: import javax.xml.transform.stream.StreamSource;
franta-hg@15: import javax.xml.xpath.XPath;
franta-hg@15: import javax.xml.xpath.XPathConstants;
franta-hg@15: import javax.xml.xpath.XPathExpression;
franta-hg@15: import javax.xml.xpath.XPathExpressionException;
franta-hg@15: import javax.xml.xpath.XPathFactory;
franta-hg@15: import org.w3c.dom.Document;
franta-hg@15: import org.w3c.dom.Node;
franta-hg@15: import org.w3c.dom.NodeList;
franta-hg@15: import org.xml.sax.SAXException;
franta-hg@15:
franta-hg@13: /**
franta-hg@15: *
franta-hg@15: * Generates dictionary files in StarDict format from source in our XML format.
franta-hg@15: *
franta-hg@15: *
franta-hg@15: *
franta-hg@15: * Number format should be: 32-bits unsigned number in network byte order
franta-hg@15: *
franta-hg@13: *
franta-hg@13: * @author Ing. František Kučera (frantovo.cz)
franta-hg@13: */
franta-hg@13: public class Generator {
franta-hg@18:
franta-hg@15: private static final Logger log = Logger.getLogger(Generator.class.getName());
franta-hg@17: private static final String EML_TO_KEN = "ixumhht68";
franta-hg@15: private final DocumentBuilderFactory documentBuilderFactory;
franta-hg@15: private final DocumentBuilder documentBuilder;
franta-hg@15: private final XPathFactory xpathFactory;
franta-hg@15: private final XPath xpath;
franta-hg@15: private final TransformerFactory xslFactory;
franta-hg@15: private final Transformer xsl;
franta-hg@18: private final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy.MM.dd");
franta-hg@18:
franta-hg@15: public Generator() throws ParserConfigurationException, TransformerConfigurationException {
franta-hg@15: documentBuilderFactory = DocumentBuilderFactory.newInstance();
franta-hg@15: documentBuilderFactory.setNamespaceAware(true);
franta-hg@15: documentBuilder = documentBuilderFactory.newDocumentBuilder();
franta-hg@18:
franta-hg@15: xslFactory = TransformerFactory.newInstance();
franta-hg@15: xsl = xslFactory.newTransformer(new StreamSource("concept.xsl"));
franta-hg@18:
franta-hg@15: xpathFactory = XPathFactory.newInstance();
franta-hg@15: xpath = xpathFactory.newXPath();
franta-hg@15: xpath.setNamespaceContext(getNamespaceContext());
franta-hg@15: }
franta-hg@18:
franta-hg@15: private void generate(File folder, String filePrefix) {
franta-hg@15: File infoFile = new File(folder, filePrefix + ".ifo");
franta-hg@15: File dictFile = new File(folder, filePrefix + ".dict");
franta-hg@15: File indexFile = new File(folder, filePrefix + ".idx");
franta-hg@15: File synonymFile = new File(folder, filePrefix + ".syn");
franta-hg@18:
franta-hg@15: FileOutputStream dictOutputStream = null;
franta-hg@15: DataOutputStream synonymOutputStream = null;
franta-hg@15: DataOutputStream indexOutputStream = null;
franta-hg@15: BufferedWriter infoWriter = null;
franta-hg@18:
franta-hg@17: SortedSet indexEntries = new TreeSet<>();
franta-hg@17: SortedSet synonymsEntries = new TreeSet<>();
franta-hg@18:
franta-hg@15: try {
franta-hg@15: dictOutputStream = new FileOutputStream(dictFile);
franta-hg@15: synonymOutputStream = new DataOutputStream(new FileOutputStream(synonymFile));
franta-hg@15: indexOutputStream = new DataOutputStream(new FileOutputStream(indexFile));
franta-hg@15: infoWriter = new BufferedWriter(new FileWriter(infoFile));
franta-hg@18:
franta-hg@15: Document sourceDocument = documentBuilder.parse("../../data/dictionary.xml");
franta-hg@15: XPathExpression termsXPath = xpath.compile("d:term/@completeForm|d:term/@abbreviation");
franta-hg@15: // TODO: tags - labels/descriptions
franta-hg@15: xsl.setParameter("tags", sourceDocument.getElementsByTagNameNS(DICTIONARY, "tags").item(0));
franta-hg@18:
franta-hg@15: long offset = 0;
franta-hg@15: long conceptIndex = 0;
franta-hg@15: for (Node conceptNode : nodeIterable(sourceDocument.getElementsByTagNameNS(DICTIONARY, "concept"))) {
franta-hg@15: ByteArrayOutputStream conceptXhtml = new ByteArrayOutputStream();
franta-hg@15: xsl.transform(new DOMSource(conceptNode), new StreamResult(conceptXhtml));
franta-hg@15: int length = conceptXhtml.size();
franta-hg@15: dictOutputStream.write(conceptXhtml.toByteArray());
franta-hg@18:
franta-hg@15: NodeList nameNodes = (NodeList) termsXPath.evaluate(conceptNode, XPathConstants.NODESET);
franta-hg@15: List names = new ArrayList<>();
franta-hg@18:
franta-hg@15: for (Node nameNode : nodeIterable(nameNodes)) {
franta-hg@15: String name = nameNode.getTextContent().trim();
franta-hg@15: if (!name.isEmpty()) {
franta-hg@15: names.add(name);
franta-hg@15: }
franta-hg@15: }
franta-hg@18:
franta-hg@18: String baseName = names.get(0);
franta-hg@18: IndexEntry indexEntry = new IndexEntry(baseName, offset, length);
franta-hg@17: indexEntries.add(indexEntry);
franta-hg@18:
franta-hg@17: for (int i = 1; i < names.size(); i++) {
franta-hg@17: String name = names.get(i);
franta-hg@18: if (!baseName.equals(name)) {
franta-hg@18: synonymsEntries.add(new SynonymsEntry(indexEntry, name));
franta-hg@18: }
franta-hg@17: }
franta-hg@18:
franta-hg@15: offset = offset + length;
franta-hg@15: conceptIndex++;
franta-hg@15: }
franta-hg@18:
franta-hg@17: writeIndex(indexOutputStream, indexEntries);
franta-hg@17: writeSynonyms(synonymOutputStream, synonymsEntries);
franta-hg@18:
franta-hg@15: indexOutputStream.flush();
franta-hg@17: writeInfo(infoWriter, sourceDocument, conceptIndex + 1, synonymsEntries.size(), indexFile.length());
franta-hg@15: } catch (SAXException | IOException | TransformerException | XPathExpressionException e) {
franta-hg@15: log.log(Level.SEVERE, "unable to generate", e);
franta-hg@15: } finally {
franta-hg@15: close(dictOutputStream);
franta-hg@15: close(synonymOutputStream);
franta-hg@15: close(indexOutputStream);
franta-hg@15: close(infoWriter);
franta-hg@15: }
franta-hg@15: }
franta-hg@18:
franta-hg@17: private void writeIndex(DataOutputStream indexOutputStream, SortedSet indexEntries) throws IOException {
franta-hg@17: long ordinal = 0;
franta-hg@17: for (IndexEntry e : indexEntries) {
franta-hg@17: e.serialize(indexOutputStream);
franta-hg@17: e.setOrdinal(ordinal++);
franta-hg@17: }
franta-hg@15: }
franta-hg@18:
franta-hg@17: private void writeSynonyms(DataOutputStream synonymOutputStream, SortedSet synonymsEntries) throws IOException {
franta-hg@17: for (SynonymsEntry s : synonymsEntries) {
franta-hg@17: s.serialize(synonymOutputStream);
franta-hg@15: }
franta-hg@15: }
franta-hg@18:
franta-hg@15: private void writeInfo(BufferedWriter infoWriter, Document sourceDocument, long wordcount, long synwourdcount, long idxfilesize) throws IOException {
franta-hg@15: // TODO: values from document metadata
franta-hg@15: infoWriter.write("StarDict's dict ifo file\n");
franta-hg@15: infoWriter.write("version=2.4.2\n");
franta-hg@15: infoWriter.write("bookname=Free Telco Dictionary\n");
franta-hg@15: infoWriter.write("wordcount=" + wordcount + "\n");
franta-hg@15: infoWriter.write("synwordcount=" + synwourdcount + "\n");
franta-hg@15: infoWriter.write("idxfilesize=" + idxfilesize + "\n");
franta-hg@15: infoWriter.write("idxoffsetbits=32\n");
franta-hg@15: infoWriter.write("author=František Kučera\n");
franta-hg@17: infoWriter.write("em" + "ail=telco" + "-dictionary." + EML_TO_KEN + "@" + "fran" + "tovo.cz\n");
franta-hg@15: infoWriter.write("website=https://telco.frantovo.cz\n");
franta-hg@15: infoWriter.write("description=A dictionary for telecommunications licensed under GNU FDL\n");
franta-hg@18: infoWriter.write("date=" + dateFormat.format(new Date()) + "\n");
franta-hg@15: infoWriter.write("sametypesequence=h\n");
franta-hg@15: }
franta-hg@18:
franta-hg@13: public static void main(String[] args) {
franta-hg@15: File outputFolder = new File("../../delivery/free-telco-dictionary");
franta-hg@15: outputFolder.mkdir();
franta-hg@18:
franta-hg@15: try {
franta-hg@15: Generator g = new Generator();
franta-hg@15: g.generate(outputFolder, "telco");
franta-hg@15: } catch (ParserConfigurationException | TransformerConfigurationException e) {
franta-hg@15: log.log(Level.SEVERE, "error during initialization", e);
franta-hg@15: }
franta-hg@13: }
franta-hg@13: }