java/dictionary-generator/src/cz/frantovo/telco/dictionary/Generator.java
author František Kučera <franta-hg@frantovo.cz>
Tue, 09 Jul 2013 22:42:32 +0200
changeset 17 b188eae2c092
parent 15 93208f791318
child 18 7a2eb4cb6ff1
permissions -rw-r--r--
generator: sorted index and synonyms
     1 /**
     2  * Free Telco Dictionary
     3  * Copyright © 2013 František Kučera (frantovo.cz)
     4  *
     5  * This program is free software: you can redistribute it and/or modify
     6  * it under the terms of the GNU General Public License as published by
     7  * the Free Software Foundation, either version 3 of the License, or
     8  * (at your option) any later version.
     9  *
    10  * This program is distributed in the hope that it will be useful,
    11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
    12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    13  * GNU General Public License for more details.
    14  *
    15  * You should have received a copy of the GNU General Public License
    16  * along with this program. If not, see <http://www.gnu.org/licenses/>.
    17  */
    18 package cz.frantovo.telco.dictionary;
    19 
    20 import static cz.frantovo.telco.dictionary.Xmlns.*;
    21 import static cz.frantovo.telco.dictionary.Functions.*;
    22 import java.io.BufferedWriter;
    23 import java.io.ByteArrayOutputStream;
    24 import java.io.DataOutputStream;
    25 import java.io.File;
    26 import java.io.FileOutputStream;
    27 import java.io.FileWriter;
    28 import java.io.IOException;
    29 import java.util.ArrayList;
    30 import java.util.List;
    31 import java.util.SortedSet;
    32 import java.util.TreeSet;
    33 import java.util.logging.Level;
    34 import java.util.logging.Logger;
    35 import javax.xml.parsers.DocumentBuilder;
    36 import javax.xml.parsers.DocumentBuilderFactory;
    37 import javax.xml.parsers.ParserConfigurationException;
    38 import javax.xml.transform.Transformer;
    39 import javax.xml.transform.TransformerConfigurationException;
    40 import javax.xml.transform.TransformerException;
    41 import javax.xml.transform.TransformerFactory;
    42 import javax.xml.transform.dom.DOMSource;
    43 import javax.xml.transform.stream.StreamResult;
    44 import javax.xml.transform.stream.StreamSource;
    45 import javax.xml.xpath.XPath;
    46 import javax.xml.xpath.XPathConstants;
    47 import javax.xml.xpath.XPathExpression;
    48 import javax.xml.xpath.XPathExpressionException;
    49 import javax.xml.xpath.XPathFactory;
    50 import org.w3c.dom.Document;
    51 import org.w3c.dom.Node;
    52 import org.w3c.dom.NodeList;
    53 import org.xml.sax.SAXException;
    54 
    55 /**
    56  * <p>
    57  * Generates dictionary files in StarDict format from source in our XML format.
    58  * </p>
    59  *
    60  * <p>
    61  * Number format should be: 32-bits unsigned number in network byte order
    62  * </p>
    63  *
    64  * @author Ing. František Kučera (frantovo.cz)
    65  */
    66 public class Generator {
    67 
    68 	private static final Logger log = Logger.getLogger(Generator.class.getName());
    69 	private static final String EML_TO_KEN = "ixumhht68";
    70 	private final DocumentBuilderFactory documentBuilderFactory;
    71 	private final DocumentBuilder documentBuilder;
    72 	private final XPathFactory xpathFactory;
    73 	private final XPath xpath;
    74 	private final TransformerFactory xslFactory;
    75 	private final Transformer xsl;
    76 
    77 	public Generator() throws ParserConfigurationException, TransformerConfigurationException {
    78 		documentBuilderFactory = DocumentBuilderFactory.newInstance();
    79 		documentBuilderFactory.setNamespaceAware(true);
    80 		documentBuilder = documentBuilderFactory.newDocumentBuilder();
    81 
    82 		xslFactory = TransformerFactory.newInstance();
    83 		xsl = xslFactory.newTransformer(new StreamSource("concept.xsl"));
    84 
    85 		xpathFactory = XPathFactory.newInstance();
    86 		xpath = xpathFactory.newXPath();
    87 		xpath.setNamespaceContext(getNamespaceContext());
    88 	}
    89 
    90 	private void generate(File folder, String filePrefix) {
    91 		File infoFile = new File(folder, filePrefix + ".ifo");
    92 		File dictFile = new File(folder, filePrefix + ".dict");
    93 		File indexFile = new File(folder, filePrefix + ".idx");
    94 		File synonymFile = new File(folder, filePrefix + ".syn");
    95 
    96 		FileOutputStream dictOutputStream = null;
    97 		DataOutputStream synonymOutputStream = null;
    98 		DataOutputStream indexOutputStream = null;
    99 		BufferedWriter infoWriter = null;
   100 
   101 		SortedSet<IndexEntry> indexEntries = new TreeSet<>();
   102 		SortedSet<SynonymsEntry> synonymsEntries = new TreeSet<>();
   103 
   104 		try {
   105 			dictOutputStream = new FileOutputStream(dictFile);
   106 			synonymOutputStream = new DataOutputStream(new FileOutputStream(synonymFile));
   107 			indexOutputStream = new DataOutputStream(new FileOutputStream(indexFile));
   108 			infoWriter = new BufferedWriter(new FileWriter(infoFile));
   109 
   110 			Document sourceDocument = documentBuilder.parse("../../data/dictionary.xml");
   111 			XPathExpression termsXPath = xpath.compile("d:term/@completeForm|d:term/@abbreviation");
   112 			// TODO: tags - labels/descriptions
   113 			xsl.setParameter("tags", sourceDocument.getElementsByTagNameNS(DICTIONARY, "tags").item(0));
   114 
   115 			long offset = 0;
   116 			long conceptIndex = 0;
   117 			for (Node conceptNode : nodeIterable(sourceDocument.getElementsByTagNameNS(DICTIONARY, "concept"))) {
   118 				ByteArrayOutputStream conceptXhtml = new ByteArrayOutputStream();
   119 				xsl.transform(new DOMSource(conceptNode), new StreamResult(conceptXhtml));
   120 				int length = conceptXhtml.size();
   121 				dictOutputStream.write(conceptXhtml.toByteArray());
   122 
   123 				NodeList nameNodes = (NodeList) termsXPath.evaluate(conceptNode, XPathConstants.NODESET);
   124 				List<String> names = new ArrayList<>();
   125 
   126 				for (Node nameNode : nodeIterable(nameNodes)) {
   127 					String name = nameNode.getTextContent().trim();
   128 					if (!name.isEmpty()) {
   129 						names.add(name);
   130 					}
   131 				}
   132 
   133 				IndexEntry indexEntry = new IndexEntry(names.get(0), offset, length);
   134 				indexEntries.add(indexEntry);
   135 
   136 				for (int i = 1; i < names.size(); i++) {
   137 					String name = names.get(i);
   138 					synonymsEntries.add(new SynonymsEntry(indexEntry, name));
   139 				}
   140 
   141 				offset = offset + length;
   142 				conceptIndex++;
   143 			}
   144 
   145 			writeIndex(indexOutputStream, indexEntries);
   146 			writeSynonyms(synonymOutputStream, synonymsEntries);
   147 
   148 			indexOutputStream.flush();
   149 			writeInfo(infoWriter, sourceDocument, conceptIndex + 1, synonymsEntries.size(), indexFile.length());
   150 		} catch (SAXException | IOException | TransformerException | XPathExpressionException e) {
   151 			log.log(Level.SEVERE, "unable to generate", e);
   152 		} finally {
   153 			close(dictOutputStream);
   154 			close(synonymOutputStream);
   155 			close(indexOutputStream);
   156 			close(infoWriter);
   157 		}
   158 	}
   159 
   160 	private void writeIndex(DataOutputStream indexOutputStream, SortedSet<IndexEntry> indexEntries) throws IOException {
   161 		long ordinal = 0;
   162 		for (IndexEntry e : indexEntries) {
   163 			e.serialize(indexOutputStream);
   164 			e.setOrdinal(ordinal++);
   165 		}
   166 	}
   167 
   168 	private void writeSynonyms(DataOutputStream synonymOutputStream, SortedSet<SynonymsEntry> synonymsEntries) throws IOException {
   169 		for (SynonymsEntry s : synonymsEntries) {
   170 			s.serialize(synonymOutputStream);
   171 		}
   172 	}
   173 
   174 	private void writeInfo(BufferedWriter infoWriter, Document sourceDocument, long wordcount, long synwourdcount, long idxfilesize) throws IOException {
   175 		// TODO: values from document metadata
   176 		infoWriter.write("StarDict's dict ifo file\n");
   177 		infoWriter.write("version=2.4.2\n");
   178 		infoWriter.write("bookname=Free Telco Dictionary\n");
   179 		infoWriter.write("wordcount=" + wordcount + "\n");
   180 		infoWriter.write("synwordcount=" + synwourdcount + "\n");
   181 		infoWriter.write("idxfilesize=" + idxfilesize + "\n");
   182 		infoWriter.write("idxoffsetbits=32\n");
   183 		infoWriter.write("author=František Kučera\n");
   184 		infoWriter.write("em" + "ail=telco" + "-dictionary." + EML_TO_KEN + "@" + "fran" + "tovo.cz\n");
   185 		infoWriter.write("website=https://telco.frantovo.cz\n");
   186 		infoWriter.write("description=A dictionary for telecommunications licensed under GNU FDL\n");
   187 		infoWriter.write("date=2013.07.09\n");
   188 		infoWriter.write("sametypesequence=h\n");
   189 	}
   190 
   191 	public static void main(String[] args) {
   192 		File outputFolder = new File("../../delivery/free-telco-dictionary");
   193 		outputFolder.mkdir();
   194 
   195 		try {
   196 			Generator g = new Generator();
   197 			g.generate(outputFolder, "telco");
   198 		} catch (ParserConfigurationException | TransformerConfigurationException e) {
   199 			log.log(Level.SEVERE, "error during initialization", e);
   200 		}
   201 	}
   202 }