java/dictionary-generator/src/cz/frantovo/telco/dictionary/Generator.java
author František Kučera <franta-hg@frantovo.cz>
Tue, 09 Jul 2013 22:41:49 +0200
changeset 16 939fa8d8663e
parent 15 93208f791318
child 17 b188eae2c092
permissions -rw-r--r--
data: MSIN/MIN
     1 /**
     2  * Free Telco Dictionary
     3  * Copyright © 2013 František Kučera (frantovo.cz)
     4  *
     5  * This program is free software: you can redistribute it and/or modify
     6  * it under the terms of the GNU General Public License as published by
     7  * the Free Software Foundation, either version 3 of the License, or
     8  * (at your option) any later version.
     9  *
    10  * This program is distributed in the hope that it will be useful,
    11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
    12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    13  * GNU General Public License for more details.
    14  *
    15  * You should have received a copy of the GNU General Public License
    16  * along with this program. If not, see <http://www.gnu.org/licenses/>.
    17  */
    18 package cz.frantovo.telco.dictionary;
    19 
    20 import static cz.frantovo.telco.dictionary.Xmlns.*;
    21 import static cz.frantovo.telco.dictionary.Functions.*;
    22 import java.io.BufferedWriter;
    23 import java.io.ByteArrayOutputStream;
    24 import java.io.DataOutputStream;
    25 import java.io.File;
    26 import java.io.FileOutputStream;
    27 import java.io.FileWriter;
    28 import java.io.IOException;
    29 import java.nio.ByteBuffer;
    30 import java.nio.charset.Charset;
    31 import java.util.ArrayList;
    32 import java.util.List;
    33 import java.util.logging.Level;
    34 import java.util.logging.Logger;
    35 import javax.xml.parsers.DocumentBuilder;
    36 import javax.xml.parsers.DocumentBuilderFactory;
    37 import javax.xml.parsers.ParserConfigurationException;
    38 import javax.xml.transform.Transformer;
    39 import javax.xml.transform.TransformerConfigurationException;
    40 import javax.xml.transform.TransformerException;
    41 import javax.xml.transform.TransformerFactory;
    42 import javax.xml.transform.dom.DOMSource;
    43 import javax.xml.transform.stream.StreamResult;
    44 import javax.xml.transform.stream.StreamSource;
    45 import javax.xml.xpath.XPath;
    46 import javax.xml.xpath.XPathConstants;
    47 import javax.xml.xpath.XPathExpression;
    48 import javax.xml.xpath.XPathExpressionException;
    49 import javax.xml.xpath.XPathFactory;
    50 import org.w3c.dom.Document;
    51 import org.w3c.dom.Node;
    52 import org.w3c.dom.NodeList;
    53 import org.xml.sax.SAXException;
    54 
    55 /**
    56  * <p>
    57  * Generates dictionary files in StarDict format from source in our XML format.
    58  * </p>
    59  *
    60  * <p>
    61  * Number format should be: 32-bits unsigned number in network byte order
    62  * </p>
    63  *
    64  * @author Ing. František Kučera (frantovo.cz)
    65  */
    66 public class Generator {
    67 
    68 	private static final Logger log = Logger.getLogger(Generator.class.getName());
    69 	private static final String EMAIL_TOKEN = "ixumhht68";
    70 	private final DocumentBuilderFactory documentBuilderFactory;
    71 	private final DocumentBuilder documentBuilder;
    72 	private final XPathFactory xpathFactory;
    73 	private final XPath xpath;
    74 	private final TransformerFactory xslFactory;
    75 	private final Transformer xsl;
    76 	private final Charset utf8;
    77 
    78 	public Generator() throws ParserConfigurationException, TransformerConfigurationException {
    79 		utf8 = Charset.forName("UTF-8");
    80 
    81 		documentBuilderFactory = DocumentBuilderFactory.newInstance();
    82 		documentBuilderFactory.setNamespaceAware(true);
    83 		documentBuilder = documentBuilderFactory.newDocumentBuilder();
    84 
    85 		xslFactory = TransformerFactory.newInstance();
    86 		xsl = xslFactory.newTransformer(new StreamSource("concept.xsl"));
    87 
    88 		xpathFactory = XPathFactory.newInstance();
    89 		xpath = xpathFactory.newXPath();
    90 		xpath.setNamespaceContext(getNamespaceContext());
    91 	}
    92 
    93 	private void generate(File folder, String filePrefix) {
    94 		File infoFile = new File(folder, filePrefix + ".ifo");
    95 		File dictFile = new File(folder, filePrefix + ".dict");
    96 		File indexFile = new File(folder, filePrefix + ".idx");
    97 		File synonymFile = new File(folder, filePrefix + ".syn");
    98 
    99 		FileOutputStream dictOutputStream = null;
   100 		DataOutputStream synonymOutputStream = null;
   101 		DataOutputStream indexOutputStream = null;
   102 		BufferedWriter infoWriter = null;
   103 
   104 		try {
   105 			dictOutputStream = new FileOutputStream(dictFile);
   106 			synonymOutputStream = new DataOutputStream(new FileOutputStream(synonymFile));
   107 			indexOutputStream = new DataOutputStream(new FileOutputStream(indexFile));
   108 			infoWriter = new BufferedWriter(new FileWriter(infoFile));
   109 
   110 			Document sourceDocument = documentBuilder.parse("../../data/dictionary.xml");
   111 			XPathExpression termsXPath = xpath.compile("d:term/@completeForm|d:term/@abbreviation");
   112 			// TODO: tags - labels/descriptions
   113 			xsl.setParameter("tags", sourceDocument.getElementsByTagNameNS(DICTIONARY, "tags").item(0));
   114 
   115 			/**
   116 			 * TODO: sorting
   117 			 */
   118 			long offset = 0;
   119 			long conceptIndex = 0;
   120 			long synonymCount = 0;
   121 			for (Node conceptNode : nodeIterable(sourceDocument.getElementsByTagNameNS(DICTIONARY, "concept"))) {
   122 				ByteArrayOutputStream conceptXhtml = new ByteArrayOutputStream();
   123 				xsl.transform(new DOMSource(conceptNode), new StreamResult(conceptXhtml));
   124 				int length = conceptXhtml.size();
   125 				dictOutputStream.write(conceptXhtml.toByteArray());
   126 
   127 				NodeList nameNodes = (NodeList) termsXPath.evaluate(conceptNode, XPathConstants.NODESET);
   128 				List<String> names = new ArrayList<>();
   129 
   130 				for (Node nameNode : nodeIterable(nameNodes)) {
   131 					String name = nameNode.getTextContent().trim();
   132 					if (!name.isEmpty()) {
   133 						names.add(name);
   134 					}
   135 				}
   136 
   137 				synonymCount = +writeSynonyms(synonymOutputStream, names, conceptIndex);
   138 				writeIndex(indexOutputStream, names.get(0), offset, length);
   139 
   140 				offset = offset + length;
   141 				conceptIndex++;
   142 			}
   143 
   144 			indexOutputStream.flush();
   145 			writeInfo(infoWriter, sourceDocument, conceptIndex + 1, synonymCount, indexFile.length());
   146 		} catch (SAXException | IOException | TransformerException | XPathExpressionException e) {
   147 			log.log(Level.SEVERE, "unable to generate", e);
   148 		} finally {
   149 			close(dictOutputStream);
   150 			close(synonymOutputStream);
   151 			close(indexOutputStream);
   152 			close(infoWriter);
   153 		}
   154 	}
   155 
   156 	private void writeIndex(DataOutputStream indexOutputStream, String name, long offset, long length) throws IOException {
   157 		indexOutputStream.write(name.getBytes(utf8));
   158 		indexOutputStream.write(0);
   159 		indexOutputStream.writeInt((int) offset); // unsigned int 32
   160 		indexOutputStream.writeInt((int) length); // unsigned int 32
   161 	}
   162 
   163 	private int writeSynonyms(DataOutputStream synonymOutputStream, List<String> names, long baseIndex) throws IOException {
   164 		if (names.size() > 1) {
   165 			for (int i = 1; i < names.size(); i++) {
   166 				String name = names.get(i);
   167 				synonymOutputStream.write(name.getBytes(utf8));
   168 				synonymOutputStream.write(0);
   169 				synonymOutputStream.writeInt((int) baseIndex); // unsigned int 32
   170 			}
   171 			return names.size() - 1;
   172 		} else {
   173 			return 0;
   174 		}
   175 	}
   176 
   177 	private void writeInfo(BufferedWriter infoWriter, Document sourceDocument, long wordcount, long synwourdcount, long idxfilesize) throws IOException {
   178 		// TODO: values from document metadata
   179 		infoWriter.write("StarDict's dict ifo file\n");
   180 		infoWriter.write("version=2.4.2\n");
   181 		infoWriter.write("bookname=Free Telco Dictionary\n");
   182 		infoWriter.write("wordcount=" + wordcount + "\n");
   183 		infoWriter.write("synwordcount=" + synwourdcount + "\n");
   184 		infoWriter.write("idxfilesize=" + idxfilesize + "\n");
   185 		infoWriter.write("idxoffsetbits=32\n");
   186 		infoWriter.write("author=František Kučera\n");
   187 		infoWriter.write("email=telco-dictionary." + EMAIL_TOKEN + "@" + "frantovo.cz\n");
   188 		infoWriter.write("website=https://telco.frantovo.cz\n");
   189 		infoWriter.write("description=A dictionary for telecommunications licensed under GNU FDL\n");
   190 		infoWriter.write("date=2013.07.09\n");
   191 		infoWriter.write("sametypesequence=h\n");
   192 	}
   193 
   194 	public static void main(String[] args) {
   195 		File outputFolder = new File("../../delivery/free-telco-dictionary");
   196 		outputFolder.mkdir();
   197 
   198 		try {
   199 			Generator g = new Generator();
   200 			g.generate(outputFolder, "telco");
   201 		} catch (ParserConfigurationException | TransformerConfigurationException e) {
   202 			log.log(Level.SEVERE, "error during initialization", e);
   203 		}
   204 	}
   205 }