2 * Free Telco Dictionary
3 * Copyright © 2013 František Kučera (frantovo.cz)
5 * This program is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, either version 3 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program. If not, see <http://www.gnu.org/licenses/>.
18 package cz.frantovo.telco.dictionary;
20 import static cz.frantovo.telco.dictionary.Xmlns.*;
21 import static cz.frantovo.telco.dictionary.Functions.*;
22 import java.io.BufferedWriter;
23 import java.io.ByteArrayOutputStream;
24 import java.io.DataOutputStream;
26 import java.io.FileOutputStream;
27 import java.io.FileWriter;
28 import java.io.IOException;
29 import java.nio.ByteBuffer;
30 import java.nio.charset.Charset;
31 import java.util.ArrayList;
32 import java.util.List;
33 import java.util.logging.Level;
34 import java.util.logging.Logger;
35 import javax.xml.parsers.DocumentBuilder;
36 import javax.xml.parsers.DocumentBuilderFactory;
37 import javax.xml.parsers.ParserConfigurationException;
38 import javax.xml.transform.Transformer;
39 import javax.xml.transform.TransformerConfigurationException;
40 import javax.xml.transform.TransformerException;
41 import javax.xml.transform.TransformerFactory;
42 import javax.xml.transform.dom.DOMSource;
43 import javax.xml.transform.stream.StreamResult;
44 import javax.xml.transform.stream.StreamSource;
45 import javax.xml.xpath.XPath;
46 import javax.xml.xpath.XPathConstants;
47 import javax.xml.xpath.XPathExpression;
48 import javax.xml.xpath.XPathExpressionException;
49 import javax.xml.xpath.XPathFactory;
50 import org.w3c.dom.Document;
51 import org.w3c.dom.Node;
52 import org.w3c.dom.NodeList;
53 import org.xml.sax.SAXException;
57 * Generates dictionary files in StarDict format from source in our XML format.
61 * Number format should be: 32-bits unsigned number in network byte order
64 * @author Ing. František Kučera (frantovo.cz)
66 public class Generator {
68 private static final Logger log = Logger.getLogger(Generator.class.getName());
69 private static final String EMAIL_TOKEN = "ixumhht68";
70 private final DocumentBuilderFactory documentBuilderFactory;
71 private final DocumentBuilder documentBuilder;
72 private final XPathFactory xpathFactory;
73 private final XPath xpath;
74 private final TransformerFactory xslFactory;
75 private final Transformer xsl;
76 private final Charset utf8;
78 public Generator() throws ParserConfigurationException, TransformerConfigurationException {
79 utf8 = Charset.forName("UTF-8");
81 documentBuilderFactory = DocumentBuilderFactory.newInstance();
82 documentBuilderFactory.setNamespaceAware(true);
83 documentBuilder = documentBuilderFactory.newDocumentBuilder();
85 xslFactory = TransformerFactory.newInstance();
86 xsl = xslFactory.newTransformer(new StreamSource("concept.xsl"));
88 xpathFactory = XPathFactory.newInstance();
89 xpath = xpathFactory.newXPath();
90 xpath.setNamespaceContext(getNamespaceContext());
93 private void generate(File folder, String filePrefix) {
94 File infoFile = new File(folder, filePrefix + ".ifo");
95 File dictFile = new File(folder, filePrefix + ".dict");
96 File indexFile = new File(folder, filePrefix + ".idx");
97 File synonymFile = new File(folder, filePrefix + ".syn");
99 FileOutputStream dictOutputStream = null;
100 DataOutputStream synonymOutputStream = null;
101 DataOutputStream indexOutputStream = null;
102 BufferedWriter infoWriter = null;
105 dictOutputStream = new FileOutputStream(dictFile);
106 synonymOutputStream = new DataOutputStream(new FileOutputStream(synonymFile));
107 indexOutputStream = new DataOutputStream(new FileOutputStream(indexFile));
108 infoWriter = new BufferedWriter(new FileWriter(infoFile));
110 Document sourceDocument = documentBuilder.parse("../../data/dictionary.xml");
111 XPathExpression termsXPath = xpath.compile("d:term/@completeForm|d:term/@abbreviation");
112 // TODO: tags - labels/descriptions
113 xsl.setParameter("tags", sourceDocument.getElementsByTagNameNS(DICTIONARY, "tags").item(0));
119 long conceptIndex = 0;
120 long synonymCount = 0;
121 for (Node conceptNode : nodeIterable(sourceDocument.getElementsByTagNameNS(DICTIONARY, "concept"))) {
122 ByteArrayOutputStream conceptXhtml = new ByteArrayOutputStream();
123 xsl.transform(new DOMSource(conceptNode), new StreamResult(conceptXhtml));
124 int length = conceptXhtml.size();
125 dictOutputStream.write(conceptXhtml.toByteArray());
127 NodeList nameNodes = (NodeList) termsXPath.evaluate(conceptNode, XPathConstants.NODESET);
128 List<String> names = new ArrayList<>();
130 for (Node nameNode : nodeIterable(nameNodes)) {
131 String name = nameNode.getTextContent().trim();
132 if (!name.isEmpty()) {
137 synonymCount = +writeSynonyms(synonymOutputStream, names, conceptIndex);
138 writeIndex(indexOutputStream, names.get(0), offset, length);
140 offset = offset + length;
144 indexOutputStream.flush();
145 writeInfo(infoWriter, sourceDocument, conceptIndex + 1, synonymCount, indexFile.length());
146 } catch (SAXException | IOException | TransformerException | XPathExpressionException e) {
147 log.log(Level.SEVERE, "unable to generate", e);
149 close(dictOutputStream);
150 close(synonymOutputStream);
151 close(indexOutputStream);
156 private void writeIndex(DataOutputStream indexOutputStream, String name, long offset, long length) throws IOException {
157 indexOutputStream.write(name.getBytes(utf8));
158 indexOutputStream.write(0);
159 indexOutputStream.writeInt((int) offset); // unsigned int 32
160 indexOutputStream.writeInt((int) length); // unsigned int 32
163 private int writeSynonyms(DataOutputStream synonymOutputStream, List<String> names, long baseIndex) throws IOException {
164 if (names.size() > 1) {
165 for (int i = 1; i < names.size(); i++) {
166 String name = names.get(i);
167 synonymOutputStream.write(name.getBytes(utf8));
168 synonymOutputStream.write(0);
169 synonymOutputStream.writeInt((int) baseIndex); // unsigned int 32
171 return names.size() - 1;
177 private void writeInfo(BufferedWriter infoWriter, Document sourceDocument, long wordcount, long synwourdcount, long idxfilesize) throws IOException {
178 // TODO: values from document metadata
179 infoWriter.write("StarDict's dict ifo file\n");
180 infoWriter.write("version=2.4.2\n");
181 infoWriter.write("bookname=Free Telco Dictionary\n");
182 infoWriter.write("wordcount=" + wordcount + "\n");
183 infoWriter.write("synwordcount=" + synwourdcount + "\n");
184 infoWriter.write("idxfilesize=" + idxfilesize + "\n");
185 infoWriter.write("idxoffsetbits=32\n");
186 infoWriter.write("author=František Kučera\n");
187 infoWriter.write("email=telco-dictionary." + EMAIL_TOKEN + "@" + "frantovo.cz\n");
188 infoWriter.write("website=https://telco.frantovo.cz\n");
189 infoWriter.write("description=A dictionary for telecommunications licensed under GNU FDL\n");
190 infoWriter.write("date=2013.07.09\n");
191 infoWriter.write("sametypesequence=h\n");
194 public static void main(String[] args) {
195 File outputFolder = new File("../../delivery/free-telco-dictionary");
196 outputFolder.mkdir();
199 Generator g = new Generator();
200 g.generate(outputFolder, "telco");
201 } catch (ParserConfigurationException | TransformerConfigurationException e) {
202 log.log(Level.SEVERE, "error during initialization", e);