2 * Free Telco Dictionary
3 * Copyright © 2013 František Kučera (frantovo.cz)
5 * This program is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, version 3 of the License.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
17 package cz.frantovo.telco.dictionary;
19 import static cz.frantovo.telco.dictionary.Xmlns.*;
20 import static cz.frantovo.telco.dictionary.Functions.*;
21 import java.io.BufferedWriter;
22 import java.io.ByteArrayOutputStream;
23 import java.io.DataOutputStream;
25 import java.io.FileOutputStream;
26 import java.io.FileWriter;
27 import java.io.IOException;
28 import java.text.SimpleDateFormat;
29 import java.util.ArrayList;
30 import java.util.Date;
31 import java.util.List;
32 import java.util.SortedSet;
33 import java.util.TreeSet;
34 import java.util.logging.Level;
35 import java.util.logging.Logger;
36 import javax.xml.parsers.DocumentBuilder;
37 import javax.xml.parsers.DocumentBuilderFactory;
38 import javax.xml.parsers.ParserConfigurationException;
39 import javax.xml.transform.Transformer;
40 import javax.xml.transform.TransformerConfigurationException;
41 import javax.xml.transform.TransformerException;
42 import javax.xml.transform.TransformerFactory;
43 import javax.xml.transform.dom.DOMSource;
44 import javax.xml.transform.stream.StreamResult;
45 import javax.xml.transform.stream.StreamSource;
46 import javax.xml.xpath.XPath;
47 import javax.xml.xpath.XPathConstants;
48 import javax.xml.xpath.XPathExpression;
49 import javax.xml.xpath.XPathExpressionException;
50 import javax.xml.xpath.XPathFactory;
51 import org.w3c.dom.Document;
52 import org.w3c.dom.Node;
53 import org.w3c.dom.NodeList;
54 import org.xml.sax.SAXException;
58 * Generates dictionary files in StarDict format from source in our XML format.
62 * Number format should be: 32-bits unsigned number in network byte order
65 * @author Ing. František Kučera (frantovo.cz)
67 public class Generator {
69 private static final Logger log = Logger.getLogger(Generator.class.getName());
70 private static final String EML_TO_KEN = "ixumhht68";
72 private final DocumentBuilderFactory documentBuilderFactory;
73 private final DocumentBuilder documentBuilder;
74 private final XPathFactory xpathFactory;
75 private final XPath xpath;
76 private final TransformerFactory xslFactory;
77 private final Transformer xsl;
78 private final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy.MM.dd");
80 public Generator(String mode) throws ParserConfigurationException, TransformerConfigurationException {
83 File templateFile = new File("concept." + mode + ".xsl");
84 if (templateFile.exists()) {
86 documentBuilderFactory = DocumentBuilderFactory.newInstance();
87 documentBuilderFactory.setNamespaceAware(true);
88 documentBuilder = documentBuilderFactory.newDocumentBuilder();
90 xslFactory = TransformerFactory.newInstance();
91 xsl = xslFactory.newTransformer(new StreamSource(templateFile));
93 xpathFactory = XPathFactory.newInstance();
94 xpath = xpathFactory.newXPath();
95 xpath.setNamespaceContext(getNamespaceContext());
97 throw new IllegalArgumentException("Invalid mode: " + mode + ". File " + templateFile + " does not exist");
101 private void generate(File folder, String filePrefix) {
102 File infoFile = new File(folder, filePrefix + ".ifo");
103 File dictFile = new File(folder, filePrefix + ".dict");
104 File indexFile = new File(folder, filePrefix + ".idx");
105 File synonymFile = new File(folder, filePrefix + ".syn");
107 FileOutputStream dictOutputStream = null;
108 DataOutputStream synonymOutputStream = null;
109 DataOutputStream indexOutputStream = null;
110 BufferedWriter infoWriter = null;
112 SortedSet<IndexEntry> indexEntries = new TreeSet<>();
113 SortedSet<SynonymsEntry> synonymsEntries = new TreeSet<>();
116 dictOutputStream = new FileOutputStream(dictFile);
117 synonymOutputStream = new DataOutputStream(new FileOutputStream(synonymFile));
118 indexOutputStream = new DataOutputStream(new FileOutputStream(indexFile));
119 infoWriter = new BufferedWriter(new FileWriter(infoFile));
121 Document sourceDocument = documentBuilder.parse("../../data/dictionary.xml");
122 XPathExpression termsXPath = xpath.compile("d:term/@completeForm|d:term/@abbreviation");
123 // TODO: tags - labels/descriptions
124 xsl.setParameter("tags", sourceDocument.getElementsByTagNameNS(DICTIONARY, "tags").item(0));
127 long conceptIndex = 0;
128 for (Node conceptNode : nodeIterable(sourceDocument.getElementsByTagNameNS(DICTIONARY, "concept"))) {
129 ByteArrayOutputStream conceptXhtml = new ByteArrayOutputStream();
130 xsl.transform(new DOMSource(conceptNode), new StreamResult(conceptXhtml));
131 int length = conceptXhtml.size();
132 dictOutputStream.write(conceptXhtml.toByteArray());
134 NodeList nameNodes = (NodeList) termsXPath.evaluate(conceptNode, XPathConstants.NODESET);
135 List<String> names = new ArrayList<>();
137 for (Node nameNode : nodeIterable(nameNodes)) {
138 String name = nameNode.getTextContent().trim();
139 if (!name.isEmpty()) {
144 String baseName = names.get(0);
145 IndexEntry indexEntry = new IndexEntry(baseName, offset, length);
146 indexEntries.add(indexEntry);
148 for (int i = 1; i < names.size(); i++) {
149 String name = names.get(i);
150 if (!baseName.equals(name)) {
151 synonymsEntries.add(new SynonymsEntry(indexEntry, name));
155 offset = offset + length;
159 writeIndex(indexOutputStream, indexEntries);
160 writeSynonyms(synonymOutputStream, synonymsEntries);
162 indexOutputStream.flush();
163 writeInfo(infoWriter, sourceDocument, conceptIndex, synonymsEntries.size(), indexFile.length());
164 } catch (SAXException | IOException | TransformerException | XPathExpressionException e) {
165 log.log(Level.SEVERE, "unable to generate", e);
167 close(dictOutputStream);
168 close(synonymOutputStream);
169 close(indexOutputStream);
174 private void writeIndex(DataOutputStream indexOutputStream, SortedSet<IndexEntry> indexEntries) throws IOException {
176 for (IndexEntry e : indexEntries) {
177 e.serialize(indexOutputStream);
178 e.setOrdinal(ordinal++);
182 private void writeSynonyms(DataOutputStream synonymOutputStream, SortedSet<SynonymsEntry> synonymsEntries) throws IOException {
183 for (SynonymsEntry s : synonymsEntries) {
184 s.serialize(synonymOutputStream);
188 private void writeInfo(BufferedWriter infoWriter, Document sourceDocument, long wordcount, long synwourdcount, long idxfilesize) throws IOException {
189 // TODO: values from document metadata
190 infoWriter.write("StarDict's dict ifo file\n");
191 infoWriter.write("version=2.4.2\n");
192 infoWriter.write("bookname=Free Telco Dictionary\n");
193 infoWriter.write("wordcount=" + wordcount + "\n");
194 infoWriter.write("synwordcount=" + synwourdcount + "\n");
195 infoWriter.write("idxfilesize=" + idxfilesize + "\n");
196 infoWriter.write("idxoffsetbits=32\n");
197 infoWriter.write("author=František Kučera\n");
198 infoWriter.write("em" + "ail=telco" + "-dictionary." + EML_TO_KEN + "@" + "fran" + "tovo.cz\n");
199 infoWriter.write("website=https://telco.frantovo.cz\n");
200 infoWriter.write("description=A dictionary for telecommunications licensed under GNU FDL. Check new versions at https://telco.frantovo.cz\n");
201 infoWriter.write("date=" + dateFormat.format(new Date()) + "\n");
202 infoWriter.write("sametypesequence=" + mode + "\n");
205 public static void main(String[] args) {
206 File outputFolder = new File("../../delivery/free-telco-dictionary");
207 outputFolder.mkdirs();
210 Generator g = new Generator(parseMode(args));
211 g.generate(outputFolder, "telco");
212 } catch (ParserConfigurationException | TransformerConfigurationException e) {
213 log.log(Level.SEVERE, "error during initialization", e);
217 private static String parseMode(String[] args) {
218 if (args.length == 1) {