java/dictionary-generator/src/cz/frantovo/telco/dictionary/Generator.java
author František Kučera <franta-hg@frantovo.cz>
Mon, 22 Jun 2020 23:11:14 +0200
changeset 152 5c878a53d3ed
parent 151 a9f1ba451247
permissions -rw-r--r--
add dictionary and concept IDs + some documentation
     1 /**
     2  * Free Telco Dictionary
     3  * Copyright © 2013 František Kučera (frantovo.cz)
     4  *
     5  * This program is free software: you can redistribute it and/or modify
     6  * it under the terms of the GNU General Public License as published by
     7  * the Free Software Foundation, version 3 of the License.
     8  *
     9  * This program is distributed in the hope that it will be useful,
    10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
    11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    12  * GNU General Public License for more details.
    13  *
    14  * You should have received a copy of the GNU General Public License
    15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
    16  */
    17 package cz.frantovo.telco.dictionary;
    18 
    19 import static cz.frantovo.telco.dictionary.Xmlns.*;
    20 import static cz.frantovo.telco.dictionary.Functions.*;
    21 import java.io.BufferedWriter;
    22 import java.io.ByteArrayOutputStream;
    23 import java.io.DataOutputStream;
    24 import java.io.File;
    25 import java.io.FileOutputStream;
    26 import java.io.FileWriter;
    27 import java.io.IOException;
    28 import java.text.SimpleDateFormat;
    29 import java.util.ArrayList;
    30 import java.util.Date;
    31 import java.util.List;
    32 import java.util.SortedSet;
    33 import java.util.TreeSet;
    34 import java.util.logging.Level;
    35 import java.util.logging.Logger;
    36 import javax.xml.parsers.DocumentBuilder;
    37 import javax.xml.parsers.DocumentBuilderFactory;
    38 import javax.xml.parsers.ParserConfigurationException;
    39 import javax.xml.transform.Transformer;
    40 import javax.xml.transform.TransformerConfigurationException;
    41 import javax.xml.transform.TransformerException;
    42 import javax.xml.transform.TransformerFactory;
    43 import javax.xml.transform.dom.DOMSource;
    44 import javax.xml.transform.stream.StreamResult;
    45 import javax.xml.transform.stream.StreamSource;
    46 import javax.xml.xpath.XPath;
    47 import javax.xml.xpath.XPathConstants;
    48 import javax.xml.xpath.XPathExpression;
    49 import javax.xml.xpath.XPathExpressionException;
    50 import javax.xml.xpath.XPathFactory;
    51 import org.w3c.dom.Document;
    52 import org.w3c.dom.Node;
    53 import org.w3c.dom.NodeList;
    54 import org.xml.sax.SAXException;
    55 
    56 /**
    57  * <p>
    58  * Generates dictionary files in StarDict format from source in our XML format.
    59  * </p>
    60  *
    61  * <p>
    62  * Number format should be: 32-bits unsigned number in network byte order
    63  * </p>
    64  *
    65  * @author Ing. František Kučera (frantovo.cz)
    66  */
    67 public class Generator {
    68 
    69 	private static final Logger log = Logger.getLogger(Generator.class.getName());
    70 	private static final String EML_TO_KEN = "ixumhht68";
    71 	private String mode;
    72 	private final DocumentBuilderFactory documentBuilderFactory;
    73 	private final DocumentBuilder documentBuilder;
    74 	private final XPathFactory xpathFactory;
    75 	private final XPath xpath;
    76 	private final TransformerFactory xslFactory;
    77 	private final Transformer xsl;
    78 	private final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy.MM.dd");
    79 
    80 	public Generator(String mode) throws ParserConfigurationException, TransformerConfigurationException {
    81 		this.mode = mode;
    82 
    83 		File templateFile = new File("concept." + mode + ".xsl");
    84 		if (templateFile.exists()) {
    85 
    86 			documentBuilderFactory = DocumentBuilderFactory.newInstance();
    87 			documentBuilderFactory.setNamespaceAware(true);
    88 			documentBuilder = documentBuilderFactory.newDocumentBuilder();
    89 
    90 			xslFactory = TransformerFactory.newInstance();
    91 			xsl = xslFactory.newTransformer(new StreamSource(templateFile));
    92 
    93 			xpathFactory = XPathFactory.newInstance();
    94 			xpath = xpathFactory.newXPath();
    95 			xpath.setNamespaceContext(getNamespaceContext());
    96 		} else {
    97 			throw new IllegalArgumentException("Invalid mode: " + mode + ". File " + templateFile + " does not exist");
    98 		}
    99 	}
   100 
   101 	private void generate(File folder, String filePrefix) {
   102 		File infoFile = new File(folder, filePrefix + ".ifo");
   103 		File dictFile = new File(folder, filePrefix + ".dict");
   104 		File indexFile = new File(folder, filePrefix + ".idx");
   105 		File synonymFile = new File(folder, filePrefix + ".syn");
   106 
   107 		FileOutputStream dictOutputStream = null;
   108 		DataOutputStream synonymOutputStream = null;
   109 		DataOutputStream indexOutputStream = null;
   110 		BufferedWriter infoWriter = null;
   111 
   112 		SortedSet<IndexEntry> indexEntries = new TreeSet<>();
   113 		SortedSet<SynonymsEntry> synonymsEntries = new TreeSet<>();
   114 
   115 		try {
   116 			dictOutputStream = new FileOutputStream(dictFile);
   117 			synonymOutputStream = new DataOutputStream(new FileOutputStream(synonymFile));
   118 			indexOutputStream = new DataOutputStream(new FileOutputStream(indexFile));
   119 			infoWriter = new BufferedWriter(new FileWriter(infoFile));
   120 
   121 			Document sourceDocument = documentBuilder.parse("../../data/dictionary.xml");
   122 			XPathExpression termsXPath = xpath.compile("d:term/@completeForm|d:term/@abbreviation");
   123 			// TODO: tags - labels/descriptions
   124 			xsl.setParameter("tags", sourceDocument.getElementsByTagNameNS(DICTIONARY, "tags").item(0));
   125 
   126 			long offset = 0;
   127 			long conceptIndex = 0;
   128 			for (Node conceptNode : nodeIterable(sourceDocument.getElementsByTagNameNS(DICTIONARY, "concept"))) {
   129 				ByteArrayOutputStream conceptXhtml = new ByteArrayOutputStream();
   130 				xsl.transform(new DOMSource(conceptNode), new StreamResult(conceptXhtml));
   131 				int length = conceptXhtml.size();
   132 				dictOutputStream.write(conceptXhtml.toByteArray());
   133 
   134 				NodeList nameNodes = (NodeList) termsXPath.evaluate(conceptNode, XPathConstants.NODESET);
   135 				List<String> names = new ArrayList<>();
   136 
   137 				for (Node nameNode : nodeIterable(nameNodes)) {
   138 					String name = nameNode.getTextContent().trim();
   139 					if (!name.isEmpty()) {
   140 						names.add(name);
   141 					}
   142 				}
   143 
   144 				String baseName = names.get(0);
   145 				IndexEntry indexEntry = new IndexEntry(baseName, offset, length);
   146 				indexEntries.add(indexEntry);
   147 
   148 				for (int i = 1; i < names.size(); i++) {
   149 					String name = names.get(i);
   150 					if (!baseName.equals(name)) {
   151 						synonymsEntries.add(new SynonymsEntry(indexEntry, name));
   152 					}
   153 				}
   154 
   155 				offset = offset + length;
   156 				conceptIndex++;
   157 			}
   158 
   159 			writeIndex(indexOutputStream, indexEntries);
   160 			writeSynonyms(synonymOutputStream, synonymsEntries);
   161 
   162 			indexOutputStream.flush();
   163 			writeInfo(infoWriter, sourceDocument, conceptIndex, synonymsEntries.size(), indexFile.length());
   164 		} catch (SAXException | IOException | TransformerException | XPathExpressionException e) {
   165 			log.log(Level.SEVERE, "unable to generate", e);
   166 		} finally {
   167 			close(dictOutputStream);
   168 			close(synonymOutputStream);
   169 			close(indexOutputStream);
   170 			close(infoWriter);
   171 		}
   172 	}
   173 
   174 	private void writeIndex(DataOutputStream indexOutputStream, SortedSet<IndexEntry> indexEntries) throws IOException {
   175 		long ordinal = 0;
   176 		for (IndexEntry e : indexEntries) {
   177 			e.serialize(indexOutputStream);
   178 			e.setOrdinal(ordinal++);
   179 		}
   180 	}
   181 
   182 	private void writeSynonyms(DataOutputStream synonymOutputStream, SortedSet<SynonymsEntry> synonymsEntries) throws IOException {
   183 		for (SynonymsEntry s : synonymsEntries) {
   184 			s.serialize(synonymOutputStream);
   185 		}
   186 	}
   187 
   188 	private void writeInfo(BufferedWriter infoWriter, Document sourceDocument, long wordcount, long synwourdcount, long idxfilesize) throws IOException {
   189 		// TODO: values from document metadata
   190 		infoWriter.write("StarDict's dict ifo file\n");
   191 		infoWriter.write("version=2.4.2\n");
   192 		infoWriter.write("bookname=Free Telco Dictionary\n");
   193 		infoWriter.write("wordcount=" + wordcount + "\n");
   194 		infoWriter.write("synwordcount=" + synwourdcount + "\n");
   195 		infoWriter.write("idxfilesize=" + idxfilesize + "\n");
   196 		infoWriter.write("idxoffsetbits=32\n");
   197 		infoWriter.write("author=František Kučera\n");
   198 		infoWriter.write("em" + "ail=telco" + "-dictionary." + EML_TO_KEN + "@" + "fran" + "tovo.cz\n");
   199 		infoWriter.write("website=https://telco.frantovo.cz\n");
   200 		infoWriter.write("description=A dictionary for telecommunications licensed under GNU FDL. Check new versions at https://telco.frantovo.cz\n");
   201 		infoWriter.write("date=" + dateFormat.format(new Date()) + "\n");
   202 		infoWriter.write("sametypesequence=" + mode + "\n");
   203 	}
   204 
   205 	public static void main(String[] args) {
   206 		File outputFolder = new File("../../delivery/free-telco-dictionary");
   207 		outputFolder.mkdirs();
   208 
   209 		try {
   210 			Generator g = new Generator(parseMode(args));
   211 			g.generate(outputFolder, "telco");
   212 		} catch (ParserConfigurationException | TransformerConfigurationException e) {
   213 			log.log(Level.SEVERE, "error during initialization", e);
   214 		}
   215 	}
   216 
   217 	private static String parseMode(String[] args) {
   218 		if (args.length == 1) {
   219 			return args[0];
   220 		} else {
   221 			return "h";
   222 		}
   223 	}
   224 }