java/dictionary-generator/src/cz/frantovo/telco/dictionary/Generator.java
author František Kučera <franta-hg@frantovo.cz>
Fri, 06 Sep 2013 00:22:37 +0200
changeset 144 9435f578284f
parent 111 5338fdbf2bb8
child 151 a9f1ba451247
permissions -rw-r--r--
data: white pages, yellow pages (directory lookup methods)
     1 /**
     2  * Free Telco Dictionary
     3  * Copyright © 2013 František Kučera (frantovo.cz)
     4  *
     5  * This program is free software: you can redistribute it and/or modify
     6  * it under the terms of the GNU General Public License as published by
     7  * the Free Software Foundation, either version 3 of the License, or
     8  * (at your option) any later version.
     9  *
    10  * This program is distributed in the hope that it will be useful,
    11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
    12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    13  * GNU General Public License for more details.
    14  *
    15  * You should have received a copy of the GNU General Public License
    16  * along with this program. If not, see <http://www.gnu.org/licenses/>.
    17  */
    18 package cz.frantovo.telco.dictionary;
    19 
    20 import static cz.frantovo.telco.dictionary.Xmlns.*;
    21 import static cz.frantovo.telco.dictionary.Functions.*;
    22 import java.io.BufferedWriter;
    23 import java.io.ByteArrayOutputStream;
    24 import java.io.DataOutputStream;
    25 import java.io.File;
    26 import java.io.FileOutputStream;
    27 import java.io.FileWriter;
    28 import java.io.IOException;
    29 import java.text.SimpleDateFormat;
    30 import java.util.ArrayList;
    31 import java.util.Date;
    32 import java.util.List;
    33 import java.util.SortedSet;
    34 import java.util.TreeSet;
    35 import java.util.logging.Level;
    36 import java.util.logging.Logger;
    37 import javax.xml.parsers.DocumentBuilder;
    38 import javax.xml.parsers.DocumentBuilderFactory;
    39 import javax.xml.parsers.ParserConfigurationException;
    40 import javax.xml.transform.Transformer;
    41 import javax.xml.transform.TransformerConfigurationException;
    42 import javax.xml.transform.TransformerException;
    43 import javax.xml.transform.TransformerFactory;
    44 import javax.xml.transform.dom.DOMSource;
    45 import javax.xml.transform.stream.StreamResult;
    46 import javax.xml.transform.stream.StreamSource;
    47 import javax.xml.xpath.XPath;
    48 import javax.xml.xpath.XPathConstants;
    49 import javax.xml.xpath.XPathExpression;
    50 import javax.xml.xpath.XPathExpressionException;
    51 import javax.xml.xpath.XPathFactory;
    52 import org.w3c.dom.Document;
    53 import org.w3c.dom.Node;
    54 import org.w3c.dom.NodeList;
    55 import org.xml.sax.SAXException;
    56 
    57 /**
    58  * <p>
    59  * Generates dictionary files in StarDict format from source in our XML format.
    60  * </p>
    61  *
    62  * <p>
    63  * Number format should be: 32-bits unsigned number in network byte order
    64  * </p>
    65  *
    66  * @author Ing. František Kučera (frantovo.cz)
    67  */
    68 public class Generator {
    69 
    70 	private static final Logger log = Logger.getLogger(Generator.class.getName());
    71 	private static final String EML_TO_KEN = "ixumhht68";
    72 	private String mode;
    73 	private final DocumentBuilderFactory documentBuilderFactory;
    74 	private final DocumentBuilder documentBuilder;
    75 	private final XPathFactory xpathFactory;
    76 	private final XPath xpath;
    77 	private final TransformerFactory xslFactory;
    78 	private final Transformer xsl;
    79 	private final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy.MM.dd");
    80 
    81 	public Generator(String mode) throws ParserConfigurationException, TransformerConfigurationException {
    82 		this.mode = mode;
    83 
    84 		File templateFile = new File("concept." + mode + ".xsl");
    85 		if (templateFile.exists()) {
    86 
    87 			documentBuilderFactory = DocumentBuilderFactory.newInstance();
    88 			documentBuilderFactory.setNamespaceAware(true);
    89 			documentBuilder = documentBuilderFactory.newDocumentBuilder();
    90 
    91 			xslFactory = TransformerFactory.newInstance();
    92 			xsl = xslFactory.newTransformer(new StreamSource(templateFile));
    93 
    94 			xpathFactory = XPathFactory.newInstance();
    95 			xpath = xpathFactory.newXPath();
    96 			xpath.setNamespaceContext(getNamespaceContext());
    97 		} else {
    98 			throw new IllegalArgumentException("Invalid mode: " + mode + ". File " + templateFile + " does not exist");
    99 		}
   100 	}
   101 
   102 	private void generate(File folder, String filePrefix) {
   103 		File infoFile = new File(folder, filePrefix + ".ifo");
   104 		File dictFile = new File(folder, filePrefix + ".dict");
   105 		File indexFile = new File(folder, filePrefix + ".idx");
   106 		File synonymFile = new File(folder, filePrefix + ".syn");
   107 
   108 		FileOutputStream dictOutputStream = null;
   109 		DataOutputStream synonymOutputStream = null;
   110 		DataOutputStream indexOutputStream = null;
   111 		BufferedWriter infoWriter = null;
   112 
   113 		SortedSet<IndexEntry> indexEntries = new TreeSet<>();
   114 		SortedSet<SynonymsEntry> synonymsEntries = new TreeSet<>();
   115 
   116 		try {
   117 			dictOutputStream = new FileOutputStream(dictFile);
   118 			synonymOutputStream = new DataOutputStream(new FileOutputStream(synonymFile));
   119 			indexOutputStream = new DataOutputStream(new FileOutputStream(indexFile));
   120 			infoWriter = new BufferedWriter(new FileWriter(infoFile));
   121 
   122 			Document sourceDocument = documentBuilder.parse("../../data/dictionary.xml");
   123 			XPathExpression termsXPath = xpath.compile("d:term/@completeForm|d:term/@abbreviation");
   124 			// TODO: tags - labels/descriptions
   125 			xsl.setParameter("tags", sourceDocument.getElementsByTagNameNS(DICTIONARY, "tags").item(0));
   126 
   127 			long offset = 0;
   128 			long conceptIndex = 0;
   129 			for (Node conceptNode : nodeIterable(sourceDocument.getElementsByTagNameNS(DICTIONARY, "concept"))) {
   130 				ByteArrayOutputStream conceptXhtml = new ByteArrayOutputStream();
   131 				xsl.transform(new DOMSource(conceptNode), new StreamResult(conceptXhtml));
   132 				int length = conceptXhtml.size();
   133 				dictOutputStream.write(conceptXhtml.toByteArray());
   134 
   135 				NodeList nameNodes = (NodeList) termsXPath.evaluate(conceptNode, XPathConstants.NODESET);
   136 				List<String> names = new ArrayList<>();
   137 
   138 				for (Node nameNode : nodeIterable(nameNodes)) {
   139 					String name = nameNode.getTextContent().trim();
   140 					if (!name.isEmpty()) {
   141 						names.add(name);
   142 					}
   143 				}
   144 
   145 				String baseName = names.get(0);
   146 				IndexEntry indexEntry = new IndexEntry(baseName, offset, length);
   147 				indexEntries.add(indexEntry);
   148 
   149 				for (int i = 1; i < names.size(); i++) {
   150 					String name = names.get(i);
   151 					if (!baseName.equals(name)) {
   152 						synonymsEntries.add(new SynonymsEntry(indexEntry, name));
   153 					}
   154 				}
   155 
   156 				offset = offset + length;
   157 				conceptIndex++;
   158 			}
   159 
   160 			writeIndex(indexOutputStream, indexEntries);
   161 			writeSynonyms(synonymOutputStream, synonymsEntries);
   162 
   163 			indexOutputStream.flush();
   164 			writeInfo(infoWriter, sourceDocument, conceptIndex, synonymsEntries.size(), indexFile.length());
   165 		} catch (SAXException | IOException | TransformerException | XPathExpressionException e) {
   166 			log.log(Level.SEVERE, "unable to generate", e);
   167 		} finally {
   168 			close(dictOutputStream);
   169 			close(synonymOutputStream);
   170 			close(indexOutputStream);
   171 			close(infoWriter);
   172 		}
   173 	}
   174 
   175 	private void writeIndex(DataOutputStream indexOutputStream, SortedSet<IndexEntry> indexEntries) throws IOException {
   176 		long ordinal = 0;
   177 		for (IndexEntry e : indexEntries) {
   178 			e.serialize(indexOutputStream);
   179 			e.setOrdinal(ordinal++);
   180 		}
   181 	}
   182 
   183 	private void writeSynonyms(DataOutputStream synonymOutputStream, SortedSet<SynonymsEntry> synonymsEntries) throws IOException {
   184 		for (SynonymsEntry s : synonymsEntries) {
   185 			s.serialize(synonymOutputStream);
   186 		}
   187 	}
   188 
   189 	private void writeInfo(BufferedWriter infoWriter, Document sourceDocument, long wordcount, long synwourdcount, long idxfilesize) throws IOException {
   190 		// TODO: values from document metadata
   191 		infoWriter.write("StarDict's dict ifo file\n");
   192 		infoWriter.write("version=2.4.2\n");
   193 		infoWriter.write("bookname=Free Telco Dictionary\n");
   194 		infoWriter.write("wordcount=" + wordcount + "\n");
   195 		infoWriter.write("synwordcount=" + synwourdcount + "\n");
   196 		infoWriter.write("idxfilesize=" + idxfilesize + "\n");
   197 		infoWriter.write("idxoffsetbits=32\n");
   198 		infoWriter.write("author=František Kučera\n");
   199 		infoWriter.write("em" + "ail=telco" + "-dictionary." + EML_TO_KEN + "@" + "fran" + "tovo.cz\n");
   200 		infoWriter.write("website=https://telco.frantovo.cz\n");
   201 		infoWriter.write("description=A dictionary for telecommunications licensed under GNU FDL. Check new versions at https://telco.frantovo.cz\n");
   202 		infoWriter.write("date=" + dateFormat.format(new Date()) + "\n");
   203 		infoWriter.write("sametypesequence=" + mode + "\n");
   204 	}
   205 
   206 	public static void main(String[] args) {
   207 		File outputFolder = new File("../../delivery/free-telco-dictionary");
   208 		outputFolder.mkdirs();
   209 
   210 		try {
   211 			Generator g = new Generator(parseMode(args));
   212 			g.generate(outputFolder, "telco");
   213 		} catch (ParserConfigurationException | TransformerConfigurationException e) {
   214 			log.log(Level.SEVERE, "error during initialization", e);
   215 		}
   216 	}
   217 
   218 	private static String parseMode(String[] args) {
   219 		if (args.length == 1) {
   220 			return args[0];
   221 		} else {
   222 			return "h";
   223 		}
   224 	}
   225 }