java/dictionary-generator/src/cz/frantovo/telco/dictionary/Generator.java
author František Kučera <franta-hg@frantovo.cz>
Wed, 10 Jul 2013 14:32:45 +0200
changeset 20 aecdfc3b1950
parent 18 7a2eb4cb6ff1
child 21 e7c9a8722f76
permissions -rw-r--r--
generator: word count fix
     1 /**
     2  * Free Telco Dictionary
     3  * Copyright © 2013 František Kučera (frantovo.cz)
     4  *
     5  * This program is free software: you can redistribute it and/or modify
     6  * it under the terms of the GNU General Public License as published by
     7  * the Free Software Foundation, either version 3 of the License, or
     8  * (at your option) any later version.
     9  *
    10  * This program is distributed in the hope that it will be useful,
    11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
    12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    13  * GNU General Public License for more details.
    14  *
    15  * You should have received a copy of the GNU General Public License
    16  * along with this program. If not, see <http://www.gnu.org/licenses/>.
    17  */
    18 package cz.frantovo.telco.dictionary;
    19 
    20 import static cz.frantovo.telco.dictionary.Xmlns.*;
    21 import static cz.frantovo.telco.dictionary.Functions.*;
    22 import java.io.BufferedWriter;
    23 import java.io.ByteArrayOutputStream;
    24 import java.io.DataOutputStream;
    25 import java.io.File;
    26 import java.io.FileOutputStream;
    27 import java.io.FileWriter;
    28 import java.io.IOException;
    29 import java.text.SimpleDateFormat;
    30 import java.util.ArrayList;
    31 import java.util.Date;
    32 import java.util.List;
    33 import java.util.SortedSet;
    34 import java.util.TreeSet;
    35 import java.util.logging.Level;
    36 import java.util.logging.Logger;
    37 import javax.xml.parsers.DocumentBuilder;
    38 import javax.xml.parsers.DocumentBuilderFactory;
    39 import javax.xml.parsers.ParserConfigurationException;
    40 import javax.xml.transform.Transformer;
    41 import javax.xml.transform.TransformerConfigurationException;
    42 import javax.xml.transform.TransformerException;
    43 import javax.xml.transform.TransformerFactory;
    44 import javax.xml.transform.dom.DOMSource;
    45 import javax.xml.transform.stream.StreamResult;
    46 import javax.xml.transform.stream.StreamSource;
    47 import javax.xml.xpath.XPath;
    48 import javax.xml.xpath.XPathConstants;
    49 import javax.xml.xpath.XPathExpression;
    50 import javax.xml.xpath.XPathExpressionException;
    51 import javax.xml.xpath.XPathFactory;
    52 import org.w3c.dom.Document;
    53 import org.w3c.dom.Node;
    54 import org.w3c.dom.NodeList;
    55 import org.xml.sax.SAXException;
    56 
    57 /**
    58  * <p>
    59  * Generates dictionary files in StarDict format from source in our XML format.
    60  * </p>
    61  *
    62  * <p>
    63  * Number format should be: 32-bits unsigned number in network byte order
    64  * </p>
    65  *
    66  * @author Ing. František Kučera (frantovo.cz)
    67  */
    68 public class Generator {
    69 	
    70 	private static final Logger log = Logger.getLogger(Generator.class.getName());
    71 	private static final String EML_TO_KEN = "ixumhht68";
    72 	private final DocumentBuilderFactory documentBuilderFactory;
    73 	private final DocumentBuilder documentBuilder;
    74 	private final XPathFactory xpathFactory;
    75 	private final XPath xpath;
    76 	private final TransformerFactory xslFactory;
    77 	private final Transformer xsl;
    78 	private final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy.MM.dd");
    79 	
    80 	public Generator() throws ParserConfigurationException, TransformerConfigurationException {
    81 		documentBuilderFactory = DocumentBuilderFactory.newInstance();
    82 		documentBuilderFactory.setNamespaceAware(true);
    83 		documentBuilder = documentBuilderFactory.newDocumentBuilder();
    84 		
    85 		xslFactory = TransformerFactory.newInstance();
    86 		xsl = xslFactory.newTransformer(new StreamSource("concept.xsl"));
    87 		
    88 		xpathFactory = XPathFactory.newInstance();
    89 		xpath = xpathFactory.newXPath();
    90 		xpath.setNamespaceContext(getNamespaceContext());
    91 	}
    92 	
    93 	private void generate(File folder, String filePrefix) {
    94 		File infoFile = new File(folder, filePrefix + ".ifo");
    95 		File dictFile = new File(folder, filePrefix + ".dict");
    96 		File indexFile = new File(folder, filePrefix + ".idx");
    97 		File synonymFile = new File(folder, filePrefix + ".syn");
    98 		
    99 		FileOutputStream dictOutputStream = null;
   100 		DataOutputStream synonymOutputStream = null;
   101 		DataOutputStream indexOutputStream = null;
   102 		BufferedWriter infoWriter = null;
   103 		
   104 		SortedSet<IndexEntry> indexEntries = new TreeSet<>();
   105 		SortedSet<SynonymsEntry> synonymsEntries = new TreeSet<>();
   106 		
   107 		try {
   108 			dictOutputStream = new FileOutputStream(dictFile);
   109 			synonymOutputStream = new DataOutputStream(new FileOutputStream(synonymFile));
   110 			indexOutputStream = new DataOutputStream(new FileOutputStream(indexFile));
   111 			infoWriter = new BufferedWriter(new FileWriter(infoFile));
   112 			
   113 			Document sourceDocument = documentBuilder.parse("../../data/dictionary.xml");
   114 			XPathExpression termsXPath = xpath.compile("d:term/@completeForm|d:term/@abbreviation");
   115 			// TODO: tags - labels/descriptions
   116 			xsl.setParameter("tags", sourceDocument.getElementsByTagNameNS(DICTIONARY, "tags").item(0));
   117 			
   118 			long offset = 0;
   119 			long conceptIndex = 0;
   120 			for (Node conceptNode : nodeIterable(sourceDocument.getElementsByTagNameNS(DICTIONARY, "concept"))) {
   121 				ByteArrayOutputStream conceptXhtml = new ByteArrayOutputStream();
   122 				xsl.transform(new DOMSource(conceptNode), new StreamResult(conceptXhtml));
   123 				int length = conceptXhtml.size();
   124 				dictOutputStream.write(conceptXhtml.toByteArray());
   125 				
   126 				NodeList nameNodes = (NodeList) termsXPath.evaluate(conceptNode, XPathConstants.NODESET);
   127 				List<String> names = new ArrayList<>();
   128 				
   129 				for (Node nameNode : nodeIterable(nameNodes)) {
   130 					String name = nameNode.getTextContent().trim();
   131 					if (!name.isEmpty()) {
   132 						names.add(name);
   133 					}
   134 				}
   135 				
   136 				String baseName = names.get(0);
   137 				IndexEntry indexEntry = new IndexEntry(baseName, offset, length);
   138 				indexEntries.add(indexEntry);
   139 				
   140 				for (int i = 1; i < names.size(); i++) {
   141 					String name = names.get(i);
   142 					if (!baseName.equals(name)) {
   143 						synonymsEntries.add(new SynonymsEntry(indexEntry, name));
   144 					}
   145 				}
   146 				
   147 				offset = offset + length;
   148 				conceptIndex++;
   149 			}
   150 			
   151 			writeIndex(indexOutputStream, indexEntries);
   152 			writeSynonyms(synonymOutputStream, synonymsEntries);
   153 			
   154 			indexOutputStream.flush();
   155 			writeInfo(infoWriter, sourceDocument, conceptIndex, synonymsEntries.size(), indexFile.length());
   156 		} catch (SAXException | IOException | TransformerException | XPathExpressionException e) {
   157 			log.log(Level.SEVERE, "unable to generate", e);
   158 		} finally {
   159 			close(dictOutputStream);
   160 			close(synonymOutputStream);
   161 			close(indexOutputStream);
   162 			close(infoWriter);
   163 		}
   164 	}
   165 	
   166 	private void writeIndex(DataOutputStream indexOutputStream, SortedSet<IndexEntry> indexEntries) throws IOException {
   167 		long ordinal = 0;
   168 		for (IndexEntry e : indexEntries) {
   169 			e.serialize(indexOutputStream);
   170 			e.setOrdinal(ordinal++);
   171 		}
   172 	}
   173 	
   174 	private void writeSynonyms(DataOutputStream synonymOutputStream, SortedSet<SynonymsEntry> synonymsEntries) throws IOException {
   175 		for (SynonymsEntry s : synonymsEntries) {
   176 			s.serialize(synonymOutputStream);
   177 		}
   178 	}
   179 	
   180 	private void writeInfo(BufferedWriter infoWriter, Document sourceDocument, long wordcount, long synwourdcount, long idxfilesize) throws IOException {
   181 		// TODO: values from document metadata
   182 		infoWriter.write("StarDict's dict ifo file\n");
   183 		infoWriter.write("version=2.4.2\n");
   184 		infoWriter.write("bookname=Free Telco Dictionary\n");
   185 		infoWriter.write("wordcount=" + wordcount + "\n");
   186 		infoWriter.write("synwordcount=" + synwourdcount + "\n");
   187 		infoWriter.write("idxfilesize=" + idxfilesize + "\n");
   188 		infoWriter.write("idxoffsetbits=32\n");
   189 		infoWriter.write("author=František Kučera\n");
   190 		infoWriter.write("em" + "ail=telco" + "-dictionary." + EML_TO_KEN + "@" + "fran" + "tovo.cz\n");
   191 		infoWriter.write("website=https://telco.frantovo.cz\n");
   192 		infoWriter.write("description=A dictionary for telecommunications licensed under GNU FDL\n");
   193 		infoWriter.write("date=" + dateFormat.format(new Date()) + "\n");
   194 		infoWriter.write("sametypesequence=h\n");
   195 	}
   196 	
   197 	public static void main(String[] args) {
   198 		File outputFolder = new File("../../delivery/free-telco-dictionary");
   199 		outputFolder.mkdir();
   200 		
   201 		try {
   202 			Generator g = new Generator();
   203 			g.generate(outputFolder, "telco");
   204 		} catch (ParserConfigurationException | TransformerConfigurationException e) {
   205 			log.log(Level.SEVERE, "error during initialization", e);
   206 		}
   207 	}
   208 }