java/dictionary-generator/src/cz/frantovo/telco/dictionary/Generator.java
author František Kučera <franta-hg@frantovo.cz>
Tue, 09 Jul 2013 22:42:32 +0200
changeset 17 b188eae2c092
parent 15 93208f791318
child 18 7a2eb4cb6ff1
permissions -rw-r--r--
generator: sorted index and synonyms
franta-hg@13
     1
/**
franta-hg@13
     2
 * Free Telco Dictionary
franta-hg@13
     3
 * Copyright © 2013 František Kučera (frantovo.cz)
franta-hg@13
     4
 *
franta-hg@13
     5
 * This program is free software: you can redistribute it and/or modify
franta-hg@13
     6
 * it under the terms of the GNU General Public License as published by
franta-hg@13
     7
 * the Free Software Foundation, either version 3 of the License, or
franta-hg@13
     8
 * (at your option) any later version.
franta-hg@13
     9
 *
franta-hg@13
    10
 * This program is distributed in the hope that it will be useful,
franta-hg@13
    11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
franta-hg@13
    12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
franta-hg@13
    13
 * GNU General Public License for more details.
franta-hg@13
    14
 *
franta-hg@13
    15
 * You should have received a copy of the GNU General Public License
franta-hg@13
    16
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
franta-hg@13
    17
 */
franta-hg@13
    18
package cz.frantovo.telco.dictionary;
franta-hg@13
    19
franta-hg@15
    20
import static cz.frantovo.telco.dictionary.Xmlns.*;
franta-hg@15
    21
import static cz.frantovo.telco.dictionary.Functions.*;
franta-hg@15
    22
import java.io.BufferedWriter;
franta-hg@15
    23
import java.io.ByteArrayOutputStream;
franta-hg@15
    24
import java.io.DataOutputStream;
franta-hg@15
    25
import java.io.File;
franta-hg@15
    26
import java.io.FileOutputStream;
franta-hg@15
    27
import java.io.FileWriter;
franta-hg@15
    28
import java.io.IOException;
franta-hg@15
    29
import java.util.ArrayList;
franta-hg@15
    30
import java.util.List;
franta-hg@17
    31
import java.util.SortedSet;
franta-hg@17
    32
import java.util.TreeSet;
franta-hg@15
    33
import java.util.logging.Level;
franta-hg@15
    34
import java.util.logging.Logger;
franta-hg@15
    35
import javax.xml.parsers.DocumentBuilder;
franta-hg@15
    36
import javax.xml.parsers.DocumentBuilderFactory;
franta-hg@15
    37
import javax.xml.parsers.ParserConfigurationException;
franta-hg@15
    38
import javax.xml.transform.Transformer;
franta-hg@15
    39
import javax.xml.transform.TransformerConfigurationException;
franta-hg@15
    40
import javax.xml.transform.TransformerException;
franta-hg@15
    41
import javax.xml.transform.TransformerFactory;
franta-hg@15
    42
import javax.xml.transform.dom.DOMSource;
franta-hg@15
    43
import javax.xml.transform.stream.StreamResult;
franta-hg@15
    44
import javax.xml.transform.stream.StreamSource;
franta-hg@15
    45
import javax.xml.xpath.XPath;
franta-hg@15
    46
import javax.xml.xpath.XPathConstants;
franta-hg@15
    47
import javax.xml.xpath.XPathExpression;
franta-hg@15
    48
import javax.xml.xpath.XPathExpressionException;
franta-hg@15
    49
import javax.xml.xpath.XPathFactory;
franta-hg@15
    50
import org.w3c.dom.Document;
franta-hg@15
    51
import org.w3c.dom.Node;
franta-hg@15
    52
import org.w3c.dom.NodeList;
franta-hg@15
    53
import org.xml.sax.SAXException;
franta-hg@15
    54
franta-hg@13
    55
/**
franta-hg@15
    56
 * <p>
franta-hg@15
    57
 * Generates dictionary files in StarDict format from source in our XML format.
franta-hg@15
    58
 * </p>
franta-hg@15
    59
 *
franta-hg@15
    60
 * <p>
franta-hg@15
    61
 * Number format should be: 32-bits unsigned number in network byte order
franta-hg@15
    62
 * </p>
franta-hg@13
    63
 *
franta-hg@13
    64
 * @author Ing. František Kučera (frantovo.cz)
franta-hg@13
    65
 */
franta-hg@13
    66
public class Generator {
franta-hg@13
    67
franta-hg@15
    68
	private static final Logger log = Logger.getLogger(Generator.class.getName());
franta-hg@17
    69
	private static final String EML_TO_KEN = "ixumhht68";
franta-hg@15
    70
	private final DocumentBuilderFactory documentBuilderFactory;
franta-hg@15
    71
	private final DocumentBuilder documentBuilder;
franta-hg@15
    72
	private final XPathFactory xpathFactory;
franta-hg@15
    73
	private final XPath xpath;
franta-hg@15
    74
	private final TransformerFactory xslFactory;
franta-hg@15
    75
	private final Transformer xsl;
franta-hg@15
    76
franta-hg@15
    77
	public Generator() throws ParserConfigurationException, TransformerConfigurationException {
franta-hg@15
    78
		documentBuilderFactory = DocumentBuilderFactory.newInstance();
franta-hg@15
    79
		documentBuilderFactory.setNamespaceAware(true);
franta-hg@15
    80
		documentBuilder = documentBuilderFactory.newDocumentBuilder();
franta-hg@15
    81
franta-hg@15
    82
		xslFactory = TransformerFactory.newInstance();
franta-hg@15
    83
		xsl = xslFactory.newTransformer(new StreamSource("concept.xsl"));
franta-hg@15
    84
franta-hg@15
    85
		xpathFactory = XPathFactory.newInstance();
franta-hg@15
    86
		xpath = xpathFactory.newXPath();
franta-hg@15
    87
		xpath.setNamespaceContext(getNamespaceContext());
franta-hg@15
    88
	}
franta-hg@15
    89
franta-hg@15
    90
	private void generate(File folder, String filePrefix) {
franta-hg@15
    91
		File infoFile = new File(folder, filePrefix + ".ifo");
franta-hg@15
    92
		File dictFile = new File(folder, filePrefix + ".dict");
franta-hg@15
    93
		File indexFile = new File(folder, filePrefix + ".idx");
franta-hg@15
    94
		File synonymFile = new File(folder, filePrefix + ".syn");
franta-hg@15
    95
franta-hg@15
    96
		FileOutputStream dictOutputStream = null;
franta-hg@15
    97
		DataOutputStream synonymOutputStream = null;
franta-hg@15
    98
		DataOutputStream indexOutputStream = null;
franta-hg@15
    99
		BufferedWriter infoWriter = null;
franta-hg@15
   100
franta-hg@17
   101
		SortedSet<IndexEntry> indexEntries = new TreeSet<>();
franta-hg@17
   102
		SortedSet<SynonymsEntry> synonymsEntries = new TreeSet<>();
franta-hg@17
   103
franta-hg@15
   104
		try {
franta-hg@15
   105
			dictOutputStream = new FileOutputStream(dictFile);
franta-hg@15
   106
			synonymOutputStream = new DataOutputStream(new FileOutputStream(synonymFile));
franta-hg@15
   107
			indexOutputStream = new DataOutputStream(new FileOutputStream(indexFile));
franta-hg@15
   108
			infoWriter = new BufferedWriter(new FileWriter(infoFile));
franta-hg@15
   109
franta-hg@15
   110
			Document sourceDocument = documentBuilder.parse("../../data/dictionary.xml");
franta-hg@15
   111
			XPathExpression termsXPath = xpath.compile("d:term/@completeForm|d:term/@abbreviation");
franta-hg@15
   112
			// TODO: tags - labels/descriptions
franta-hg@15
   113
			xsl.setParameter("tags", sourceDocument.getElementsByTagNameNS(DICTIONARY, "tags").item(0));
franta-hg@15
   114
franta-hg@15
   115
			long offset = 0;
franta-hg@15
   116
			long conceptIndex = 0;
franta-hg@15
   117
			for (Node conceptNode : nodeIterable(sourceDocument.getElementsByTagNameNS(DICTIONARY, "concept"))) {
franta-hg@15
   118
				ByteArrayOutputStream conceptXhtml = new ByteArrayOutputStream();
franta-hg@15
   119
				xsl.transform(new DOMSource(conceptNode), new StreamResult(conceptXhtml));
franta-hg@15
   120
				int length = conceptXhtml.size();
franta-hg@15
   121
				dictOutputStream.write(conceptXhtml.toByteArray());
franta-hg@15
   122
franta-hg@15
   123
				NodeList nameNodes = (NodeList) termsXPath.evaluate(conceptNode, XPathConstants.NODESET);
franta-hg@15
   124
				List<String> names = new ArrayList<>();
franta-hg@15
   125
franta-hg@15
   126
				for (Node nameNode : nodeIterable(nameNodes)) {
franta-hg@15
   127
					String name = nameNode.getTextContent().trim();
franta-hg@15
   128
					if (!name.isEmpty()) {
franta-hg@15
   129
						names.add(name);
franta-hg@15
   130
					}
franta-hg@15
   131
				}
franta-hg@15
   132
franta-hg@17
   133
				IndexEntry indexEntry = new IndexEntry(names.get(0), offset, length);
franta-hg@17
   134
				indexEntries.add(indexEntry);
franta-hg@17
   135
franta-hg@17
   136
				for (int i = 1; i < names.size(); i++) {
franta-hg@17
   137
					String name = names.get(i);
franta-hg@17
   138
					synonymsEntries.add(new SynonymsEntry(indexEntry, name));
franta-hg@17
   139
				}
franta-hg@15
   140
franta-hg@15
   141
				offset = offset + length;
franta-hg@15
   142
				conceptIndex++;
franta-hg@15
   143
			}
franta-hg@15
   144
franta-hg@17
   145
			writeIndex(indexOutputStream, indexEntries);
franta-hg@17
   146
			writeSynonyms(synonymOutputStream, synonymsEntries);
franta-hg@17
   147
franta-hg@15
   148
			indexOutputStream.flush();
franta-hg@17
   149
			writeInfo(infoWriter, sourceDocument, conceptIndex + 1, synonymsEntries.size(), indexFile.length());
franta-hg@15
   150
		} catch (SAXException | IOException | TransformerException | XPathExpressionException e) {
franta-hg@15
   151
			log.log(Level.SEVERE, "unable to generate", e);
franta-hg@15
   152
		} finally {
franta-hg@15
   153
			close(dictOutputStream);
franta-hg@15
   154
			close(synonymOutputStream);
franta-hg@15
   155
			close(indexOutputStream);
franta-hg@15
   156
			close(infoWriter);
franta-hg@15
   157
		}
franta-hg@15
   158
	}
franta-hg@15
   159
franta-hg@17
   160
	private void writeIndex(DataOutputStream indexOutputStream, SortedSet<IndexEntry> indexEntries) throws IOException {
franta-hg@17
   161
		long ordinal = 0;
franta-hg@17
   162
		for (IndexEntry e : indexEntries) {
franta-hg@17
   163
			e.serialize(indexOutputStream);
franta-hg@17
   164
			e.setOrdinal(ordinal++);
franta-hg@17
   165
		}
franta-hg@15
   166
	}
franta-hg@15
   167
franta-hg@17
   168
	private void writeSynonyms(DataOutputStream synonymOutputStream, SortedSet<SynonymsEntry> synonymsEntries) throws IOException {
franta-hg@17
   169
		for (SynonymsEntry s : synonymsEntries) {
franta-hg@17
   170
			s.serialize(synonymOutputStream);
franta-hg@15
   171
		}
franta-hg@15
   172
	}
franta-hg@15
   173
franta-hg@15
   174
	private void writeInfo(BufferedWriter infoWriter, Document sourceDocument, long wordcount, long synwourdcount, long idxfilesize) throws IOException {
franta-hg@15
   175
		// TODO: values from document metadata
franta-hg@15
   176
		infoWriter.write("StarDict's dict ifo file\n");
franta-hg@15
   177
		infoWriter.write("version=2.4.2\n");
franta-hg@15
   178
		infoWriter.write("bookname=Free Telco Dictionary\n");
franta-hg@15
   179
		infoWriter.write("wordcount=" + wordcount + "\n");
franta-hg@15
   180
		infoWriter.write("synwordcount=" + synwourdcount + "\n");
franta-hg@15
   181
		infoWriter.write("idxfilesize=" + idxfilesize + "\n");
franta-hg@15
   182
		infoWriter.write("idxoffsetbits=32\n");
franta-hg@15
   183
		infoWriter.write("author=František Kučera\n");
franta-hg@17
   184
		infoWriter.write("em" + "ail=telco" + "-dictionary." + EML_TO_KEN + "@" + "fran" + "tovo.cz\n");
franta-hg@15
   185
		infoWriter.write("website=https://telco.frantovo.cz\n");
franta-hg@15
   186
		infoWriter.write("description=A dictionary for telecommunications licensed under GNU FDL\n");
franta-hg@15
   187
		infoWriter.write("date=2013.07.09\n");
franta-hg@15
   188
		infoWriter.write("sametypesequence=h\n");
franta-hg@15
   189
	}
franta-hg@15
   190
franta-hg@13
   191
	public static void main(String[] args) {
franta-hg@15
   192
		File outputFolder = new File("../../delivery/free-telco-dictionary");
franta-hg@15
   193
		outputFolder.mkdir();
franta-hg@15
   194
franta-hg@15
   195
		try {
franta-hg@15
   196
			Generator g = new Generator();
franta-hg@15
   197
			g.generate(outputFolder, "telco");
franta-hg@15
   198
		} catch (ParserConfigurationException | TransformerConfigurationException e) {
franta-hg@15
   199
			log.log(Level.SEVERE, "error during initialization", e);
franta-hg@15
   200
		}
franta-hg@13
   201
	}
franta-hg@13
   202
}