java/dictionary-generator/src/cz/frantovo/telco/dictionary/Generator.java
author František Kučera <franta-hg@frantovo.cz>
Tue, 09 Jul 2013 22:41:49 +0200
changeset 16 939fa8d8663e
parent 15 93208f791318
child 17 b188eae2c092
permissions -rw-r--r--
data: MSIN/MIN
franta-hg@13
     1
/**
franta-hg@13
     2
 * Free Telco Dictionary
franta-hg@13
     3
 * Copyright © 2013 František Kučera (frantovo.cz)
franta-hg@13
     4
 *
franta-hg@13
     5
 * This program is free software: you can redistribute it and/or modify
franta-hg@13
     6
 * it under the terms of the GNU General Public License as published by
franta-hg@13
     7
 * the Free Software Foundation, either version 3 of the License, or
franta-hg@13
     8
 * (at your option) any later version.
franta-hg@13
     9
 *
franta-hg@13
    10
 * This program is distributed in the hope that it will be useful,
franta-hg@13
    11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
franta-hg@13
    12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
franta-hg@13
    13
 * GNU General Public License for more details.
franta-hg@13
    14
 *
franta-hg@13
    15
 * You should have received a copy of the GNU General Public License
franta-hg@13
    16
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
franta-hg@13
    17
 */
franta-hg@13
    18
package cz.frantovo.telco.dictionary;
franta-hg@13
    19
franta-hg@15
    20
import static cz.frantovo.telco.dictionary.Xmlns.*;
franta-hg@15
    21
import static cz.frantovo.telco.dictionary.Functions.*;
franta-hg@15
    22
import java.io.BufferedWriter;
franta-hg@15
    23
import java.io.ByteArrayOutputStream;
franta-hg@15
    24
import java.io.DataOutputStream;
franta-hg@15
    25
import java.io.File;
franta-hg@15
    26
import java.io.FileOutputStream;
franta-hg@15
    27
import java.io.FileWriter;
franta-hg@15
    28
import java.io.IOException;
franta-hg@15
    29
import java.nio.ByteBuffer;
franta-hg@15
    30
import java.nio.charset.Charset;
franta-hg@15
    31
import java.util.ArrayList;
franta-hg@15
    32
import java.util.List;
franta-hg@15
    33
import java.util.logging.Level;
franta-hg@15
    34
import java.util.logging.Logger;
franta-hg@15
    35
import javax.xml.parsers.DocumentBuilder;
franta-hg@15
    36
import javax.xml.parsers.DocumentBuilderFactory;
franta-hg@15
    37
import javax.xml.parsers.ParserConfigurationException;
franta-hg@15
    38
import javax.xml.transform.Transformer;
franta-hg@15
    39
import javax.xml.transform.TransformerConfigurationException;
franta-hg@15
    40
import javax.xml.transform.TransformerException;
franta-hg@15
    41
import javax.xml.transform.TransformerFactory;
franta-hg@15
    42
import javax.xml.transform.dom.DOMSource;
franta-hg@15
    43
import javax.xml.transform.stream.StreamResult;
franta-hg@15
    44
import javax.xml.transform.stream.StreamSource;
franta-hg@15
    45
import javax.xml.xpath.XPath;
franta-hg@15
    46
import javax.xml.xpath.XPathConstants;
franta-hg@15
    47
import javax.xml.xpath.XPathExpression;
franta-hg@15
    48
import javax.xml.xpath.XPathExpressionException;
franta-hg@15
    49
import javax.xml.xpath.XPathFactory;
franta-hg@15
    50
import org.w3c.dom.Document;
franta-hg@15
    51
import org.w3c.dom.Node;
franta-hg@15
    52
import org.w3c.dom.NodeList;
franta-hg@15
    53
import org.xml.sax.SAXException;
franta-hg@15
    54
franta-hg@13
    55
/**
franta-hg@15
    56
 * <p>
franta-hg@15
    57
 * Generates dictionary files in StarDict format from source in our XML format.
franta-hg@15
    58
 * </p>
franta-hg@15
    59
 *
franta-hg@15
    60
 * <p>
franta-hg@15
    61
 * Number format should be: 32-bits unsigned number in network byte order
franta-hg@15
    62
 * </p>
franta-hg@13
    63
 *
franta-hg@13
    64
 * @author Ing. František Kučera (frantovo.cz)
franta-hg@13
    65
 */
franta-hg@13
    66
public class Generator {
franta-hg@13
    67
franta-hg@15
    68
	private static final Logger log = Logger.getLogger(Generator.class.getName());
franta-hg@15
    69
	private static final String EMAIL_TOKEN = "ixumhht68";
franta-hg@15
    70
	private final DocumentBuilderFactory documentBuilderFactory;
franta-hg@15
    71
	private final DocumentBuilder documentBuilder;
franta-hg@15
    72
	private final XPathFactory xpathFactory;
franta-hg@15
    73
	private final XPath xpath;
franta-hg@15
    74
	private final TransformerFactory xslFactory;
franta-hg@15
    75
	private final Transformer xsl;
franta-hg@15
    76
	private final Charset utf8;
franta-hg@15
    77
franta-hg@15
    78
	public Generator() throws ParserConfigurationException, TransformerConfigurationException {
franta-hg@15
    79
		utf8 = Charset.forName("UTF-8");
franta-hg@15
    80
franta-hg@15
    81
		documentBuilderFactory = DocumentBuilderFactory.newInstance();
franta-hg@15
    82
		documentBuilderFactory.setNamespaceAware(true);
franta-hg@15
    83
		documentBuilder = documentBuilderFactory.newDocumentBuilder();
franta-hg@15
    84
franta-hg@15
    85
		xslFactory = TransformerFactory.newInstance();
franta-hg@15
    86
		xsl = xslFactory.newTransformer(new StreamSource("concept.xsl"));
franta-hg@15
    87
franta-hg@15
    88
		xpathFactory = XPathFactory.newInstance();
franta-hg@15
    89
		xpath = xpathFactory.newXPath();
franta-hg@15
    90
		xpath.setNamespaceContext(getNamespaceContext());
franta-hg@15
    91
	}
franta-hg@15
    92
franta-hg@15
    93
	private void generate(File folder, String filePrefix) {
franta-hg@15
    94
		File infoFile = new File(folder, filePrefix + ".ifo");
franta-hg@15
    95
		File dictFile = new File(folder, filePrefix + ".dict");
franta-hg@15
    96
		File indexFile = new File(folder, filePrefix + ".idx");
franta-hg@15
    97
		File synonymFile = new File(folder, filePrefix + ".syn");
franta-hg@15
    98
franta-hg@15
    99
		FileOutputStream dictOutputStream = null;
franta-hg@15
   100
		DataOutputStream synonymOutputStream = null;
franta-hg@15
   101
		DataOutputStream indexOutputStream = null;
franta-hg@15
   102
		BufferedWriter infoWriter = null;
franta-hg@15
   103
franta-hg@15
   104
		try {
franta-hg@15
   105
			dictOutputStream = new FileOutputStream(dictFile);
franta-hg@15
   106
			synonymOutputStream = new DataOutputStream(new FileOutputStream(synonymFile));
franta-hg@15
   107
			indexOutputStream = new DataOutputStream(new FileOutputStream(indexFile));
franta-hg@15
   108
			infoWriter = new BufferedWriter(new FileWriter(infoFile));
franta-hg@15
   109
franta-hg@15
   110
			Document sourceDocument = documentBuilder.parse("../../data/dictionary.xml");
franta-hg@15
   111
			XPathExpression termsXPath = xpath.compile("d:term/@completeForm|d:term/@abbreviation");
franta-hg@15
   112
			// TODO: tags - labels/descriptions
franta-hg@15
   113
			xsl.setParameter("tags", sourceDocument.getElementsByTagNameNS(DICTIONARY, "tags").item(0));
franta-hg@15
   114
franta-hg@15
   115
			/**
franta-hg@15
   116
			 * TODO: sorting
franta-hg@15
   117
			 */
franta-hg@15
   118
			long offset = 0;
franta-hg@15
   119
			long conceptIndex = 0;
franta-hg@15
   120
			long synonymCount = 0;
franta-hg@15
   121
			for (Node conceptNode : nodeIterable(sourceDocument.getElementsByTagNameNS(DICTIONARY, "concept"))) {
franta-hg@15
   122
				ByteArrayOutputStream conceptXhtml = new ByteArrayOutputStream();
franta-hg@15
   123
				xsl.transform(new DOMSource(conceptNode), new StreamResult(conceptXhtml));
franta-hg@15
   124
				int length = conceptXhtml.size();
franta-hg@15
   125
				dictOutputStream.write(conceptXhtml.toByteArray());
franta-hg@15
   126
franta-hg@15
   127
				NodeList nameNodes = (NodeList) termsXPath.evaluate(conceptNode, XPathConstants.NODESET);
franta-hg@15
   128
				List<String> names = new ArrayList<>();
franta-hg@15
   129
franta-hg@15
   130
				for (Node nameNode : nodeIterable(nameNodes)) {
franta-hg@15
   131
					String name = nameNode.getTextContent().trim();
franta-hg@15
   132
					if (!name.isEmpty()) {
franta-hg@15
   133
						names.add(name);
franta-hg@15
   134
					}
franta-hg@15
   135
				}
franta-hg@15
   136
franta-hg@15
   137
				synonymCount = +writeSynonyms(synonymOutputStream, names, conceptIndex);
franta-hg@15
   138
				writeIndex(indexOutputStream, names.get(0), offset, length);
franta-hg@15
   139
franta-hg@15
   140
				offset = offset + length;
franta-hg@15
   141
				conceptIndex++;
franta-hg@15
   142
			}
franta-hg@15
   143
franta-hg@15
   144
			indexOutputStream.flush();
franta-hg@15
   145
			writeInfo(infoWriter, sourceDocument, conceptIndex + 1, synonymCount, indexFile.length());
franta-hg@15
   146
		} catch (SAXException | IOException | TransformerException | XPathExpressionException e) {
franta-hg@15
   147
			log.log(Level.SEVERE, "unable to generate", e);
franta-hg@15
   148
		} finally {
franta-hg@15
   149
			close(dictOutputStream);
franta-hg@15
   150
			close(synonymOutputStream);
franta-hg@15
   151
			close(indexOutputStream);
franta-hg@15
   152
			close(infoWriter);
franta-hg@15
   153
		}
franta-hg@15
   154
	}
franta-hg@15
   155
franta-hg@15
   156
	private void writeIndex(DataOutputStream indexOutputStream, String name, long offset, long length) throws IOException {
franta-hg@15
   157
		indexOutputStream.write(name.getBytes(utf8));
franta-hg@15
   158
		indexOutputStream.write(0);
franta-hg@15
   159
		indexOutputStream.writeInt((int) offset); // unsigned int 32
franta-hg@15
   160
		indexOutputStream.writeInt((int) length); // unsigned int 32
franta-hg@15
   161
	}
franta-hg@15
   162
franta-hg@15
   163
	private int writeSynonyms(DataOutputStream synonymOutputStream, List<String> names, long baseIndex) throws IOException {
franta-hg@15
   164
		if (names.size() > 1) {
franta-hg@15
   165
			for (int i = 1; i < names.size(); i++) {
franta-hg@15
   166
				String name = names.get(i);
franta-hg@15
   167
				synonymOutputStream.write(name.getBytes(utf8));
franta-hg@15
   168
				synonymOutputStream.write(0);
franta-hg@15
   169
				synonymOutputStream.writeInt((int) baseIndex); // unsigned int 32
franta-hg@15
   170
			}
franta-hg@15
   171
			return names.size() - 1;
franta-hg@15
   172
		} else {
franta-hg@15
   173
			return 0;
franta-hg@15
   174
		}
franta-hg@15
   175
	}
franta-hg@15
   176
franta-hg@15
   177
	private void writeInfo(BufferedWriter infoWriter, Document sourceDocument, long wordcount, long synwourdcount, long idxfilesize) throws IOException {
franta-hg@15
   178
		// TODO: values from document metadata
franta-hg@15
   179
		infoWriter.write("StarDict's dict ifo file\n");
franta-hg@15
   180
		infoWriter.write("version=2.4.2\n");
franta-hg@15
   181
		infoWriter.write("bookname=Free Telco Dictionary\n");
franta-hg@15
   182
		infoWriter.write("wordcount=" + wordcount + "\n");
franta-hg@15
   183
		infoWriter.write("synwordcount=" + synwourdcount + "\n");
franta-hg@15
   184
		infoWriter.write("idxfilesize=" + idxfilesize + "\n");
franta-hg@15
   185
		infoWriter.write("idxoffsetbits=32\n");
franta-hg@15
   186
		infoWriter.write("author=František Kučera\n");
franta-hg@15
   187
		infoWriter.write("email=telco-dictionary." + EMAIL_TOKEN + "@" + "frantovo.cz\n");
franta-hg@15
   188
		infoWriter.write("website=https://telco.frantovo.cz\n");
franta-hg@15
   189
		infoWriter.write("description=A dictionary for telecommunications licensed under GNU FDL\n");
franta-hg@15
   190
		infoWriter.write("date=2013.07.09\n");
franta-hg@15
   191
		infoWriter.write("sametypesequence=h\n");
franta-hg@15
   192
	}
franta-hg@15
   193
franta-hg@13
   194
	public static void main(String[] args) {
franta-hg@15
   195
		File outputFolder = new File("../../delivery/free-telco-dictionary");
franta-hg@15
   196
		outputFolder.mkdir();
franta-hg@15
   197
franta-hg@15
   198
		try {
franta-hg@15
   199
			Generator g = new Generator();
franta-hg@15
   200
			g.generate(outputFolder, "telco");
franta-hg@15
   201
		} catch (ParserConfigurationException | TransformerConfigurationException e) {
franta-hg@15
   202
			log.log(Level.SEVERE, "error during initialization", e);
franta-hg@15
   203
		}
franta-hg@13
   204
	}
franta-hg@13
   205
}