java/dictionary-generator/src/cz/frantovo/telco/dictionary/Generator.java
author František Kučera <franta-hg@frantovo.cz>
Fri, 06 Sep 2013 00:22:37 +0200
changeset 144 9435f578284f
parent 111 5338fdbf2bb8
child 151 a9f1ba451247
permissions -rw-r--r--
data: white pages, yellow pages (directory lookup methods)
franta-hg@13
     1
/**
franta-hg@13
     2
 * Free Telco Dictionary
franta-hg@13
     3
 * Copyright © 2013 František Kučera (frantovo.cz)
franta-hg@13
     4
 *
franta-hg@13
     5
 * This program is free software: you can redistribute it and/or modify
franta-hg@13
     6
 * it under the terms of the GNU General Public License as published by
franta-hg@13
     7
 * the Free Software Foundation, either version 3 of the License, or
franta-hg@13
     8
 * (at your option) any later version.
franta-hg@13
     9
 *
franta-hg@13
    10
 * This program is distributed in the hope that it will be useful,
franta-hg@13
    11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
franta-hg@13
    12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
franta-hg@13
    13
 * GNU General Public License for more details.
franta-hg@13
    14
 *
franta-hg@13
    15
 * You should have received a copy of the GNU General Public License
franta-hg@13
    16
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
franta-hg@13
    17
 */
franta-hg@13
    18
package cz.frantovo.telco.dictionary;
franta-hg@13
    19
franta-hg@15
    20
import static cz.frantovo.telco.dictionary.Xmlns.*;
franta-hg@15
    21
import static cz.frantovo.telco.dictionary.Functions.*;
franta-hg@15
    22
import java.io.BufferedWriter;
franta-hg@15
    23
import java.io.ByteArrayOutputStream;
franta-hg@15
    24
import java.io.DataOutputStream;
franta-hg@15
    25
import java.io.File;
franta-hg@15
    26
import java.io.FileOutputStream;
franta-hg@15
    27
import java.io.FileWriter;
franta-hg@15
    28
import java.io.IOException;
franta-hg@18
    29
import java.text.SimpleDateFormat;
franta-hg@15
    30
import java.util.ArrayList;
franta-hg@18
    31
import java.util.Date;
franta-hg@15
    32
import java.util.List;
franta-hg@17
    33
import java.util.SortedSet;
franta-hg@17
    34
import java.util.TreeSet;
franta-hg@15
    35
import java.util.logging.Level;
franta-hg@15
    36
import java.util.logging.Logger;
franta-hg@15
    37
import javax.xml.parsers.DocumentBuilder;
franta-hg@15
    38
import javax.xml.parsers.DocumentBuilderFactory;
franta-hg@15
    39
import javax.xml.parsers.ParserConfigurationException;
franta-hg@15
    40
import javax.xml.transform.Transformer;
franta-hg@15
    41
import javax.xml.transform.TransformerConfigurationException;
franta-hg@15
    42
import javax.xml.transform.TransformerException;
franta-hg@15
    43
import javax.xml.transform.TransformerFactory;
franta-hg@15
    44
import javax.xml.transform.dom.DOMSource;
franta-hg@15
    45
import javax.xml.transform.stream.StreamResult;
franta-hg@15
    46
import javax.xml.transform.stream.StreamSource;
franta-hg@15
    47
import javax.xml.xpath.XPath;
franta-hg@15
    48
import javax.xml.xpath.XPathConstants;
franta-hg@15
    49
import javax.xml.xpath.XPathExpression;
franta-hg@15
    50
import javax.xml.xpath.XPathExpressionException;
franta-hg@15
    51
import javax.xml.xpath.XPathFactory;
franta-hg@15
    52
import org.w3c.dom.Document;
franta-hg@15
    53
import org.w3c.dom.Node;
franta-hg@15
    54
import org.w3c.dom.NodeList;
franta-hg@15
    55
import org.xml.sax.SAXException;
franta-hg@15
    56
franta-hg@13
    57
/**
franta-hg@15
    58
 * <p>
franta-hg@15
    59
 * Generates dictionary files in StarDict format from source in our XML format.
franta-hg@15
    60
 * </p>
franta-hg@15
    61
 *
franta-hg@15
    62
 * <p>
franta-hg@15
    63
 * Number format should be: 32-bits unsigned number in network byte order
franta-hg@15
    64
 * </p>
franta-hg@13
    65
 *
franta-hg@13
    66
 * @author Ing. František Kučera (frantovo.cz)
franta-hg@13
    67
 */
franta-hg@13
    68
public class Generator {
franta-hg@21
    69
franta-hg@15
    70
	private static final Logger log = Logger.getLogger(Generator.class.getName());
franta-hg@17
    71
	private static final String EML_TO_KEN = "ixumhht68";
franta-hg@21
    72
	private String mode;
franta-hg@15
    73
	private final DocumentBuilderFactory documentBuilderFactory;
franta-hg@15
    74
	private final DocumentBuilder documentBuilder;
franta-hg@15
    75
	private final XPathFactory xpathFactory;
franta-hg@15
    76
	private final XPath xpath;
franta-hg@15
    77
	private final TransformerFactory xslFactory;
franta-hg@15
    78
	private final Transformer xsl;
franta-hg@18
    79
	private final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy.MM.dd");
franta-hg@21
    80
franta-hg@21
    81
	public Generator(String mode) throws ParserConfigurationException, TransformerConfigurationException {
franta-hg@21
    82
		this.mode = mode;
franta-hg@21
    83
franta-hg@21
    84
		File templateFile = new File("concept." + mode + ".xsl");
franta-hg@21
    85
		if (templateFile.exists()) {
franta-hg@21
    86
franta-hg@21
    87
			documentBuilderFactory = DocumentBuilderFactory.newInstance();
franta-hg@21
    88
			documentBuilderFactory.setNamespaceAware(true);
franta-hg@21
    89
			documentBuilder = documentBuilderFactory.newDocumentBuilder();
franta-hg@21
    90
franta-hg@21
    91
			xslFactory = TransformerFactory.newInstance();
franta-hg@21
    92
			xsl = xslFactory.newTransformer(new StreamSource(templateFile));
franta-hg@21
    93
franta-hg@21
    94
			xpathFactory = XPathFactory.newInstance();
franta-hg@21
    95
			xpath = xpathFactory.newXPath();
franta-hg@21
    96
			xpath.setNamespaceContext(getNamespaceContext());
franta-hg@21
    97
		} else {
franta-hg@21
    98
			throw new IllegalArgumentException("Invalid mode: " + mode + ". File " + templateFile + " does not exist");
franta-hg@21
    99
		}
franta-hg@15
   100
	}
franta-hg@21
   101
franta-hg@15
   102
	private void generate(File folder, String filePrefix) {
franta-hg@15
   103
		File infoFile = new File(folder, filePrefix + ".ifo");
franta-hg@15
   104
		File dictFile = new File(folder, filePrefix + ".dict");
franta-hg@15
   105
		File indexFile = new File(folder, filePrefix + ".idx");
franta-hg@15
   106
		File synonymFile = new File(folder, filePrefix + ".syn");
franta-hg@21
   107
franta-hg@15
   108
		FileOutputStream dictOutputStream = null;
franta-hg@15
   109
		DataOutputStream synonymOutputStream = null;
franta-hg@15
   110
		DataOutputStream indexOutputStream = null;
franta-hg@15
   111
		BufferedWriter infoWriter = null;
franta-hg@21
   112
franta-hg@17
   113
		SortedSet<IndexEntry> indexEntries = new TreeSet<>();
franta-hg@17
   114
		SortedSet<SynonymsEntry> synonymsEntries = new TreeSet<>();
franta-hg@21
   115
franta-hg@15
   116
		try {
franta-hg@15
   117
			dictOutputStream = new FileOutputStream(dictFile);
franta-hg@15
   118
			synonymOutputStream = new DataOutputStream(new FileOutputStream(synonymFile));
franta-hg@15
   119
			indexOutputStream = new DataOutputStream(new FileOutputStream(indexFile));
franta-hg@15
   120
			infoWriter = new BufferedWriter(new FileWriter(infoFile));
franta-hg@21
   121
franta-hg@15
   122
			Document sourceDocument = documentBuilder.parse("../../data/dictionary.xml");
franta-hg@15
   123
			XPathExpression termsXPath = xpath.compile("d:term/@completeForm|d:term/@abbreviation");
franta-hg@15
   124
			// TODO: tags - labels/descriptions
franta-hg@15
   125
			xsl.setParameter("tags", sourceDocument.getElementsByTagNameNS(DICTIONARY, "tags").item(0));
franta-hg@21
   126
franta-hg@15
   127
			long offset = 0;
franta-hg@15
   128
			long conceptIndex = 0;
franta-hg@15
   129
			for (Node conceptNode : nodeIterable(sourceDocument.getElementsByTagNameNS(DICTIONARY, "concept"))) {
franta-hg@15
   130
				ByteArrayOutputStream conceptXhtml = new ByteArrayOutputStream();
franta-hg@15
   131
				xsl.transform(new DOMSource(conceptNode), new StreamResult(conceptXhtml));
franta-hg@15
   132
				int length = conceptXhtml.size();
franta-hg@15
   133
				dictOutputStream.write(conceptXhtml.toByteArray());
franta-hg@21
   134
franta-hg@15
   135
				NodeList nameNodes = (NodeList) termsXPath.evaluate(conceptNode, XPathConstants.NODESET);
franta-hg@15
   136
				List<String> names = new ArrayList<>();
franta-hg@21
   137
franta-hg@15
   138
				for (Node nameNode : nodeIterable(nameNodes)) {
franta-hg@15
   139
					String name = nameNode.getTextContent().trim();
franta-hg@15
   140
					if (!name.isEmpty()) {
franta-hg@15
   141
						names.add(name);
franta-hg@15
   142
					}
franta-hg@15
   143
				}
franta-hg@21
   144
franta-hg@18
   145
				String baseName = names.get(0);
franta-hg@18
   146
				IndexEntry indexEntry = new IndexEntry(baseName, offset, length);
franta-hg@17
   147
				indexEntries.add(indexEntry);
franta-hg@21
   148
franta-hg@17
   149
				for (int i = 1; i < names.size(); i++) {
franta-hg@17
   150
					String name = names.get(i);
franta-hg@18
   151
					if (!baseName.equals(name)) {
franta-hg@18
   152
						synonymsEntries.add(new SynonymsEntry(indexEntry, name));
franta-hg@18
   153
					}
franta-hg@17
   154
				}
franta-hg@21
   155
franta-hg@15
   156
				offset = offset + length;
franta-hg@15
   157
				conceptIndex++;
franta-hg@15
   158
			}
franta-hg@21
   159
franta-hg@17
   160
			writeIndex(indexOutputStream, indexEntries);
franta-hg@17
   161
			writeSynonyms(synonymOutputStream, synonymsEntries);
franta-hg@21
   162
franta-hg@15
   163
			indexOutputStream.flush();
franta-hg@20
   164
			writeInfo(infoWriter, sourceDocument, conceptIndex, synonymsEntries.size(), indexFile.length());
franta-hg@15
   165
		} catch (SAXException | IOException | TransformerException | XPathExpressionException e) {
franta-hg@15
   166
			log.log(Level.SEVERE, "unable to generate", e);
franta-hg@15
   167
		} finally {
franta-hg@15
   168
			close(dictOutputStream);
franta-hg@15
   169
			close(synonymOutputStream);
franta-hg@15
   170
			close(indexOutputStream);
franta-hg@15
   171
			close(infoWriter);
franta-hg@15
   172
		}
franta-hg@15
   173
	}
franta-hg@21
   174
franta-hg@17
   175
	private void writeIndex(DataOutputStream indexOutputStream, SortedSet<IndexEntry> indexEntries) throws IOException {
franta-hg@17
   176
		long ordinal = 0;
franta-hg@17
   177
		for (IndexEntry e : indexEntries) {
franta-hg@17
   178
			e.serialize(indexOutputStream);
franta-hg@17
   179
			e.setOrdinal(ordinal++);
franta-hg@17
   180
		}
franta-hg@15
   181
	}
franta-hg@21
   182
franta-hg@17
   183
	private void writeSynonyms(DataOutputStream synonymOutputStream, SortedSet<SynonymsEntry> synonymsEntries) throws IOException {
franta-hg@17
   184
		for (SynonymsEntry s : synonymsEntries) {
franta-hg@17
   185
			s.serialize(synonymOutputStream);
franta-hg@15
   186
		}
franta-hg@15
   187
	}
franta-hg@21
   188
franta-hg@15
   189
	private void writeInfo(BufferedWriter infoWriter, Document sourceDocument, long wordcount, long synwourdcount, long idxfilesize) throws IOException {
franta-hg@15
   190
		// TODO: values from document metadata
franta-hg@15
   191
		infoWriter.write("StarDict's dict ifo file\n");
franta-hg@15
   192
		infoWriter.write("version=2.4.2\n");
franta-hg@15
   193
		infoWriter.write("bookname=Free Telco Dictionary\n");
franta-hg@15
   194
		infoWriter.write("wordcount=" + wordcount + "\n");
franta-hg@15
   195
		infoWriter.write("synwordcount=" + synwourdcount + "\n");
franta-hg@15
   196
		infoWriter.write("idxfilesize=" + idxfilesize + "\n");
franta-hg@15
   197
		infoWriter.write("idxoffsetbits=32\n");
franta-hg@15
   198
		infoWriter.write("author=František Kučera\n");
franta-hg@17
   199
		infoWriter.write("em" + "ail=telco" + "-dictionary." + EML_TO_KEN + "@" + "fran" + "tovo.cz\n");
franta-hg@15
   200
		infoWriter.write("website=https://telco.frantovo.cz\n");
franta-hg@111
   201
		infoWriter.write("description=A dictionary for telecommunications licensed under GNU FDL. Check new versions at https://telco.frantovo.cz\n");
franta-hg@18
   202
		infoWriter.write("date=" + dateFormat.format(new Date()) + "\n");
franta-hg@21
   203
		infoWriter.write("sametypesequence=" + mode + "\n");
franta-hg@15
   204
	}
franta-hg@21
   205
franta-hg@13
   206
	public static void main(String[] args) {
franta-hg@15
   207
		File outputFolder = new File("../../delivery/free-telco-dictionary");
franta-hg@23
   208
		outputFolder.mkdirs();
franta-hg@21
   209
franta-hg@15
   210
		try {
franta-hg@21
   211
			Generator g = new Generator(parseMode(args));
franta-hg@15
   212
			g.generate(outputFolder, "telco");
franta-hg@15
   213
		} catch (ParserConfigurationException | TransformerConfigurationException e) {
franta-hg@15
   214
			log.log(Level.SEVERE, "error during initialization", e);
franta-hg@15
   215
		}
franta-hg@13
   216
	}
franta-hg@21
   217
franta-hg@21
   218
	private static String parseMode(String[] args) {
franta-hg@21
   219
		if (args.length == 1) {
franta-hg@21
   220
			return args[0];
franta-hg@21
   221
		} else {
franta-hg@21
   222
			return "h";
franta-hg@21
   223
		}
franta-hg@21
   224
	}
franta-hg@13
   225
}