java/dictionary-generator/src/cz/frantovo/telco/dictionary/Generator.java
author František Kučera <franta-hg@frantovo.cz>
Mon, 22 Jun 2020 23:11:14 +0200
changeset 152 5c878a53d3ed
parent 151 a9f1ba451247
permissions -rw-r--r--
add dictionary and concept IDs + some documentation
franta-hg@13
     1
/**
franta-hg@13
     2
 * Free Telco Dictionary
franta-hg@13
     3
 * Copyright © 2013 František Kučera (frantovo.cz)
franta-hg@13
     4
 *
franta-hg@13
     5
 * This program is free software: you can redistribute it and/or modify
franta-hg@13
     6
 * it under the terms of the GNU General Public License as published by
franta-hg@151
     7
 * the Free Software Foundation, version 3 of the License.
franta-hg@13
     8
 *
franta-hg@13
     9
 * This program is distributed in the hope that it will be useful,
franta-hg@13
    10
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
franta-hg@13
    11
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
franta-hg@13
    12
 * GNU General Public License for more details.
franta-hg@13
    13
 *
franta-hg@13
    14
 * You should have received a copy of the GNU General Public License
franta-hg@13
    15
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
franta-hg@13
    16
 */
franta-hg@13
    17
package cz.frantovo.telco.dictionary;
franta-hg@13
    18
franta-hg@15
    19
import static cz.frantovo.telco.dictionary.Xmlns.*;
franta-hg@15
    20
import static cz.frantovo.telco.dictionary.Functions.*;
franta-hg@15
    21
import java.io.BufferedWriter;
franta-hg@15
    22
import java.io.ByteArrayOutputStream;
franta-hg@15
    23
import java.io.DataOutputStream;
franta-hg@15
    24
import java.io.File;
franta-hg@15
    25
import java.io.FileOutputStream;
franta-hg@15
    26
import java.io.FileWriter;
franta-hg@15
    27
import java.io.IOException;
franta-hg@18
    28
import java.text.SimpleDateFormat;
franta-hg@15
    29
import java.util.ArrayList;
franta-hg@18
    30
import java.util.Date;
franta-hg@15
    31
import java.util.List;
franta-hg@17
    32
import java.util.SortedSet;
franta-hg@17
    33
import java.util.TreeSet;
franta-hg@15
    34
import java.util.logging.Level;
franta-hg@15
    35
import java.util.logging.Logger;
franta-hg@15
    36
import javax.xml.parsers.DocumentBuilder;
franta-hg@15
    37
import javax.xml.parsers.DocumentBuilderFactory;
franta-hg@15
    38
import javax.xml.parsers.ParserConfigurationException;
franta-hg@15
    39
import javax.xml.transform.Transformer;
franta-hg@15
    40
import javax.xml.transform.TransformerConfigurationException;
franta-hg@15
    41
import javax.xml.transform.TransformerException;
franta-hg@15
    42
import javax.xml.transform.TransformerFactory;
franta-hg@15
    43
import javax.xml.transform.dom.DOMSource;
franta-hg@15
    44
import javax.xml.transform.stream.StreamResult;
franta-hg@15
    45
import javax.xml.transform.stream.StreamSource;
franta-hg@15
    46
import javax.xml.xpath.XPath;
franta-hg@15
    47
import javax.xml.xpath.XPathConstants;
franta-hg@15
    48
import javax.xml.xpath.XPathExpression;
franta-hg@15
    49
import javax.xml.xpath.XPathExpressionException;
franta-hg@15
    50
import javax.xml.xpath.XPathFactory;
franta-hg@15
    51
import org.w3c.dom.Document;
franta-hg@15
    52
import org.w3c.dom.Node;
franta-hg@15
    53
import org.w3c.dom.NodeList;
franta-hg@15
    54
import org.xml.sax.SAXException;
franta-hg@15
    55
franta-hg@13
    56
/**
franta-hg@15
    57
 * <p>
franta-hg@15
    58
 * Generates dictionary files in StarDict format from source in our XML format.
franta-hg@15
    59
 * </p>
franta-hg@15
    60
 *
franta-hg@15
    61
 * <p>
franta-hg@15
    62
 * Number format should be: 32-bits unsigned number in network byte order
franta-hg@15
    63
 * </p>
franta-hg@13
    64
 *
franta-hg@13
    65
 * @author Ing. František Kučera (frantovo.cz)
franta-hg@13
    66
 */
franta-hg@13
    67
public class Generator {
franta-hg@21
    68
franta-hg@15
    69
	private static final Logger log = Logger.getLogger(Generator.class.getName());
franta-hg@17
    70
	private static final String EML_TO_KEN = "ixumhht68";
franta-hg@21
    71
	private String mode;
franta-hg@15
    72
	private final DocumentBuilderFactory documentBuilderFactory;
franta-hg@15
    73
	private final DocumentBuilder documentBuilder;
franta-hg@15
    74
	private final XPathFactory xpathFactory;
franta-hg@15
    75
	private final XPath xpath;
franta-hg@15
    76
	private final TransformerFactory xslFactory;
franta-hg@15
    77
	private final Transformer xsl;
franta-hg@18
    78
	private final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy.MM.dd");
franta-hg@21
    79
franta-hg@21
    80
	public Generator(String mode) throws ParserConfigurationException, TransformerConfigurationException {
franta-hg@21
    81
		this.mode = mode;
franta-hg@21
    82
franta-hg@21
    83
		File templateFile = new File("concept." + mode + ".xsl");
franta-hg@21
    84
		if (templateFile.exists()) {
franta-hg@21
    85
franta-hg@21
    86
			documentBuilderFactory = DocumentBuilderFactory.newInstance();
franta-hg@21
    87
			documentBuilderFactory.setNamespaceAware(true);
franta-hg@21
    88
			documentBuilder = documentBuilderFactory.newDocumentBuilder();
franta-hg@21
    89
franta-hg@21
    90
			xslFactory = TransformerFactory.newInstance();
franta-hg@21
    91
			xsl = xslFactory.newTransformer(new StreamSource(templateFile));
franta-hg@21
    92
franta-hg@21
    93
			xpathFactory = XPathFactory.newInstance();
franta-hg@21
    94
			xpath = xpathFactory.newXPath();
franta-hg@21
    95
			xpath.setNamespaceContext(getNamespaceContext());
franta-hg@21
    96
		} else {
franta-hg@21
    97
			throw new IllegalArgumentException("Invalid mode: " + mode + ". File " + templateFile + " does not exist");
franta-hg@21
    98
		}
franta-hg@15
    99
	}
franta-hg@21
   100
franta-hg@15
   101
	private void generate(File folder, String filePrefix) {
franta-hg@15
   102
		File infoFile = new File(folder, filePrefix + ".ifo");
franta-hg@15
   103
		File dictFile = new File(folder, filePrefix + ".dict");
franta-hg@15
   104
		File indexFile = new File(folder, filePrefix + ".idx");
franta-hg@15
   105
		File synonymFile = new File(folder, filePrefix + ".syn");
franta-hg@21
   106
franta-hg@15
   107
		FileOutputStream dictOutputStream = null;
franta-hg@15
   108
		DataOutputStream synonymOutputStream = null;
franta-hg@15
   109
		DataOutputStream indexOutputStream = null;
franta-hg@15
   110
		BufferedWriter infoWriter = null;
franta-hg@21
   111
franta-hg@17
   112
		SortedSet<IndexEntry> indexEntries = new TreeSet<>();
franta-hg@17
   113
		SortedSet<SynonymsEntry> synonymsEntries = new TreeSet<>();
franta-hg@21
   114
franta-hg@15
   115
		try {
franta-hg@15
   116
			dictOutputStream = new FileOutputStream(dictFile);
franta-hg@15
   117
			synonymOutputStream = new DataOutputStream(new FileOutputStream(synonymFile));
franta-hg@15
   118
			indexOutputStream = new DataOutputStream(new FileOutputStream(indexFile));
franta-hg@15
   119
			infoWriter = new BufferedWriter(new FileWriter(infoFile));
franta-hg@21
   120
franta-hg@15
   121
			Document sourceDocument = documentBuilder.parse("../../data/dictionary.xml");
franta-hg@15
   122
			XPathExpression termsXPath = xpath.compile("d:term/@completeForm|d:term/@abbreviation");
franta-hg@15
   123
			// TODO: tags - labels/descriptions
franta-hg@15
   124
			xsl.setParameter("tags", sourceDocument.getElementsByTagNameNS(DICTIONARY, "tags").item(0));
franta-hg@21
   125
franta-hg@15
   126
			long offset = 0;
franta-hg@15
   127
			long conceptIndex = 0;
franta-hg@15
   128
			for (Node conceptNode : nodeIterable(sourceDocument.getElementsByTagNameNS(DICTIONARY, "concept"))) {
franta-hg@15
   129
				ByteArrayOutputStream conceptXhtml = new ByteArrayOutputStream();
franta-hg@15
   130
				xsl.transform(new DOMSource(conceptNode), new StreamResult(conceptXhtml));
franta-hg@15
   131
				int length = conceptXhtml.size();
franta-hg@15
   132
				dictOutputStream.write(conceptXhtml.toByteArray());
franta-hg@21
   133
franta-hg@15
   134
				NodeList nameNodes = (NodeList) termsXPath.evaluate(conceptNode, XPathConstants.NODESET);
franta-hg@15
   135
				List<String> names = new ArrayList<>();
franta-hg@21
   136
franta-hg@15
   137
				for (Node nameNode : nodeIterable(nameNodes)) {
franta-hg@15
   138
					String name = nameNode.getTextContent().trim();
franta-hg@15
   139
					if (!name.isEmpty()) {
franta-hg@15
   140
						names.add(name);
franta-hg@15
   141
					}
franta-hg@15
   142
				}
franta-hg@21
   143
franta-hg@18
   144
				String baseName = names.get(0);
franta-hg@18
   145
				IndexEntry indexEntry = new IndexEntry(baseName, offset, length);
franta-hg@17
   146
				indexEntries.add(indexEntry);
franta-hg@21
   147
franta-hg@17
   148
				for (int i = 1; i < names.size(); i++) {
franta-hg@17
   149
					String name = names.get(i);
franta-hg@18
   150
					if (!baseName.equals(name)) {
franta-hg@18
   151
						synonymsEntries.add(new SynonymsEntry(indexEntry, name));
franta-hg@18
   152
					}
franta-hg@17
   153
				}
franta-hg@21
   154
franta-hg@15
   155
				offset = offset + length;
franta-hg@15
   156
				conceptIndex++;
franta-hg@15
   157
			}
franta-hg@21
   158
franta-hg@17
   159
			writeIndex(indexOutputStream, indexEntries);
franta-hg@17
   160
			writeSynonyms(synonymOutputStream, synonymsEntries);
franta-hg@21
   161
franta-hg@15
   162
			indexOutputStream.flush();
franta-hg@20
   163
			writeInfo(infoWriter, sourceDocument, conceptIndex, synonymsEntries.size(), indexFile.length());
franta-hg@15
   164
		} catch (SAXException | IOException | TransformerException | XPathExpressionException e) {
franta-hg@15
   165
			log.log(Level.SEVERE, "unable to generate", e);
franta-hg@15
   166
		} finally {
franta-hg@15
   167
			close(dictOutputStream);
franta-hg@15
   168
			close(synonymOutputStream);
franta-hg@15
   169
			close(indexOutputStream);
franta-hg@15
   170
			close(infoWriter);
franta-hg@15
   171
		}
franta-hg@15
   172
	}
franta-hg@21
   173
franta-hg@17
   174
	private void writeIndex(DataOutputStream indexOutputStream, SortedSet<IndexEntry> indexEntries) throws IOException {
franta-hg@17
   175
		long ordinal = 0;
franta-hg@17
   176
		for (IndexEntry e : indexEntries) {
franta-hg@17
   177
			e.serialize(indexOutputStream);
franta-hg@17
   178
			e.setOrdinal(ordinal++);
franta-hg@17
   179
		}
franta-hg@15
   180
	}
franta-hg@21
   181
franta-hg@17
   182
	private void writeSynonyms(DataOutputStream synonymOutputStream, SortedSet<SynonymsEntry> synonymsEntries) throws IOException {
franta-hg@17
   183
		for (SynonymsEntry s : synonymsEntries) {
franta-hg@17
   184
			s.serialize(synonymOutputStream);
franta-hg@15
   185
		}
franta-hg@15
   186
	}
franta-hg@21
   187
franta-hg@15
   188
	private void writeInfo(BufferedWriter infoWriter, Document sourceDocument, long wordcount, long synwourdcount, long idxfilesize) throws IOException {
franta-hg@15
   189
		// TODO: values from document metadata
franta-hg@15
   190
		infoWriter.write("StarDict's dict ifo file\n");
franta-hg@15
   191
		infoWriter.write("version=2.4.2\n");
franta-hg@15
   192
		infoWriter.write("bookname=Free Telco Dictionary\n");
franta-hg@15
   193
		infoWriter.write("wordcount=" + wordcount + "\n");
franta-hg@15
   194
		infoWriter.write("synwordcount=" + synwourdcount + "\n");
franta-hg@15
   195
		infoWriter.write("idxfilesize=" + idxfilesize + "\n");
franta-hg@15
   196
		infoWriter.write("idxoffsetbits=32\n");
franta-hg@15
   197
		infoWriter.write("author=František Kučera\n");
franta-hg@17
   198
		infoWriter.write("em" + "ail=telco" + "-dictionary." + EML_TO_KEN + "@" + "fran" + "tovo.cz\n");
franta-hg@15
   199
		infoWriter.write("website=https://telco.frantovo.cz\n");
franta-hg@111
   200
		infoWriter.write("description=A dictionary for telecommunications licensed under GNU FDL. Check new versions at https://telco.frantovo.cz\n");
franta-hg@18
   201
		infoWriter.write("date=" + dateFormat.format(new Date()) + "\n");
franta-hg@21
   202
		infoWriter.write("sametypesequence=" + mode + "\n");
franta-hg@15
   203
	}
franta-hg@21
   204
franta-hg@13
   205
	public static void main(String[] args) {
franta-hg@15
   206
		File outputFolder = new File("../../delivery/free-telco-dictionary");
franta-hg@23
   207
		outputFolder.mkdirs();
franta-hg@21
   208
franta-hg@15
   209
		try {
franta-hg@21
   210
			Generator g = new Generator(parseMode(args));
franta-hg@15
   211
			g.generate(outputFolder, "telco");
franta-hg@15
   212
		} catch (ParserConfigurationException | TransformerConfigurationException e) {
franta-hg@15
   213
			log.log(Level.SEVERE, "error during initialization", e);
franta-hg@15
   214
		}
franta-hg@13
   215
	}
franta-hg@21
   216
franta-hg@21
   217
	private static String parseMode(String[] args) {
franta-hg@21
   218
		if (args.length == 1) {
franta-hg@21
   219
			return args[0];
franta-hg@21
   220
		} else {
franta-hg@21
   221
			return "h";
franta-hg@21
   222
		}
franta-hg@21
   223
	}
franta-hg@13
   224
}