java/dictionary-generator/src/cz/frantovo/telco/dictionary/Generator.java
author František Kučera <franta-hg@frantovo.cz>
Wed, 10 Jul 2013 01:27:41 +0200
changeset 19 89f9c8307dee
parent 18 7a2eb4cb6ff1
child 20 aecdfc3b1950
permissions -rw-r--r--
data: xmlns, xsl, javadoc
franta-hg@13
     1
/**
franta-hg@13
     2
 * Free Telco Dictionary
franta-hg@13
     3
 * Copyright © 2013 František Kučera (frantovo.cz)
franta-hg@13
     4
 *
franta-hg@13
     5
 * This program is free software: you can redistribute it and/or modify
franta-hg@13
     6
 * it under the terms of the GNU General Public License as published by
franta-hg@13
     7
 * the Free Software Foundation, either version 3 of the License, or
franta-hg@13
     8
 * (at your option) any later version.
franta-hg@13
     9
 *
franta-hg@13
    10
 * This program is distributed in the hope that it will be useful,
franta-hg@13
    11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
franta-hg@13
    12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
franta-hg@13
    13
 * GNU General Public License for more details.
franta-hg@13
    14
 *
franta-hg@13
    15
 * You should have received a copy of the GNU General Public License
franta-hg@13
    16
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
franta-hg@13
    17
 */
franta-hg@13
    18
package cz.frantovo.telco.dictionary;
franta-hg@13
    19
franta-hg@15
    20
import static cz.frantovo.telco.dictionary.Xmlns.*;
franta-hg@15
    21
import static cz.frantovo.telco.dictionary.Functions.*;
franta-hg@15
    22
import java.io.BufferedWriter;
franta-hg@15
    23
import java.io.ByteArrayOutputStream;
franta-hg@15
    24
import java.io.DataOutputStream;
franta-hg@15
    25
import java.io.File;
franta-hg@15
    26
import java.io.FileOutputStream;
franta-hg@15
    27
import java.io.FileWriter;
franta-hg@15
    28
import java.io.IOException;
franta-hg@18
    29
import java.text.SimpleDateFormat;
franta-hg@15
    30
import java.util.ArrayList;
franta-hg@18
    31
import java.util.Date;
franta-hg@15
    32
import java.util.List;
franta-hg@17
    33
import java.util.SortedSet;
franta-hg@17
    34
import java.util.TreeSet;
franta-hg@15
    35
import java.util.logging.Level;
franta-hg@15
    36
import java.util.logging.Logger;
franta-hg@15
    37
import javax.xml.parsers.DocumentBuilder;
franta-hg@15
    38
import javax.xml.parsers.DocumentBuilderFactory;
franta-hg@15
    39
import javax.xml.parsers.ParserConfigurationException;
franta-hg@15
    40
import javax.xml.transform.Transformer;
franta-hg@15
    41
import javax.xml.transform.TransformerConfigurationException;
franta-hg@15
    42
import javax.xml.transform.TransformerException;
franta-hg@15
    43
import javax.xml.transform.TransformerFactory;
franta-hg@15
    44
import javax.xml.transform.dom.DOMSource;
franta-hg@15
    45
import javax.xml.transform.stream.StreamResult;
franta-hg@15
    46
import javax.xml.transform.stream.StreamSource;
franta-hg@15
    47
import javax.xml.xpath.XPath;
franta-hg@15
    48
import javax.xml.xpath.XPathConstants;
franta-hg@15
    49
import javax.xml.xpath.XPathExpression;
franta-hg@15
    50
import javax.xml.xpath.XPathExpressionException;
franta-hg@15
    51
import javax.xml.xpath.XPathFactory;
franta-hg@15
    52
import org.w3c.dom.Document;
franta-hg@15
    53
import org.w3c.dom.Node;
franta-hg@15
    54
import org.w3c.dom.NodeList;
franta-hg@15
    55
import org.xml.sax.SAXException;
franta-hg@15
    56
franta-hg@13
    57
/**
franta-hg@15
    58
 * <p>
franta-hg@15
    59
 * Generates dictionary files in StarDict format from source in our XML format.
franta-hg@15
    60
 * </p>
franta-hg@15
    61
 *
franta-hg@15
    62
 * <p>
franta-hg@15
    63
 * Number format should be: 32-bits unsigned number in network byte order
franta-hg@15
    64
 * </p>
franta-hg@13
    65
 *
franta-hg@13
    66
 * @author Ing. František Kučera (frantovo.cz)
franta-hg@13
    67
 */
franta-hg@13
    68
public class Generator {
franta-hg@18
    69
	
franta-hg@15
    70
	private static final Logger log = Logger.getLogger(Generator.class.getName());
franta-hg@17
    71
	private static final String EML_TO_KEN = "ixumhht68";
franta-hg@15
    72
	private final DocumentBuilderFactory documentBuilderFactory;
franta-hg@15
    73
	private final DocumentBuilder documentBuilder;
franta-hg@15
    74
	private final XPathFactory xpathFactory;
franta-hg@15
    75
	private final XPath xpath;
franta-hg@15
    76
	private final TransformerFactory xslFactory;
franta-hg@15
    77
	private final Transformer xsl;
franta-hg@18
    78
	private final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy.MM.dd");
franta-hg@18
    79
	
franta-hg@15
    80
	public Generator() throws ParserConfigurationException, TransformerConfigurationException {
franta-hg@15
    81
		documentBuilderFactory = DocumentBuilderFactory.newInstance();
franta-hg@15
    82
		documentBuilderFactory.setNamespaceAware(true);
franta-hg@15
    83
		documentBuilder = documentBuilderFactory.newDocumentBuilder();
franta-hg@18
    84
		
franta-hg@15
    85
		xslFactory = TransformerFactory.newInstance();
franta-hg@15
    86
		xsl = xslFactory.newTransformer(new StreamSource("concept.xsl"));
franta-hg@18
    87
		
franta-hg@15
    88
		xpathFactory = XPathFactory.newInstance();
franta-hg@15
    89
		xpath = xpathFactory.newXPath();
franta-hg@15
    90
		xpath.setNamespaceContext(getNamespaceContext());
franta-hg@15
    91
	}
franta-hg@18
    92
	
franta-hg@15
    93
	private void generate(File folder, String filePrefix) {
franta-hg@15
    94
		File infoFile = new File(folder, filePrefix + ".ifo");
franta-hg@15
    95
		File dictFile = new File(folder, filePrefix + ".dict");
franta-hg@15
    96
		File indexFile = new File(folder, filePrefix + ".idx");
franta-hg@15
    97
		File synonymFile = new File(folder, filePrefix + ".syn");
franta-hg@18
    98
		
franta-hg@15
    99
		FileOutputStream dictOutputStream = null;
franta-hg@15
   100
		DataOutputStream synonymOutputStream = null;
franta-hg@15
   101
		DataOutputStream indexOutputStream = null;
franta-hg@15
   102
		BufferedWriter infoWriter = null;
franta-hg@18
   103
		
franta-hg@17
   104
		SortedSet<IndexEntry> indexEntries = new TreeSet<>();
franta-hg@17
   105
		SortedSet<SynonymsEntry> synonymsEntries = new TreeSet<>();
franta-hg@18
   106
		
franta-hg@15
   107
		try {
franta-hg@15
   108
			dictOutputStream = new FileOutputStream(dictFile);
franta-hg@15
   109
			synonymOutputStream = new DataOutputStream(new FileOutputStream(synonymFile));
franta-hg@15
   110
			indexOutputStream = new DataOutputStream(new FileOutputStream(indexFile));
franta-hg@15
   111
			infoWriter = new BufferedWriter(new FileWriter(infoFile));
franta-hg@18
   112
			
franta-hg@15
   113
			Document sourceDocument = documentBuilder.parse("../../data/dictionary.xml");
franta-hg@15
   114
			XPathExpression termsXPath = xpath.compile("d:term/@completeForm|d:term/@abbreviation");
franta-hg@15
   115
			// TODO: tags - labels/descriptions
franta-hg@15
   116
			xsl.setParameter("tags", sourceDocument.getElementsByTagNameNS(DICTIONARY, "tags").item(0));
franta-hg@18
   117
			
franta-hg@15
   118
			long offset = 0;
franta-hg@15
   119
			long conceptIndex = 0;
franta-hg@15
   120
			for (Node conceptNode : nodeIterable(sourceDocument.getElementsByTagNameNS(DICTIONARY, "concept"))) {
franta-hg@15
   121
				ByteArrayOutputStream conceptXhtml = new ByteArrayOutputStream();
franta-hg@15
   122
				xsl.transform(new DOMSource(conceptNode), new StreamResult(conceptXhtml));
franta-hg@15
   123
				int length = conceptXhtml.size();
franta-hg@15
   124
				dictOutputStream.write(conceptXhtml.toByteArray());
franta-hg@18
   125
				
franta-hg@15
   126
				NodeList nameNodes = (NodeList) termsXPath.evaluate(conceptNode, XPathConstants.NODESET);
franta-hg@15
   127
				List<String> names = new ArrayList<>();
franta-hg@18
   128
				
franta-hg@15
   129
				for (Node nameNode : nodeIterable(nameNodes)) {
franta-hg@15
   130
					String name = nameNode.getTextContent().trim();
franta-hg@15
   131
					if (!name.isEmpty()) {
franta-hg@15
   132
						names.add(name);
franta-hg@15
   133
					}
franta-hg@15
   134
				}
franta-hg@18
   135
				
franta-hg@18
   136
				String baseName = names.get(0);
franta-hg@18
   137
				IndexEntry indexEntry = new IndexEntry(baseName, offset, length);
franta-hg@17
   138
				indexEntries.add(indexEntry);
franta-hg@18
   139
				
franta-hg@17
   140
				for (int i = 1; i < names.size(); i++) {
franta-hg@17
   141
					String name = names.get(i);
franta-hg@18
   142
					if (!baseName.equals(name)) {
franta-hg@18
   143
						synonymsEntries.add(new SynonymsEntry(indexEntry, name));
franta-hg@18
   144
					}
franta-hg@17
   145
				}
franta-hg@18
   146
				
franta-hg@15
   147
				offset = offset + length;
franta-hg@15
   148
				conceptIndex++;
franta-hg@15
   149
			}
franta-hg@18
   150
			
franta-hg@17
   151
			writeIndex(indexOutputStream, indexEntries);
franta-hg@17
   152
			writeSynonyms(synonymOutputStream, synonymsEntries);
franta-hg@18
   153
			
franta-hg@15
   154
			indexOutputStream.flush();
franta-hg@17
   155
			writeInfo(infoWriter, sourceDocument, conceptIndex + 1, synonymsEntries.size(), indexFile.length());
franta-hg@15
   156
		} catch (SAXException | IOException | TransformerException | XPathExpressionException e) {
franta-hg@15
   157
			log.log(Level.SEVERE, "unable to generate", e);
franta-hg@15
   158
		} finally {
franta-hg@15
   159
			close(dictOutputStream);
franta-hg@15
   160
			close(synonymOutputStream);
franta-hg@15
   161
			close(indexOutputStream);
franta-hg@15
   162
			close(infoWriter);
franta-hg@15
   163
		}
franta-hg@15
   164
	}
franta-hg@18
   165
	
franta-hg@17
   166
	private void writeIndex(DataOutputStream indexOutputStream, SortedSet<IndexEntry> indexEntries) throws IOException {
franta-hg@17
   167
		long ordinal = 0;
franta-hg@17
   168
		for (IndexEntry e : indexEntries) {
franta-hg@17
   169
			e.serialize(indexOutputStream);
franta-hg@17
   170
			e.setOrdinal(ordinal++);
franta-hg@17
   171
		}
franta-hg@15
   172
	}
franta-hg@18
   173
	
franta-hg@17
   174
	private void writeSynonyms(DataOutputStream synonymOutputStream, SortedSet<SynonymsEntry> synonymsEntries) throws IOException {
franta-hg@17
   175
		for (SynonymsEntry s : synonymsEntries) {
franta-hg@17
   176
			s.serialize(synonymOutputStream);
franta-hg@15
   177
		}
franta-hg@15
   178
	}
franta-hg@18
   179
	
franta-hg@15
   180
	private void writeInfo(BufferedWriter infoWriter, Document sourceDocument, long wordcount, long synwourdcount, long idxfilesize) throws IOException {
franta-hg@15
   181
		// TODO: values from document metadata
franta-hg@15
   182
		infoWriter.write("StarDict's dict ifo file\n");
franta-hg@15
   183
		infoWriter.write("version=2.4.2\n");
franta-hg@15
   184
		infoWriter.write("bookname=Free Telco Dictionary\n");
franta-hg@15
   185
		infoWriter.write("wordcount=" + wordcount + "\n");
franta-hg@15
   186
		infoWriter.write("synwordcount=" + synwourdcount + "\n");
franta-hg@15
   187
		infoWriter.write("idxfilesize=" + idxfilesize + "\n");
franta-hg@15
   188
		infoWriter.write("idxoffsetbits=32\n");
franta-hg@15
   189
		infoWriter.write("author=František Kučera\n");
franta-hg@17
   190
		infoWriter.write("em" + "ail=telco" + "-dictionary." + EML_TO_KEN + "@" + "fran" + "tovo.cz\n");
franta-hg@15
   191
		infoWriter.write("website=https://telco.frantovo.cz\n");
franta-hg@15
   192
		infoWriter.write("description=A dictionary for telecommunications licensed under GNU FDL\n");
franta-hg@18
   193
		infoWriter.write("date=" + dateFormat.format(new Date()) + "\n");
franta-hg@15
   194
		infoWriter.write("sametypesequence=h\n");
franta-hg@15
   195
	}
franta-hg@18
   196
	
franta-hg@13
   197
	public static void main(String[] args) {
franta-hg@15
   198
		File outputFolder = new File("../../delivery/free-telco-dictionary");
franta-hg@15
   199
		outputFolder.mkdir();
franta-hg@18
   200
		
franta-hg@15
   201
		try {
franta-hg@15
   202
			Generator g = new Generator();
franta-hg@15
   203
			g.generate(outputFolder, "telco");
franta-hg@15
   204
		} catch (ParserConfigurationException | TransformerConfigurationException e) {
franta-hg@15
   205
			log.log(Level.SEVERE, "error during initialization", e);
franta-hg@15
   206
		}
franta-hg@13
   207
	}
franta-hg@13
   208
}