java/dictionary-generator/src/cz/frantovo/telco/dictionary/Generator.java
changeset 15 93208f791318
parent 13 a5d7afd1b93a
child 17 b188eae2c092
     1.1 --- a/java/dictionary-generator/src/cz/frantovo/telco/dictionary/Generator.java	Mon Jul 08 23:38:22 2013 +0200
     1.2 +++ b/java/dictionary-generator/src/cz/frantovo/telco/dictionary/Generator.java	Tue Jul 09 18:59:07 2013 +0200
     1.3 @@ -17,13 +17,189 @@
     1.4   */
     1.5  package cz.frantovo.telco.dictionary;
     1.6  
     1.7 +import static cz.frantovo.telco.dictionary.Xmlns.*;
     1.8 +import static cz.frantovo.telco.dictionary.Functions.*;
     1.9 +import java.io.BufferedWriter;
    1.10 +import java.io.ByteArrayOutputStream;
    1.11 +import java.io.DataOutputStream;
    1.12 +import java.io.File;
    1.13 +import java.io.FileOutputStream;
    1.14 +import java.io.FileWriter;
    1.15 +import java.io.IOException;
    1.16 +import java.nio.ByteBuffer;
    1.17 +import java.nio.charset.Charset;
    1.18 +import java.util.ArrayList;
    1.19 +import java.util.List;
    1.20 +import java.util.logging.Level;
    1.21 +import java.util.logging.Logger;
    1.22 +import javax.xml.parsers.DocumentBuilder;
    1.23 +import javax.xml.parsers.DocumentBuilderFactory;
    1.24 +import javax.xml.parsers.ParserConfigurationException;
    1.25 +import javax.xml.transform.Transformer;
    1.26 +import javax.xml.transform.TransformerConfigurationException;
    1.27 +import javax.xml.transform.TransformerException;
    1.28 +import javax.xml.transform.TransformerFactory;
    1.29 +import javax.xml.transform.dom.DOMSource;
    1.30 +import javax.xml.transform.stream.StreamResult;
    1.31 +import javax.xml.transform.stream.StreamSource;
    1.32 +import javax.xml.xpath.XPath;
    1.33 +import javax.xml.xpath.XPathConstants;
    1.34 +import javax.xml.xpath.XPathExpression;
    1.35 +import javax.xml.xpath.XPathExpressionException;
    1.36 +import javax.xml.xpath.XPathFactory;
    1.37 +import org.w3c.dom.Document;
    1.38 +import org.w3c.dom.Node;
    1.39 +import org.w3c.dom.NodeList;
    1.40 +import org.xml.sax.SAXException;
    1.41 +
    1.42  /**
    1.43 + * <p>
    1.44 + * Generates dictionary files in StarDict format from source in our XML format.
    1.45 + * </p>
    1.46 + *
    1.47 + * <p>
    1.48 + * Number format should be: 32-bits unsigned number in network byte order
    1.49 + * </p>
    1.50   *
    1.51   * @author Ing. František Kučera (frantovo.cz)
    1.52   */
    1.53  public class Generator {
    1.54  
    1.55 +	private static final Logger log = Logger.getLogger(Generator.class.getName());
    1.56 +	private static final String EMAIL_TOKEN = "ixumhht68";
    1.57 +	private final DocumentBuilderFactory documentBuilderFactory;
    1.58 +	private final DocumentBuilder documentBuilder;
    1.59 +	private final XPathFactory xpathFactory;
    1.60 +	private final XPath xpath;
    1.61 +	private final TransformerFactory xslFactory;
    1.62 +	private final Transformer xsl;
    1.63 +	private final Charset utf8;
    1.64 +
    1.65 +	public Generator() throws ParserConfigurationException, TransformerConfigurationException {
    1.66 +		utf8 = Charset.forName("UTF-8");
    1.67 +
    1.68 +		documentBuilderFactory = DocumentBuilderFactory.newInstance();
    1.69 +		documentBuilderFactory.setNamespaceAware(true);
    1.70 +		documentBuilder = documentBuilderFactory.newDocumentBuilder();
    1.71 +
    1.72 +		xslFactory = TransformerFactory.newInstance();
    1.73 +		xsl = xslFactory.newTransformer(new StreamSource("concept.xsl"));
    1.74 +
    1.75 +		xpathFactory = XPathFactory.newInstance();
    1.76 +		xpath = xpathFactory.newXPath();
    1.77 +		xpath.setNamespaceContext(getNamespaceContext());
    1.78 +	}
    1.79 +
    1.80 +	private void generate(File folder, String filePrefix) {
    1.81 +		File infoFile = new File(folder, filePrefix + ".ifo");
    1.82 +		File dictFile = new File(folder, filePrefix + ".dict");
    1.83 +		File indexFile = new File(folder, filePrefix + ".idx");
    1.84 +		File synonymFile = new File(folder, filePrefix + ".syn");
    1.85 +
    1.86 +		FileOutputStream dictOutputStream = null;
    1.87 +		DataOutputStream synonymOutputStream = null;
    1.88 +		DataOutputStream indexOutputStream = null;
    1.89 +		BufferedWriter infoWriter = null;
    1.90 +
    1.91 +		try {
    1.92 +			dictOutputStream = new FileOutputStream(dictFile);
    1.93 +			synonymOutputStream = new DataOutputStream(new FileOutputStream(synonymFile));
    1.94 +			indexOutputStream = new DataOutputStream(new FileOutputStream(indexFile));
    1.95 +			infoWriter = new BufferedWriter(new FileWriter(infoFile));
    1.96 +
    1.97 +			Document sourceDocument = documentBuilder.parse("../../data/dictionary.xml");
    1.98 +			XPathExpression termsXPath = xpath.compile("d:term/@completeForm|d:term/@abbreviation");
    1.99 +			// TODO: tags - labels/descriptions
   1.100 +			xsl.setParameter("tags", sourceDocument.getElementsByTagNameNS(DICTIONARY, "tags").item(0));
   1.101 +
   1.102 +			/**
   1.103 +			 * TODO: sorting
   1.104 +			 */
   1.105 +			long offset = 0;
   1.106 +			long conceptIndex = 0;
   1.107 +			long synonymCount = 0;
   1.108 +			for (Node conceptNode : nodeIterable(sourceDocument.getElementsByTagNameNS(DICTIONARY, "concept"))) {
   1.109 +				ByteArrayOutputStream conceptXhtml = new ByteArrayOutputStream();
   1.110 +				xsl.transform(new DOMSource(conceptNode), new StreamResult(conceptXhtml));
   1.111 +				int length = conceptXhtml.size();
   1.112 +				dictOutputStream.write(conceptXhtml.toByteArray());
   1.113 +
   1.114 +				NodeList nameNodes = (NodeList) termsXPath.evaluate(conceptNode, XPathConstants.NODESET);
   1.115 +				List<String> names = new ArrayList<>();
   1.116 +
   1.117 +				for (Node nameNode : nodeIterable(nameNodes)) {
   1.118 +					String name = nameNode.getTextContent().trim();
   1.119 +					if (!name.isEmpty()) {
   1.120 +						names.add(name);
   1.121 +					}
   1.122 +				}
   1.123 +
   1.124 +				synonymCount = +writeSynonyms(synonymOutputStream, names, conceptIndex);
   1.125 +				writeIndex(indexOutputStream, names.get(0), offset, length);
   1.126 +
   1.127 +				offset = offset + length;
   1.128 +				conceptIndex++;
   1.129 +			}
   1.130 +
   1.131 +			indexOutputStream.flush();
   1.132 +			writeInfo(infoWriter, sourceDocument, conceptIndex + 1, synonymCount, indexFile.length());
   1.133 +		} catch (SAXException | IOException | TransformerException | XPathExpressionException e) {
   1.134 +			log.log(Level.SEVERE, "unable to generate", e);
   1.135 +		} finally {
   1.136 +			close(dictOutputStream);
   1.137 +			close(synonymOutputStream);
   1.138 +			close(indexOutputStream);
   1.139 +			close(infoWriter);
   1.140 +		}
   1.141 +	}
   1.142 +
   1.143 +	private void writeIndex(DataOutputStream indexOutputStream, String name, long offset, long length) throws IOException {
   1.144 +		indexOutputStream.write(name.getBytes(utf8));
   1.145 +		indexOutputStream.write(0);
   1.146 +		indexOutputStream.writeInt((int) offset); // unsigned int 32
   1.147 +		indexOutputStream.writeInt((int) length); // unsigned int 32
   1.148 +	}
   1.149 +
   1.150 +	private int writeSynonyms(DataOutputStream synonymOutputStream, List<String> names, long baseIndex) throws IOException {
   1.151 +		if (names.size() > 1) {
   1.152 +			for (int i = 1; i < names.size(); i++) {
   1.153 +				String name = names.get(i);
   1.154 +				synonymOutputStream.write(name.getBytes(utf8));
   1.155 +				synonymOutputStream.write(0);
   1.156 +				synonymOutputStream.writeInt((int) baseIndex); // unsigned int 32
   1.157 +			}
   1.158 +			return names.size() - 1;
   1.159 +		} else {
   1.160 +			return 0;
   1.161 +		}
   1.162 +	}
   1.163 +
   1.164 +	private void writeInfo(BufferedWriter infoWriter, Document sourceDocument, long wordcount, long synwourdcount, long idxfilesize) throws IOException {
   1.165 +		// TODO: values from document metadata
   1.166 +		infoWriter.write("StarDict's dict ifo file\n");
   1.167 +		infoWriter.write("version=2.4.2\n");
   1.168 +		infoWriter.write("bookname=Free Telco Dictionary\n");
   1.169 +		infoWriter.write("wordcount=" + wordcount + "\n");
   1.170 +		infoWriter.write("synwordcount=" + synwourdcount + "\n");
   1.171 +		infoWriter.write("idxfilesize=" + idxfilesize + "\n");
   1.172 +		infoWriter.write("idxoffsetbits=32\n");
   1.173 +		infoWriter.write("author=František Kučera\n");
   1.174 +		infoWriter.write("email=telco-dictionary." + EMAIL_TOKEN + "@" + "frantovo.cz\n");
   1.175 +		infoWriter.write("website=https://telco.frantovo.cz\n");
   1.176 +		infoWriter.write("description=A dictionary for telecommunications licensed under GNU FDL\n");
   1.177 +		infoWriter.write("date=2013.07.09\n");
   1.178 +		infoWriter.write("sametypesequence=h\n");
   1.179 +	}
   1.180 +
   1.181  	public static void main(String[] args) {
   1.182 -		
   1.183 +		File outputFolder = new File("../../delivery/free-telco-dictionary");
   1.184 +		outputFolder.mkdir();
   1.185 +
   1.186 +		try {
   1.187 +			Generator g = new Generator();
   1.188 +			g.generate(outputFolder, "telco");
   1.189 +		} catch (ParserConfigurationException | TransformerConfigurationException e) {
   1.190 +			log.log(Level.SEVERE, "error during initialization", e);
   1.191 +		}
   1.192  	}
   1.193  }