franta-hg@13: /** franta-hg@13: * Free Telco Dictionary franta-hg@13: * Copyright © 2013 František Kučera (frantovo.cz) franta-hg@13: * franta-hg@13: * This program is free software: you can redistribute it and/or modify franta-hg@13: * it under the terms of the GNU General Public License as published by franta-hg@13: * the Free Software Foundation, either version 3 of the License, or franta-hg@13: * (at your option) any later version. franta-hg@13: * franta-hg@13: * This program is distributed in the hope that it will be useful, franta-hg@13: * but WITHOUT ANY WARRANTY; without even the implied warranty of franta-hg@13: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the franta-hg@13: * GNU General Public License for more details. franta-hg@13: * franta-hg@13: * You should have received a copy of the GNU General Public License franta-hg@13: * along with this program. If not, see . franta-hg@13: */ franta-hg@13: package cz.frantovo.telco.dictionary; franta-hg@13: franta-hg@15: import static cz.frantovo.telco.dictionary.Xmlns.*; franta-hg@15: import static cz.frantovo.telco.dictionary.Functions.*; franta-hg@15: import java.io.BufferedWriter; franta-hg@15: import java.io.ByteArrayOutputStream; franta-hg@15: import java.io.DataOutputStream; franta-hg@15: import java.io.File; franta-hg@15: import java.io.FileOutputStream; franta-hg@15: import java.io.FileWriter; franta-hg@15: import java.io.IOException; franta-hg@18: import java.text.SimpleDateFormat; franta-hg@15: import java.util.ArrayList; franta-hg@18: import java.util.Date; franta-hg@15: import java.util.List; franta-hg@17: import java.util.SortedSet; franta-hg@17: import java.util.TreeSet; franta-hg@15: import java.util.logging.Level; franta-hg@15: import java.util.logging.Logger; franta-hg@15: import javax.xml.parsers.DocumentBuilder; franta-hg@15: import javax.xml.parsers.DocumentBuilderFactory; franta-hg@15: import javax.xml.parsers.ParserConfigurationException; franta-hg@15: import javax.xml.transform.Transformer; franta-hg@15: import javax.xml.transform.TransformerConfigurationException; franta-hg@15: import javax.xml.transform.TransformerException; franta-hg@15: import javax.xml.transform.TransformerFactory; franta-hg@15: import javax.xml.transform.dom.DOMSource; franta-hg@15: import javax.xml.transform.stream.StreamResult; franta-hg@15: import javax.xml.transform.stream.StreamSource; franta-hg@15: import javax.xml.xpath.XPath; franta-hg@15: import javax.xml.xpath.XPathConstants; franta-hg@15: import javax.xml.xpath.XPathExpression; franta-hg@15: import javax.xml.xpath.XPathExpressionException; franta-hg@15: import javax.xml.xpath.XPathFactory; franta-hg@15: import org.w3c.dom.Document; franta-hg@15: import org.w3c.dom.Node; franta-hg@15: import org.w3c.dom.NodeList; franta-hg@15: import org.xml.sax.SAXException; franta-hg@15: franta-hg@13: /** franta-hg@15: *

franta-hg@15: * Generates dictionary files in StarDict format from source in our XML format. franta-hg@15: *

franta-hg@15: * franta-hg@15: *

franta-hg@15: * Number format should be: 32-bits unsigned number in network byte order franta-hg@15: *

franta-hg@13: * franta-hg@13: * @author Ing. František Kučera (frantovo.cz) franta-hg@13: */ franta-hg@13: public class Generator { franta-hg@21: franta-hg@15: private static final Logger log = Logger.getLogger(Generator.class.getName()); franta-hg@17: private static final String EML_TO_KEN = "ixumhht68"; franta-hg@21: private String mode; franta-hg@15: private final DocumentBuilderFactory documentBuilderFactory; franta-hg@15: private final DocumentBuilder documentBuilder; franta-hg@15: private final XPathFactory xpathFactory; franta-hg@15: private final XPath xpath; franta-hg@15: private final TransformerFactory xslFactory; franta-hg@15: private final Transformer xsl; franta-hg@18: private final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy.MM.dd"); franta-hg@21: franta-hg@21: public Generator(String mode) throws ParserConfigurationException, TransformerConfigurationException { franta-hg@21: this.mode = mode; franta-hg@21: franta-hg@21: File templateFile = new File("concept." + mode + ".xsl"); franta-hg@21: if (templateFile.exists()) { franta-hg@21: franta-hg@21: documentBuilderFactory = DocumentBuilderFactory.newInstance(); franta-hg@21: documentBuilderFactory.setNamespaceAware(true); franta-hg@21: documentBuilder = documentBuilderFactory.newDocumentBuilder(); franta-hg@21: franta-hg@21: xslFactory = TransformerFactory.newInstance(); franta-hg@21: xsl = xslFactory.newTransformer(new StreamSource(templateFile)); franta-hg@21: franta-hg@21: xpathFactory = XPathFactory.newInstance(); franta-hg@21: xpath = xpathFactory.newXPath(); franta-hg@21: xpath.setNamespaceContext(getNamespaceContext()); franta-hg@21: } else { franta-hg@21: throw new IllegalArgumentException("Invalid mode: " + mode + ". File " + templateFile + " does not exist"); franta-hg@21: } franta-hg@15: } franta-hg@21: franta-hg@15: private void generate(File folder, String filePrefix) { franta-hg@15: File infoFile = new File(folder, filePrefix + ".ifo"); franta-hg@15: File dictFile = new File(folder, filePrefix + ".dict"); franta-hg@15: File indexFile = new File(folder, filePrefix + ".idx"); franta-hg@15: File synonymFile = new File(folder, filePrefix + ".syn"); franta-hg@21: franta-hg@15: FileOutputStream dictOutputStream = null; franta-hg@15: DataOutputStream synonymOutputStream = null; franta-hg@15: DataOutputStream indexOutputStream = null; franta-hg@15: BufferedWriter infoWriter = null; franta-hg@21: franta-hg@17: SortedSet indexEntries = new TreeSet<>(); franta-hg@17: SortedSet synonymsEntries = new TreeSet<>(); franta-hg@21: franta-hg@15: try { franta-hg@15: dictOutputStream = new FileOutputStream(dictFile); franta-hg@15: synonymOutputStream = new DataOutputStream(new FileOutputStream(synonymFile)); franta-hg@15: indexOutputStream = new DataOutputStream(new FileOutputStream(indexFile)); franta-hg@15: infoWriter = new BufferedWriter(new FileWriter(infoFile)); franta-hg@21: franta-hg@15: Document sourceDocument = documentBuilder.parse("../../data/dictionary.xml"); franta-hg@15: XPathExpression termsXPath = xpath.compile("d:term/@completeForm|d:term/@abbreviation"); franta-hg@15: // TODO: tags - labels/descriptions franta-hg@15: xsl.setParameter("tags", sourceDocument.getElementsByTagNameNS(DICTIONARY, "tags").item(0)); franta-hg@21: franta-hg@15: long offset = 0; franta-hg@15: long conceptIndex = 0; franta-hg@15: for (Node conceptNode : nodeIterable(sourceDocument.getElementsByTagNameNS(DICTIONARY, "concept"))) { franta-hg@15: ByteArrayOutputStream conceptXhtml = new ByteArrayOutputStream(); franta-hg@15: xsl.transform(new DOMSource(conceptNode), new StreamResult(conceptXhtml)); franta-hg@15: int length = conceptXhtml.size(); franta-hg@15: dictOutputStream.write(conceptXhtml.toByteArray()); franta-hg@21: franta-hg@15: NodeList nameNodes = (NodeList) termsXPath.evaluate(conceptNode, XPathConstants.NODESET); franta-hg@15: List names = new ArrayList<>(); franta-hg@21: franta-hg@15: for (Node nameNode : nodeIterable(nameNodes)) { franta-hg@15: String name = nameNode.getTextContent().trim(); franta-hg@15: if (!name.isEmpty()) { franta-hg@15: names.add(name); franta-hg@15: } franta-hg@15: } franta-hg@21: franta-hg@18: String baseName = names.get(0); franta-hg@18: IndexEntry indexEntry = new IndexEntry(baseName, offset, length); franta-hg@17: indexEntries.add(indexEntry); franta-hg@21: franta-hg@17: for (int i = 1; i < names.size(); i++) { franta-hg@17: String name = names.get(i); franta-hg@18: if (!baseName.equals(name)) { franta-hg@18: synonymsEntries.add(new SynonymsEntry(indexEntry, name)); franta-hg@18: } franta-hg@17: } franta-hg@21: franta-hg@15: offset = offset + length; franta-hg@15: conceptIndex++; franta-hg@15: } franta-hg@21: franta-hg@17: writeIndex(indexOutputStream, indexEntries); franta-hg@17: writeSynonyms(synonymOutputStream, synonymsEntries); franta-hg@21: franta-hg@15: indexOutputStream.flush(); franta-hg@20: writeInfo(infoWriter, sourceDocument, conceptIndex, synonymsEntries.size(), indexFile.length()); franta-hg@15: } catch (SAXException | IOException | TransformerException | XPathExpressionException e) { franta-hg@15: log.log(Level.SEVERE, "unable to generate", e); franta-hg@15: } finally { franta-hg@15: close(dictOutputStream); franta-hg@15: close(synonymOutputStream); franta-hg@15: close(indexOutputStream); franta-hg@15: close(infoWriter); franta-hg@15: } franta-hg@15: } franta-hg@21: franta-hg@17: private void writeIndex(DataOutputStream indexOutputStream, SortedSet indexEntries) throws IOException { franta-hg@17: long ordinal = 0; franta-hg@17: for (IndexEntry e : indexEntries) { franta-hg@17: e.serialize(indexOutputStream); franta-hg@17: e.setOrdinal(ordinal++); franta-hg@17: } franta-hg@15: } franta-hg@21: franta-hg@17: private void writeSynonyms(DataOutputStream synonymOutputStream, SortedSet synonymsEntries) throws IOException { franta-hg@17: for (SynonymsEntry s : synonymsEntries) { franta-hg@17: s.serialize(synonymOutputStream); franta-hg@15: } franta-hg@15: } franta-hg@21: franta-hg@15: private void writeInfo(BufferedWriter infoWriter, Document sourceDocument, long wordcount, long synwourdcount, long idxfilesize) throws IOException { franta-hg@15: // TODO: values from document metadata franta-hg@15: infoWriter.write("StarDict's dict ifo file\n"); franta-hg@15: infoWriter.write("version=2.4.2\n"); franta-hg@15: infoWriter.write("bookname=Free Telco Dictionary\n"); franta-hg@15: infoWriter.write("wordcount=" + wordcount + "\n"); franta-hg@15: infoWriter.write("synwordcount=" + synwourdcount + "\n"); franta-hg@15: infoWriter.write("idxfilesize=" + idxfilesize + "\n"); franta-hg@15: infoWriter.write("idxoffsetbits=32\n"); franta-hg@15: infoWriter.write("author=František Kučera\n"); franta-hg@17: infoWriter.write("em" + "ail=telco" + "-dictionary." + EML_TO_KEN + "@" + "fran" + "tovo.cz\n"); franta-hg@15: infoWriter.write("website=https://telco.frantovo.cz\n"); franta-hg@111: infoWriter.write("description=A dictionary for telecommunications licensed under GNU FDL. Check new versions at https://telco.frantovo.cz\n"); franta-hg@18: infoWriter.write("date=" + dateFormat.format(new Date()) + "\n"); franta-hg@21: infoWriter.write("sametypesequence=" + mode + "\n"); franta-hg@15: } franta-hg@21: franta-hg@13: public static void main(String[] args) { franta-hg@15: File outputFolder = new File("../../delivery/free-telco-dictionary"); franta-hg@23: outputFolder.mkdirs(); franta-hg@21: franta-hg@15: try { franta-hg@21: Generator g = new Generator(parseMode(args)); franta-hg@15: g.generate(outputFolder, "telco"); franta-hg@15: } catch (ParserConfigurationException | TransformerConfigurationException e) { franta-hg@15: log.log(Level.SEVERE, "error during initialization", e); franta-hg@15: } franta-hg@13: } franta-hg@21: franta-hg@21: private static String parseMode(String[] args) { franta-hg@21: if (args.length == 1) { franta-hg@21: return args[0]; franta-hg@21: } else { franta-hg@21: return "h"; franta-hg@21: } franta-hg@21: } franta-hg@13: }