diff -r b188eae2c092 -r 7a2eb4cb6ff1 java/dictionary-generator/src/cz/frantovo/telco/dictionary/Generator.java --- a/java/dictionary-generator/src/cz/frantovo/telco/dictionary/Generator.java Tue Jul 09 22:42:32 2013 +0200 +++ b/java/dictionary-generator/src/cz/frantovo/telco/dictionary/Generator.java Wed Jul 10 01:27:13 2013 +0200 @@ -26,7 +26,9 @@ import java.io.FileOutputStream; import java.io.FileWriter; import java.io.IOException; +import java.text.SimpleDateFormat; import java.util.ArrayList; +import java.util.Date; import java.util.List; import java.util.SortedSet; import java.util.TreeSet; @@ -64,7 +66,7 @@ * @author Ing. František Kučera (frantovo.cz) */ public class Generator { - + private static final Logger log = Logger.getLogger(Generator.class.getName()); private static final String EML_TO_KEN = "ixumhht68"; private final DocumentBuilderFactory documentBuilderFactory; @@ -73,45 +75,46 @@ private final XPath xpath; private final TransformerFactory xslFactory; private final Transformer xsl; - + private final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy.MM.dd"); + public Generator() throws ParserConfigurationException, TransformerConfigurationException { documentBuilderFactory = DocumentBuilderFactory.newInstance(); documentBuilderFactory.setNamespaceAware(true); documentBuilder = documentBuilderFactory.newDocumentBuilder(); - + xslFactory = TransformerFactory.newInstance(); xsl = xslFactory.newTransformer(new StreamSource("concept.xsl")); - + xpathFactory = XPathFactory.newInstance(); xpath = xpathFactory.newXPath(); xpath.setNamespaceContext(getNamespaceContext()); } - + private void generate(File folder, String filePrefix) { File infoFile = new File(folder, filePrefix + ".ifo"); File dictFile = new File(folder, filePrefix + ".dict"); File indexFile = new File(folder, filePrefix + ".idx"); File synonymFile = new File(folder, filePrefix + ".syn"); - + FileOutputStream dictOutputStream = null; DataOutputStream synonymOutputStream = null; DataOutputStream indexOutputStream = null; BufferedWriter infoWriter = null; - + SortedSet indexEntries = new TreeSet<>(); SortedSet synonymsEntries = new TreeSet<>(); - + try { dictOutputStream = new FileOutputStream(dictFile); synonymOutputStream = new DataOutputStream(new FileOutputStream(synonymFile)); indexOutputStream = new DataOutputStream(new FileOutputStream(indexFile)); infoWriter = new BufferedWriter(new FileWriter(infoFile)); - + Document sourceDocument = documentBuilder.parse("../../data/dictionary.xml"); XPathExpression termsXPath = xpath.compile("d:term/@completeForm|d:term/@abbreviation"); // TODO: tags - labels/descriptions xsl.setParameter("tags", sourceDocument.getElementsByTagNameNS(DICTIONARY, "tags").item(0)); - + long offset = 0; long conceptIndex = 0; for (Node conceptNode : nodeIterable(sourceDocument.getElementsByTagNameNS(DICTIONARY, "concept"))) { @@ -119,32 +122,35 @@ xsl.transform(new DOMSource(conceptNode), new StreamResult(conceptXhtml)); int length = conceptXhtml.size(); dictOutputStream.write(conceptXhtml.toByteArray()); - + NodeList nameNodes = (NodeList) termsXPath.evaluate(conceptNode, XPathConstants.NODESET); List names = new ArrayList<>(); - + for (Node nameNode : nodeIterable(nameNodes)) { String name = nameNode.getTextContent().trim(); if (!name.isEmpty()) { names.add(name); } } - - IndexEntry indexEntry = new IndexEntry(names.get(0), offset, length); + + String baseName = names.get(0); + IndexEntry indexEntry = new IndexEntry(baseName, offset, length); indexEntries.add(indexEntry); - + for (int i = 1; i < names.size(); i++) { String name = names.get(i); - synonymsEntries.add(new SynonymsEntry(indexEntry, name)); + if (!baseName.equals(name)) { + synonymsEntries.add(new SynonymsEntry(indexEntry, name)); + } } - + offset = offset + length; conceptIndex++; } - + writeIndex(indexOutputStream, indexEntries); writeSynonyms(synonymOutputStream, synonymsEntries); - + indexOutputStream.flush(); writeInfo(infoWriter, sourceDocument, conceptIndex + 1, synonymsEntries.size(), indexFile.length()); } catch (SAXException | IOException | TransformerException | XPathExpressionException e) { @@ -156,7 +162,7 @@ close(infoWriter); } } - + private void writeIndex(DataOutputStream indexOutputStream, SortedSet indexEntries) throws IOException { long ordinal = 0; for (IndexEntry e : indexEntries) { @@ -164,13 +170,13 @@ e.setOrdinal(ordinal++); } } - + private void writeSynonyms(DataOutputStream synonymOutputStream, SortedSet synonymsEntries) throws IOException { for (SynonymsEntry s : synonymsEntries) { s.serialize(synonymOutputStream); } } - + private void writeInfo(BufferedWriter infoWriter, Document sourceDocument, long wordcount, long synwourdcount, long idxfilesize) throws IOException { // TODO: values from document metadata infoWriter.write("StarDict's dict ifo file\n"); @@ -184,14 +190,14 @@ infoWriter.write("em" + "ail=telco" + "-dictionary." + EML_TO_KEN + "@" + "fran" + "tovo.cz\n"); infoWriter.write("website=https://telco.frantovo.cz\n"); infoWriter.write("description=A dictionary for telecommunications licensed under GNU FDL\n"); - infoWriter.write("date=2013.07.09\n"); + infoWriter.write("date=" + dateFormat.format(new Date()) + "\n"); infoWriter.write("sametypesequence=h\n"); } - + public static void main(String[] args) { File outputFolder = new File("../../delivery/free-telco-dictionary"); outputFolder.mkdir(); - + try { Generator g = new Generator(); g.generate(outputFolder, "telco");