# HG changeset patch # User František Kučera # Date 1373402552 -7200 # Node ID b188eae2c092c95d1343f0c240d9f8ede7570513 # Parent 939fa8d8663ef5ebad2235a17abeeae944373898 generator: sorted index and synonyms diff -r 939fa8d8663e -r b188eae2c092 java/dictionary-generator/src/cz/frantovo/telco/dictionary/Generator.java --- a/java/dictionary-generator/src/cz/frantovo/telco/dictionary/Generator.java Tue Jul 09 22:41:49 2013 +0200 +++ b/java/dictionary-generator/src/cz/frantovo/telco/dictionary/Generator.java Tue Jul 09 22:42:32 2013 +0200 @@ -26,10 +26,10 @@ import java.io.FileOutputStream; import java.io.FileWriter; import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.charset.Charset; import java.util.ArrayList; import java.util.List; +import java.util.SortedSet; +import java.util.TreeSet; import java.util.logging.Level; import java.util.logging.Logger; import javax.xml.parsers.DocumentBuilder; @@ -66,18 +66,15 @@ public class Generator { private static final Logger log = Logger.getLogger(Generator.class.getName()); - private static final String EMAIL_TOKEN = "ixumhht68"; + private static final String EML_TO_KEN = "ixumhht68"; private final DocumentBuilderFactory documentBuilderFactory; private final DocumentBuilder documentBuilder; private final XPathFactory xpathFactory; private final XPath xpath; private final TransformerFactory xslFactory; private final Transformer xsl; - private final Charset utf8; public Generator() throws ParserConfigurationException, TransformerConfigurationException { - utf8 = Charset.forName("UTF-8"); - documentBuilderFactory = DocumentBuilderFactory.newInstance(); documentBuilderFactory.setNamespaceAware(true); documentBuilder = documentBuilderFactory.newDocumentBuilder(); @@ -101,6 +98,9 @@ DataOutputStream indexOutputStream = null; BufferedWriter infoWriter = null; + SortedSet indexEntries = new TreeSet<>(); + SortedSet synonymsEntries = new TreeSet<>(); + try { dictOutputStream = new FileOutputStream(dictFile); synonymOutputStream = new DataOutputStream(new FileOutputStream(synonymFile)); @@ -112,12 +112,8 @@ // TODO: tags - labels/descriptions xsl.setParameter("tags", sourceDocument.getElementsByTagNameNS(DICTIONARY, "tags").item(0)); - /** - * TODO: sorting - */ long offset = 0; long conceptIndex = 0; - long synonymCount = 0; for (Node conceptNode : nodeIterable(sourceDocument.getElementsByTagNameNS(DICTIONARY, "concept"))) { ByteArrayOutputStream conceptXhtml = new ByteArrayOutputStream(); xsl.transform(new DOMSource(conceptNode), new StreamResult(conceptXhtml)); @@ -134,15 +130,23 @@ } } - synonymCount = +writeSynonyms(synonymOutputStream, names, conceptIndex); - writeIndex(indexOutputStream, names.get(0), offset, length); + IndexEntry indexEntry = new IndexEntry(names.get(0), offset, length); + indexEntries.add(indexEntry); + + for (int i = 1; i < names.size(); i++) { + String name = names.get(i); + synonymsEntries.add(new SynonymsEntry(indexEntry, name)); + } offset = offset + length; conceptIndex++; } + writeIndex(indexOutputStream, indexEntries); + writeSynonyms(synonymOutputStream, synonymsEntries); + indexOutputStream.flush(); - writeInfo(infoWriter, sourceDocument, conceptIndex + 1, synonymCount, indexFile.length()); + writeInfo(infoWriter, sourceDocument, conceptIndex + 1, synonymsEntries.size(), indexFile.length()); } catch (SAXException | IOException | TransformerException | XPathExpressionException e) { log.log(Level.SEVERE, "unable to generate", e); } finally { @@ -153,24 +157,17 @@ } } - private void writeIndex(DataOutputStream indexOutputStream, String name, long offset, long length) throws IOException { - indexOutputStream.write(name.getBytes(utf8)); - indexOutputStream.write(0); - indexOutputStream.writeInt((int) offset); // unsigned int 32 - indexOutputStream.writeInt((int) length); // unsigned int 32 + private void writeIndex(DataOutputStream indexOutputStream, SortedSet indexEntries) throws IOException { + long ordinal = 0; + for (IndexEntry e : indexEntries) { + e.serialize(indexOutputStream); + e.setOrdinal(ordinal++); + } } - private int writeSynonyms(DataOutputStream synonymOutputStream, List names, long baseIndex) throws IOException { - if (names.size() > 1) { - for (int i = 1; i < names.size(); i++) { - String name = names.get(i); - synonymOutputStream.write(name.getBytes(utf8)); - synonymOutputStream.write(0); - synonymOutputStream.writeInt((int) baseIndex); // unsigned int 32 - } - return names.size() - 1; - } else { - return 0; + private void writeSynonyms(DataOutputStream synonymOutputStream, SortedSet synonymsEntries) throws IOException { + for (SynonymsEntry s : synonymsEntries) { + s.serialize(synonymOutputStream); } } @@ -184,7 +181,7 @@ infoWriter.write("idxfilesize=" + idxfilesize + "\n"); infoWriter.write("idxoffsetbits=32\n"); infoWriter.write("author=František Kučera\n"); - infoWriter.write("email=telco-dictionary." + EMAIL_TOKEN + "@" + "frantovo.cz\n"); + infoWriter.write("em" + "ail=telco" + "-dictionary." + EML_TO_KEN + "@" + "fran" + "tovo.cz\n"); infoWriter.write("website=https://telco.frantovo.cz\n"); infoWriter.write("description=A dictionary for telecommunications licensed under GNU FDL\n"); infoWriter.write("date=2013.07.09\n"); diff -r 939fa8d8663e -r b188eae2c092 java/dictionary-generator/src/cz/frantovo/telco/dictionary/IndexEntry.java --- a/java/dictionary-generator/src/cz/frantovo/telco/dictionary/IndexEntry.java Tue Jul 09 22:41:49 2013 +0200 +++ b/java/dictionary-generator/src/cz/frantovo/telco/dictionary/IndexEntry.java Tue Jul 09 22:42:32 2013 +0200 @@ -17,14 +17,69 @@ */ package cz.frantovo.telco.dictionary; +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.Objects; + /** * * @author Ing. František Kučera (frantovo.cz) */ public class IndexEntry implements Comparable { + private String name; + private long offset; + private long length; + private long ordinal; + + public IndexEntry(String name, long offset, long length) { + this.name = name; + this.offset = offset; + this.length = length; + } + + public void serialize(DataOutputStream indexOutputStream) throws IOException { + indexOutputStream.write(name.getBytes(StandardCharsets.UTF_8)); + indexOutputStream.write(0); + indexOutputStream.writeInt((int) offset); // unsigned int 32 + indexOutputStream.writeInt((int) length); // unsigned int 32 + } + + public void setOrdinal(long ordinal) { + this.ordinal = ordinal; + } + + public long getOrdinal() { + return ordinal; + } + @Override public int compareTo(IndexEntry o) { - throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates. + int nameDiff = name.compareTo(o.name); + if (nameDiff == 0) { + int offsetDiff = ((Long) offset).compareTo(o.offset); + if (offsetDiff == 0) { + return ((Long) length).compareTo(o.length); + } else { + return offsetDiff; + } + } else { + return nameDiff; + } + } + + @Override + public boolean equals(Object o) { + return o instanceof IndexEntry && compareTo((IndexEntry) o) == 0; + } + + @Override + public int hashCode() { + int hash = 5; + hash = 53 * hash + Objects.hashCode(this.name); + hash = 53 * hash + (int) (this.offset ^ (this.offset >>> 32)); + hash = 53 * hash + (int) (this.length ^ (this.length >>> 32)); + return hash; } } diff -r 939fa8d8663e -r b188eae2c092 java/dictionary-generator/src/cz/frantovo/telco/dictionary/SynonymsEntry.java --- a/java/dictionary-generator/src/cz/frantovo/telco/dictionary/SynonymsEntry.java Tue Jul 09 22:41:49 2013 +0200 +++ b/java/dictionary-generator/src/cz/frantovo/telco/dictionary/SynonymsEntry.java Tue Jul 09 22:42:32 2013 +0200 @@ -17,14 +17,51 @@ */ package cz.frantovo.telco.dictionary; +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.Objects; + /** * * @author Ing. František Kučera (frantovo.cz) */ public class SynonymsEntry implements Comparable { + private IndexEntry base; + private String name; + + public SynonymsEntry(IndexEntry base, String name) { + this.base = base; + this.name = name; + } + + public void serialize(DataOutputStream synonymOutputStream) throws IOException { + synonymOutputStream.write(name.getBytes(StandardCharsets.UTF_8)); + synonymOutputStream.write(0); + synonymOutputStream.writeInt((int) base.getOrdinal()); // unsigned int 32 + } + @Override public int compareTo(SynonymsEntry o) { - throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates. + int nameDiff = name.compareTo(o.name); + if (nameDiff == 0) { + return base.compareTo(o.base); + } else { + return nameDiff; + } + } + + @Override + public boolean equals(Object o) { + return o instanceof IndexEntry && compareTo((SynonymsEntry) o) == 0; + } + + @Override + public int hashCode() { + int hash = 3; + hash = 47 * hash + Objects.hashCode(this.base); + hash = 47 * hash + Objects.hashCode(this.name); + return hash; } }