1.1 --- a/java/dictionary-generator/src/cz/frantovo/telco/dictionary/Generator.java Tue Jul 09 22:41:49 2013 +0200
1.2 +++ b/java/dictionary-generator/src/cz/frantovo/telco/dictionary/Generator.java Tue Jul 09 22:42:32 2013 +0200
1.3 @@ -26,10 +26,10 @@
1.4 import java.io.FileOutputStream;
1.5 import java.io.FileWriter;
1.6 import java.io.IOException;
1.7 -import java.nio.ByteBuffer;
1.8 -import java.nio.charset.Charset;
1.9 import java.util.ArrayList;
1.10 import java.util.List;
1.11 +import java.util.SortedSet;
1.12 +import java.util.TreeSet;
1.13 import java.util.logging.Level;
1.14 import java.util.logging.Logger;
1.15 import javax.xml.parsers.DocumentBuilder;
1.16 @@ -66,18 +66,15 @@
1.17 public class Generator {
1.18
1.19 private static final Logger log = Logger.getLogger(Generator.class.getName());
1.20 - private static final String EMAIL_TOKEN = "ixumhht68";
1.21 + private static final String EML_TO_KEN = "ixumhht68";
1.22 private final DocumentBuilderFactory documentBuilderFactory;
1.23 private final DocumentBuilder documentBuilder;
1.24 private final XPathFactory xpathFactory;
1.25 private final XPath xpath;
1.26 private final TransformerFactory xslFactory;
1.27 private final Transformer xsl;
1.28 - private final Charset utf8;
1.29
1.30 public Generator() throws ParserConfigurationException, TransformerConfigurationException {
1.31 - utf8 = Charset.forName("UTF-8");
1.32 -
1.33 documentBuilderFactory = DocumentBuilderFactory.newInstance();
1.34 documentBuilderFactory.setNamespaceAware(true);
1.35 documentBuilder = documentBuilderFactory.newDocumentBuilder();
1.36 @@ -101,6 +98,9 @@
1.37 DataOutputStream indexOutputStream = null;
1.38 BufferedWriter infoWriter = null;
1.39
1.40 + SortedSet<IndexEntry> indexEntries = new TreeSet<>();
1.41 + SortedSet<SynonymsEntry> synonymsEntries = new TreeSet<>();
1.42 +
1.43 try {
1.44 dictOutputStream = new FileOutputStream(dictFile);
1.45 synonymOutputStream = new DataOutputStream(new FileOutputStream(synonymFile));
1.46 @@ -112,12 +112,8 @@
1.47 // TODO: tags - labels/descriptions
1.48 xsl.setParameter("tags", sourceDocument.getElementsByTagNameNS(DICTIONARY, "tags").item(0));
1.49
1.50 - /**
1.51 - * TODO: sorting
1.52 - */
1.53 long offset = 0;
1.54 long conceptIndex = 0;
1.55 - long synonymCount = 0;
1.56 for (Node conceptNode : nodeIterable(sourceDocument.getElementsByTagNameNS(DICTIONARY, "concept"))) {
1.57 ByteArrayOutputStream conceptXhtml = new ByteArrayOutputStream();
1.58 xsl.transform(new DOMSource(conceptNode), new StreamResult(conceptXhtml));
1.59 @@ -134,15 +130,23 @@
1.60 }
1.61 }
1.62
1.63 - synonymCount = +writeSynonyms(synonymOutputStream, names, conceptIndex);
1.64 - writeIndex(indexOutputStream, names.get(0), offset, length);
1.65 + IndexEntry indexEntry = new IndexEntry(names.get(0), offset, length);
1.66 + indexEntries.add(indexEntry);
1.67 +
1.68 + for (int i = 1; i < names.size(); i++) {
1.69 + String name = names.get(i);
1.70 + synonymsEntries.add(new SynonymsEntry(indexEntry, name));
1.71 + }
1.72
1.73 offset = offset + length;
1.74 conceptIndex++;
1.75 }
1.76
1.77 + writeIndex(indexOutputStream, indexEntries);
1.78 + writeSynonyms(synonymOutputStream, synonymsEntries);
1.79 +
1.80 indexOutputStream.flush();
1.81 - writeInfo(infoWriter, sourceDocument, conceptIndex + 1, synonymCount, indexFile.length());
1.82 + writeInfo(infoWriter, sourceDocument, conceptIndex + 1, synonymsEntries.size(), indexFile.length());
1.83 } catch (SAXException | IOException | TransformerException | XPathExpressionException e) {
1.84 log.log(Level.SEVERE, "unable to generate", e);
1.85 } finally {
1.86 @@ -153,24 +157,17 @@
1.87 }
1.88 }
1.89
1.90 - private void writeIndex(DataOutputStream indexOutputStream, String name, long offset, long length) throws IOException {
1.91 - indexOutputStream.write(name.getBytes(utf8));
1.92 - indexOutputStream.write(0);
1.93 - indexOutputStream.writeInt((int) offset); // unsigned int 32
1.94 - indexOutputStream.writeInt((int) length); // unsigned int 32
1.95 + private void writeIndex(DataOutputStream indexOutputStream, SortedSet<IndexEntry> indexEntries) throws IOException {
1.96 + long ordinal = 0;
1.97 + for (IndexEntry e : indexEntries) {
1.98 + e.serialize(indexOutputStream);
1.99 + e.setOrdinal(ordinal++);
1.100 + }
1.101 }
1.102
1.103 - private int writeSynonyms(DataOutputStream synonymOutputStream, List<String> names, long baseIndex) throws IOException {
1.104 - if (names.size() > 1) {
1.105 - for (int i = 1; i < names.size(); i++) {
1.106 - String name = names.get(i);
1.107 - synonymOutputStream.write(name.getBytes(utf8));
1.108 - synonymOutputStream.write(0);
1.109 - synonymOutputStream.writeInt((int) baseIndex); // unsigned int 32
1.110 - }
1.111 - return names.size() - 1;
1.112 - } else {
1.113 - return 0;
1.114 + private void writeSynonyms(DataOutputStream synonymOutputStream, SortedSet<SynonymsEntry> synonymsEntries) throws IOException {
1.115 + for (SynonymsEntry s : synonymsEntries) {
1.116 + s.serialize(synonymOutputStream);
1.117 }
1.118 }
1.119
1.120 @@ -184,7 +181,7 @@
1.121 infoWriter.write("idxfilesize=" + idxfilesize + "\n");
1.122 infoWriter.write("idxoffsetbits=32\n");
1.123 infoWriter.write("author=František Kučera\n");
1.124 - infoWriter.write("email=telco-dictionary." + EMAIL_TOKEN + "@" + "frantovo.cz\n");
1.125 + infoWriter.write("em" + "ail=telco" + "-dictionary." + EML_TO_KEN + "@" + "fran" + "tovo.cz\n");
1.126 infoWriter.write("website=https://telco.frantovo.cz\n");
1.127 infoWriter.write("description=A dictionary for telecommunications licensed under GNU FDL\n");
1.128 infoWriter.write("date=2013.07.09\n");