generator: sorted index and synonyms
authorFrantišek Kučera <franta-hg@frantovo.cz>
Tue, 09 Jul 2013 22:42:32 +0200
changeset 17b188eae2c092
parent 16 939fa8d8663e
child 18 7a2eb4cb6ff1
generator: sorted index and synonyms
java/dictionary-generator/src/cz/frantovo/telco/dictionary/Generator.java
java/dictionary-generator/src/cz/frantovo/telco/dictionary/IndexEntry.java
java/dictionary-generator/src/cz/frantovo/telco/dictionary/SynonymsEntry.java
     1.1 --- a/java/dictionary-generator/src/cz/frantovo/telco/dictionary/Generator.java	Tue Jul 09 22:41:49 2013 +0200
     1.2 +++ b/java/dictionary-generator/src/cz/frantovo/telco/dictionary/Generator.java	Tue Jul 09 22:42:32 2013 +0200
     1.3 @@ -26,10 +26,10 @@
     1.4  import java.io.FileOutputStream;
     1.5  import java.io.FileWriter;
     1.6  import java.io.IOException;
     1.7 -import java.nio.ByteBuffer;
     1.8 -import java.nio.charset.Charset;
     1.9  import java.util.ArrayList;
    1.10  import java.util.List;
    1.11 +import java.util.SortedSet;
    1.12 +import java.util.TreeSet;
    1.13  import java.util.logging.Level;
    1.14  import java.util.logging.Logger;
    1.15  import javax.xml.parsers.DocumentBuilder;
    1.16 @@ -66,18 +66,15 @@
    1.17  public class Generator {
    1.18  
    1.19  	private static final Logger log = Logger.getLogger(Generator.class.getName());
    1.20 -	private static final String EMAIL_TOKEN = "ixumhht68";
    1.21 +	private static final String EML_TO_KEN = "ixumhht68";
    1.22  	private final DocumentBuilderFactory documentBuilderFactory;
    1.23  	private final DocumentBuilder documentBuilder;
    1.24  	private final XPathFactory xpathFactory;
    1.25  	private final XPath xpath;
    1.26  	private final TransformerFactory xslFactory;
    1.27  	private final Transformer xsl;
    1.28 -	private final Charset utf8;
    1.29  
    1.30  	public Generator() throws ParserConfigurationException, TransformerConfigurationException {
    1.31 -		utf8 = Charset.forName("UTF-8");
    1.32 -
    1.33  		documentBuilderFactory = DocumentBuilderFactory.newInstance();
    1.34  		documentBuilderFactory.setNamespaceAware(true);
    1.35  		documentBuilder = documentBuilderFactory.newDocumentBuilder();
    1.36 @@ -101,6 +98,9 @@
    1.37  		DataOutputStream indexOutputStream = null;
    1.38  		BufferedWriter infoWriter = null;
    1.39  
    1.40 +		SortedSet<IndexEntry> indexEntries = new TreeSet<>();
    1.41 +		SortedSet<SynonymsEntry> synonymsEntries = new TreeSet<>();
    1.42 +
    1.43  		try {
    1.44  			dictOutputStream = new FileOutputStream(dictFile);
    1.45  			synonymOutputStream = new DataOutputStream(new FileOutputStream(synonymFile));
    1.46 @@ -112,12 +112,8 @@
    1.47  			// TODO: tags - labels/descriptions
    1.48  			xsl.setParameter("tags", sourceDocument.getElementsByTagNameNS(DICTIONARY, "tags").item(0));
    1.49  
    1.50 -			/**
    1.51 -			 * TODO: sorting
    1.52 -			 */
    1.53  			long offset = 0;
    1.54  			long conceptIndex = 0;
    1.55 -			long synonymCount = 0;
    1.56  			for (Node conceptNode : nodeIterable(sourceDocument.getElementsByTagNameNS(DICTIONARY, "concept"))) {
    1.57  				ByteArrayOutputStream conceptXhtml = new ByteArrayOutputStream();
    1.58  				xsl.transform(new DOMSource(conceptNode), new StreamResult(conceptXhtml));
    1.59 @@ -134,15 +130,23 @@
    1.60  					}
    1.61  				}
    1.62  
    1.63 -				synonymCount = +writeSynonyms(synonymOutputStream, names, conceptIndex);
    1.64 -				writeIndex(indexOutputStream, names.get(0), offset, length);
    1.65 +				IndexEntry indexEntry = new IndexEntry(names.get(0), offset, length);
    1.66 +				indexEntries.add(indexEntry);
    1.67 +
    1.68 +				for (int i = 1; i < names.size(); i++) {
    1.69 +					String name = names.get(i);
    1.70 +					synonymsEntries.add(new SynonymsEntry(indexEntry, name));
    1.71 +				}
    1.72  
    1.73  				offset = offset + length;
    1.74  				conceptIndex++;
    1.75  			}
    1.76  
    1.77 +			writeIndex(indexOutputStream, indexEntries);
    1.78 +			writeSynonyms(synonymOutputStream, synonymsEntries);
    1.79 +
    1.80  			indexOutputStream.flush();
    1.81 -			writeInfo(infoWriter, sourceDocument, conceptIndex + 1, synonymCount, indexFile.length());
    1.82 +			writeInfo(infoWriter, sourceDocument, conceptIndex + 1, synonymsEntries.size(), indexFile.length());
    1.83  		} catch (SAXException | IOException | TransformerException | XPathExpressionException e) {
    1.84  			log.log(Level.SEVERE, "unable to generate", e);
    1.85  		} finally {
    1.86 @@ -153,24 +157,17 @@
    1.87  		}
    1.88  	}
    1.89  
    1.90 -	private void writeIndex(DataOutputStream indexOutputStream, String name, long offset, long length) throws IOException {
    1.91 -		indexOutputStream.write(name.getBytes(utf8));
    1.92 -		indexOutputStream.write(0);
    1.93 -		indexOutputStream.writeInt((int) offset); // unsigned int 32
    1.94 -		indexOutputStream.writeInt((int) length); // unsigned int 32
    1.95 +	private void writeIndex(DataOutputStream indexOutputStream, SortedSet<IndexEntry> indexEntries) throws IOException {
    1.96 +		long ordinal = 0;
    1.97 +		for (IndexEntry e : indexEntries) {
    1.98 +			e.serialize(indexOutputStream);
    1.99 +			e.setOrdinal(ordinal++);
   1.100 +		}
   1.101  	}
   1.102  
   1.103 -	private int writeSynonyms(DataOutputStream synonymOutputStream, List<String> names, long baseIndex) throws IOException {
   1.104 -		if (names.size() > 1) {
   1.105 -			for (int i = 1; i < names.size(); i++) {
   1.106 -				String name = names.get(i);
   1.107 -				synonymOutputStream.write(name.getBytes(utf8));
   1.108 -				synonymOutputStream.write(0);
   1.109 -				synonymOutputStream.writeInt((int) baseIndex); // unsigned int 32
   1.110 -			}
   1.111 -			return names.size() - 1;
   1.112 -		} else {
   1.113 -			return 0;
   1.114 +	private void writeSynonyms(DataOutputStream synonymOutputStream, SortedSet<SynonymsEntry> synonymsEntries) throws IOException {
   1.115 +		for (SynonymsEntry s : synonymsEntries) {
   1.116 +			s.serialize(synonymOutputStream);
   1.117  		}
   1.118  	}
   1.119  
   1.120 @@ -184,7 +181,7 @@
   1.121  		infoWriter.write("idxfilesize=" + idxfilesize + "\n");
   1.122  		infoWriter.write("idxoffsetbits=32\n");
   1.123  		infoWriter.write("author=František Kučera\n");
   1.124 -		infoWriter.write("email=telco-dictionary." + EMAIL_TOKEN + "@" + "frantovo.cz\n");
   1.125 +		infoWriter.write("em" + "ail=telco" + "-dictionary." + EML_TO_KEN + "@" + "fran" + "tovo.cz\n");
   1.126  		infoWriter.write("website=https://telco.frantovo.cz\n");
   1.127  		infoWriter.write("description=A dictionary for telecommunications licensed under GNU FDL\n");
   1.128  		infoWriter.write("date=2013.07.09\n");
     2.1 --- a/java/dictionary-generator/src/cz/frantovo/telco/dictionary/IndexEntry.java	Tue Jul 09 22:41:49 2013 +0200
     2.2 +++ b/java/dictionary-generator/src/cz/frantovo/telco/dictionary/IndexEntry.java	Tue Jul 09 22:42:32 2013 +0200
     2.3 @@ -17,14 +17,69 @@
     2.4   */
     2.5  package cz.frantovo.telco.dictionary;
     2.6  
     2.7 +import java.io.DataOutputStream;
     2.8 +import java.io.IOException;
     2.9 +import java.nio.charset.StandardCharsets;
    2.10 +import java.util.Objects;
    2.11 +
    2.12  /**
    2.13   *
    2.14   * @author Ing. František Kučera (frantovo.cz)
    2.15   */
    2.16  public class IndexEntry implements Comparable<IndexEntry> {
    2.17  
    2.18 +	private String name;
    2.19 +	private long offset;
    2.20 +	private long length;
    2.21 +	private long ordinal;
    2.22 +
    2.23 +	public IndexEntry(String name, long offset, long length) {
    2.24 +		this.name = name;
    2.25 +		this.offset = offset;
    2.26 +		this.length = length;
    2.27 +	}
    2.28 +
    2.29 +	public void serialize(DataOutputStream indexOutputStream) throws IOException {
    2.30 +		indexOutputStream.write(name.getBytes(StandardCharsets.UTF_8));
    2.31 +		indexOutputStream.write(0);
    2.32 +		indexOutputStream.writeInt((int) offset); // unsigned int 32
    2.33 +		indexOutputStream.writeInt((int) length); // unsigned int 32
    2.34 +	}
    2.35 +
    2.36 +	public void setOrdinal(long ordinal) {
    2.37 +		this.ordinal = ordinal;
    2.38 +	}
    2.39 +
    2.40 +	public long getOrdinal() {
    2.41 +		return ordinal;
    2.42 +	}
    2.43 +
    2.44  	@Override
    2.45  	public int compareTo(IndexEntry o) {
    2.46 -		throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
    2.47 +		int nameDiff = name.compareTo(o.name);
    2.48 +		if (nameDiff == 0) {
    2.49 +			int offsetDiff = ((Long) offset).compareTo(o.offset);
    2.50 +			if (offsetDiff == 0) {
    2.51 +				return ((Long) length).compareTo(o.length);
    2.52 +			} else {
    2.53 +				return offsetDiff;
    2.54 +			}
    2.55 +		} else {
    2.56 +			return nameDiff;
    2.57 +		}
    2.58 +	}
    2.59 +
    2.60 +	@Override
    2.61 +	public boolean equals(Object o) {
    2.62 +		return o instanceof IndexEntry && compareTo((IndexEntry) o) == 0;
    2.63 +	}
    2.64 +
    2.65 +	@Override
    2.66 +	public int hashCode() {
    2.67 +		int hash = 5;
    2.68 +		hash = 53 * hash + Objects.hashCode(this.name);
    2.69 +		hash = 53 * hash + (int) (this.offset ^ (this.offset >>> 32));
    2.70 +		hash = 53 * hash + (int) (this.length ^ (this.length >>> 32));
    2.71 +		return hash;
    2.72  	}
    2.73  }
     3.1 --- a/java/dictionary-generator/src/cz/frantovo/telco/dictionary/SynonymsEntry.java	Tue Jul 09 22:41:49 2013 +0200
     3.2 +++ b/java/dictionary-generator/src/cz/frantovo/telco/dictionary/SynonymsEntry.java	Tue Jul 09 22:42:32 2013 +0200
     3.3 @@ -17,14 +17,51 @@
     3.4   */
     3.5  package cz.frantovo.telco.dictionary;
     3.6  
     3.7 +import java.io.DataOutputStream;
     3.8 +import java.io.IOException;
     3.9 +import java.nio.charset.StandardCharsets;
    3.10 +import java.util.Objects;
    3.11 +
    3.12  /**
    3.13   *
    3.14   * @author Ing. František Kučera (frantovo.cz)
    3.15   */
    3.16  public class SynonymsEntry implements Comparable<SynonymsEntry> {
    3.17  
    3.18 +	private IndexEntry base;
    3.19 +	private String name;
    3.20 +
    3.21 +	public SynonymsEntry(IndexEntry base, String name) {
    3.22 +		this.base = base;
    3.23 +		this.name = name;
    3.24 +	}
    3.25 +
    3.26 +	public void serialize(DataOutputStream synonymOutputStream) throws IOException {
    3.27 +		synonymOutputStream.write(name.getBytes(StandardCharsets.UTF_8));
    3.28 +		synonymOutputStream.write(0);
    3.29 +		synonymOutputStream.writeInt((int) base.getOrdinal()); // unsigned int 32
    3.30 +	}
    3.31 +
    3.32  	@Override
    3.33  	public int compareTo(SynonymsEntry o) {
    3.34 -		throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
    3.35 +		int nameDiff = name.compareTo(o.name);
    3.36 +		if (nameDiff == 0) {
    3.37 +			return base.compareTo(o.base);
    3.38 +		} else {
    3.39 +			return nameDiff;
    3.40 +		}
    3.41 +	}
    3.42 +
    3.43 +	@Override
    3.44 +	public boolean equals(Object o) {
    3.45 +		return o instanceof IndexEntry && compareTo((SynonymsEntry) o) == 0;
    3.46 +	}
    3.47 +
    3.48 +	@Override
    3.49 +	public int hashCode() {
    3.50 +		int hash = 3;
    3.51 +		hash = 47 * hash + Objects.hashCode(this.base);
    3.52 +		hash = 47 * hash + Objects.hashCode(this.name);
    3.53 +		return hash;
    3.54  	}
    3.55  }