java/dictionary-generator/src/cz/frantovo/telco/dictionary/Generator.java
changeset 17 b188eae2c092
parent 15 93208f791318
child 18 7a2eb4cb6ff1
     1.1 --- a/java/dictionary-generator/src/cz/frantovo/telco/dictionary/Generator.java	Tue Jul 09 22:41:49 2013 +0200
     1.2 +++ b/java/dictionary-generator/src/cz/frantovo/telco/dictionary/Generator.java	Tue Jul 09 22:42:32 2013 +0200
     1.3 @@ -26,10 +26,10 @@
     1.4  import java.io.FileOutputStream;
     1.5  import java.io.FileWriter;
     1.6  import java.io.IOException;
     1.7 -import java.nio.ByteBuffer;
     1.8 -import java.nio.charset.Charset;
     1.9  import java.util.ArrayList;
    1.10  import java.util.List;
    1.11 +import java.util.SortedSet;
    1.12 +import java.util.TreeSet;
    1.13  import java.util.logging.Level;
    1.14  import java.util.logging.Logger;
    1.15  import javax.xml.parsers.DocumentBuilder;
    1.16 @@ -66,18 +66,15 @@
    1.17  public class Generator {
    1.18  
    1.19  	private static final Logger log = Logger.getLogger(Generator.class.getName());
    1.20 -	private static final String EMAIL_TOKEN = "ixumhht68";
    1.21 +	private static final String EML_TO_KEN = "ixumhht68";
    1.22  	private final DocumentBuilderFactory documentBuilderFactory;
    1.23  	private final DocumentBuilder documentBuilder;
    1.24  	private final XPathFactory xpathFactory;
    1.25  	private final XPath xpath;
    1.26  	private final TransformerFactory xslFactory;
    1.27  	private final Transformer xsl;
    1.28 -	private final Charset utf8;
    1.29  
    1.30  	public Generator() throws ParserConfigurationException, TransformerConfigurationException {
    1.31 -		utf8 = Charset.forName("UTF-8");
    1.32 -
    1.33  		documentBuilderFactory = DocumentBuilderFactory.newInstance();
    1.34  		documentBuilderFactory.setNamespaceAware(true);
    1.35  		documentBuilder = documentBuilderFactory.newDocumentBuilder();
    1.36 @@ -101,6 +98,9 @@
    1.37  		DataOutputStream indexOutputStream = null;
    1.38  		BufferedWriter infoWriter = null;
    1.39  
    1.40 +		SortedSet<IndexEntry> indexEntries = new TreeSet<>();
    1.41 +		SortedSet<SynonymsEntry> synonymsEntries = new TreeSet<>();
    1.42 +
    1.43  		try {
    1.44  			dictOutputStream = new FileOutputStream(dictFile);
    1.45  			synonymOutputStream = new DataOutputStream(new FileOutputStream(synonymFile));
    1.46 @@ -112,12 +112,8 @@
    1.47  			// TODO: tags - labels/descriptions
    1.48  			xsl.setParameter("tags", sourceDocument.getElementsByTagNameNS(DICTIONARY, "tags").item(0));
    1.49  
    1.50 -			/**
    1.51 -			 * TODO: sorting
    1.52 -			 */
    1.53  			long offset = 0;
    1.54  			long conceptIndex = 0;
    1.55 -			long synonymCount = 0;
    1.56  			for (Node conceptNode : nodeIterable(sourceDocument.getElementsByTagNameNS(DICTIONARY, "concept"))) {
    1.57  				ByteArrayOutputStream conceptXhtml = new ByteArrayOutputStream();
    1.58  				xsl.transform(new DOMSource(conceptNode), new StreamResult(conceptXhtml));
    1.59 @@ -134,15 +130,23 @@
    1.60  					}
    1.61  				}
    1.62  
    1.63 -				synonymCount = +writeSynonyms(synonymOutputStream, names, conceptIndex);
    1.64 -				writeIndex(indexOutputStream, names.get(0), offset, length);
    1.65 +				IndexEntry indexEntry = new IndexEntry(names.get(0), offset, length);
    1.66 +				indexEntries.add(indexEntry);
    1.67 +
    1.68 +				for (int i = 1; i < names.size(); i++) {
    1.69 +					String name = names.get(i);
    1.70 +					synonymsEntries.add(new SynonymsEntry(indexEntry, name));
    1.71 +				}
    1.72  
    1.73  				offset = offset + length;
    1.74  				conceptIndex++;
    1.75  			}
    1.76  
    1.77 +			writeIndex(indexOutputStream, indexEntries);
    1.78 +			writeSynonyms(synonymOutputStream, synonymsEntries);
    1.79 +
    1.80  			indexOutputStream.flush();
    1.81 -			writeInfo(infoWriter, sourceDocument, conceptIndex + 1, synonymCount, indexFile.length());
    1.82 +			writeInfo(infoWriter, sourceDocument, conceptIndex + 1, synonymsEntries.size(), indexFile.length());
    1.83  		} catch (SAXException | IOException | TransformerException | XPathExpressionException e) {
    1.84  			log.log(Level.SEVERE, "unable to generate", e);
    1.85  		} finally {
    1.86 @@ -153,24 +157,17 @@
    1.87  		}
    1.88  	}
    1.89  
    1.90 -	private void writeIndex(DataOutputStream indexOutputStream, String name, long offset, long length) throws IOException {
    1.91 -		indexOutputStream.write(name.getBytes(utf8));
    1.92 -		indexOutputStream.write(0);
    1.93 -		indexOutputStream.writeInt((int) offset); // unsigned int 32
    1.94 -		indexOutputStream.writeInt((int) length); // unsigned int 32
    1.95 +	private void writeIndex(DataOutputStream indexOutputStream, SortedSet<IndexEntry> indexEntries) throws IOException {
    1.96 +		long ordinal = 0;
    1.97 +		for (IndexEntry e : indexEntries) {
    1.98 +			e.serialize(indexOutputStream);
    1.99 +			e.setOrdinal(ordinal++);
   1.100 +		}
   1.101  	}
   1.102  
   1.103 -	private int writeSynonyms(DataOutputStream synonymOutputStream, List<String> names, long baseIndex) throws IOException {
   1.104 -		if (names.size() > 1) {
   1.105 -			for (int i = 1; i < names.size(); i++) {
   1.106 -				String name = names.get(i);
   1.107 -				synonymOutputStream.write(name.getBytes(utf8));
   1.108 -				synonymOutputStream.write(0);
   1.109 -				synonymOutputStream.writeInt((int) baseIndex); // unsigned int 32
   1.110 -			}
   1.111 -			return names.size() - 1;
   1.112 -		} else {
   1.113 -			return 0;
   1.114 +	private void writeSynonyms(DataOutputStream synonymOutputStream, SortedSet<SynonymsEntry> synonymsEntries) throws IOException {
   1.115 +		for (SynonymsEntry s : synonymsEntries) {
   1.116 +			s.serialize(synonymOutputStream);
   1.117  		}
   1.118  	}
   1.119  
   1.120 @@ -184,7 +181,7 @@
   1.121  		infoWriter.write("idxfilesize=" + idxfilesize + "\n");
   1.122  		infoWriter.write("idxoffsetbits=32\n");
   1.123  		infoWriter.write("author=František Kučera\n");
   1.124 -		infoWriter.write("email=telco-dictionary." + EMAIL_TOKEN + "@" + "frantovo.cz\n");
   1.125 +		infoWriter.write("em" + "ail=telco" + "-dictionary." + EML_TO_KEN + "@" + "fran" + "tovo.cz\n");
   1.126  		infoWriter.write("website=https://telco.frantovo.cz\n");
   1.127  		infoWriter.write("description=A dictionary for telecommunications licensed under GNU FDL\n");
   1.128  		infoWriter.write("date=2013.07.09\n");