java/dictionary-generator/src/cz/frantovo/telco/dictionary/Generator.java
changeset 18 7a2eb4cb6ff1
parent 17 b188eae2c092
child 20 aecdfc3b1950
     1.1 --- a/java/dictionary-generator/src/cz/frantovo/telco/dictionary/Generator.java	Tue Jul 09 22:42:32 2013 +0200
     1.2 +++ b/java/dictionary-generator/src/cz/frantovo/telco/dictionary/Generator.java	Wed Jul 10 01:27:13 2013 +0200
     1.3 @@ -26,7 +26,9 @@
     1.4  import java.io.FileOutputStream;
     1.5  import java.io.FileWriter;
     1.6  import java.io.IOException;
     1.7 +import java.text.SimpleDateFormat;
     1.8  import java.util.ArrayList;
     1.9 +import java.util.Date;
    1.10  import java.util.List;
    1.11  import java.util.SortedSet;
    1.12  import java.util.TreeSet;
    1.13 @@ -64,7 +66,7 @@
    1.14   * @author Ing. František Kučera (frantovo.cz)
    1.15   */
    1.16  public class Generator {
    1.17 -
    1.18 +	
    1.19  	private static final Logger log = Logger.getLogger(Generator.class.getName());
    1.20  	private static final String EML_TO_KEN = "ixumhht68";
    1.21  	private final DocumentBuilderFactory documentBuilderFactory;
    1.22 @@ -73,45 +75,46 @@
    1.23  	private final XPath xpath;
    1.24  	private final TransformerFactory xslFactory;
    1.25  	private final Transformer xsl;
    1.26 -
    1.27 +	private final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy.MM.dd");
    1.28 +	
    1.29  	public Generator() throws ParserConfigurationException, TransformerConfigurationException {
    1.30  		documentBuilderFactory = DocumentBuilderFactory.newInstance();
    1.31  		documentBuilderFactory.setNamespaceAware(true);
    1.32  		documentBuilder = documentBuilderFactory.newDocumentBuilder();
    1.33 -
    1.34 +		
    1.35  		xslFactory = TransformerFactory.newInstance();
    1.36  		xsl = xslFactory.newTransformer(new StreamSource("concept.xsl"));
    1.37 -
    1.38 +		
    1.39  		xpathFactory = XPathFactory.newInstance();
    1.40  		xpath = xpathFactory.newXPath();
    1.41  		xpath.setNamespaceContext(getNamespaceContext());
    1.42  	}
    1.43 -
    1.44 +	
    1.45  	private void generate(File folder, String filePrefix) {
    1.46  		File infoFile = new File(folder, filePrefix + ".ifo");
    1.47  		File dictFile = new File(folder, filePrefix + ".dict");
    1.48  		File indexFile = new File(folder, filePrefix + ".idx");
    1.49  		File synonymFile = new File(folder, filePrefix + ".syn");
    1.50 -
    1.51 +		
    1.52  		FileOutputStream dictOutputStream = null;
    1.53  		DataOutputStream synonymOutputStream = null;
    1.54  		DataOutputStream indexOutputStream = null;
    1.55  		BufferedWriter infoWriter = null;
    1.56 -
    1.57 +		
    1.58  		SortedSet<IndexEntry> indexEntries = new TreeSet<>();
    1.59  		SortedSet<SynonymsEntry> synonymsEntries = new TreeSet<>();
    1.60 -
    1.61 +		
    1.62  		try {
    1.63  			dictOutputStream = new FileOutputStream(dictFile);
    1.64  			synonymOutputStream = new DataOutputStream(new FileOutputStream(synonymFile));
    1.65  			indexOutputStream = new DataOutputStream(new FileOutputStream(indexFile));
    1.66  			infoWriter = new BufferedWriter(new FileWriter(infoFile));
    1.67 -
    1.68 +			
    1.69  			Document sourceDocument = documentBuilder.parse("../../data/dictionary.xml");
    1.70  			XPathExpression termsXPath = xpath.compile("d:term/@completeForm|d:term/@abbreviation");
    1.71  			// TODO: tags - labels/descriptions
    1.72  			xsl.setParameter("tags", sourceDocument.getElementsByTagNameNS(DICTIONARY, "tags").item(0));
    1.73 -
    1.74 +			
    1.75  			long offset = 0;
    1.76  			long conceptIndex = 0;
    1.77  			for (Node conceptNode : nodeIterable(sourceDocument.getElementsByTagNameNS(DICTIONARY, "concept"))) {
    1.78 @@ -119,32 +122,35 @@
    1.79  				xsl.transform(new DOMSource(conceptNode), new StreamResult(conceptXhtml));
    1.80  				int length = conceptXhtml.size();
    1.81  				dictOutputStream.write(conceptXhtml.toByteArray());
    1.82 -
    1.83 +				
    1.84  				NodeList nameNodes = (NodeList) termsXPath.evaluate(conceptNode, XPathConstants.NODESET);
    1.85  				List<String> names = new ArrayList<>();
    1.86 -
    1.87 +				
    1.88  				for (Node nameNode : nodeIterable(nameNodes)) {
    1.89  					String name = nameNode.getTextContent().trim();
    1.90  					if (!name.isEmpty()) {
    1.91  						names.add(name);
    1.92  					}
    1.93  				}
    1.94 -
    1.95 -				IndexEntry indexEntry = new IndexEntry(names.get(0), offset, length);
    1.96 +				
    1.97 +				String baseName = names.get(0);
    1.98 +				IndexEntry indexEntry = new IndexEntry(baseName, offset, length);
    1.99  				indexEntries.add(indexEntry);
   1.100 -
   1.101 +				
   1.102  				for (int i = 1; i < names.size(); i++) {
   1.103  					String name = names.get(i);
   1.104 -					synonymsEntries.add(new SynonymsEntry(indexEntry, name));
   1.105 +					if (!baseName.equals(name)) {
   1.106 +						synonymsEntries.add(new SynonymsEntry(indexEntry, name));
   1.107 +					}
   1.108  				}
   1.109 -
   1.110 +				
   1.111  				offset = offset + length;
   1.112  				conceptIndex++;
   1.113  			}
   1.114 -
   1.115 +			
   1.116  			writeIndex(indexOutputStream, indexEntries);
   1.117  			writeSynonyms(synonymOutputStream, synonymsEntries);
   1.118 -
   1.119 +			
   1.120  			indexOutputStream.flush();
   1.121  			writeInfo(infoWriter, sourceDocument, conceptIndex + 1, synonymsEntries.size(), indexFile.length());
   1.122  		} catch (SAXException | IOException | TransformerException | XPathExpressionException e) {
   1.123 @@ -156,7 +162,7 @@
   1.124  			close(infoWriter);
   1.125  		}
   1.126  	}
   1.127 -
   1.128 +	
   1.129  	private void writeIndex(DataOutputStream indexOutputStream, SortedSet<IndexEntry> indexEntries) throws IOException {
   1.130  		long ordinal = 0;
   1.131  		for (IndexEntry e : indexEntries) {
   1.132 @@ -164,13 +170,13 @@
   1.133  			e.setOrdinal(ordinal++);
   1.134  		}
   1.135  	}
   1.136 -
   1.137 +	
   1.138  	private void writeSynonyms(DataOutputStream synonymOutputStream, SortedSet<SynonymsEntry> synonymsEntries) throws IOException {
   1.139  		for (SynonymsEntry s : synonymsEntries) {
   1.140  			s.serialize(synonymOutputStream);
   1.141  		}
   1.142  	}
   1.143 -
   1.144 +	
   1.145  	private void writeInfo(BufferedWriter infoWriter, Document sourceDocument, long wordcount, long synwourdcount, long idxfilesize) throws IOException {
   1.146  		// TODO: values from document metadata
   1.147  		infoWriter.write("StarDict's dict ifo file\n");
   1.148 @@ -184,14 +190,14 @@
   1.149  		infoWriter.write("em" + "ail=telco" + "-dictionary." + EML_TO_KEN + "@" + "fran" + "tovo.cz\n");
   1.150  		infoWriter.write("website=https://telco.frantovo.cz\n");
   1.151  		infoWriter.write("description=A dictionary for telecommunications licensed under GNU FDL\n");
   1.152 -		infoWriter.write("date=2013.07.09\n");
   1.153 +		infoWriter.write("date=" + dateFormat.format(new Date()) + "\n");
   1.154  		infoWriter.write("sametypesequence=h\n");
   1.155  	}
   1.156 -
   1.157 +	
   1.158  	public static void main(String[] args) {
   1.159  		File outputFolder = new File("../../delivery/free-telco-dictionary");
   1.160  		outputFolder.mkdir();
   1.161 -
   1.162 +		
   1.163  		try {
   1.164  			Generator g = new Generator();
   1.165  			g.generate(outputFolder, "telco");