1.1 --- a/java/dictionary-generator/src/cz/frantovo/telco/dictionary/Generator.java Tue Jul 09 22:42:32 2013 +0200
1.2 +++ b/java/dictionary-generator/src/cz/frantovo/telco/dictionary/Generator.java Wed Jul 10 01:27:13 2013 +0200
1.3 @@ -26,7 +26,9 @@
1.4 import java.io.FileOutputStream;
1.5 import java.io.FileWriter;
1.6 import java.io.IOException;
1.7 +import java.text.SimpleDateFormat;
1.8 import java.util.ArrayList;
1.9 +import java.util.Date;
1.10 import java.util.List;
1.11 import java.util.SortedSet;
1.12 import java.util.TreeSet;
1.13 @@ -64,7 +66,7 @@
1.14 * @author Ing. František Kučera (frantovo.cz)
1.15 */
1.16 public class Generator {
1.17 -
1.18 +
1.19 private static final Logger log = Logger.getLogger(Generator.class.getName());
1.20 private static final String EML_TO_KEN = "ixumhht68";
1.21 private final DocumentBuilderFactory documentBuilderFactory;
1.22 @@ -73,45 +75,46 @@
1.23 private final XPath xpath;
1.24 private final TransformerFactory xslFactory;
1.25 private final Transformer xsl;
1.26 -
1.27 + private final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy.MM.dd");
1.28 +
1.29 public Generator() throws ParserConfigurationException, TransformerConfigurationException {
1.30 documentBuilderFactory = DocumentBuilderFactory.newInstance();
1.31 documentBuilderFactory.setNamespaceAware(true);
1.32 documentBuilder = documentBuilderFactory.newDocumentBuilder();
1.33 -
1.34 +
1.35 xslFactory = TransformerFactory.newInstance();
1.36 xsl = xslFactory.newTransformer(new StreamSource("concept.xsl"));
1.37 -
1.38 +
1.39 xpathFactory = XPathFactory.newInstance();
1.40 xpath = xpathFactory.newXPath();
1.41 xpath.setNamespaceContext(getNamespaceContext());
1.42 }
1.43 -
1.44 +
1.45 private void generate(File folder, String filePrefix) {
1.46 File infoFile = new File(folder, filePrefix + ".ifo");
1.47 File dictFile = new File(folder, filePrefix + ".dict");
1.48 File indexFile = new File(folder, filePrefix + ".idx");
1.49 File synonymFile = new File(folder, filePrefix + ".syn");
1.50 -
1.51 +
1.52 FileOutputStream dictOutputStream = null;
1.53 DataOutputStream synonymOutputStream = null;
1.54 DataOutputStream indexOutputStream = null;
1.55 BufferedWriter infoWriter = null;
1.56 -
1.57 +
1.58 SortedSet<IndexEntry> indexEntries = new TreeSet<>();
1.59 SortedSet<SynonymsEntry> synonymsEntries = new TreeSet<>();
1.60 -
1.61 +
1.62 try {
1.63 dictOutputStream = new FileOutputStream(dictFile);
1.64 synonymOutputStream = new DataOutputStream(new FileOutputStream(synonymFile));
1.65 indexOutputStream = new DataOutputStream(new FileOutputStream(indexFile));
1.66 infoWriter = new BufferedWriter(new FileWriter(infoFile));
1.67 -
1.68 +
1.69 Document sourceDocument = documentBuilder.parse("../../data/dictionary.xml");
1.70 XPathExpression termsXPath = xpath.compile("d:term/@completeForm|d:term/@abbreviation");
1.71 // TODO: tags - labels/descriptions
1.72 xsl.setParameter("tags", sourceDocument.getElementsByTagNameNS(DICTIONARY, "tags").item(0));
1.73 -
1.74 +
1.75 long offset = 0;
1.76 long conceptIndex = 0;
1.77 for (Node conceptNode : nodeIterable(sourceDocument.getElementsByTagNameNS(DICTIONARY, "concept"))) {
1.78 @@ -119,32 +122,35 @@
1.79 xsl.transform(new DOMSource(conceptNode), new StreamResult(conceptXhtml));
1.80 int length = conceptXhtml.size();
1.81 dictOutputStream.write(conceptXhtml.toByteArray());
1.82 -
1.83 +
1.84 NodeList nameNodes = (NodeList) termsXPath.evaluate(conceptNode, XPathConstants.NODESET);
1.85 List<String> names = new ArrayList<>();
1.86 -
1.87 +
1.88 for (Node nameNode : nodeIterable(nameNodes)) {
1.89 String name = nameNode.getTextContent().trim();
1.90 if (!name.isEmpty()) {
1.91 names.add(name);
1.92 }
1.93 }
1.94 -
1.95 - IndexEntry indexEntry = new IndexEntry(names.get(0), offset, length);
1.96 +
1.97 + String baseName = names.get(0);
1.98 + IndexEntry indexEntry = new IndexEntry(baseName, offset, length);
1.99 indexEntries.add(indexEntry);
1.100 -
1.101 +
1.102 for (int i = 1; i < names.size(); i++) {
1.103 String name = names.get(i);
1.104 - synonymsEntries.add(new SynonymsEntry(indexEntry, name));
1.105 + if (!baseName.equals(name)) {
1.106 + synonymsEntries.add(new SynonymsEntry(indexEntry, name));
1.107 + }
1.108 }
1.109 -
1.110 +
1.111 offset = offset + length;
1.112 conceptIndex++;
1.113 }
1.114 -
1.115 +
1.116 writeIndex(indexOutputStream, indexEntries);
1.117 writeSynonyms(synonymOutputStream, synonymsEntries);
1.118 -
1.119 +
1.120 indexOutputStream.flush();
1.121 writeInfo(infoWriter, sourceDocument, conceptIndex + 1, synonymsEntries.size(), indexFile.length());
1.122 } catch (SAXException | IOException | TransformerException | XPathExpressionException e) {
1.123 @@ -156,7 +162,7 @@
1.124 close(infoWriter);
1.125 }
1.126 }
1.127 -
1.128 +
1.129 private void writeIndex(DataOutputStream indexOutputStream, SortedSet<IndexEntry> indexEntries) throws IOException {
1.130 long ordinal = 0;
1.131 for (IndexEntry e : indexEntries) {
1.132 @@ -164,13 +170,13 @@
1.133 e.setOrdinal(ordinal++);
1.134 }
1.135 }
1.136 -
1.137 +
1.138 private void writeSynonyms(DataOutputStream synonymOutputStream, SortedSet<SynonymsEntry> synonymsEntries) throws IOException {
1.139 for (SynonymsEntry s : synonymsEntries) {
1.140 s.serialize(synonymOutputStream);
1.141 }
1.142 }
1.143 -
1.144 +
1.145 private void writeInfo(BufferedWriter infoWriter, Document sourceDocument, long wordcount, long synwourdcount, long idxfilesize) throws IOException {
1.146 // TODO: values from document metadata
1.147 infoWriter.write("StarDict's dict ifo file\n");
1.148 @@ -184,14 +190,14 @@
1.149 infoWriter.write("em" + "ail=telco" + "-dictionary." + EML_TO_KEN + "@" + "fran" + "tovo.cz\n");
1.150 infoWriter.write("website=https://telco.frantovo.cz\n");
1.151 infoWriter.write("description=A dictionary for telecommunications licensed under GNU FDL\n");
1.152 - infoWriter.write("date=2013.07.09\n");
1.153 + infoWriter.write("date=" + dateFormat.format(new Date()) + "\n");
1.154 infoWriter.write("sametypesequence=h\n");
1.155 }
1.156 -
1.157 +
1.158 public static void main(String[] args) {
1.159 File outputFolder = new File("../../delivery/free-telco-dictionary");
1.160 outputFolder.mkdir();
1.161 -
1.162 +
1.163 try {
1.164 Generator g = new Generator();
1.165 g.generate(outputFolder, "telco");