Drupal: číštění HTML – Tidy.
authorFrantišek Kučera <franta-hg@frantovo.cz>
Thu, 13 Oct 2011 03:09:22 +0200
changeset 7541d6c0cac8b3
parent 74 e1244384cc6f
child 76 b5690fc25af6
Drupal: číštění HTML – Tidy.
src/org/sonews/storage/DrupalMessage.java
     1.1 --- a/src/org/sonews/storage/DrupalMessage.java	Wed Oct 12 23:10:31 2011 +0200
     1.2 +++ b/src/org/sonews/storage/DrupalMessage.java	Thu Oct 13 03:09:22 2011 +0200
     1.3 @@ -17,8 +17,12 @@
     1.4   */
     1.5  package org.sonews.storage;
     1.6  
     1.7 +import java.io.BufferedReader;
     1.8  import java.io.ByteArrayOutputStream;
     1.9  import java.io.IOException;
    1.10 +import java.io.InputStream;
    1.11 +import java.io.InputStreamReader;
    1.12 +import java.io.PrintStream;
    1.13  import java.io.StringReader;
    1.14  import java.io.StringWriter;
    1.15  import java.io.UnsupportedEncodingException;
    1.16 @@ -114,8 +118,22 @@
    1.17  		 */
    1.18  		try {
    1.19  			String originalText = rs.getString("text");
    1.20 -			StringReader input = new StringReader("<body>" + originalText + "</body>");
    1.21 -			StringWriter output = new StringWriter();
    1.22 +
    1.23 +			/**
    1.24 +			 * TODO: používat cache, ukládat si vygenerované články
    1.25 +			 * 
    1.26 +			 * 
    1.27 +			 * Místo markdownu jen ošetřit:
    1.28 +			 *		- odstavce
    1.29 +			 *		- nesmyslné entity v odkazech
    1.30 +			 *		- neuzavřené značky: br, hr, img
    1.31 +			 */
    1.32 +			String tidyTexy = tidyXhtml("<html><body>" + originalText + "</body></html>");
    1.33 +
    1.34 +
    1.35 +
    1.36 +			StringReader input = new StringReader(tidyTexy);
    1.37 +			StringWriter output = new StringWriter(2 * tidyTexy.length());
    1.38  			TransformerFactory tf = TransformerFactory.newInstance();
    1.39  			Transformer t = tf.newTransformer(new StreamSource(Resource.getAsStream("helpers/mimeXhtmlPart.xsl")));
    1.40  			t.setParameter("isRoot", (rs.getInt("parent_id") == 0));
    1.41 @@ -124,7 +142,7 @@
    1.42  			t.setParameter("wwwRead", rs.getString("wwwRead"));
    1.43  			t.setParameter("wwwPost", rs.getString("wwwPost"));
    1.44  			t.transform(new StreamSource(input), new StreamResult(output));
    1.45 -			
    1.46 +
    1.47  			return output.toString();
    1.48  		} catch (Exception e) {
    1.49  			/**
    1.50 @@ -135,6 +153,47 @@
    1.51  		}
    1.52  	}
    1.53  
    1.54 +	/**
    1.55 +	 * TODO: refaktorovat, přesunout
    1.56 +	 */
    1.57 +	private static String tidyXhtml(String inputText) throws IOException {
    1.58 +		Runtime r = Runtime.getRuntime();
    1.59 +		Process p = r.exec(new String[]{"tidy",
    1.60 +					"-asxml",
    1.61 +					"-numeric",
    1.62 +					"-utf8",
    1.63 +					"-quiet",
    1.64 +					"--doctype", "omit",
    1.65 +					"--logical-emphasis", "true",
    1.66 +					"--show-errors", "0"});
    1.67 +
    1.68 +		PrintStream vstupProcesu = new PrintStream(p.getOutputStream());
    1.69 +		vstupProcesu.print(inputText);
    1.70 +		vstupProcesu.close();
    1.71 +
    1.72 +		String outputText = streamToString(p.getInputStream());
    1.73 +
    1.74 +		return outputText;
    1.75 +	}
    1.76 +
    1.77 +	/**
    1.78 +	 * TODO: refaktorovat, přesunout
    1.79 +	 */
    1.80 +	private static String streamToString(InputStream proud) throws IOException {
    1.81 +		StringBuilder výsledek = new StringBuilder();
    1.82 +		BufferedReader buf = new BufferedReader(new InputStreamReader(proud));
    1.83 +		while (true) {
    1.84 +			String radek = buf.readLine();
    1.85 +			if (radek == null) {
    1.86 +				break;
    1.87 +			} else {
    1.88 +				výsledek.append(radek);
    1.89 +				výsledek.append("\n");
    1.90 +			}
    1.91 +		}
    1.92 +		return výsledek.toString();
    1.93 +	}
    1.94 +
    1.95  	private static String constructMessageId(int articleID, int groupID, String groupName, String domainName) {
    1.96  		StringBuilder sb = new StringBuilder();
    1.97  		sb.append("<");