HtmlUnitをJrubyから使ってみる
*[scrape][jruby]HtmlUnitをJrubyから使ってみる
以前に見つけた Crowbar(http://d.hatena.ne.jp/cartooh/20090616#1245164108)だと日本語が化けるので、
別の方法を探しました。
はじめからCelerityを使ってスクレイピングしてればよかったんだけど、
mechanizeで作り込んじゃったからね。。。乗り換えるのもちょっと。
なので、単一ページのみ渡してJavaScriptを実行した結果を貰えるようにしてみました。
com.gargoylesoftware.htmlunit.protocol.data.DataUrlDecoderのパッチ
dataスキーマで文字コードに対応していなかったので適当に対応させてみました。
*** DataUrlDecoder_org.java Fri Aug 28 09:12:18 2009 --- DataUrlDecoder.java Tue Dec 22 12:19:42 2009 *************** *** 90,96 **** } private static String extractCharset(final String beforeData) { ! // TODO return DEFAULT_CHARSET; } --- 90,102 ---- } private static String extractCharset(final String beforeData) { ! if (beforeData.contains("charset=")) { ! final String charset = StringUtils.substringAfter(beforeData, "charset="); ! if (charset.contains(";")) { ! return StringUtils.substringBefore(charset, ";"); ! } ! return charset; ! } return DEFAULT_CHARSET; }
適当なラッパー
jrubyで書いてもよかったんだけどね。
import com.gargoylesoftware.htmlunit.*; import com.gargoylesoftware.htmlunit.html.*; import java.io.*; import java.util.*; import org.apache.commons.lang.StringEscapeUtils; import org.apache.commons.codec.binary.Base64; import org.apache.commons.codec.net.URLCodec; public class WebClientRb { static { System.setProperty("org.apache.commons.logging.Log", "org.apache.commons.logging.impl.SimpleLog"); System.setProperty("org.apache.commons.logging.simplelog.showdatetime", "true"); System.setProperty("org.apache.commons.logging.simplelog.log.httpclient.wire.header", "debug"); System.setProperty("org.apache.commons.logging.simplelog.log.org.apache.commons.httpclient", "debug"); }; private static final String DEFAULT_CHARSET = "US-ASCII"; protected WebClient web_client_ = null; protected XmlSerializer xml_serializer_ = null; public WebClientRb() { this(BrowserVersion.FIREFOX_3); } public WebClientRb(final BrowserVersion browserVersion) { this(new WebClient(browserVersion)); } public WebClientRb(final BrowserVersion browserVersion, final String proxyHost, final int proxyPort) { this(new WebClient(browserVersion, proxyHost, proxyPort)); } public WebClientRb(final String proxyHost, final int proxyPort) { this(new WebClient(BrowserVersion.FIREFOX_3, proxyHost, proxyPort)); } public WebClientRb(WebClient web_client) { WebAssert.notNull("web_client", web_client); web_client_ = web_client; xml_serializer_ = new XmlSerializer(); } public String getDocument(String url) throws IOException { WebAssert.notNull("url", url); final HtmlPage page = web_client_.getPage(url); return xml_serializer_.asXml(page); } public static String generateDataURL(final String mediaType, final byte[] data) throws UnsupportedEncodingException { WebAssert.notNull("mediaType", mediaType); WebAssert.notNull("data", data); return "data:" + mediaType + ";base64," + new String(URLCodec.encodeUrl(null, Base64.encodeBase64(data)), "US-ASCII"); } public static String generateHTMLDataURL(String html) throws UnsupportedEncodingException { return generateHTMLDataURL(html, "utf-8"); } public static String generateHTMLDataURL(String html, String charset) throws UnsupportedEncodingException { WebAssert.notNull("html", html); WebAssert.notNull("charset", charset); return generateDataURL("text/html;charset=utf-8", html.getBytes(charset)); } public static String generateHTMLDataURL(final byte[] data, String charset) throws UnsupportedEncodingException { if(null == charset) { charset = DEFAULT_CHARSET; } return generateDataURL("text/html;charset=" + charset, data); } // テストコード public static void main(String args[]) throws Exception { final WebClientRb html_unit_rb = new WebClientRb("localhost", 8000); File f = new File("input.html"); /* byte[] data = new byte[(int)f.length()]; BufferedInputStream bis = new BufferedInputStream(new FileInputStream(f)); bis.read(data); bis.close(); */ StringBuffer html = new StringBuffer(); BufferedReader fr = new BufferedReader(new InputStreamReader(new FileInputStream(f), "x-JISAutoDetect")); String line = null; while((line = fr.readLine()) != null) { html.append(line); } fr.close(); Writer fw = new BufferedWriter(new FileWriter("output.html")); try { //fw.write(html_unit_rb.getDocument(generateHTMLDataURL("EUC-JP", data))); fw.write(html_unit_rb.getDocument(generateHTMLDataURL(html.toString()))); } finally { fw.close(); } } // com.gargoylesoftware.htmlunit.html.XmlSerializer からの流用 // (いらん処理を消したもの) protected class XmlSerializer { final StringBuilder buffer_ = new StringBuilder(); final StringBuilder indent_ = new StringBuilder(); public String asXml(final HtmlPage page) { return asXml(page.getDocumentElement()); } public String asXml(final HtmlElement node) { buffer_.setLength(0); indent_.setLength(0); String charsetName = null; if(node.getPage() instanceof HtmlPage) { charsetName = node.getPage().getPageEncoding(); } if(charsetName != null && node instanceof HtmlHtml) { buffer_.append("<?xml version=\"1.0\" encoding=\"").append(charsetName).append("\"?>").append('\n'); } printXml(node); final String response = buffer_.toString(); buffer_.setLength(0); return response; } protected boolean isEmptyXmlTagExpanded(final DomNode node) { return node instanceof HtmlDivision || node instanceof HtmlInlineFrame || node instanceof HtmlOrderedList || node instanceof HtmlScript || node instanceof HtmlSpan || node instanceof HtmlStyle || node instanceof HtmlTable || node instanceof HtmlTitle || node instanceof HtmlUnorderedList; } /** * Prints the content between "<" and ">" (or "/>") in the output of the tag name * and its attributes in XML format. * @param node the node whose opening tag is to be printed */ protected void printOpeningTag(final DomElement node) { buffer_.append(node.getTagName()); final Map<String, DomAttr> attributes = node.getAttributesMap(); for(final String name : attributes.keySet()) { buffer_.append(" "); buffer_.append(name); buffer_.append("=\""); buffer_.append(StringEscapeUtils.escapeXml(attributes.get(name).getNodeValue())); buffer_.append("\""); } } protected void printXml(final DomElement node) { if(isExcluded(node)) return; final boolean hasChildren = node.getFirstChild() != null; buffer_.append(indent_).append('<'); printOpeningTag(node); if(!hasChildren && !isEmptyXmlTagExpanded(node)) { buffer_.append("/>").append('\n'); return; } buffer_.append(">").append('\n'); for(DomNode child = node.getFirstChild(); child != null; child = child.getNextSibling()) { indent_.append(" "); if (child instanceof DomElement) { printXml((DomElement) child); } else { buffer_.append(child); } indent_.setLength(indent_.length() - 2); } buffer_.append(indent_).append("</").append(node.getTagName()).append('>').append('\n'); } protected boolean isExcluded(final DomElement element) { return element instanceof HtmlScript; } } }
jrubyのテストコード
CLASSPATHにHtmlUnitのライブラリを設定した上で実行
#!ruby -Ku require 'benchmark' require 'kconv' require 'web_client_rb.jar' HtmlUnitRb = Java::WebClientRb html_unit = WebClientRb.new("localhost", 8000) html = IO.read("input.html") url = WebClientRb::generateHTMLDataURL(html.toutf8) puts html_unit.getDocument(url)
重いけどまあできた。
あとはmechanizeと一緒に使ってちゃんと動くか確認しないとね。。。