HtmlUnitをJrubyから使ってみる

＊[scrape][jruby]HtmlUnitをJrubyから使ってみる

以前に見つけた Crowbar(http://d.hatena.ne.jp/cartooh/20090616#1245164108)だと日本語が化けるので、
別の方法を探しました。

HtmlUnit: http://htmlunit.sourceforge.net/
Celerity: http://celerity.rubyforge.org/

はじめからCelerityを使ってスクレイピングしてればよかったんだけど、
mechanizeで作り込んじゃったからね。。。乗り換えるのもちょっと。

なので、単一ページのみ渡してJavaScriptを実行した結果を貰えるようにしてみました。

com.gargoylesoftware.htmlunit.protocol.data.DataUrlDecoderのパッチ

dataスキーマで文字コードに対応していなかったので適当に対応させてみました。

*** DataUrlDecoder_org.java	Fri Aug 28 09:12:18 2009
--- DataUrlDecoder.java	Tue Dec 22 12:19:42 2009
***************
*** 90,96 ****
      }
  
      private static String extractCharset(final String beforeData) {
!         // TODO
          return DEFAULT_CHARSET;
      }
  
--- 90,102 ----
      }
  
      private static String extractCharset(final String beforeData) {
!         if (beforeData.contains("charset=")) {
!           final String charset = StringUtils.substringAfter(beforeData, "charset=");
!           if (charset.contains(";")) {
!                 return StringUtils.substringBefore(charset, ";");
!             }
!             return charset;          
!         }
          return DEFAULT_CHARSET;
      }

適当なラッパー

jrubyで書いてもよかったんだけどね。

import com.gargoylesoftware.htmlunit.*;
import com.gargoylesoftware.htmlunit.html.*;
import java.io.*;
import java.util.*;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.codec.net.URLCodec;

public class WebClientRb
{
  static
  {  
    System.setProperty("org.apache.commons.logging.Log", "org.apache.commons.logging.impl.SimpleLog");
    System.setProperty("org.apache.commons.logging.simplelog.showdatetime", "true");
    System.setProperty("org.apache.commons.logging.simplelog.log.httpclient.wire.header", "debug");
    System.setProperty("org.apache.commons.logging.simplelog.log.org.apache.commons.httpclient", "debug");
  };

  private static final String DEFAULT_CHARSET = "US-ASCII";
  protected WebClient web_client_ = null;
  protected XmlSerializer xml_serializer_ = null;

  public WebClientRb()
  {
    this(BrowserVersion.FIREFOX_3);
  }

  public WebClientRb(final BrowserVersion browserVersion)
  {
    this(new WebClient(browserVersion));
  }

  public WebClientRb(final BrowserVersion browserVersion, final String proxyHost, final int proxyPort)
  {
    this(new WebClient(browserVersion, proxyHost, proxyPort));
  }

  public WebClientRb(final String proxyHost, final int proxyPort)
  {
    this(new WebClient(BrowserVersion.FIREFOX_3, proxyHost, proxyPort));
  }  

  public WebClientRb(WebClient web_client)
  {
    WebAssert.notNull("web_client", web_client);
    web_client_ = web_client;
    xml_serializer_ = new XmlSerializer();    
  }

  public String getDocument(String url) throws IOException
  {
    WebAssert.notNull("url", url);
    final HtmlPage page = web_client_.getPage(url);
    return xml_serializer_.asXml(page);
  }

  public static String generateDataURL(final String mediaType, final byte[] data) throws UnsupportedEncodingException
  {
    WebAssert.notNull("mediaType", mediaType);
    WebAssert.notNull("data", data);
    return "data:" + mediaType + ";base64," + 
           new String(URLCodec.encodeUrl(null, Base64.encodeBase64(data)), "US-ASCII");
  }
  
  public static String generateHTMLDataURL(String html) throws UnsupportedEncodingException
  {
    return generateHTMLDataURL(html, "utf-8");
  }
  
  public static String generateHTMLDataURL(String html, String charset) throws UnsupportedEncodingException
  {
    WebAssert.notNull("html", html);
    WebAssert.notNull("charset", charset);
    return generateDataURL("text/html;charset=utf-8", html.getBytes(charset));
  }

  public static String generateHTMLDataURL(final byte[] data, String charset) throws UnsupportedEncodingException
  {
    if(null == charset)
    {
      charset = DEFAULT_CHARSET;
    }
    return generateDataURL("text/html;charset=" + charset, data);
  }

  // テストコード
  public static void main(String args[]) throws Exception
  {
    final WebClientRb html_unit_rb = new WebClientRb("localhost", 8000);

    File f = new File("input.html");
    
    /*
    byte[] data = new byte[(int)f.length()];
    BufferedInputStream bis = new BufferedInputStream(new FileInputStream(f));
    bis.read(data);
    bis.close();
    */
    
    StringBuffer html = new StringBuffer();
    BufferedReader fr = new BufferedReader(new InputStreamReader(new FileInputStream(f), "x-JISAutoDetect"));
    String line = null;
    while((line = fr.readLine()) != null)
    {
      html.append(line);
    }
    fr.close();
    
    Writer fw = new BufferedWriter(new FileWriter("output.html"));
    try {
      //fw.write(html_unit_rb.getDocument(generateHTMLDataURL("EUC-JP", data)));
      fw.write(html_unit_rb.getDocument(generateHTMLDataURL(html.toString())));
    } finally {
      fw.close();
    }
  }
  // com.gargoylesoftware.htmlunit.html.XmlSerializer からの流用
  // （いらん処理を消したもの）

  protected class XmlSerializer
  {
    final StringBuilder buffer_ = new StringBuilder();
    final StringBuilder indent_ = new StringBuilder();
    public String asXml(final HtmlPage page)
    {
      return asXml(page.getDocumentElement());
    }
    public String asXml(final HtmlElement node)
    {
      buffer_.setLength(0);
      indent_.setLength(0);
      String charsetName = null;
      if(node.getPage() instanceof HtmlPage)
      {
        charsetName = node.getPage().getPageEncoding();
      }
      if(charsetName != null && node instanceof HtmlHtml)
      {
        buffer_.append("<?xml version=\"1.0\" encoding=\"").append(charsetName).append("\"?>").append('\n');
      }
      printXml(node);
      final String response = buffer_.toString();
      buffer_.setLength(0);
      return response;
    }
    
    protected boolean isEmptyXmlTagExpanded(final DomNode node)
    {
        return node instanceof HtmlDivision || node instanceof HtmlInlineFrame || node instanceof HtmlOrderedList
            || node instanceof HtmlScript || node instanceof HtmlSpan || node instanceof HtmlStyle
            || node instanceof HtmlTable || node instanceof HtmlTitle || node instanceof HtmlUnorderedList;
    }
    
    /**
     * Prints the content between "&lt;" and "&gt;" (or "/&gt;") in the output of the tag name
     * and its attributes in XML format.
     * @param node the node whose opening tag is to be printed
     */
    protected void printOpeningTag(final DomElement node)
    {
      buffer_.append(node.getTagName());
      final Map<String, DomAttr> attributes = node.getAttributesMap();
      for(final String name : attributes.keySet())
      {
        buffer_.append(" ");
        buffer_.append(name);
        buffer_.append("=\"");
        buffer_.append(StringEscapeUtils.escapeXml(attributes.get(name).getNodeValue()));
        buffer_.append("\"");
      }
    }
    
    protected void printXml(final DomElement node)
    {
      if(isExcluded(node)) return;
      final boolean hasChildren = node.getFirstChild() != null;
      buffer_.append(indent_).append('<');
      printOpeningTag(node);
      
      if(!hasChildren && !isEmptyXmlTagExpanded(node))
      {
      	buffer_.append("/>").append('\n');
      	return;
      }
      
      buffer_.append(">").append('\n');
      for(DomNode child = node.getFirstChild(); child != null; child = child.getNextSibling())
      {
        indent_.append("  ");
        if (child instanceof DomElement)
        {
          printXml((DomElement) child);
        }
        else
        {
          buffer_.append(child);
        }
        indent_.setLength(indent_.length() - 2);
      }
      buffer_.append(indent_).append("</").append(node.getTagName()).append('>').append('\n');
    }
    
    protected boolean isExcluded(final DomElement element)
    {
      return element instanceof HtmlScript;
    }    
  }
  
}

jrubyのテストコード

CLASSPATHにHtmlUnitのライブラリを設定した上で実行

#!ruby -Ku

require 'benchmark'
require 'kconv'
require 'web_client_rb.jar'

HtmlUnitRb = Java::WebClientRb
html_unit = WebClientRb.new("localhost", 8000)
html = IO.read("input.html")
url = WebClientRb::generateHTMLDataURL(html.toutf8)
puts html_unit.getDocument(url)

重いけどまあできた。
あとはmechanizeと一緒に使ってちゃんと動くか確認しないとね。。。