Skip to content

Instantly share code, notes, and snippets.

@teramako
Created August 14, 2009 21:32
Show Gist options
  • Save teramako/168118 to your computer and use it in GitHub Desktop.
Save teramako/168118 to your computer and use it in GitHub Desktop.
#!/usr/lib/xulrunner/xpcshell
const USAGE = <><![CDATA[
XULRunner付属のxpcshellで動くJavaScript
引数のURLのHTML文字列を取得しパースして、再度文字列化して出力するもの
Usage: xpcshell curl.js URL
]]></>.toString();
const Cc = Components.classes;
const Ci = Components.interfaces;
var ios = Cc["@mozilla.org/network/io-service;1"].getService(Ci.nsIIOService);
var suh = Cc["@mozilla.org/feed-unescapehtml;1"].getService(Ci.nsIScriptableUnescapeHTML);
var parser = Cc["@mozilla.org/xmlextras/domparser;1"].createInstance(Ci.nsIDOMParser);
var ser = Cc["@mozilla.org/xmlextras/xmlserializer;1"].createInstance(Ci.nsIDOMSerializer);
var xslt = Cc["@mozilla.org/document-transformer;1?type=xslt"].createInstance(Ci.nsIXSLTProcessor);
function main(args){
var url = args[0];
if (!url){
usage();
quit();
}
var htmlstr = httpGet(url);
// UTF8octetに変換
htmlstr = unescape(encodeURIComponent(htmlstr));
var title = getTitleFromHTMLString(htmlstr);
var doc = createHTMLDocument(<title>{title}</title>);
var html = parseHTML(htmlstr, doc, url);
doc.body.appendChild(html);
var xhtmlstr = tag2LowerCase(ser.serializeToString(doc));
print(xhtmlstr);
}
function usage(){
print(USAGE);
}
function httpGet(url){
/*
var req = Cc["@mozilla.org/xmlextras/xmlhttprequest;1"].createInstance(Ci.nsIXMLHttpRequest);
req.open("GET", url, false);
//req.overrideMimeType("text/html;charset=UTF-8");
req.send(null);
//print(req.getAllResponseHeaders());
return req.responseText;
*/
// XMLHttpRequestでも良いけど、Content-Typeのcharsetを取れるios.newChannelで
var channel = ios.newChannel(url, 0, null);
var stream = channel.open();
if (channel instanceof Ci.nsIHttpChannel && channel.responseStatus != 200){
return "";
}
var charset = channel.contentCharset || "UTF-8";
var iconv = Cc["@mozilla.org/intl/converter-input-stream;1"].createInstance(Ci.nsIConverterInputStream);
iconv.init(stream, charset, 1024, Ci.nsIConverterInputStream.DEFAULT_REPLACEMENT_CHARACTER);
try {
var str = {};;
var htmlstr = "";
var count;
while (iconv.readString(4096, str) != 0){
htmlstr += str.value;
}
} catch(e){
print(e);
} finally {
iconv.close();
stream.close();
}
return htmlstr;
}
/**
* nsIXSLTProcessorを使ってHTMLDocumentを生成
*
* Components.classesByID['{5d0fcdd0-4daa-11d2-b328-00805f8a3859}'].createInstance(Ci.nsIDOMHTMLDocument)
* でもHTMLDocumentを生成できる(@see http://twitter.com/nanto_vi/status/3314628619 )
*/
function createHTMLDocument(header){
var xsl = <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
<xsl:output method="html"/>
<xsl:template match="/">
<html>
<head>{header}</head>
<body/>
</html>
</xsl:template>
</xsl:stylesheet>;
var xml = <html xmlns="http://www.w3.org/1999/xhtml"></html>;
var doc = parser.parseFromString(xml.toXMLString(), "application/xml");
var styleNode = parser.parseFromString(xsl.toXMLString(), "application/xml");
xslt.importStylesheet(styleNode);
var htmldoc = xslt.transformToDocument(doc);
htmldoc.QueryInterface(Ci.nsIDOMHTMLDocument);
htmldoc.documentElement.QueryInterface(Ci.nsIDOMHTMLElement).QueryInterface(Ci.nsIDOMHTMLHtmlElement);
htmldoc.body.QueryInterface(Ci.nsIDOMHTMLElement).QueryInterface(Ci.nsIDOMHTMLBodyElement);
return htmldoc;
}
function parseHTML(htmlstr, doc, baseURL){
htmlstr = relateURL2ABS(htmlstr, baseURL);
return suh.parseFragment(htmlstr, false, null, doc.createElement("xml"));
}
function createURI(url){
return ios.newURI(url, null, null);
}
/**
* 相対パスのhref属性値をURLに変換する
*/
function relateURL2ABS(htmlstr, baseURL){
var uri = createURI(baseURL);
function replacer(all, quote, url){
return "href=" + quote + uri.resolve(url) + quote;
}
return htmlstr.replace(/href=(["'])([^"']*)\1/g, replacer)
}
function tag2LowerCase(htmlstr){
return htmlstr.replace(/<(\/)?([\w]+)([\s\S]*?)>/g, function(all, close, tag, attr){
return "<" + close + tag.toLowerCase() + attr + ">";
});
}
function getTitleFromHTMLString(htmlstr){
var title = /<title>([\s\S]+?)<\/title>/.exec(htmlstr);
if (title){
return title[1];
}
return "";
}
function dump(obj){
print("============================================");
print(obj);
for (var i in obj){
print(i + ": " + obj[i]);
}
}
main(arguments);
// vim: sw=2 ts=2 et
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment