Skip to content

Instantly share code, notes, and snippets.

@fuba
Created February 16, 2011 13:19
Show Gist options
  • Save fuba/829350 to your computer and use it in GitHub Desktop.
Save fuba/829350 to your computer and use it in GitHub Desktop.
Scrape Google realtime search using phantomjs
// phantomjs realtime.js keyword
var xpathes = {
item: '//li[@class="g s"]',
screen_name: './/div/a[@class="l"]/text()',
url: './/div//a[./span]',
text: './/div[./a[@class="l"]]',
};
var log = {};
if (phantom.state.length === 0) {
if (phantom.args.length === 0) {
phantom.exit();
} else {
var keyword = phantom.args[0];
var url = 'http://www.google.co.jp/search?q='+
//encodeURIComponent(keyword)+
keyword+ // phantomjs escapes URL…
'&hl=ja&safe=off&tbs=mbl:1';
phantom.state = 'realtime';
phantom.open(url);
}
} else {
if (phantom.loadStatus === 'success') {
extract();
function extract () {
var items = $X(xpathes.item, document);
for (var i = 0; i < items.length; i++) {
(function () {
var item = items[i];
var screen_name = $X(xpathes.screen_name, item, String);
var urls = $X(xpathes.url, item);
var texts = $X(xpathes.text, item);
if (!log[urls[0].href]) {
console.log(
JSON.stringify({
screen_name: screen_name,
url: urls[0].href,
text: texts[0].innerHTML
})
);
log[urls[0].href] = 1;
}
})();
}
window.setTimeout(extract, 20000);
}
}
}
// $X is from https://gist.github.com/3238
// extend version of $X
// $X(exp);
// $X(exp, context);
// $X(exp, type);
// $X(exp, context, type);
function $X (exp, context, type /* want type */) {
if (typeof context == "function") {
type = context;
context = null;
}
if (!context) context = document;
exp = (context.ownerDocument || context).createExpression(exp, function (prefix) {
var o = document.createNSResolver(context)(prefix);
if (o) return o;
return (document.contentType == "application/xhtml+xml") ? "http://www.w3.org/1999/xhtml" : "";
});
switch (type) {
case String: return exp.evaluate(context, XPathResult.STRING_TYPE, null).stringValue;
case Number: return exp.evaluate(context, XPathResult.NUMBER_TYPE, null).numberValue;
case Boolean: return exp.evaluate(context, XPathResult.BOOLEAN_TYPE, null).booleanValue;
case Array:
var result = exp.evaluate(context, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
for (var ret = [], i = 0, len = result.snapshotLength; i < len; i++) {
ret.push(result.snapshotItem(i));
}
return ret;
case undefined:
var result = exp.evaluate(context, XPathResult.ANY_TYPE, null);
switch (result.resultType) {
case XPathResult.STRING_TYPE : return result.stringValue;
case XPathResult.NUMBER_TYPE : return result.numberValue;
case XPathResult.BOOLEAN_TYPE: return result.booleanValue;
case XPathResult.UNORDERED_NODE_ITERATOR_TYPE:
// not ensure the order.
var ret = [], i = null;
while ((i = result.iterateNext())) ret.push(i);
return ret;
}
return null;
default: throw(TypeError("$X: specified type is not valid type."));
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment