-
-
Save kanzure/80badcf6c66c7a3d8d8e to your computer and use it in GitHub Desktop.
daily coffee
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# https://raw.github.com/jashkenas/coffee-script/master/examples/underscore.coffee | |
# http://coffeescript.org/documentation/docs/underscore.html | |
phantom.injectJs("underscore.js") | |
class Webdis | |
msgtoken: "SPLAT: " | |
constructor: (host, port, startup_callback, resque) -> | |
@host = host | |
@port = port | |
# stores callbacks for onAlert to access later | |
@callbacks = {} | |
# triggered when the webdis/redis connection is ready | |
@startup_callback = startup_callback | |
@resque = resque | |
# used to represent whether or not jquery needs to be loaded | |
@ready = false | |
# phantomjs WebPage/tab to do AJAX-communication with webdis | |
@page = require("webpage").create() | |
# TODO: possibly use _.bindAll here? | |
# this is because "@" changes in onLoadFinished | |
self = @ | |
# triggers startup_callback after page and jquery are both done loading | |
onLoadFinished = (status) -> | |
# http://code.jquery.com/jquery-latest.min.js | |
self.page.injectJs("jquery.js") | |
jqueryconflictresolver = -> | |
window.$ = jQuery.noConflict(true) | |
self.page.evaluate(jqueryconflictresolver) | |
# jquery is done loading | |
self.ready = true | |
# webdis/redis connection is ready | |
self.startup_callback(self.resque) | |
# parses redis responses and selects appropriate (stored) callback | |
onAlert = (message) -> | |
if not (message.substring(0, self.msgtoken.length) == self.msgtoken) | |
console.log(message) # no reason for this page to be alerting | |
else | |
# @page's GET callback triggers something like this: | |
# alert({"callbackid": 1, "wbdismsg": "{\"key\": \"value\"}"}) | |
jsonmsg = message.substring(self.msgtoken.length, message.length) | |
msg = JSON.parse(jsonmsg) | |
callbackid = msg["callbackid"] | |
webdismsg = msg["webdismsg"] | |
# the "rediscmd" key indicates which part of webdismsg is relevant | |
rediscmd = null | |
if msg["rediscmd"] != null and msg["rediscmd"] != undefined | |
rediscmd = msg["rediscmd"] | |
# no callbackid is a sorta big problem | |
if callbackid == null or callbackid == undefined | |
return | |
# grab the callback only after callbackid is vetted | |
callback = self.callbacks[callbackid] | |
# a callbackid w/o callback is also pretty bad | |
if callback == null or callback == undefined | |
return | |
# callback only wants the relevant response from webdis | |
if rediscmd != null | |
webdisresp = JSON.parse(webdismsg[rediscmd]) | |
else | |
webdisresp = webdismsg | |
# so that memory can be freed | |
self.remove_callback_by_id(callbackid) | |
# finally send the RPOP result back to whatever handler | |
callback(webdisresp) | |
@page.onAlert = onAlert | |
@page.onConsoleMessage = (x) -> console.log(x) | |
# open some working page, even if it shows a webdis error response | |
@page.open("http://"+@host+":"+@port+"/lpush", onLoadFinished) | |
execute: (url, callback, rediscmd) -> | |
storedid = @store_callback(callback) | |
evilargs = | |
msgtoken: @msgtoken | |
url: url | |
callbackid: storedid | |
rediscmd: rediscmd | |
evil = (args) -> | |
msgtoken = args.msgtoken | |
url = args.url | |
callbackid = args.callbackid | |
rediscmd = args.rediscmd | |
supercallback = (webdis_response) -> | |
payload = | |
rediscmd: rediscmd | |
callbackid: callbackid | |
webdismsg: webdis_response | |
jsonified = JSON.stringify(payload) | |
if callbackid != undefined and callbackid != null | |
alert msgtoken + jsonified | |
window.$.get(url, supercallback) | |
@page.evaluate(evil, evilargs) | |
construct_request: (components) -> | |
url = "http://" + @host + ":" + @port + "/" + components.join("/") | |
return url | |
store_callback: (callback) -> | |
id = _.keys(@callbacks).length | |
@callbacks[id] = callback | |
return id | |
find_callback_id: (callback) -> | |
for id, pcallback in @callbacks | |
if pcallback == callback | |
return id | |
return false | |
remove_callback: (callback) -> | |
id = @find_callback_id(callback) | |
if id != false | |
@callbacks[id] = null | |
return true | |
return false | |
remove_callback_by_id: (id) -> | |
@callbacks[id] = null | |
return true | |
release: -> | |
@page.release() | |
@ready = false | |
# push an item to the end of the queue with LPUSH | |
push: (key, value) -> | |
cmd = ["LPUSH", key, value] | |
url = @construct_request(cmd) | |
r = @execute(url) | |
# pop an item from the head of the queue with RPOP | |
pop: (key, callback) -> | |
cmd = ["RPOP", key] | |
url = @construct_request(cmd) | |
r = @execute(url, callback, "RPOP") | |
class Resque | |
constructor: (host, port, startup_callback) -> | |
@host = host | |
@port = port | |
@webdis = new Webdis(host, port, startup_callback, @) | |
push: (queue, object) -> | |
key = "resque:queue:" + queue | |
object = JSON.stringify(object) | |
@webdis.push(key, object) | |
pop: (queue, callback) -> | |
key = "resque:queue:" + queue | |
@webdis.pop(key, callback) | |
startup_callback = (resque) -> | |
queue.push("movies", {"name": "short circuit"}) | |
queue.push("movies", {"name": "star wars episode iv"}) | |
queue.push("movies", {"name": "star wars episode v"}) | |
queue.push("movies", {"name": "star wars episode vi"}) | |
moviemonster = (movie) -> | |
console.log("movie name is: " + movie.name) | |
queue.pop("movies", moviemonster) | |
queue.pop("movies", moviemonster) | |
moviemonsterandexit = (movie) -> | |
moviemonster(movie) | |
phantom.exit() | |
queue.pop("movies", moviemonsterandexit) | |
# queue = new Resque("localhost", "7379", startup_callback) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Spatula.coffee v1.0 | |
# (c) 2012 Bryan Bishop <kanzure@gmail.com> | |
# | |
# **Spatula** is a tiny, queue-based scraping framework that gives structure to | |
# PhantomJS scraping projects. It provides extendable models for representing | |
# the state and structure of target pages, like the ever-popular "categories | |
# and category-members" website pattern. Spatula manages the PhantomJS instance | |
# by reading from a task queue in a control tab, and it publishes results back | |
# to other queues for assimilation by whatever downstream workers you create. | |
# Because PhantomJS lacks other forms of I/O, it is up to you to do application | |
# integration and write the downstream workers (perhaps in ruby or python) for | |
# storing the scraped data. | |
# | |
# Spatula requires Redis and Webdis (an HTTP API for Redis). | |
#### Includes | |
# underscore would be nice to have in here.. | |
# https://raw.github.com/jashkenas/coffee-script/master/examples/underscore.coffee | |
# http://coffeescript.org/documentation/docs/underscore.html | |
phantom.injectJs("underscore.js") | |
# for webdis/redis | |
phantom.injectJs("resque.js") | |
#### Convenience utility belt | |
# for removing an element from an array | |
#Array::remove = (e) -> @splice(t,1)[0] if (t = @indexOf(e)) > -1 | |
Array::remove = (e) -> _(@).reject (v) -> v is e | |
log = (message) -> | |
console.log message | |
alert = (message) -> | |
log "ALERT: " + message | |
exit = -> | |
log "exiting because exit was called" | |
phantom.exit() | |
throwexit = (error) -> | |
# assemble the final error message | |
message = "\n\n\n" | |
message += "ERROR: <" + error.message + ">" | |
if error.hasOwnProperty("lineNumber") and error.lineNumber != null | |
message += " on line " + error.lineNumber | |
if error.hasOwnProperty("fileName") and error.fileName != null | |
message += " in file " + error.fileName | |
# show the error message | |
log message | |
# explain why the program is done | |
log "exiting because caught and escalated error/exception" | |
# die a horrible death | |
phantom.exit(1) | |
#### Spatula classes | |
class Page | |
# instead of base_url just use the parent's url | |
url: null | |
# per-page settings (phantomjs) | |
settings: | |
loadImages: true | |
loadPlugins: true | |
javascriptEnabled: true | |
userAgent: "Spatula" | |
# custom settings - doesn't seem to negatively impact phantomjs | |
jquery: true | |
log: (mexo) -> | |
console.log "Page" + ": " + mexo | |
constructor: -> | |
@current_analysis = {} | |
@transition_history = [] | |
# for whether or not this constructor has been called | |
@has_initialized = true | |
# for whether or not the start method has been called | |
@has_started = false | |
# for whether or not onLoadFinished has been called | |
@loaded = false | |
# contains an error state | |
@error = null | |
# for whether or not jquery has been inserted into the page | |
@jquery_injected = false | |
@page = require("webpage").create() | |
# these are attributes on any WebPage that can be customized | |
overrides = [ | |
"settings", | |
"onAlert", | |
"onConsoleMessage", | |
"onError", | |
"onInitialized", | |
"onLoadStarted", | |
"onLoadFinished", | |
"onResourceRequested", | |
"onResourceReceived", | |
] | |
# each attribute will be replaced unless there's no override | |
for override in overrides | |
if @[override] != null and @[override] != undefined | |
@page[override] = @[override] | |
# starts to load the page | |
start: -> | |
@log "start called, url is: " + @url | |
@has_started = true | |
self = @ | |
# is called when the page is finished loading | |
onLoadFinished = (status) -> | |
self.log "onLoadFinished called" | |
self.loaded = true | |
if self.settings.jquery && !self.jquery_injected | |
self.log "injecting jquery into the page" | |
# TODO: fix this ? | |
#self.page.includeJs "http://code.jquery.com/jquery-latest.min.js" | |
self.jquery_injected = true | |
# status will be 'failed' on 404s, disconnects and timeouts | |
self.log "about to call analyze" | |
analysis = self.analyze() | |
onLoadStarted = -> | |
self.loaded = false | |
@page.open(@url, onLoadFinished) | |
@log "start done... callback is onLoadFinished" | |
# runs the method on the page with the given args | |
run: (method, callback_handler, passed_args) -> | |
@log "run called (with method and passed_args)" | |
result = @page.evaluate(method) | |
#result = @page.evaluate(method, passed_args) | |
if typeof(callback_handler) == "function" | |
@log "run calling callback_handler(result)" | |
result = callback_handler(result) | |
@log "run done" | |
return result | |
# analyzes html and returns a analysis | |
analyzer: -> | |
console.log "analyzer running inside the page" | |
analysis = | |
state: null | |
console.log "analyzer done running inside the page" | |
return analysis | |
# injects the actual deduction code into the page context | |
analyze: -> | |
@log "analyze: injecting @analyzer" | |
analysis = @page.evaluate(@analyzer) | |
# historical: moving from @current_analysis to this new analysis | |
@transition_history.push(analysis) | |
@previous_analysis = @current_analysis | |
@current_analysis = analysis | |
@log "analyze: returning analysis" | |
return analysis | |
# chooses a manipulation to perform based on previous_analysis and current_analysis | |
transition: -> | |
console.log "transition: nothing to do, exiting..." | |
exit() | |
class ExamplePage extends Page | |
#super: @constructor.__super__ | |
url: "http://news.ycombinator.com" | |
class ExampleHomePage extends ExamplePage | |
#super: @constructor.__super__ | |
analyzer: -> | |
console.log "analyzer running inside the page" | |
analysis = | |
title: document.title | |
url: document.location.href | |
if analysis.title == null || analysis.title == "" | |
analysis["state"] = "HAS_NO_TITLE" | |
else | |
analysis["state"] = "HAS_TITLE" | |
if Math.floor(Math.random()*4) == 3 | |
analysis["state"] = "SIMULATED_DISASTER" | |
console.log "analyzer done running inside the page" | |
return analysis | |
transition: -> | |
@log "transition looking at the analysis" | |
analysis = @current_analysis | |
state = analysis["state"] | |
if state == "HAS_NO_TITLE" | |
@run(@set_title) | |
else if state == "HAS_TITLE" || @previous_analysis == null | |
@run(@unset_title) | |
else | |
@error = new Error("unknown state") | |
@loaded = false | |
throw @error | |
@log "transition done looking at the analysis" | |
set_title: -> | |
document.title = "hello world!" | |
unset_title: -> | |
document.title = "" | |
# Manage a list of Page objects. | |
class Spatula | |
constructor: -> | |
@has_started = false | |
@pages = [] | |
# length of time between checking all pages (in milliseconds) | |
interval: 1000 | |
# convenience method.. pass either a class or an instance | |
add_page: (page) -> | |
if page == null | |
alert "page can't be null in add_page" | |
throwexit "page can't be null in add_page" | |
# can pass either a class or an instance | |
if not (page.hasOwnProperty("has_initialized") and page.has_initialized) | |
page = new page | |
@pages.push(page) | |
main_loop: -> | |
if not @has_started | |
alert "consider calling Spatula.start() instead of directly calling main_loop()" | |
for page in @pages | |
# classes should have their constructor and then their start() called | |
if not (page.hasOwnProperty("has_initialized") and page.has_initialized) \ | |
and not (page.hasOwnProperty("has_started") and page.has_started) | |
page = page() | |
page.start() | |
# some pages might need to be started first | |
else if not (page.hasOwnProperty("has_started") and page.has_started) | |
log "page hasn't been started yet.. starting the page" | |
page.start() | |
# after those checks all remaining pages should have these attributes | |
else if not _.every(page.hasOwnProperty(x) for x in ["has_initialized", "has_started"], _.identity) | |
throwexit "page is missing some attributes (this shouldn't happen)" | |
# page might have had an error | |
else if page.error | |
log "removing a page from spatula.pages because of an error" | |
@pages.remove(page) | |
# means that onLoadFinished has been called at least once (like after start) | |
else if page.loaded | |
try | |
console.log "main_loop: calling transition on page" | |
page.transition() | |
console.log "main_loop: calling analyze on page" | |
page.analyze() | |
catch error | |
throwexit error | |
# main_loop might be called on an interval prior to adding pages | |
if @pages.length == 0 and @has_started | |
throwexit new Error("main_loop called prior to adding any pages in spatula") | |
start: -> | |
@has_started = true | |
window.setInterval(_.bind(@main_loop, @), @interval) | |
startup_callback = (resque) -> | |
console.log("starting a tiny spatula demo...") | |
queue.push("movies", {"name": "short circuit"}) | |
queue.push("movies", {"name": "star wars episode iv"}) | |
queue.push("movies", {"name": "star wars episode v"}) | |
queue.push("movies", {"name": "star wars episode vi"}) | |
moviemonster = (movie) -> | |
console.log("movie name is: " + movie.name) | |
queue.pop("movies", moviemonster) | |
queue.pop("movies", moviemonster) | |
# also start spatula in the mean time | |
spatula = new Spatula(resque) | |
spatula.add_page(ExampleHomePage) | |
spatula.start() | |
console.log("spatula initiated") | |
queue = new Resque("localhost", "7379", startup_callback) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment