Skip to content

Instantly share code, notes, and snippets.

@kanzure
Created June 3, 2012 19:23
Show Gist options
  • Save kanzure/80badcf6c66c7a3d8d8e to your computer and use it in GitHub Desktop.
Save kanzure/80badcf6c66c7a3d8d8e to your computer and use it in GitHub Desktop.
daily coffee
# https://raw.github.com/jashkenas/coffee-script/master/examples/underscore.coffee
# http://coffeescript.org/documentation/docs/underscore.html
phantom.injectJs("underscore.js")
class Webdis
msgtoken: "SPLAT: "
constructor: (host, port, startup_callback, resque) ->
@host = host
@port = port
# stores callbacks for onAlert to access later
@callbacks = {}
# triggered when the webdis/redis connection is ready
@startup_callback = startup_callback
@resque = resque
# used to represent whether or not jquery needs to be loaded
@ready = false
# phantomjs WebPage/tab to do AJAX-communication with webdis
@page = require("webpage").create()
# TODO: possibly use _.bindAll here?
# this is because "@" changes in onLoadFinished
self = @
# triggers startup_callback after page and jquery are both done loading
onLoadFinished = (status) ->
# http://code.jquery.com/jquery-latest.min.js
self.page.injectJs("jquery.js")
jqueryconflictresolver = ->
window.$ = jQuery.noConflict(true)
self.page.evaluate(jqueryconflictresolver)
# jquery is done loading
self.ready = true
# webdis/redis connection is ready
self.startup_callback(self.resque)
# parses redis responses and selects appropriate (stored) callback
onAlert = (message) ->
if not (message.substring(0, self.msgtoken.length) == self.msgtoken)
console.log(message) # no reason for this page to be alerting
else
# @page's GET callback triggers something like this:
# alert({"callbackid": 1, "wbdismsg": "{\"key\": \"value\"}"})
jsonmsg = message.substring(self.msgtoken.length, message.length)
msg = JSON.parse(jsonmsg)
callbackid = msg["callbackid"]
webdismsg = msg["webdismsg"]
# the "rediscmd" key indicates which part of webdismsg is relevant
rediscmd = null
if msg["rediscmd"] != null and msg["rediscmd"] != undefined
rediscmd = msg["rediscmd"]
# no callbackid is a sorta big problem
if callbackid == null or callbackid == undefined
return
# grab the callback only after callbackid is vetted
callback = self.callbacks[callbackid]
# a callbackid w/o callback is also pretty bad
if callback == null or callback == undefined
return
# callback only wants the relevant response from webdis
if rediscmd != null
webdisresp = JSON.parse(webdismsg[rediscmd])
else
webdisresp = webdismsg
# so that memory can be freed
self.remove_callback_by_id(callbackid)
# finally send the RPOP result back to whatever handler
callback(webdisresp)
@page.onAlert = onAlert
@page.onConsoleMessage = (x) -> console.log(x)
# open some working page, even if it shows a webdis error response
@page.open("http://"+@host+":"+@port+"/lpush", onLoadFinished)
execute: (url, callback, rediscmd) ->
storedid = @store_callback(callback)
evilargs =
msgtoken: @msgtoken
url: url
callbackid: storedid
rediscmd: rediscmd
evil = (args) ->
msgtoken = args.msgtoken
url = args.url
callbackid = args.callbackid
rediscmd = args.rediscmd
supercallback = (webdis_response) ->
payload =
rediscmd: rediscmd
callbackid: callbackid
webdismsg: webdis_response
jsonified = JSON.stringify(payload)
if callbackid != undefined and callbackid != null
alert msgtoken + jsonified
window.$.get(url, supercallback)
@page.evaluate(evil, evilargs)
construct_request: (components) ->
url = "http://" + @host + ":" + @port + "/" + components.join("/")
return url
store_callback: (callback) ->
id = _.keys(@callbacks).length
@callbacks[id] = callback
return id
find_callback_id: (callback) ->
for id, pcallback in @callbacks
if pcallback == callback
return id
return false
remove_callback: (callback) ->
id = @find_callback_id(callback)
if id != false
@callbacks[id] = null
return true
return false
remove_callback_by_id: (id) ->
@callbacks[id] = null
return true
release: ->
@page.release()
@ready = false
# push an item to the end of the queue with LPUSH
push: (key, value) ->
cmd = ["LPUSH", key, value]
url = @construct_request(cmd)
r = @execute(url)
# pop an item from the head of the queue with RPOP
pop: (key, callback) ->
cmd = ["RPOP", key]
url = @construct_request(cmd)
r = @execute(url, callback, "RPOP")
class Resque
constructor: (host, port, startup_callback) ->
@host = host
@port = port
@webdis = new Webdis(host, port, startup_callback, @)
push: (queue, object) ->
key = "resque:queue:" + queue
object = JSON.stringify(object)
@webdis.push(key, object)
pop: (queue, callback) ->
key = "resque:queue:" + queue
@webdis.pop(key, callback)
startup_callback = (resque) ->
queue.push("movies", {"name": "short circuit"})
queue.push("movies", {"name": "star wars episode iv"})
queue.push("movies", {"name": "star wars episode v"})
queue.push("movies", {"name": "star wars episode vi"})
moviemonster = (movie) ->
console.log("movie name is: " + movie.name)
queue.pop("movies", moviemonster)
queue.pop("movies", moviemonster)
moviemonsterandexit = (movie) ->
moviemonster(movie)
phantom.exit()
queue.pop("movies", moviemonsterandexit)
# queue = new Resque("localhost", "7379", startup_callback)
# Spatula.coffee v1.0
# (c) 2012 Bryan Bishop <kanzure@gmail.com>
#
# **Spatula** is a tiny, queue-based scraping framework that gives structure to
# PhantomJS scraping projects. It provides extendable models for representing
# the state and structure of target pages, like the ever-popular "categories
# and category-members" website pattern. Spatula manages the PhantomJS instance
# by reading from a task queue in a control tab, and it publishes results back
# to other queues for assimilation by whatever downstream workers you create.
# Because PhantomJS lacks other forms of I/O, it is up to you to do application
# integration and write the downstream workers (perhaps in ruby or python) for
# storing the scraped data.
#
# Spatula requires Redis and Webdis (an HTTP API for Redis).
#### Includes
# underscore would be nice to have in here..
# https://raw.github.com/jashkenas/coffee-script/master/examples/underscore.coffee
# http://coffeescript.org/documentation/docs/underscore.html
phantom.injectJs("underscore.js")
# for webdis/redis
phantom.injectJs("resque.js")
#### Convenience utility belt
# for removing an element from an array
#Array::remove = (e) -> @splice(t,1)[0] if (t = @indexOf(e)) > -1
Array::remove = (e) -> _(@).reject (v) -> v is e
log = (message) ->
console.log message
alert = (message) ->
log "ALERT: " + message
exit = ->
log "exiting because exit was called"
phantom.exit()
throwexit = (error) ->
# assemble the final error message
message = "\n\n\n"
message += "ERROR: <" + error.message + ">"
if error.hasOwnProperty("lineNumber") and error.lineNumber != null
message += " on line " + error.lineNumber
if error.hasOwnProperty("fileName") and error.fileName != null
message += " in file " + error.fileName
# show the error message
log message
# explain why the program is done
log "exiting because caught and escalated error/exception"
# die a horrible death
phantom.exit(1)
#### Spatula classes
class Page
# instead of base_url just use the parent's url
url: null
# per-page settings (phantomjs)
settings:
loadImages: true
loadPlugins: true
javascriptEnabled: true
userAgent: "Spatula"
# custom settings - doesn't seem to negatively impact phantomjs
jquery: true
log: (mexo) ->
console.log "Page" + ": " + mexo
constructor: ->
@current_analysis = {}
@transition_history = []
# for whether or not this constructor has been called
@has_initialized = true
# for whether or not the start method has been called
@has_started = false
# for whether or not onLoadFinished has been called
@loaded = false
# contains an error state
@error = null
# for whether or not jquery has been inserted into the page
@jquery_injected = false
@page = require("webpage").create()
# these are attributes on any WebPage that can be customized
overrides = [
"settings",
"onAlert",
"onConsoleMessage",
"onError",
"onInitialized",
"onLoadStarted",
"onLoadFinished",
"onResourceRequested",
"onResourceReceived",
]
# each attribute will be replaced unless there's no override
for override in overrides
if @[override] != null and @[override] != undefined
@page[override] = @[override]
# starts to load the page
start: ->
@log "start called, url is: " + @url
@has_started = true
self = @
# is called when the page is finished loading
onLoadFinished = (status) ->
self.log "onLoadFinished called"
self.loaded = true
if self.settings.jquery && !self.jquery_injected
self.log "injecting jquery into the page"
# TODO: fix this ?
#self.page.includeJs "http://code.jquery.com/jquery-latest.min.js"
self.jquery_injected = true
# status will be 'failed' on 404s, disconnects and timeouts
self.log "about to call analyze"
analysis = self.analyze()
onLoadStarted = ->
self.loaded = false
@page.open(@url, onLoadFinished)
@log "start done... callback is onLoadFinished"
# runs the method on the page with the given args
run: (method, callback_handler, passed_args) ->
@log "run called (with method and passed_args)"
result = @page.evaluate(method)
#result = @page.evaluate(method, passed_args)
if typeof(callback_handler) == "function"
@log "run calling callback_handler(result)"
result = callback_handler(result)
@log "run done"
return result
# analyzes html and returns a analysis
analyzer: ->
console.log "analyzer running inside the page"
analysis =
state: null
console.log "analyzer done running inside the page"
return analysis
# injects the actual deduction code into the page context
analyze: ->
@log "analyze: injecting @analyzer"
analysis = @page.evaluate(@analyzer)
# historical: moving from @current_analysis to this new analysis
@transition_history.push(analysis)
@previous_analysis = @current_analysis
@current_analysis = analysis
@log "analyze: returning analysis"
return analysis
# chooses a manipulation to perform based on previous_analysis and current_analysis
transition: ->
console.log "transition: nothing to do, exiting..."
exit()
class ExamplePage extends Page
#super: @constructor.__super__
url: "http://news.ycombinator.com"
class ExampleHomePage extends ExamplePage
#super: @constructor.__super__
analyzer: ->
console.log "analyzer running inside the page"
analysis =
title: document.title
url: document.location.href
if analysis.title == null || analysis.title == ""
analysis["state"] = "HAS_NO_TITLE"
else
analysis["state"] = "HAS_TITLE"
if Math.floor(Math.random()*4) == 3
analysis["state"] = "SIMULATED_DISASTER"
console.log "analyzer done running inside the page"
return analysis
transition: ->
@log "transition looking at the analysis"
analysis = @current_analysis
state = analysis["state"]
if state == "HAS_NO_TITLE"
@run(@set_title)
else if state == "HAS_TITLE" || @previous_analysis == null
@run(@unset_title)
else
@error = new Error("unknown state")
@loaded = false
throw @error
@log "transition done looking at the analysis"
set_title: ->
document.title = "hello world!"
unset_title: ->
document.title = ""
# Manage a list of Page objects.
class Spatula
constructor: ->
@has_started = false
@pages = []
# length of time between checking all pages (in milliseconds)
interval: 1000
# convenience method.. pass either a class or an instance
add_page: (page) ->
if page == null
alert "page can't be null in add_page"
throwexit "page can't be null in add_page"
# can pass either a class or an instance
if not (page.hasOwnProperty("has_initialized") and page.has_initialized)
page = new page
@pages.push(page)
main_loop: ->
if not @has_started
alert "consider calling Spatula.start() instead of directly calling main_loop()"
for page in @pages
# classes should have their constructor and then their start() called
if not (page.hasOwnProperty("has_initialized") and page.has_initialized) \
and not (page.hasOwnProperty("has_started") and page.has_started)
page = page()
page.start()
# some pages might need to be started first
else if not (page.hasOwnProperty("has_started") and page.has_started)
log "page hasn't been started yet.. starting the page"
page.start()
# after those checks all remaining pages should have these attributes
else if not _.every(page.hasOwnProperty(x) for x in ["has_initialized", "has_started"], _.identity)
throwexit "page is missing some attributes (this shouldn't happen)"
# page might have had an error
else if page.error
log "removing a page from spatula.pages because of an error"
@pages.remove(page)
# means that onLoadFinished has been called at least once (like after start)
else if page.loaded
try
console.log "main_loop: calling transition on page"
page.transition()
console.log "main_loop: calling analyze on page"
page.analyze()
catch error
throwexit error
# main_loop might be called on an interval prior to adding pages
if @pages.length == 0 and @has_started
throwexit new Error("main_loop called prior to adding any pages in spatula")
start: ->
@has_started = true
window.setInterval(_.bind(@main_loop, @), @interval)
startup_callback = (resque) ->
console.log("starting a tiny spatula demo...")
queue.push("movies", {"name": "short circuit"})
queue.push("movies", {"name": "star wars episode iv"})
queue.push("movies", {"name": "star wars episode v"})
queue.push("movies", {"name": "star wars episode vi"})
moviemonster = (movie) ->
console.log("movie name is: " + movie.name)
queue.pop("movies", moviemonster)
queue.pop("movies", moviemonster)
# also start spatula in the mean time
spatula = new Spatula(resque)
spatula.add_page(ExampleHomePage)
spatula.start()
console.log("spatula initiated")
queue = new Resque("localhost", "7379", startup_callback)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment