Skip to content

Instantly share code, notes, and snippets.

@kasperlanger
Created May 2, 2011 18:39
Show Gist options
  • Save kasperlanger/952114 to your computer and use it in GitHub Desktop.
Save kasperlanger/952114 to your computer and use it in GitHub Desktop.
#!/usr/bin/env coffee
# `crawler.coffee` - a simple http crawler recording the timings of GET request.
# Require http, jquery and underscore
http = require('http')
$ = require('jquery')
_ = require('underscore')
# Set options for http client
auth = 'Basic ' + new Buffer('demo2:demo').toString('base64')
options =
host: 'localhost'
port: 8080
headers: Authorization: auth
# Initialize task tracking variables
visited = {'/view/auth/logout': 'blacklist'}
results = []
queue = []
# `padl` - takes a string and optional `length` option. Returns a string left-padded with spaces.
padl = (str, length = 5) ->
str = "" + str
while (str.length < length)
str = " " + str
return str
# `get` - HTTP get `path` and call `handler` with the content of the response
get = (path, handler) ->
content = []
options.path = path
http.get options, (res) ->
res.on 'data', (chunk) ->
content.push chunk
res.on 'end', ->
handler(content.join "")
# `visit` - Recursively crawl pages (breadth first one request at a time) starting at `path`.
visit = (path) ->
if visited[path] or path.match /^mailto:/
return visit(queue.shift())
visited[path] = true;
start = (new Date).getTime()
get path, (content) ->
diff = (new Date).getTime() - start
console.log "#{padl diff} ms #{path}"
results.push {time: diff, path: path}
$(content).find('a').each ->
queue.push($(this).attr('href'))
visit(queue.shift())
# Start the crawler!
visit '/'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment