Skip to content

Instantly share code, notes, and snippets.

@gudmundur
Created June 6, 2012 11:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gudmundur/2881474 to your computer and use it in GitHub Desktop.
Save gudmundur/2881474 to your computer and use it in GitHub Desktop.
Server-Side scraping with jQuery, jsdom and node.js
fs = require 'fs'
jsdom = require 'jsdom'
{ zip } = require 'underscore'
jquery = fs.readFileSync 'lib/jquery-1.7.2.min.js'
story = ($, [title, subtext]) ->
$t = (selector) -> $ selector, title
$s = (selector) -> $ selector, subtext
source = (s) -> s.match(/\((.*)\)/)?[1] or ''
number = (s) -> Number (s).text().split(' ')[0]
link = $t 'td.title a'
{
title: link.text()
url: link.attr 'href'
source: source $('.comhead', title).text()
user: ($s 'a[href^="user"]').text()
points: number ($s 'span[id^="score"]')
comments: number ($s 'a[href^="item"]')
}
stories = ($) ->
subtexts = ($ '.subtext')
titles = subtexts.parent().prev()
stories = zip titles, subtexts
(story($, s) for s in stories)
domReady = (callback) -> (error, window) ->
stories = stories window.$
window.close()
callback null, stories
@scrape = (html, callback) ->
jsdom.env { html: html, src: [jquery], done: domReady callback }
return
request = require 'request'
hn = require './lib/hn'
request 'http://news.ycombinator.com/', (error, response, body) ->
hn.scrape body, (err, stories) ->
console.log stories
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment