Skip to content

Instantly share code, notes, and snippets.

@yurivictor
Created June 1, 2015 21:55
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save yurivictor/4984ef873efb5146de08 to your computer and use it in GitHub Desktop.
Save yurivictor/4984ef873efb5146de08 to your computer and use it in GitHub Desktop.
Example scraper in node (scraping rand paul's site for issues)
var cheerio = require('cheerio');
var request = require('request');
var fs = require('fs');
var s,
Scrape = {
settings: {
// Object, the json to be output
json: {},
// String, the url to scrape
domain: 'https://randpaul.com/issues',
// Int, an iterator for use later
iterator: 0
},
init: function() {
// Globalize settings
s = this.settings;
// Start scraper
request( s.domain, this.scrapePage );
// Wait 10 seconds to scrape and then output the JSON
setTimeout( Scrape.outputJSON, 10000 );
},
scrapePage: function( error, resp, html ) {
// Init cheerio
var $ = cheerio.load( html );
// Get each issue
$( '.short-article a' ).each( function ( i, elem ) {
// Up the json
s.json[i] = { 'issue': '', 'text': '' };
// Set the issue
s.json[i]['issue'] = $( this ).text();
// Get the text
request( $( this ).attr( 'href' ), Scrape.getText );
} );
},
getText: function( error, resp, html ) {
// Init cheerio
var $ = cheerio.load( html );
// Set the text
s.json[s.iterator]['text'] = $( '.article-text' ).text();
// Increase the iterator
s.iterator++;
},
outputJSON: function() {
fs.writeFile( 'output.json', JSON.stringify( s.json, null, 4 ), function( error ) {
console.log( 'File successfully written' );
} );
}
};
(function() {
Scrape.init();
})();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment