Skip to content

Instantly share code, notes, and snippets.

@kennylugo
Last active December 29, 2015 23:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kennylugo/db54adcdf7891fe432ce to your computer and use it in GitHub Desktop.
Save kennylugo/db54adcdf7891fe432ce to your computer and use it in GitHub Desktop.
Basic Site Crawling
var request = require("request"); //We are asking to include these npm modules in our project.
var cheerio = require("cheerio");
// request('https://news.ycombinator.com', function (error, response, html) {
// if (!error && response.statusCode == 200){
// console.log(html);
// }
// });
//If we run node scrape.js we should see the website's html.
//------------------------------------------------------------------------------------------------------------------
request('https://news.ycombinator.com', function (error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
$('span.comhead').each(function(i, element){
var a = $(this).prev(); //Will select the previous element.
var rank = a.parent().parent().text(); //get the rank by parsing the element two levels above the "a" element.
var title = a.text(); //parse the link title.
var url = a.attr('href'); //parse the href attribute from the "a" element.
var subtext = a.parent().parent().next().children('.subtext').children(); // get the subtext children from the next row in the HTML table.
var points = $(subtext).eq(0).text(); //extract the relevant data from the children.
var username = $(subtext).eq(1).text(); //extract the relevant data from the children.
var comments = $(subtext).eq(2).text(); //extract the relevant data from the children.
// Our parsed meta data object
var metadata = {
rank: parseInt(rank),
title: title,
url: url,
points: parseInt(points),
username: username,
comments: parseInt(comments)
};
console.log(metadata);
});
}
});
//Running the the code above will output an array of objects like this:
// [ { rank: 1,
// title: 'The Meteoric Rise of DigitalOcean ',
// url: 'http://news.netcraft.com/archives/2013/06/13/the-meteoric-rise-of-digitalocean.html',
// points: 240,
// username: 'beigeotter',
// comments: 163 },
// { rank: 2,
// title: 'Introducing Private Networking',
// url: 'https://www.digitalocean.com/blog_posts/introducing-private-networking',
// points: 172,
// username: 'Goranek',
// comments: 75 },
// ...
//We could store the extracted data in a database like MongoDB or Redis to further process it.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment