Skip to content

Instantly share code, notes, and snippets.

@astur
Created May 22, 2016 03:30
Show Gist options
  • Star 11 You must be signed in to star a gist
  • Fork 9 You must be signed in to fork a gist
  • Save astur/2b3258a7991d2bc83d07670f27036fb0 to your computer and use it in GitHub Desktop.
Save astur/2b3258a7991d2bc83d07670f27036fb0 to your computer and use it in GitHub Desktop.
Ferra scraping (for Habr)
var tress = require('tress');
var needle = require('needle');
var cheerio = require('cheerio');
var resolve = require('url').resolve;
var fs = require('fs');
var URL = 'http://www.ferra.ru/ru/techlife/news/';
var results = [];
var q = tress(function(url, callback){
needle.get(url, function(err, res){
if (err) throw err;
var $ = cheerio.load(res.body);
if($('.b_infopost').contents().eq(2).text().trim().slice(0, -1) === 'Алексей Козлов'){
results.push({
title: $('h1').text(),
date: $('.b_infopost>.date').text(),
href: url,
size: $('.newsbody').text().length
});
}
$('.b_rewiev p>a').each(function() {
q.push($(this).attr('href'));
});
$('.bpr_next>a').each(function() {
q.push(resolve(URL, $(this).attr('href')));
});
callback();
});
}, 10);
q.drain = function(){
fs.writeFileSync('./data.json', JSON.stringify(results, null, 4));
}
q.push(URL);
{
"private": true,
"name": "ferra-scraper",
"version": "0.0.1",
"description": "Web scraping example for habrahabr",
"main": "index.js",
"author": "astur <astur@yandex.ru> (http://kozlov.am/)",
"license": "WTFPL",
"dependencies": {
"cheerio": "^0.20.0",
"needle": "^1.0.0",
"tress": "^1.0.0"
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment