Skip to content

Instantly share code, notes, and snippets.

@astro
Last active August 29, 2015 14:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save astro/6347ba2022e4d83a56ef to your computer and use it in GitHub Desktop.
Save astro/6347ba2022e4d83a56ef to your computer and use it in GitHub Desktop.
// npm i crawler && node scrape.js && less scraped.json
var fs = require('fs');
var resolve = require('url').resolve;
var Crawler = require("crawler").Crawler;
function trim(s) {
return s.replace(/^\s+/gm, "").replace(/\s+$/gm, "");
}
var items = [];
var c = new Crawler({
maxConnections: 4,
skipDuplicates: true,
callback: function(error,result,$) {
try {
// Scrape tables
$('table').each(function() {
var table = $(this);
var headers = table.find('thead th').map(function() {
return $(this).text();
}).toArray();
var rows = table.find('tbody tr').map(function() {
var i = 0;
var obj = {};
$(this).find('td').each(function() {
var k = headers[i] || ("unknkown" + i);
i++;
var v = trim($(this).text());
var as;
if ((as = $(this).find('a')).length > 0) {
v = as.map(function() {
return $(this).attr('href');
}).toArray();
}
obj[k] = v;
});
return obj;
}).toArray();
console.log(rows.length + " records in table");
items = items.concat(rows);
});
// Scrape contact boxes
var contacts = $('.contact').map(function() {
var c = $(this);
var obj = {
title: c.find('h3').text()
};
c.find('dd').each(function() {
var dd = $(this);
var k = dd.attr('class');
var v = dd.text();
var as;
if ((as = $(this).find('a')).length > 0) {
v = as.map(function() {
return $(this).attr('href');
}).toArray();
}
obj[k] = v;
});
return obj;
}).toArray();
console.log(contacts.length + " contacts");
items = items.concat(contacts);
// Open search results
function followLink() {
var url = resolve("http://www.dresden.de/sportvereine/listing", $(this).attr('href'));
c.queue(url);
}
$('.detaillist li a').each(followLink);
// Follow pages
$('.searchpager li a').each(followLink);
console.log("Done with " + result.window.document._URL);
} catch (e) {
console.error(e);
}
},
onDrain: function() {
console.log("Writing " + items.length + " items");
var out = fs.createWriteStream("scraped.json");
out.write("[\n");
for(var i = 0; i < items.length; i++) {
out.write(i == 0 ? "" : ",\n");
out.write(JSON.stringify(items[i]));
}
out.write("\n]\n");
}
});
c.queue(["http://www.dresden.de/de/05/02/c_01.php", "http://www.dresden.de/de/05/02/10/c_02.php", "http://www.dresden.de/de/05/02/09/adressen.php", "http://www.dresden.de/de/05/02/06/c_02.php", "http://www.dresden.de/de/05/02/05/c_02.php", "http://www.dresden.de/de/03/070/c_03.php"]);
c.queue("http://www.dresden.de/sportvereine/listing")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment