Last active
August 29, 2015 14:05
-
-
Save astro/6347ba2022e4d83a56ef to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// npm i crawler && node scrape.js && less scraped.json | |
var fs = require('fs'); | |
var resolve = require('url').resolve; | |
var Crawler = require("crawler").Crawler; | |
function trim(s) { | |
return s.replace(/^\s+/gm, "").replace(/\s+$/gm, ""); | |
} | |
var items = []; | |
var c = new Crawler({ | |
maxConnections: 4, | |
skipDuplicates: true, | |
callback: function(error,result,$) { | |
try { | |
// Scrape tables | |
$('table').each(function() { | |
var table = $(this); | |
var headers = table.find('thead th').map(function() { | |
return $(this).text(); | |
}).toArray(); | |
var rows = table.find('tbody tr').map(function() { | |
var i = 0; | |
var obj = {}; | |
$(this).find('td').each(function() { | |
var k = headers[i] || ("unknkown" + i); | |
i++; | |
var v = trim($(this).text()); | |
var as; | |
if ((as = $(this).find('a')).length > 0) { | |
v = as.map(function() { | |
return $(this).attr('href'); | |
}).toArray(); | |
} | |
obj[k] = v; | |
}); | |
return obj; | |
}).toArray(); | |
console.log(rows.length + " records in table"); | |
items = items.concat(rows); | |
}); | |
// Scrape contact boxes | |
var contacts = $('.contact').map(function() { | |
var c = $(this); | |
var obj = { | |
title: c.find('h3').text() | |
}; | |
c.find('dd').each(function() { | |
var dd = $(this); | |
var k = dd.attr('class'); | |
var v = dd.text(); | |
var as; | |
if ((as = $(this).find('a')).length > 0) { | |
v = as.map(function() { | |
return $(this).attr('href'); | |
}).toArray(); | |
} | |
obj[k] = v; | |
}); | |
return obj; | |
}).toArray(); | |
console.log(contacts.length + " contacts"); | |
items = items.concat(contacts); | |
// Open search results | |
function followLink() { | |
var url = resolve("http://www.dresden.de/sportvereine/listing", $(this).attr('href')); | |
c.queue(url); | |
} | |
$('.detaillist li a').each(followLink); | |
// Follow pages | |
$('.searchpager li a').each(followLink); | |
console.log("Done with " + result.window.document._URL); | |
} catch (e) { | |
console.error(e); | |
} | |
}, | |
onDrain: function() { | |
console.log("Writing " + items.length + " items"); | |
var out = fs.createWriteStream("scraped.json"); | |
out.write("[\n"); | |
for(var i = 0; i < items.length; i++) { | |
out.write(i == 0 ? "" : ",\n"); | |
out.write(JSON.stringify(items[i])); | |
} | |
out.write("\n]\n"); | |
} | |
}); | |
c.queue(["http://www.dresden.de/de/05/02/c_01.php", "http://www.dresden.de/de/05/02/10/c_02.php", "http://www.dresden.de/de/05/02/09/adressen.php", "http://www.dresden.de/de/05/02/06/c_02.php", "http://www.dresden.de/de/05/02/05/c_02.php", "http://www.dresden.de/de/03/070/c_03.php"]); | |
c.queue("http://www.dresden.de/sportvereine/listing") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment