Skip to content

Instantly share code, notes, and snippets.

@coleww
Created November 10, 2015 16:34
Show Gist options
  • Save coleww/b3228ad22a29144ad146 to your computer and use it in GitHub Desktop.
Save coleww/b3228ad22a29144ad146 to your computer and use it in GitHub Desktop.
top one hundo itunes
request('http://lyrics.wikia.com/LyricWiki:Top_100', function (error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
//handle the 'page not found' page?
$('li b a').each(function(i, element){
pages.push('http://lyrics.wikia.com' + $(element).attr('href'))
//
});
console.log(pages)
//start scraping
doit()
}
});
function doit(){
setTimeout(function(){
scrape()
if(pages.length) doit()
}, 25000)
}
function scrape(){
// grab a page off the stack (queue? whatever....)
var page = pages.pop()
console.log(page, 'SPIDER')
// handle un-created pages
if(page.indexOf('redlink') === -1) {
request(page, function (error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
// grab those sweet lyrics
var text = $('.lyricbox').html()
// remove script tags for tracking junk
text = text.replace(/<script>.+<\/script>/g, '')
// remove weird comment thing, split on break
lines = text.slice(0, text.indexOf('<!')).split('<br>')
lines.forEach(function(line){
examine(line)
})
// ugh name the file better :<
fs.writeFileSync(page.replace(/\W/g, "-").split('com-')[1] + '.txt', escaper.unescape(lines.join("\n")).replace(/&apos;/g, "'"))
console.log("SUCCESS")
}
});
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment