Skip to content

Instantly share code, notes, and snippets.

@pluswave
Created March 29, 2016 13:30
Show Gist options
  • Save pluswave/43e3077513307e6315e9 to your computer and use it in GitHub Desktop.
Save pluswave/43e3077513307e6315e9 to your computer and use it in GitHub Desktop.
youban.com 专用MP3爬虫
#!usr/bin/env node
var jsdom = require('jsdom');
function lists(window){
var as = window.document.body.querySelectorAll("#topicListern a");
as = Array.prototype.slice.call(as, 0);
// console.log(JSON.stringify(as));
return as.filter( function(a){
return !!a.text;
}).map(function(a){
return {
title: a.text,
url: a.href
};
});
//return as
}
function getDownloadUrl(window){
var as = window.document.body.querySelectorAll(".downloadboxlist a");
as = Array.prototype.slice.call(as, 0);
return as.filter( function(a){
return a.text.indexOf('网通') >= 0 || a.text.indexOf('联通') >= 0;
})[0].href;
}
jsdom.env('http://www.youban.com/mp3-t4416.html', function(err, window) {
if( err ){
console.log("error open url");
return;
}
var mp3_page_list = lists(window);
downloadMP3(mp3_page_list);
});
function downloadMP3(mp3_page_list){
var item = mp3_page_list.shift();
if( !item ){
return;
}
var title = item.title;
var url = item.url;
//console.log(title, url, url.replace(/([0-9]{4})/, "d$1"));
var rurl = url.replace(/([0-9]{4})/, "d$1");
// var nPage = webPage.create();
jsdom.env(rurl, function(err, window){
if( !err ){
var mp3_url = getDownloadUrl(window);
console.log("wget", "-O", "'" + title + ".mp3'", mp3_url);
downloadMP3(mp3_page_list);
}
else{
mp3_page_list.unshift(item);
downloadMP3(mp3_page_list);
}
});
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment