Skip to content

Instantly share code, notes, and snippets.

@jazahn
Created May 2, 2014 16:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jazahn/2821faf2fd5576558695 to your computer and use it in GitHub Desktop.
Save jazahn/2821faf2fd5576558695 to your computer and use it in GitHub Desktop.
iTunesU data collection
var http = require("http");
var https = require("https");
var parseString = require('xml2js').parseString;
var Metadata = require('fluent-ffmpeg').Metadata;
var fs = require("fs");
var stream = fs.createWriteStream("itunesu.csv");
var jsdom = require("jsdom");
var channels = {courses: [819616149, 512201207, 571347835, 566636297, 786312091, 624655973, 613647256, 502492375,
529181544, 670536578, 670536610, 670535516, 625966490, 670536607, 670535213, 670662986, 625902714],
other: [411903143, 379063375, 379063381, 591098929, 567747501, 379063801, 587128452, 472446131, 399283453, 400741408,
714329509, 825877930, 673110189, 518318060, 518313357, 413553848, 592788196, 561512608, 623829264, 263483352, 398045010,
401164502, 401003463, 479663762, 417418575, 430450895, 545327510, 379063752, 379063759, 578784220, 515364871, 436371984,
478087810, 395952634, 453738055, 591462156, 465084893, 587134365, 431237603, 575900407, 521116421, 490169377, 439654236,
453238964, 444745048, 463528970, 671010496, 815668660, 434828355, 596136152, 434703168, 571645977, 379064741, 584028745,
379064174, 379064160, 479309361, 605400671, 533433841, 615412484, 825748928, 670965868, 573237224, 410612813,
429705849, 519683367, 415512057, 671078494, 561949070, 670983664, 587370423, 501794769, 587557773, 526156815, 411429327,
439300531, 578776681, 531461319, 524412107, 440723882, 521166717, 480999153, 379064095, 467626953, 502637008, 578778015,
578779029, 536305743, 631354973, 684872322, 718405737, 611185098, 780886169, 507487820, 569167537, 399227991, 432423871,
526404628, 526114704, 487252557, 474569709, 497804027, 403603467, 501192024, 489312990, 422641216, 432602394, 444271442,
399792369, 570216090, 575656854, 510666384, 433483802, 446399851, 614579847, 444002672, 479215163, 493052750, 643305020,
424040892, 474584559, 379063371, 457060447, 578690626, 394751011, 393343331, 418598278, 432140237, 379064041, 514573600,
425669026, 414024198, 583974449, 475836756]};
//var channels = {courses: [], other: [411903143, 379063375, 379063381]};
var url = function(type, channel){
switch(type){
case 'courses':
return "https://itunesu.itunes.apple.com/WebObjects/LZDirectory.woa/ra/directory/courses/"+channel+"/feed";
break;
case 'other':
return "http://phobos.apple.com/WebObjects/MZStore.woa/wa/viewPodcast?id="+channel;
break;
default:
console.log("error: invalid channel type");
}
}
var dodata = function(channelType, channel){
console.log(url("courses", channel));
if(channelType == "courses"){
var req = https.request(url(channelType, channel), function(response){
console.log("redirect: " + response.headers.location);
https.get(response.headers.location, function(response){
var body = '';
response.on("data", function(chunk){
body += chunk;
});
response.on("end", function(){
//console.log(body);
parseString(body, function(err, result){
//console.log(result.feed.entry.length);
processChannelEntry(result, result.feed.entry.shift());
});
});
});
});
// this is needed otherwise the req limit of 5 keeps it from continuing past 5...
req.shouldKeepAlive = false;
req.end();
} else if(channelType == "other"){
var req = http.request(url(channelType, channel), function(response){
console.log("redirect: " + response.headers.location);
jsdom.env(response.headers.location, function(errors, window){
$ = require("jquery")(window);
//console.log($('.video').toArray());
var channelName = $('#title h1').html();
var arr = $('tr[kind=movie], tr[audio-preview-url]').toArray();
processOtherEntry($, arr, arr.shift());
});
});
req.shouldKeepAlive = false;
req.end();
}
}
var processChannelEntry = function(result, item){
var channelTitle = result.feed.title[0]._;
var output = '';
var title = item.title[0]._;
var published = item.published;
var filename = item.link[0].$.href;
var type = '';
if(item.link[0].$.type.match(/video/)){
type = "video";
}
if(item.link[0].$.type.match(/audio/)){
type = "audio";
}
if(type != ''){
new Metadata(
filename,
function(metadata, err) {
var duration = metadata.durationsec;
output += channelTitle + "\t" + title + "\t" + duration + "\t" + published + "\t" + type + "\n";
console.log(output);
stream.write(output);
// go to next
progressChannelEntry(result);
}
);
} else {
progressChannelEntry(result);
}
}
var processOtherEntry = function($, entries, entry){
var channelTitle = $(entry).attr('preview-album');
var title = $(entry).attr('preview-title');
var published = $('td.release-date', entry).attr('sort-value');
var filename = '';
var type = '';
var output = '';
if($(entry).attr('video-preview-url')){
type = 'video';
filename = $(entry).attr('video-preview-url');
} else if($(entry).attr('audio-preview-url')){
type = 'audio';
filename = $(entry).attr('audio-preview-url');
} else {
console.log("ERROR: unknown preview-url format: ");
console.log($(entry).parent().html());
}
// duration is done in miliseconds
var duration = parseInt($(entry).attr('preview-duration')) / 1000;
if(duration < 1){
console.log("ERROR: duration no good");
}
output += channelTitle + "\t" + title + "\t" + duration + "\t" + published + "\t" + type + "\n";
console.log(output);
stream.write(output);
// go on to next
progressOtherEntry($, entries);
}
var progressChannelEntry = function(result){
if(result.feed.entry.length > 0){
processChannelEntry(result, result.feed.entry.shift());
} else {
progress();
}
}
var progressOtherEntry = function($, entries){
if(entries.length > 0){
processOtherEntry($, entries, entries.shift());
} else {
progress();
}
}
var progress = function(){
if(channels.courses.length > 0){
dodata("courses", channels.courses.shift());
} else if(channels.other.length > 0){
dodata("other", channels.other.shift());
} else {
// finished
stream.end();
}
}
progress();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment