Created
May 2, 2014 16:18
-
-
Save jazahn/2821faf2fd5576558695 to your computer and use it in GitHub Desktop.
iTunesU data collection
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var http = require("http"); | |
var https = require("https"); | |
var parseString = require('xml2js').parseString; | |
var Metadata = require('fluent-ffmpeg').Metadata; | |
var fs = require("fs"); | |
var stream = fs.createWriteStream("itunesu.csv"); | |
var jsdom = require("jsdom"); | |
var channels = {courses: [819616149, 512201207, 571347835, 566636297, 786312091, 624655973, 613647256, 502492375, | |
529181544, 670536578, 670536610, 670535516, 625966490, 670536607, 670535213, 670662986, 625902714], | |
other: [411903143, 379063375, 379063381, 591098929, 567747501, 379063801, 587128452, 472446131, 399283453, 400741408, | |
714329509, 825877930, 673110189, 518318060, 518313357, 413553848, 592788196, 561512608, 623829264, 263483352, 398045010, | |
401164502, 401003463, 479663762, 417418575, 430450895, 545327510, 379063752, 379063759, 578784220, 515364871, 436371984, | |
478087810, 395952634, 453738055, 591462156, 465084893, 587134365, 431237603, 575900407, 521116421, 490169377, 439654236, | |
453238964, 444745048, 463528970, 671010496, 815668660, 434828355, 596136152, 434703168, 571645977, 379064741, 584028745, | |
379064174, 379064160, 479309361, 605400671, 533433841, 615412484, 825748928, 670965868, 573237224, 410612813, | |
429705849, 519683367, 415512057, 671078494, 561949070, 670983664, 587370423, 501794769, 587557773, 526156815, 411429327, | |
439300531, 578776681, 531461319, 524412107, 440723882, 521166717, 480999153, 379064095, 467626953, 502637008, 578778015, | |
578779029, 536305743, 631354973, 684872322, 718405737, 611185098, 780886169, 507487820, 569167537, 399227991, 432423871, | |
526404628, 526114704, 487252557, 474569709, 497804027, 403603467, 501192024, 489312990, 422641216, 432602394, 444271442, | |
399792369, 570216090, 575656854, 510666384, 433483802, 446399851, 614579847, 444002672, 479215163, 493052750, 643305020, | |
424040892, 474584559, 379063371, 457060447, 578690626, 394751011, 393343331, 418598278, 432140237, 379064041, 514573600, | |
425669026, 414024198, 583974449, 475836756]}; | |
//var channels = {courses: [], other: [411903143, 379063375, 379063381]}; | |
var url = function(type, channel){ | |
switch(type){ | |
case 'courses': | |
return "https://itunesu.itunes.apple.com/WebObjects/LZDirectory.woa/ra/directory/courses/"+channel+"/feed"; | |
break; | |
case 'other': | |
return "http://phobos.apple.com/WebObjects/MZStore.woa/wa/viewPodcast?id="+channel; | |
break; | |
default: | |
console.log("error: invalid channel type"); | |
} | |
} | |
var dodata = function(channelType, channel){ | |
console.log(url("courses", channel)); | |
if(channelType == "courses"){ | |
var req = https.request(url(channelType, channel), function(response){ | |
console.log("redirect: " + response.headers.location); | |
https.get(response.headers.location, function(response){ | |
var body = ''; | |
response.on("data", function(chunk){ | |
body += chunk; | |
}); | |
response.on("end", function(){ | |
//console.log(body); | |
parseString(body, function(err, result){ | |
//console.log(result.feed.entry.length); | |
processChannelEntry(result, result.feed.entry.shift()); | |
}); | |
}); | |
}); | |
}); | |
// this is needed otherwise the req limit of 5 keeps it from continuing past 5... | |
req.shouldKeepAlive = false; | |
req.end(); | |
} else if(channelType == "other"){ | |
var req = http.request(url(channelType, channel), function(response){ | |
console.log("redirect: " + response.headers.location); | |
jsdom.env(response.headers.location, function(errors, window){ | |
$ = require("jquery")(window); | |
//console.log($('.video').toArray()); | |
var channelName = $('#title h1').html(); | |
var arr = $('tr[kind=movie], tr[audio-preview-url]').toArray(); | |
processOtherEntry($, arr, arr.shift()); | |
}); | |
}); | |
req.shouldKeepAlive = false; | |
req.end(); | |
} | |
} | |
var processChannelEntry = function(result, item){ | |
var channelTitle = result.feed.title[0]._; | |
var output = ''; | |
var title = item.title[0]._; | |
var published = item.published; | |
var filename = item.link[0].$.href; | |
var type = ''; | |
if(item.link[0].$.type.match(/video/)){ | |
type = "video"; | |
} | |
if(item.link[0].$.type.match(/audio/)){ | |
type = "audio"; | |
} | |
if(type != ''){ | |
new Metadata( | |
filename, | |
function(metadata, err) { | |
var duration = metadata.durationsec; | |
output += channelTitle + "\t" + title + "\t" + duration + "\t" + published + "\t" + type + "\n"; | |
console.log(output); | |
stream.write(output); | |
// go to next | |
progressChannelEntry(result); | |
} | |
); | |
} else { | |
progressChannelEntry(result); | |
} | |
} | |
var processOtherEntry = function($, entries, entry){ | |
var channelTitle = $(entry).attr('preview-album'); | |
var title = $(entry).attr('preview-title'); | |
var published = $('td.release-date', entry).attr('sort-value'); | |
var filename = ''; | |
var type = ''; | |
var output = ''; | |
if($(entry).attr('video-preview-url')){ | |
type = 'video'; | |
filename = $(entry).attr('video-preview-url'); | |
} else if($(entry).attr('audio-preview-url')){ | |
type = 'audio'; | |
filename = $(entry).attr('audio-preview-url'); | |
} else { | |
console.log("ERROR: unknown preview-url format: "); | |
console.log($(entry).parent().html()); | |
} | |
// duration is done in miliseconds | |
var duration = parseInt($(entry).attr('preview-duration')) / 1000; | |
if(duration < 1){ | |
console.log("ERROR: duration no good"); | |
} | |
output += channelTitle + "\t" + title + "\t" + duration + "\t" + published + "\t" + type + "\n"; | |
console.log(output); | |
stream.write(output); | |
// go on to next | |
progressOtherEntry($, entries); | |
} | |
var progressChannelEntry = function(result){ | |
if(result.feed.entry.length > 0){ | |
processChannelEntry(result, result.feed.entry.shift()); | |
} else { | |
progress(); | |
} | |
} | |
var progressOtherEntry = function($, entries){ | |
if(entries.length > 0){ | |
processOtherEntry($, entries, entries.shift()); | |
} else { | |
progress(); | |
} | |
} | |
var progress = function(){ | |
if(channels.courses.length > 0){ | |
dodata("courses", channels.courses.shift()); | |
} else if(channels.other.length > 0){ | |
dodata("other", channels.other.shift()); | |
} else { | |
// finished | |
stream.end(); | |
} | |
} | |
progress(); | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment