Created
September 30, 2010 22:35
-
-
Save grampelberg/605449 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var couchdb = require('couchdb'); | |
var request = require('request'); | |
var sys = require("sys"); | |
var jsdom = require("jsdom"); | |
var _ = require('underscore')._; | |
var mustache = require('mustache'); | |
var window = jsdom.jsdom().createWindow(); | |
var $ = null; | |
var client = couchdb.createClient(5984, '127.0.0.1', 'username', | |
'password'); | |
var db = client.db('ted'); | |
var scrape = { | |
stop: false, | |
ordered: false, | |
pages: [], | |
site: 'http://www.ted.com', | |
generator: 'http://tracker-01.apps.falcon.utorrent.com/url?' + | |
'url={{ site }}{{ url }}&name={{ title }}{{ ext }}', | |
sorting: [ 'MOSTTRANSLATED', 'MOSTEMAILED', 'MOSTDISCUSSED', | |
'MOSTFAVORITED', 'JAW-DROPPING', 'PERSUASIVE', | |
'COURAGEOUS', 'INGENIOUS', 'FASCINATING', 'INSPIRING', | |
'BEAUTIFUL', 'FUNNY', 'INFORMATIVE' ], | |
theme_sorting: [ 'MOSTDISCUSSED', 'NEWEST', 'MOSTEMAILED', 'MOSTTALKS' ], | |
search: 'http://www.ted.com/talks/searchRpc?tagid={{ id }}&' + | |
'orderedby={{ sort }}', | |
theme_search: 'http://www.ted.com/themes/searchRpc?tagid={{ id }}&' + | |
'orderedby={{ sort }}', | |
update_speaker: function(id, link, err, resp, body) { | |
if (err) { | |
console.log('Error(' + id + ') - ' + link); | |
return | |
} | |
console.log('speaker(' + id +') - ' + link); | |
var speaker = $("h1", body).find("span").text().split(': ')[0]; | |
if (!speaker) { | |
console.log('Error(' + id + ') - ' + link); | |
console.log('\t' + $("h1", body).find("span").text()); | |
return | |
} | |
var categories = {}; | |
var names = [ null, null, "speakers", "themes", "conferences" ]; | |
_.each($(".categories", body).find("dd"), function(elem, k) { | |
if (!names[k]) | |
return | |
categories[names[k]] = _.map($("li", elem), function(v) { | |
return $(v).text(); | |
}); | |
}); | |
var quote = { 'text': $(".why", body).find("blockquote").text(), | |
'cite': $(".why", body).find("cite").text() }; | |
var web = _.reduce( | |
$(".categories", body).find("dd:first").find("a"), function(acc, v) { | |
acc[$(v).text().split(': ')[1]] = $(v).attr("href"); | |
return acc | |
}, {}); | |
if (quote.text) | |
categories.quote = quote; | |
if (_.keys(web).length != 0) | |
categories.web = web; | |
db.getDoc(id, function(err, doc) { | |
doc.speaker = _.extend(categories, { | |
name: $(".viewTemplate", body).find("h1").find("span").text().split( | |
': ')[0], | |
caption: $(".viewTemplate", body).find("h1").find("span").text().split( | |
': ')[1], | |
descr: $("#speakerscontent", body).find("p:first").text(), | |
why: $(".why", body).find("p:first").html(), | |
images: { thumbnail: $("#speakerscontent", body).find("img").attr( | |
"src").split("_")[0] + '_132x99.jpg', | |
profile: $("#speakerscontent", body).find("img").attr("src") | |
}, | |
link: link | |
}); | |
db.saveDoc(id, doc, function(err, ok) { | |
if (err) throw new Error(JSON.stringify(err) + '\t' + id); | |
console.log('\tUpdated: ' + id + ' with ' + speaker); | |
}); | |
}); | |
}, | |
purify_page: function(talk, err, resp, body) { | |
if (err) { | |
console.log('Error - ' + talk.link); | |
return | |
} | |
try { | |
var id = $(".downloads", body).find("a").attr("href").split('/').slice( | |
-1)[0]; | |
} catch(err) { | |
console.log('Couldn\'t find ID (' + talk.link + ')'); | |
return | |
} | |
console.log(id + ' - ' + talk.link); | |
var downloads = _.reduce( | |
$(".downloads", body).find("a"), function(acc, elem) { | |
var types = { "Watch high-res video (MP4)": "HD", | |
"Download video to desktop (MP4)": "SD", | |
"Audio to desktop (MP3)": "Audio only" | |
}; | |
if ($(elem).text() in types) | |
acc[types[$(elem).text()]] = $(elem).attr('href'); | |
return acc | |
}, {}); | |
var title = $("#altHeadline", body).text().replace(/( )+$/, ''); | |
db.saveDoc( | |
id, | |
{ _id: id, | |
descr: $("#tagline", body).text(), | |
download: { | |
http: _.reduce(downloads, function(acc, v, k) { | |
acc[k] = scrape.site + v; | |
return acc; | |
}, {}), | |
torrent: _.reduce(downloads, function(acc, v, k) { | |
var extension = { 'HD': '.mp4', | |
'SD': '.mp4', | |
'Audio only': '.mp3' }; | |
acc[k] = mustache.to_html(scrape.generator, { | |
url: v, title: escape(title.replace(':', '-')), | |
site: scrape.site, ext: extension[k] }); | |
return acc; | |
}, {}) | |
}, | |
duration: talk.duration, | |
images: { thumbnail: talk.thumbnail, | |
large: talk.thumbnail.split('_')[0] + '_615x461.jpg' }, | |
languages: _.map($("#languageCode", body).find("option"), function(elem) { | |
return { 'code': $(elem).attr('value'), | |
'lang': $(elem).text() } | |
}), | |
link: talk.link, | |
recommendations: [ | |
scrape.site + | |
$(".watchnext", body).find('a:first').attr('href')], | |
posted: talk.post, | |
speaker: $("#aboutThisTalk", body).find("h3:last").text().split( | |
'About ')[1], | |
tags: _.map($(".tags", body).find("a"), function(elem) { | |
return $(elem).text() | |
}), | |
themes: _.map($(".relatedThemes", body).find("a"), function(elem) { | |
return $(elem).text() | |
}), | |
title: title, | |
type: 'talk' | |
}, function(er, ok) { | |
if (er) throw new Error(JSON.stringify(er) + '\t' + id); | |
console.log('\tSaved: ' + talk.link); | |
var speaker_link = scrape.site + $("#aboutThisTalk", body).find( | |
"p:last").find("a:last").attr("href") | |
request({ | |
uri: speaker_link | |
}, _.bind(scrape.update_speaker, this, id, speaker_link)); | |
}); | |
}, | |
request_page: function(page) { | |
request( | |
{ uri: 'http://www.ted.com/talks/list/page/' + page }, | |
function(err, resp, body) { | |
if (scrape.stop) return | |
var tmp = $(".talkMedallion", body).text(); | |
if (tmp == scrape._tmp) return | |
scrape._tmp = tmp; | |
var talks = _($(".talkMedallion", body)).chain().map(function(elem) { | |
return { link: scrape.site + | |
$(elem).find('.thumbnail').find('a').attr('href'), | |
thumbnail: $(elem).find('.thumbnail').find('img:last' | |
).attr('src'), | |
duration: $(elem).find('.date').text().split(' ')[0], | |
post: new Date($(elem).find('.date').text().split( | |
': ')[1]).toISOString() | |
} | |
}).each(function(talk) { | |
db.view('app', 'links', { key: talk.link }, function(e, resp) { | |
if (resp.rows.length != 0) { | |
console.log('\tskipping: ' + talk.link); | |
scrape.stop = true; | |
// After a scrape is completed, fire up the 'ordering' which is | |
// going to give things like most favorited or emailed. | |
if (!scrape.ordered) { | |
scrape.ordered = true; | |
scrape.__ordering(); | |
} | |
return | |
} | |
request({ uri: talk.link }, _.bind(scrape.purify_page, this, talk)); | |
}) | |
}); | |
scrape.request_page(page + 1); | |
}); | |
}, | |
__fixup_speakers: function() { | |
db.view('app', 'bad-speakers', { limit: 25 }, function(e, resp) { | |
console.log('fixing - ' + resp.rows.length); | |
_.each(resp.rows, function(talk) { | |
talk = talk.value; | |
request({ uri: talk.link }, function(err, resp, body) { | |
var speaker_link = scrape.site + $("#aboutThisTalk", body).find( | |
"p:last").find("a:last").attr("href"); | |
console.log('\t'+speaker_link); | |
request({ uri: speaker_link }, | |
_.bind(scrape.update_speaker, this, talk._id, speaker_link)); | |
}); | |
}); | |
}); | |
}, | |
__tag_ids: function() { | |
// This is so that there's a list of tag-name to numerical id translations | |
// and is then used for the searchrpc calls. | |
request({ uri: scrape.site + '/talks/tags' }, function(err, resp, body) { | |
console.log('Parsing tags...'); | |
var tags = _.map($("#maincontent", body).find("a"), function(elem) { | |
return { name: $(elem).text().split(' (')[0], | |
id: $(elem).attr('href').split('/').slice(-1)[0], | |
link: scrape.site + $(elem).attr('href'), | |
type: 'tag' } | |
}); | |
db.view('app', 'ids', { include_docs: true, key: 'tag' }, function(er, resp) { | |
var names = _.map(resp.rows, function(v) { return v.doc.name }); | |
tags = _.filter(tags, function(doc) { | |
return names.indexOf(doc.name) == -1; | |
}); | |
if (tags.length) { | |
client.uuids(tags.length, function(er, resp) { | |
_(tags).chain().zip(resp.uuids).each(function(v) { | |
db.saveDoc(v[0], v[1], function(er, ok) { | |
console.log(er, ok); | |
}); | |
}); | |
}); | |
} | |
}); | |
}); | |
}, | |
__theme_ids: function(page) { | |
// This is so that there's a list of theme-name to numerical id | |
// translations and then is used for the theme/searchrpc calls. | |
if (!page) page = 1; | |
console.log('page: ' + page); | |
request( | |
{ uri: scrape.site + '/themes/list/page/' + page }, | |
function(err, resp, body) { | |
if (scrape.stop) return | |
var tmp = $(".themeMedallion", body).text(); | |
if (tmp == scrape._tmp) return | |
scrape._tmp = tmp; | |
scrape.__theme_ids(page+1); | |
db.view('app', 'ids', {include_docs: true, key: 'theme'}, function(er, resp) { | |
var ids = resp ? | |
_.map(resp.rows, function(v) { return v.doc.link; }) : []; | |
var themes = _($(".themeMedallion", body)).chain().map(function(elem) { | |
return $(elem).find('a:first').attr('href'); | |
}).filter(function(v) { | |
return ids.indexOf(scrape.site + v) == -1 | |
}).each(function(link) { | |
console.log('\t' + link); | |
request({ uri: scrape.site + link }, function(err, resp, body) { | |
client.uuids(1, function(er, resp) { | |
var uuid = resp.uuids[0]; | |
db.saveDoc(uuid, { name: $("h1", body).find("span").text(), | |
id: body.match(/themeId\/(\d+)/)[1], | |
descr: $("#themeDescription", body).html(), | |
link: scrape.site + link, | |
type: 'theme', | |
thumbnail: $(".about", body).find( | |
"div:first").find( | |
"img:last").attr('src') | |
}, function(er, ok) { | |
console.log('saved...'); | |
} | |
); | |
}); | |
}); | |
}); | |
}); | |
}); | |
}, | |
__ordering: function() { | |
db.view('app', 'type', { key: 'talk', include_docs: true }, function(err, resp) { | |
var docs = _.map(resp.rows, function(v) { | |
if ('ranking' in v.doc) | |
delete v.doc.ranking; | |
return v.doc; | |
}); | |
db.bulkDocs({ docs: docs }, function(er, ok) { | |
function category_ranking(i) { | |
if (i >= scrape.sorting.length) return | |
request({ | |
uri: mustache.to_html( | |
scrape.search, { id: 0, sort: scrape.sorting[i] })}, | |
function(err, resp, body) { | |
var talks = JSON.parse(body).main | |
_.each(talks, function(talk, k) { | |
db.getDoc(talk.id, function(err, doc) { | |
if (err) return | |
if (!('ranking' in doc)) doc.ranking = {}; | |
doc.ranking[scrape.sorting[i]] = k; | |
db.saveDoc(doc._id, doc, function(err, ok) { | |
console.log('\tsaved: (' + doc._id + ') '+ doc.title); | |
}); | |
}); | |
}); | |
category_ranking(i+1); | |
}); | |
} | |
category_ranking(0); | |
}); | |
}); | |
}, | |
__theme_ordering: function() { | |
db.view('app', 'type', { key: 'theme', include_docs: true }, function(e, resp) { | |
var docs = _.map(resp.rows, function(v) { | |
if ('ranking' in v.doc) | |
delete v.doc.ranking; | |
return v.doc; | |
}); | |
db.bulkDocs({ docs: docs }, function(e, ok) { | |
function theme_ranking(i) { | |
if (i >= scrape.theme_sorting.length) return | |
console.log('Looking up ' + scrape.theme_sorting[i]); | |
console.log('\tURL: ' + mustache.to_html( | |
scrape.theme_search, { id: 0, sort: scrape.theme_sorting[i] })); | |
request({ | |
uri: mustache.to_html( | |
scrape.theme_search, { id: 0, sort: scrape.theme_sorting[i] }) | |
}, function(err, resp, body) { | |
try { | |
var themes = JSON.parse(body).main | |
} catch(err) { | |
console.log('\tProblem with ' + scrape.theme_sorting[i]); | |
return | |
} | |
_.each(themes, function(theme, k) { | |
_.each(docs, function(doc) { | |
if (doc.id != theme.themeId) return | |
db.getDoc(doc._id, function(err, doc) { | |
if (!('ranking' in doc)) doc.ranking = {}; | |
doc.ranking[scrape.theme_sorting[i]] = k; | |
db.saveDoc(doc, function(err, ok) { | |
console.log('\tsaved: (' + doc._id + ') ' + doc.name); | |
}); | |
}); | |
}); | |
}); | |
theme_ranking(i + 1); | |
}); | |
} | |
theme_ranking(0); | |
}); | |
}); | |
}, | |
__fixup_images: function() { | |
db.view('app', 'type', { key: 'talk', include_docs: true }, function(e, resp) { | |
var docs = _.map(resp.rows, function(v) { | |
return v.doc; | |
}); | |
_.each(docs, function(doc) { | |
_.each(doc.images, function(v, k) { | |
request({ uri: doc.images[k] }, function(err, resp, body) { | |
if (err || resp.statusCode != 404) return | |
db.getDoc(doc._id, function(err, doc) { | |
delete doc.images[k]; | |
db.saveDoc(doc, function(err, ok) { | |
console.log('\tremoved: (' + doc._id + ') ' + doc.title); | |
}); | |
}); | |
}); | |
}); | |
}); | |
}); | |
}, | |
__main: function() { | |
jsdom.jQueryify(window, function() { | |
$ = window.jQuery; | |
scrape.request_page(1); | |
scrape.__tag_ids(); | |
scrape.__theme_ids(); | |
scrape.__theme_ordering(); | |
scrape.__fixup_images(); | |
}); | |
} | |
} | |
scrape.__main(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment