Skip to content

Instantly share code, notes, and snippets.

@grampelberg
Created September 30, 2010 22:35
Show Gist options
  • Save grampelberg/605449 to your computer and use it in GitHub Desktop.
Save grampelberg/605449 to your computer and use it in GitHub Desktop.
var couchdb = require('couchdb');
var request = require('request');
var sys = require("sys");
var jsdom = require("jsdom");
var _ = require('underscore')._;
var mustache = require('mustache');
var window = jsdom.jsdom().createWindow();
var $ = null;
var client = couchdb.createClient(5984, '127.0.0.1', 'username',
'password');
var db = client.db('ted');
var scrape = {
stop: false,
ordered: false,
pages: [],
site: 'http://www.ted.com',
generator: 'http://tracker-01.apps.falcon.utorrent.com/url?' +
'url={{ site }}{{ url }}&name={{ title }}{{ ext }}',
sorting: [ 'MOSTTRANSLATED', 'MOSTEMAILED', 'MOSTDISCUSSED',
'MOSTFAVORITED', 'JAW-DROPPING', 'PERSUASIVE',
'COURAGEOUS', 'INGENIOUS', 'FASCINATING', 'INSPIRING',
'BEAUTIFUL', 'FUNNY', 'INFORMATIVE' ],
theme_sorting: [ 'MOSTDISCUSSED', 'NEWEST', 'MOSTEMAILED', 'MOSTTALKS' ],
search: 'http://www.ted.com/talks/searchRpc?tagid={{ id }}&' +
'orderedby={{ sort }}',
theme_search: 'http://www.ted.com/themes/searchRpc?tagid={{ id }}&' +
'orderedby={{ sort }}',
update_speaker: function(id, link, err, resp, body) {
if (err) {
console.log('Error(' + id + ') - ' + link);
return
}
console.log('speaker(' + id +') - ' + link);
var speaker = $("h1", body).find("span").text().split(': ')[0];
if (!speaker) {
console.log('Error(' + id + ') - ' + link);
console.log('\t' + $("h1", body).find("span").text());
return
}
var categories = {};
var names = [ null, null, "speakers", "themes", "conferences" ];
_.each($(".categories", body).find("dd"), function(elem, k) {
if (!names[k])
return
categories[names[k]] = _.map($("li", elem), function(v) {
return $(v).text();
});
});
var quote = { 'text': $(".why", body).find("blockquote").text(),
'cite': $(".why", body).find("cite").text() };
var web = _.reduce(
$(".categories", body).find("dd:first").find("a"), function(acc, v) {
acc[$(v).text().split(': ')[1]] = $(v).attr("href");
return acc
}, {});
if (quote.text)
categories.quote = quote;
if (_.keys(web).length != 0)
categories.web = web;
db.getDoc(id, function(err, doc) {
doc.speaker = _.extend(categories, {
name: $(".viewTemplate", body).find("h1").find("span").text().split(
': ')[0],
caption: $(".viewTemplate", body).find("h1").find("span").text().split(
': ')[1],
descr: $("#speakerscontent", body).find("p:first").text(),
why: $(".why", body).find("p:first").html(),
images: { thumbnail: $("#speakerscontent", body).find("img").attr(
"src").split("_")[0] + '_132x99.jpg',
profile: $("#speakerscontent", body).find("img").attr("src")
},
link: link
});
db.saveDoc(id, doc, function(err, ok) {
if (err) throw new Error(JSON.stringify(err) + '\t' + id);
console.log('\tUpdated: ' + id + ' with ' + speaker);
});
});
},
purify_page: function(talk, err, resp, body) {
if (err) {
console.log('Error - ' + talk.link);
return
}
try {
var id = $(".downloads", body).find("a").attr("href").split('/').slice(
-1)[0];
} catch(err) {
console.log('Couldn\'t find ID (' + talk.link + ')');
return
}
console.log(id + ' - ' + talk.link);
var downloads = _.reduce(
$(".downloads", body).find("a"), function(acc, elem) {
var types = { "Watch high-res video (MP4)": "HD",
"Download video to desktop (MP4)": "SD",
"Audio to desktop (MP3)": "Audio only"
};
if ($(elem).text() in types)
acc[types[$(elem).text()]] = $(elem).attr('href');
return acc
}, {});
var title = $("#altHeadline", body).text().replace(/( )+$/, '');
db.saveDoc(
id,
{ _id: id,
descr: $("#tagline", body).text(),
download: {
http: _.reduce(downloads, function(acc, v, k) {
acc[k] = scrape.site + v;
return acc;
}, {}),
torrent: _.reduce(downloads, function(acc, v, k) {
var extension = { 'HD': '.mp4',
'SD': '.mp4',
'Audio only': '.mp3' };
acc[k] = mustache.to_html(scrape.generator, {
url: v, title: escape(title.replace(':', '-')),
site: scrape.site, ext: extension[k] });
return acc;
}, {})
},
duration: talk.duration,
images: { thumbnail: talk.thumbnail,
large: talk.thumbnail.split('_')[0] + '_615x461.jpg' },
languages: _.map($("#languageCode", body).find("option"), function(elem) {
return { 'code': $(elem).attr('value'),
'lang': $(elem).text() }
}),
link: talk.link,
recommendations: [
scrape.site +
$(".watchnext", body).find('a:first').attr('href')],
posted: talk.post,
speaker: $("#aboutThisTalk", body).find("h3:last").text().split(
'About ')[1],
tags: _.map($(".tags", body).find("a"), function(elem) {
return $(elem).text()
}),
themes: _.map($(".relatedThemes", body).find("a"), function(elem) {
return $(elem).text()
}),
title: title,
type: 'talk'
}, function(er, ok) {
if (er) throw new Error(JSON.stringify(er) + '\t' + id);
console.log('\tSaved: ' + talk.link);
var speaker_link = scrape.site + $("#aboutThisTalk", body).find(
"p:last").find("a:last").attr("href")
request({
uri: speaker_link
}, _.bind(scrape.update_speaker, this, id, speaker_link));
});
},
request_page: function(page) {
request(
{ uri: 'http://www.ted.com/talks/list/page/' + page },
function(err, resp, body) {
if (scrape.stop) return
var tmp = $(".talkMedallion", body).text();
if (tmp == scrape._tmp) return
scrape._tmp = tmp;
var talks = _($(".talkMedallion", body)).chain().map(function(elem) {
return { link: scrape.site +
$(elem).find('.thumbnail').find('a').attr('href'),
thumbnail: $(elem).find('.thumbnail').find('img:last'
).attr('src'),
duration: $(elem).find('.date').text().split(' ')[0],
post: new Date($(elem).find('.date').text().split(
': ')[1]).toISOString()
}
}).each(function(talk) {
db.view('app', 'links', { key: talk.link }, function(e, resp) {
if (resp.rows.length != 0) {
console.log('\tskipping: ' + talk.link);
scrape.stop = true;
// After a scrape is completed, fire up the 'ordering' which is
// going to give things like most favorited or emailed.
if (!scrape.ordered) {
scrape.ordered = true;
scrape.__ordering();
}
return
}
request({ uri: talk.link }, _.bind(scrape.purify_page, this, talk));
})
});
scrape.request_page(page + 1);
});
},
__fixup_speakers: function() {
db.view('app', 'bad-speakers', { limit: 25 }, function(e, resp) {
console.log('fixing - ' + resp.rows.length);
_.each(resp.rows, function(talk) {
talk = talk.value;
request({ uri: talk.link }, function(err, resp, body) {
var speaker_link = scrape.site + $("#aboutThisTalk", body).find(
"p:last").find("a:last").attr("href");
console.log('\t'+speaker_link);
request({ uri: speaker_link },
_.bind(scrape.update_speaker, this, talk._id, speaker_link));
});
});
});
},
__tag_ids: function() {
// This is so that there's a list of tag-name to numerical id translations
// and is then used for the searchrpc calls.
request({ uri: scrape.site + '/talks/tags' }, function(err, resp, body) {
console.log('Parsing tags...');
var tags = _.map($("#maincontent", body).find("a"), function(elem) {
return { name: $(elem).text().split(' (')[0],
id: $(elem).attr('href').split('/').slice(-1)[0],
link: scrape.site + $(elem).attr('href'),
type: 'tag' }
});
db.view('app', 'ids', { include_docs: true, key: 'tag' }, function(er, resp) {
var names = _.map(resp.rows, function(v) { return v.doc.name });
tags = _.filter(tags, function(doc) {
return names.indexOf(doc.name) == -1;
});
if (tags.length) {
client.uuids(tags.length, function(er, resp) {
_(tags).chain().zip(resp.uuids).each(function(v) {
db.saveDoc(v[0], v[1], function(er, ok) {
console.log(er, ok);
});
});
});
}
});
});
},
__theme_ids: function(page) {
// This is so that there's a list of theme-name to numerical id
// translations and then is used for the theme/searchrpc calls.
if (!page) page = 1;
console.log('page: ' + page);
request(
{ uri: scrape.site + '/themes/list/page/' + page },
function(err, resp, body) {
if (scrape.stop) return
var tmp = $(".themeMedallion", body).text();
if (tmp == scrape._tmp) return
scrape._tmp = tmp;
scrape.__theme_ids(page+1);
db.view('app', 'ids', {include_docs: true, key: 'theme'}, function(er, resp) {
var ids = resp ?
_.map(resp.rows, function(v) { return v.doc.link; }) : [];
var themes = _($(".themeMedallion", body)).chain().map(function(elem) {
return $(elem).find('a:first').attr('href');
}).filter(function(v) {
return ids.indexOf(scrape.site + v) == -1
}).each(function(link) {
console.log('\t' + link);
request({ uri: scrape.site + link }, function(err, resp, body) {
client.uuids(1, function(er, resp) {
var uuid = resp.uuids[0];
db.saveDoc(uuid, { name: $("h1", body).find("span").text(),
id: body.match(/themeId\/(\d+)/)[1],
descr: $("#themeDescription", body).html(),
link: scrape.site + link,
type: 'theme',
thumbnail: $(".about", body).find(
"div:first").find(
"img:last").attr('src')
}, function(er, ok) {
console.log('saved...');
}
);
});
});
});
});
});
},
__ordering: function() {
db.view('app', 'type', { key: 'talk', include_docs: true }, function(err, resp) {
var docs = _.map(resp.rows, function(v) {
if ('ranking' in v.doc)
delete v.doc.ranking;
return v.doc;
});
db.bulkDocs({ docs: docs }, function(er, ok) {
function category_ranking(i) {
if (i >= scrape.sorting.length) return
request({
uri: mustache.to_html(
scrape.search, { id: 0, sort: scrape.sorting[i] })},
function(err, resp, body) {
var talks = JSON.parse(body).main
_.each(talks, function(talk, k) {
db.getDoc(talk.id, function(err, doc) {
if (err) return
if (!('ranking' in doc)) doc.ranking = {};
doc.ranking[scrape.sorting[i]] = k;
db.saveDoc(doc._id, doc, function(err, ok) {
console.log('\tsaved: (' + doc._id + ') '+ doc.title);
});
});
});
category_ranking(i+1);
});
}
category_ranking(0);
});
});
},
__theme_ordering: function() {
db.view('app', 'type', { key: 'theme', include_docs: true }, function(e, resp) {
var docs = _.map(resp.rows, function(v) {
if ('ranking' in v.doc)
delete v.doc.ranking;
return v.doc;
});
db.bulkDocs({ docs: docs }, function(e, ok) {
function theme_ranking(i) {
if (i >= scrape.theme_sorting.length) return
console.log('Looking up ' + scrape.theme_sorting[i]);
console.log('\tURL: ' + mustache.to_html(
scrape.theme_search, { id: 0, sort: scrape.theme_sorting[i] }));
request({
uri: mustache.to_html(
scrape.theme_search, { id: 0, sort: scrape.theme_sorting[i] })
}, function(err, resp, body) {
try {
var themes = JSON.parse(body).main
} catch(err) {
console.log('\tProblem with ' + scrape.theme_sorting[i]);
return
}
_.each(themes, function(theme, k) {
_.each(docs, function(doc) {
if (doc.id != theme.themeId) return
db.getDoc(doc._id, function(err, doc) {
if (!('ranking' in doc)) doc.ranking = {};
doc.ranking[scrape.theme_sorting[i]] = k;
db.saveDoc(doc, function(err, ok) {
console.log('\tsaved: (' + doc._id + ') ' + doc.name);
});
});
});
});
theme_ranking(i + 1);
});
}
theme_ranking(0);
});
});
},
__fixup_images: function() {
db.view('app', 'type', { key: 'talk', include_docs: true }, function(e, resp) {
var docs = _.map(resp.rows, function(v) {
return v.doc;
});
_.each(docs, function(doc) {
_.each(doc.images, function(v, k) {
request({ uri: doc.images[k] }, function(err, resp, body) {
if (err || resp.statusCode != 404) return
db.getDoc(doc._id, function(err, doc) {
delete doc.images[k];
db.saveDoc(doc, function(err, ok) {
console.log('\tremoved: (' + doc._id + ') ' + doc.title);
});
});
});
});
});
});
},
__main: function() {
jsdom.jQueryify(window, function() {
$ = window.jQuery;
scrape.request_page(1);
scrape.__tag_ids();
scrape.__theme_ids();
scrape.__theme_ordering();
scrape.__fixup_images();
});
}
}
scrape.__main();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment