Skip to content

Instantly share code, notes, and snippets.

@igrigorik
Created May 5, 2010 19:34
Show Gist options
  • Star 11 You must be signed in to star a gist
  • Fork 7 You must be signed in to fork a gist
  • Save igrigorik/391312 to your computer and use it in GitHub Desktop.
Save igrigorik/391312 to your computer and use it in GitHub Desktop.
require 'rubygems'
require 'nokogiri'
require 'open-uri'
require 'digest/md5'
require 'fastercsv'
require 'json'
require 'curb'
require 'pp'
data = []
(1..69).each do |page|
# fetch ted talks from each page
doc = Nokogiri.parse(open("http://www.ted.com/talks/list/page/#{page}").read)
talks = doc.search('dd a').inject({}) do |hash,a|
hash["http://www.ted.com#{a.attributes['href'].value}"] = a.attributes['title'].value
hash
end
# fetch postrank metrics data
metrics = Curl::Easy.http_post('http://api.postrank.com/v2/entry/metrics?appkey=TEDdemo',
talks.keys.map{|t| "url[]=#{Digest::MD5.hexdigest(t)}"}.join("&"))
metrics.perform
metrics = JSON.parse(metrics.body_str)
talks.keys.each do |url|
data.push({'title' => talks[url], 'url' => url}.merge(metrics[Digest::MD5.hexdigest(url)]))
end
puts "processed page #{page}"
end
# output a CSV file with the results
FasterCSV.open("ted.csv", "w") do |csv|
columns = data.collect{|d| d.keys}.flatten.uniq.sort
columns.delete('title')
columns.delete('url')
csv << ["Title", "URL", *columns]
data.each do |a|
csv << [a['title'], a['url'], *columns.map{|c| a[c] || 0}]
end
end
# Blog post: http://blog.postrank.com/2010/05/and-the-most-engaging-ted-talk-is/
# Google spreadsheet data: https://spreadsheets0.google.com/ccc?key=tWri7T3f4Ex6-uVU8i9-FFQ&hl=en
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment