Created
December 15, 2008 01:07
-
-
Save jey/35841 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# vim: ts=2 sw=2 et | |
require 'rubygems' | |
require 'open-uri' | |
require 'hpricot' | |
require 'set' | |
require 'timeout' | |
class Article | |
attr_reader :url, :title, :author | |
def initialize(url) | |
@url = url | |
doc = Hpricot(fetch_url(url)) | |
@title = (doc/'.entry-header')[0].inner_text | |
@author = /Posted by (.+) at/.match((doc/'.post-footers')[0].inner_text)[1] | |
end | |
def node | |
"x#{object_id}" | |
end | |
def find_links | |
result = Set.new | |
doc = Hpricot(fetch_url(url)) | |
(doc/'.entry-content').each do |content_node| | |
(content_node/'a').each do |link_node| | |
next unless link_node['href'] | |
result.add(resolve_href(url, link_node['href'])) rescue next | |
end | |
end | |
result | |
end | |
def article_links | |
find_links.collect do |link_url| | |
next unless link_url =~ %r{overcomingbias.com/\d{4}/\d{2}/(.+)\.html} | |
next if $1 == 'index' | |
link_url = link_url.gsub(%r{http://http//}, 'http://') # fix bad hrefs | |
link_url = link_url.gsub(%r{http://$}, '') # fix bad hrefs | |
link_url = link_url.gsub(/html[#?].+$/, 'html') # strip anchors and params | |
link_url | |
end.compact.to_set | |
end | |
end | |
def fetch_url2(url) | |
STDERR.write "fetching #{url}\n" unless $url_cache[url] | |
$url_cache[url] ||= open(url).read | |
end | |
# hack to work around timeouts | |
def fetch_url(url) | |
(1..3).each do |n| | |
begin | |
return fetch_url2(url) | |
rescue Timeout::Error | |
STDERR.write "fetch attempt #{n} failed for #{url}\n" | |
end | |
end | |
raise "failed to fetch #{url}" | |
end | |
def resolve_href(url, href) | |
href = href.strip.gsub(/ /, '%20') # hack to fix some bad hrefs | |
href = 'http://' + href if href =~ /^www.overcomingbias.com/ # another fix | |
URI.parse(url).merge(href).to_s | |
end | |
def collect_articles | |
collect_articles_from('http://www.overcomingbias.com') | |
end | |
# starts at index page `url' and follows "Next >>" links to collect all post URLs | |
def collect_articles_from(url) | |
doc = Hpricot(fetch_url(url)) | |
result = {} | |
(doc/'.entry-header a').each do |article_link| | |
article_url = resolve_href(url, article_link['href']) | |
result[article_url] = Article.new(article_url) | |
end | |
next_link = (doc/'.pager-right a')[0] | |
if next_link | |
raise "error finding Next link" unless next_link.inner_text =~ /^Next/ | |
next_url = resolve_href(url, next_link['href']) | |
result.merge!(collect_articles_from(next_url)) | |
end | |
result | |
end | |
def escape(s) | |
s.gsub(/"/, '\"') | |
end | |
def generate_graph(articles) | |
puts "digraph map {" | |
articles.values.each do |article| | |
next unless article.author == 'Eliezer Yudkowsky' | |
puts "#{article.node} [URL=\"#{article.url}\" label=\"#{escape(article.title)}\"];" | |
article.article_links.each do |link_url| | |
if linked_article = articles[link_url] | |
puts "#{article.node} -> #{linked_article.node};" if linked_article.author == 'Eliezer Yudkowsky' | |
else | |
STDERR.write "#{article.url}: missing article: #{link_url}\n" | |
end | |
end | |
end | |
puts "}" | |
end | |
def main | |
$url_cache = Marshal.load(open('url_cache').read) rescue {} | |
begin | |
generate_graph(collect_articles) | |
ensure | |
open('url_cache', 'w').write(Marshal.dump($url_cache)) | |
end | |
end | |
main |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment