Skip to content

Instantly share code, notes, and snippets.

@jey
Created December 15, 2008 01:07
Show Gist options
  • Save jey/35841 to your computer and use it in GitHub Desktop.
Save jey/35841 to your computer and use it in GitHub Desktop.
# vim: ts=2 sw=2 et
require 'rubygems'
require 'open-uri'
require 'hpricot'
require 'set'
require 'timeout'
class Article
attr_reader :url, :title, :author
def initialize(url)
@url = url
doc = Hpricot(fetch_url(url))
@title = (doc/'.entry-header')[0].inner_text
@author = /Posted by (.+) at/.match((doc/'.post-footers')[0].inner_text)[1]
end
def node
"x#{object_id}"
end
def find_links
result = Set.new
doc = Hpricot(fetch_url(url))
(doc/'.entry-content').each do |content_node|
(content_node/'a').each do |link_node|
next unless link_node['href']
result.add(resolve_href(url, link_node['href'])) rescue next
end
end
result
end
def article_links
find_links.collect do |link_url|
next unless link_url =~ %r{overcomingbias.com/\d{4}/\d{2}/(.+)\.html}
next if $1 == 'index'
link_url = link_url.gsub(%r{http://http//}, 'http://') # fix bad hrefs
link_url = link_url.gsub(%r{http://$}, '') # fix bad hrefs
link_url = link_url.gsub(/html[#?].+$/, 'html') # strip anchors and params
link_url
end.compact.to_set
end
end
def fetch_url2(url)
STDERR.write "fetching #{url}\n" unless $url_cache[url]
$url_cache[url] ||= open(url).read
end
# hack to work around timeouts
def fetch_url(url)
(1..3).each do |n|
begin
return fetch_url2(url)
rescue Timeout::Error
STDERR.write "fetch attempt #{n} failed for #{url}\n"
end
end
raise "failed to fetch #{url}"
end
def resolve_href(url, href)
href = href.strip.gsub(/ /, '%20') # hack to fix some bad hrefs
href = 'http://' + href if href =~ /^www.overcomingbias.com/ # another fix
URI.parse(url).merge(href).to_s
end
def collect_articles
collect_articles_from('http://www.overcomingbias.com')
end
# starts at index page `url' and follows "Next >>" links to collect all post URLs
def collect_articles_from(url)
doc = Hpricot(fetch_url(url))
result = {}
(doc/'.entry-header a').each do |article_link|
article_url = resolve_href(url, article_link['href'])
result[article_url] = Article.new(article_url)
end
next_link = (doc/'.pager-right a')[0]
if next_link
raise "error finding Next link" unless next_link.inner_text =~ /^Next/
next_url = resolve_href(url, next_link['href'])
result.merge!(collect_articles_from(next_url))
end
result
end
def escape(s)
s.gsub(/"/, '\"')
end
def generate_graph(articles)
puts "digraph map {"
articles.values.each do |article|
next unless article.author == 'Eliezer Yudkowsky'
puts "#{article.node} [URL=\"#{article.url}\" label=\"#{escape(article.title)}\"];"
article.article_links.each do |link_url|
if linked_article = articles[link_url]
puts "#{article.node} -> #{linked_article.node};" if linked_article.author == 'Eliezer Yudkowsky'
else
STDERR.write "#{article.url}: missing article: #{link_url}\n"
end
end
end
puts "}"
end
def main
$url_cache = Marshal.load(open('url_cache').read) rescue {}
begin
generate_graph(collect_articles)
ensure
open('url_cache', 'w').write(Marshal.dump($url_cache))
end
end
main
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment