jey/gist:35841

## gistfile1.rb
# vim: ts=2 sw=2 et
require 'rubygems'
require 'open-uri'
require 'hpricot'
require 'set'
require 'timeout'

class Article
  attr_reader :url, :title, :author

  def initialize(url)
    @url = url
    doc = Hpricot(fetch_url(url))
    @title = (doc/'.entry-header')[0].inner_text
    @author = /Posted by (.+) at/.match((doc/'.post-footers')[0].inner_text)[1]
  end

  def node
    "x#{object_id}"
  end

  def find_links
    result = Set.new
    doc = Hpricot(fetch_url(url))
    (doc/'.entry-content').each do |content_node|
      (content_node/'a').each do |link_node|
        next unless link_node['href']
        result.add(resolve_href(url, link_node['href'])) rescue next
      end
    end
    result
  end

  def article_links
    find_links.collect do |link_url|
      next unless link_url =~ %r{overcomingbias.com/\d{4}/\d{2}/(.+)\.html}
      next if $1 == 'index'
      link_url = link_url.gsub(%r{http://http//}, 'http://') # fix bad hrefs
      link_url = link_url.gsub(%r{http://$}, '') # fix bad hrefs
      link_url = link_url.gsub(/html[#?].+$/, 'html') # strip anchors and params
      link_url
    end.compact.to_set
  end
end

def fetch_url2(url)
  STDERR.write "fetching #{url}\n" unless $url_cache[url]
  $url_cache[url] ||= open(url).read
end

# hack to work around timeouts
def fetch_url(url)
  (1..3).each do |n|
    begin
      return fetch_url2(url)
    rescue Timeout::Error
      STDERR.write "fetch attempt #{n} failed for #{url}\n"
    end
  end
  raise "failed to fetch #{url}"
end

def resolve_href(url, href)
  href = href.strip.gsub(/ /, '%20') # hack to fix some bad hrefs
  href = 'http://' + href if href =~ /^www.overcomingbias.com/ # another fix
  URI.parse(url).merge(href).to_s
end

def collect_articles
  collect_articles_from('http://www.overcomingbias.com')
end

# starts at index page `url' and follows "Next >>" links to collect all post URLs
def collect_articles_from(url)
  doc = Hpricot(fetch_url(url))
  result = {}

  (doc/'.entry-header a').each do |article_link|
    article_url = resolve_href(url, article_link['href'])
    result[article_url] = Article.new(article_url)
  end

  next_link = (doc/'.pager-right a')[0]
  if next_link
    raise "error finding Next link" unless next_link.inner_text =~ /^Next/
    next_url = resolve_href(url, next_link['href'])
    result.merge!(collect_articles_from(next_url))
  end

  result
end

def escape(s)
  s.gsub(/"/, '\"')
end

def generate_graph(articles)
  puts "digraph map {"
  articles.values.each do |article|
    next unless article.author == 'Eliezer Yudkowsky'
    puts "#{article.node} [URL=\"#{article.url}\" label=\"#{escape(article.title)}\"];"
    article.article_links.each do |link_url|
      if linked_article = articles[link_url]
        puts "#{article.node} -> #{linked_article.node};" if linked_article.author == 'Eliezer Yudkowsky'
      else
        STDERR.write "#{article.url}: missing article: #{link_url}\n"
      end
    end
  end
  puts "}"
end

def main
  $url_cache = Marshal.load(open('url_cache').read) rescue {}
  begin
    generate_graph(collect_articles)
  ensure
    open('url_cache', 'w').write(Marshal.dump($url_cache))
  end
end

main
	# vim: ts=2 sw=2 et
	require 'rubygems'
	require 'open-uri'
	require 'hpricot'
	require 'set'
	require 'timeout'

	class Article
	attr_reader :url, :title, :author

	def initialize(url)
	@url = url
	doc = Hpricot(fetch_url(url))
	@title = (doc/'.entry-header')[0].inner_text
	@author = /Posted by (.+) at/.match((doc/'.post-footers')[0].inner_text)[1]
	end

	def node
	"x#{object_id}"
	end

	def find_links
	result = Set.new
	doc = Hpricot(fetch_url(url))
	(doc/'.entry-content').each do \|content_node\|
	(content_node/'a').each do \|link_node\|
	next unless link_node['href']
	result.add(resolve_href(url, link_node['href'])) rescue next
	end
	end
	result
	end

	def article_links
	find_links.collect do \|link_url\|
	next unless link_url =~ %r{overcomingbias.com/\d{4}/\d{2}/(.+)\.html}
	next if $1 == 'index'
	link_url = link_url.gsub(%r{http://http//}, 'http://') # fix bad hrefs
	link_url = link_url.gsub(%r{http://$}, '') # fix bad hrefs
	link_url = link_url.gsub(/html[#?].+$/, 'html') # strip anchors and params
	link_url
	end.compact.to_set
	end
	end

	def fetch_url2(url)
	STDERR.write "fetching #{url}\n" unless $url_cache[url]
	$url_cache[url] \|\|= open(url).read
	end

	# hack to work around timeouts
	def fetch_url(url)
	(1..3).each do \|n\|
	begin
	return fetch_url2(url)
	rescue Timeout::Error
	STDERR.write "fetch attempt #{n} failed for #{url}\n"
	end
	end
	raise "failed to fetch #{url}"
	end

	def resolve_href(url, href)
	href = href.strip.gsub(/ /, '%20') # hack to fix some bad hrefs
	href = 'http://' + href if href =~ /^www.overcomingbias.com/ # another fix
	URI.parse(url).merge(href).to_s
	end

	def collect_articles
	collect_articles_from('http://www.overcomingbias.com')
	end

	# starts at index page `url' and follows "Next >>" links to collect all post URLs
	def collect_articles_from(url)
	doc = Hpricot(fetch_url(url))
	result = {}

	(doc/'.entry-header a').each do \|article_link\|
	article_url = resolve_href(url, article_link['href'])
	result[article_url] = Article.new(article_url)
	end

	next_link = (doc/'.pager-right a')[0]
	if next_link
	raise "error finding Next link" unless next_link.inner_text =~ /^Next/
	next_url = resolve_href(url, next_link['href'])
	result.merge!(collect_articles_from(next_url))
	end

	result
	end

	def escape(s)
	s.gsub(/"/, '\"')
	end

	def generate_graph(articles)
	puts "digraph map {"
	articles.values.each do \|article\|
	next unless article.author == 'Eliezer Yudkowsky'
	puts "#{article.node} [URL=\"#{article.url}\" label=\"#{escape(article.title)}\"];"
	article.article_links.each do \|link_url\|
	if linked_article = articles[link_url]
	puts "#{article.node} -> #{linked_article.node};" if linked_article.author == 'Eliezer Yudkowsky'
	else
	STDERR.write "#{article.url}: missing article: #{link_url}\n"
	end
	end
	end
	puts "}"
	end

	def main
	$url_cache = Marshal.load(open('url_cache').read) rescue {}
	begin
	generate_graph(collect_articles)
	ensure
	open('url_cache', 'w').write(Marshal.dump($url_cache))
	end
	end

	main