minrk/xkcd903.rb

## xkcd903.rb
#!/usr/bin/env ruby
# a script for graphing first non-italic non-parenthetical link paths on wikipedia
# per http://www.xkcd.com/903
#

require 'rubygems'
require 'mechanize'
require 'rgl/adjacency'
require 'rgl/dot'
require 'rgl/traversal'

$wikipedia = "http://en.wikipedia.org"
$random_url = $wikipedia+"/wiki/Special:Random"
$scrub = "http://en.wikipedia.org/wiki/"

# the root node for the tree:
$root = "Philosophy"
# Any of the following will work equally well, because they form a closed loop:
# Philosophy
# Existence
# Sense
# Organism
# Biology
# Natural_science
# Science
# Knowledge
# Fact
# Information
# Sequence
# Mathematics
# Quantity
# Property_(philosophy)
# Modern_philosophy

# filename for saving/graphing
$basename = 'xkcd903'
# whether to perform analysis and report after each iteration
$report = false

$endpoints = []


def graph_from_dotfile (file)
  # from RGL doc examples
  g = RGL::AdjacencyGraph.new
  pattern = /\s*([^\"]+)[\"\s]*--[\"\s]*([^\"\[\;]+)/ # ugly but works
  latest = ''
  IO.foreach(file) { |line|
    case line
       when /^digraph/
         g = RGL::DirectedAdjacencyGraph.new
         pattern = /\s*([^\"]+)[\"\s]*->[\"\s]*([^\"\[\;]+)/
       when pattern
         g.add_edge $1.strip,$2.strip
       when /(.+)\[/
         latest = $1.strip.sub(/^"/,'').sub(/"$/,'')
         # puts latest
       when /color = red/
         # puts latest
         $endpoints << latest
         latest = ''
       else
         nil
       end
  }
  g
end


if File.exists? "#{$basename}.dot"
  $graph = graph_from_dotfile("#{$basename}.dot")
else
  $graph = RGL::DirectedAdjacencyGraph.new
  # comment out next line to allow infinite path,
  # which is safe because Science <-> Philosophy is a closed loop
  # known unsafe pages include dates, e.g. 'October'
  # $graph.add_vertex($root)
end

def print_distances
  $distances.each do |url, dist|
    puts "        #{url} : #{dist}"
  end
end

def walk_page(agent, page)
  # walk a page until arriving at Philosophy
  puts "Starting from: #{page.uri}"
  name = page.uri.to_s.sub($scrub, '')
  if not $endpoints.index(name)
    $endpoints << name
  end
  while true
    here = page.uri.to_s.sub($scrub, '')
    nextpage = first_link(page)
    there = nextpage.to_s.sub($scrub,'')
    puts there
    if nextpage.nil?
      break
    elsif $graph.has_vertex? there
      $graph.add_edge(here, there)
      break
    end
    $graph.add_edge(here, there)
    page = agent.get(nextpage)
  end
end

def in_paren(node)
  # check whether we are inside parentheses
  # this is not rigorous, it assumes use of parentheses is valid, which should be safe
  previous = node.previous
  prefix = ''
  while previous
    prefix = previous.text + prefix
    previous = previous.previous
  end
  return prefix.count('(') > prefix.count(')')
end

def valid_link(node)
  # ignore non-text links
  if node.text.strip.empty?
    # not a text link
    return false
  end

  # validate href
  href = node['href']
  if not href.match(/^\/wiki\//)
    # not a page link
    return false
  elsif href.match(/^\/wiki\/(Wikipedia|File|Help)\:/)
    # ignore meta-pages
    return false
  end

  # skip parentheticals
  if in_paren(node)
    return false
  end

  # validate node context
  valid_container = false
  parent = node.parent
  while parent['id'] != 'bodyContent'
    if parent.name == 'p'
      valid_container = true
    elsif parent.name == 'li'
      valid_container = true
    elsif parent.name == 'i'
      # ignore italics
      return false
    elsif parent.name == 'table'
      # don't include content in tables
      return false
    elsif parent.name == 'div'
      # ignore divs with *any* kind of class
      # formatting divs don't seem have a class
      if not parent['class'].to_s.strip.empty?
        return false
      end
      # cls = parent['class']
      # if cls.match(/dablink/) or cls.match(/rellink/)
      #
      #   return false
    elsif parent.name == 'span'
      # skip coordinates link
      if parent['id'].to_s == 'coordinates'
        return false
      end
    end
    parent = parent.parent
  end
  return valid_container
end

def first_link(page)
  # I bet this could be done with one clever XPATH
  # loop through links in paragraphs or lists
  bodyc = page.parser.xpath("//div[@id='bodyContent']")[0]
  links = bodyc.xpath('//a')
  links.each do |a|
    href = a['href']
    if href.nil?
      next
    else
      # scrub hash-link
      href = href.rstrip.split('#')[0]
    end
    if (not page.uri.to_s.end_with? href) and valid_link(a)
      # first link that is to another wiki page, and not in parenthesis
      # various contexts excluded
      return $wikipedia+href
    end
  end
  nil
end

def skip_page(page)
  # Don't try some categories of pages
  if $graph.has_vertex? page.uri.to_s.sub($scrub, '')
    # already visited
    puts "ignoring page already visited: #{page.uri}"
    return true
  elsif page.title.start_with? "List of"
    # skip "List of ..." pages
    puts "ignoring list page: #{page.uri}"
    return true
  elsif not page.parser.xpath("//table[@id='disambigbox']").empty?
    # skip disambiguation pages
    puts "ignoring disambiguation page: #{page.uri}"
    return true
  elsif not page.parser.xpath("//a[@href='/wiki/Wikipedia:Stub']").empty?
    # skip stubs
    puts "ignoring stub: #{page.uri}"
    return true
  else
    return false
  end
end

def save_dotfile(graph, fmt, name)
  # to DOT format
  dot = graph.to_dot_graph
  dot.each_element do |element|
    if element.name == 'Philosophy'
      element.options['color'] = 'blue'
    elsif not element.name.nil? and $endpoints.index(element.name)
      # mark starting points as red
      element.options['color'] = 'red'
    end
  end
  # write the dotfile
  File.open("#{name}.dot", 'w') do |f|
    f << dot.to_s << "\n"
  end
  # draw the graph
  system( "dot -T#{fmt} #{name}.dot -o #{name}.#{fmt}" )
end

def main(first_urls)
  agent = Mechanize.new
  # fake user-agent.  Don't be a dick and abuse it
  agent.user_agent_alias = 'Mac Safari'
  i = 1
  first_urls=first_urls.reverse
  while true
    # get a new random article
    if first_urls.empty?
      url = $random_url
    else
      url = first_urls.pop
      if not url.start_with? 'http://'
        # assume article name, not full URL
        url = $scrub + url
      end
    end
    page = agent.get url
    # skip disambiguations, stubs, 'List of' pages
    while skip_page(page)
      page = agent.get $random_url
    end
    # add it to the graph
    walk_page(agent, page)
    save_dotfile($graph, 'svg', $basename)
    if $report
      # calculate and replort the distances
      # (this is incredibly wasteful, because there will only be one new value
      # each iteration)
      rev = $graph.reverse
      visitor = RGL::DFSVisitor.new(rev)
      visitor.attach_distance_map
      distances = []
      rev.depth_first_visit('Philosophy', visitor) do |v|
        if $endpoints.index(v)
          d = visitor.distance_to_root(v)
          distances << d
          # puts "#{v} is #{d} steps from Philosophy"
        end
      end
      avg = eval(distances.join('+')) / (1.0*distances.length)
      puts "Found #{$endpoints.length} pages, averaging #{avg} links from Philosophy"
    else
      puts "visited #{$graph.count} sites, from #{$endpoints.count} starting pages, and found #{$graph.cycles.count} cycles"
      # wait a second between walks, so we don't piss off Wikipedia
      sleep rand
    end
  end
end

main(ARGV)
	#!/usr/bin/env ruby
	# a script for graphing first non-italic non-parenthetical link paths on wikipedia
	# per http://www.xkcd.com/903
	#

	require 'rubygems'
	require 'mechanize'
	require 'rgl/adjacency'
	require 'rgl/dot'
	require 'rgl/traversal'

	$wikipedia = "http://en.wikipedia.org"
	$random_url = $wikipedia+"/wiki/Special:Random"
	$scrub = "http://en.wikipedia.org/wiki/"

	# the root node for the tree:
	$root = "Philosophy"
	# Any of the following will work equally well, because they form a closed loop:
	# Philosophy
	# Existence
	# Sense
	# Organism
	# Biology
	# Natural_science
	# Science
	# Knowledge
	# Fact
	# Information
	# Sequence
	# Mathematics
	# Quantity
	# Property_(philosophy)
	# Modern_philosophy

	# filename for saving/graphing
	$basename = 'xkcd903'
	# whether to perform analysis and report after each iteration
	$report = false

	$endpoints = []


	def graph_from_dotfile (file)
	# from RGL doc examples
	g = RGL::AdjacencyGraph.new
	pattern = /\s([^\"]+)[\"\s]--[\"\s]*([^\"\[\;]+)/ # ugly but works
	latest = ''
	IO.foreach(file) { \|line\|
	case line
	when /^digraph/
	g = RGL::DirectedAdjacencyGraph.new
	pattern = /\s([^\"]+)[\"\s]->[\"\s]*([^\"\[\;]+)/
	when pattern
	g.add_edge $1.strip,$2.strip
	when /(.+)\[/
	latest = $1.strip.sub(/^"/,'').sub(/"$/,'')
	# puts latest
	when /color = red/
	# puts latest
	$endpoints << latest
	latest = ''
	else
	nil
	end
	}
	g
	end


	if File.exists? "#{$basename}.dot"
	$graph = graph_from_dotfile("#{$basename}.dot")
	else
	$graph = RGL::DirectedAdjacencyGraph.new
	# comment out next line to allow infinite path,
	# which is safe because Science <-> Philosophy is a closed loop
	# known unsafe pages include dates, e.g. 'October'
	# $graph.add_vertex($root)
	end

	def print_distances
	$distances.each do \|url, dist\|
	puts " #{url} : #{dist}"
	end
	end

	def walk_page(agent, page)
	# walk a page until arriving at Philosophy
	puts "Starting from: #{page.uri}"
	name = page.uri.to_s.sub($scrub, '')
	if not $endpoints.index(name)
	$endpoints << name
	end
	while true
	here = page.uri.to_s.sub($scrub, '')
	nextpage = first_link(page)
	there = nextpage.to_s.sub($scrub,'')
	puts there
	if nextpage.nil?
	break
	elsif $graph.has_vertex? there
	$graph.add_edge(here, there)
	break
	end
	$graph.add_edge(here, there)
	page = agent.get(nextpage)
	end
	end

	def in_paren(node)
	# check whether we are inside parentheses
	# this is not rigorous, it assumes use of parentheses is valid, which should be safe
	previous = node.previous
	prefix = ''
	while previous
	prefix = previous.text + prefix
	previous = previous.previous
	end
	return prefix.count('(') > prefix.count(')')
	end

	def valid_link(node)
	# ignore non-text links
	if node.text.strip.empty?
	# not a text link
	return false
	end

	# validate href
	href = node['href']
	if not href.match(/^\/wiki\//)
	# not a page link
	return false
	elsif href.match(/^\/wiki\/(Wikipedia\|File\|Help)\:/)
	# ignore meta-pages
	return false
	end

	# skip parentheticals
	if in_paren(node)
	return false
	end

	# validate node context
	valid_container = false
	parent = node.parent
	while parent['id'] != 'bodyContent'
	if parent.name == 'p'
	valid_container = true
	elsif parent.name == 'li'
	valid_container = true
	elsif parent.name == 'i'
	# ignore italics
	return false
	elsif parent.name == 'table'
	# don't include content in tables
	return false
	elsif parent.name == 'div'
	# ignore divs with any kind of class
	# formatting divs don't seem have a class
	if not parent['class'].to_s.strip.empty?
	return false
	end
	# cls = parent['class']
	# if cls.match(/dablink/) or cls.match(/rellink/)
	#
	# return false
	elsif parent.name == 'span'
	# skip coordinates link
	if parent['id'].to_s == 'coordinates'
	return false
	end
	end
	parent = parent.parent
	end
	return valid_container
	end

	def first_link(page)
	# I bet this could be done with one clever XPATH
	# loop through links in paragraphs or lists
	bodyc = page.parser.xpath("//div[@id='bodyContent']")[0]
	links = bodyc.xpath('//a')
	links.each do \|a\|
	href = a['href']
	if href.nil?
	next
	else
	# scrub hash-link
	href = href.rstrip.split('#')[0]
	end
	if (not page.uri.to_s.end_with? href) and valid_link(a)
	# first link that is to another wiki page, and not in parenthesis
	# various contexts excluded
	return $wikipedia+href
	end
	end
	nil
	end

	def skip_page(page)
	# Don't try some categories of pages
	if $graph.has_vertex? page.uri.to_s.sub($scrub, '')
	# already visited
	puts "ignoring page already visited: #{page.uri}"
	return true
	elsif page.title.start_with? "List of"
	# skip "List of ..." pages
	puts "ignoring list page: #{page.uri}"
	return true
	elsif not page.parser.xpath("//table[@id='disambigbox']").empty?
	# skip disambiguation pages
	puts "ignoring disambiguation page: #{page.uri}"
	return true
	elsif not page.parser.xpath("//a[@href='/wiki/Wikipedia:Stub']").empty?
	# skip stubs
	puts "ignoring stub: #{page.uri}"
	return true
	else
	return false
	end
	end

	def save_dotfile(graph, fmt, name)
	# to DOT format
	dot = graph.to_dot_graph
	dot.each_element do \|element\|
	if element.name == 'Philosophy'
	element.options['color'] = 'blue'
	elsif not element.name.nil? and $endpoints.index(element.name)
	# mark starting points as red
	element.options['color'] = 'red'
	end
	end
	# write the dotfile
	File.open("#{name}.dot", 'w') do \|f\|
	f << dot.to_s << "\n"
	end
	# draw the graph
	system( "dot -T#{fmt} #{name}.dot -o #{name}.#{fmt}" )
	end

	def main(first_urls)
	agent = Mechanize.new
	# fake user-agent. Don't be a dick and abuse it
	agent.user_agent_alias = 'Mac Safari'
	i = 1
	first_urls=first_urls.reverse
	while true
	# get a new random article
	if first_urls.empty?
	url = $random_url
	else
	url = first_urls.pop
	if not url.start_with? 'http://'
	# assume article name, not full URL
	url = $scrub + url
	end
	end
	page = agent.get url
	# skip disambiguations, stubs, 'List of' pages
	while skip_page(page)
	page = agent.get $random_url
	end
	# add it to the graph
	walk_page(agent, page)
	save_dotfile($graph, 'svg', $basename)
	if $report
	# calculate and replort the distances
	# (this is incredibly wasteful, because there will only be one new value
	# each iteration)
	rev = $graph.reverse
	visitor = RGL::DFSVisitor.new(rev)
	visitor.attach_distance_map
	distances = []
	rev.depth_first_visit('Philosophy', visitor) do \|v\|
	if $endpoints.index(v)
	d = visitor.distance_to_root(v)
	distances << d
	# puts "#{v} is #{d} steps from Philosophy"
	end
	end
	avg = eval(distances.join('+')) / (1.0*distances.length)
	puts "Found #{$endpoints.length} pages, averaging #{avg} links from Philosophy"
	else
	puts "visited #{$graph.count} sites, from #{$endpoints.count} starting pages, and found #{$graph.cycles.count} cycles"
	# wait a second between walks, so we don't piss off Wikipedia
	sleep rand
	end
	end
	end

	main(ARGV)