Skip to content

Instantly share code, notes, and snippets.

@minrk
Created May 26, 2011 19:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save minrk/993842 to your computer and use it in GitHub Desktop.
Save minrk/993842 to your computer and use it in GitHub Desktop.
See http://www.xkcd.com/903 for details
#!/usr/bin/env ruby
# a script for graphing first non-italic non-parenthetical link paths on wikipedia
# per http://www.xkcd.com/903
#
require 'rubygems'
require 'mechanize'
require 'rgl/adjacency'
require 'rgl/dot'
require 'rgl/traversal'
$wikipedia = "http://en.wikipedia.org"
$random_url = $wikipedia+"/wiki/Special:Random"
$scrub = "http://en.wikipedia.org/wiki/"
# the root node for the tree:
$root = "Philosophy"
# Any of the following will work equally well, because they form a closed loop:
# Philosophy
# Existence
# Sense
# Organism
# Biology
# Natural_science
# Science
# Knowledge
# Fact
# Information
# Sequence
# Mathematics
# Quantity
# Property_(philosophy)
# Modern_philosophy
# filename for saving/graphing
$basename = 'xkcd903'
# whether to perform analysis and report after each iteration
$report = false
$endpoints = []
def graph_from_dotfile (file)
# from RGL doc examples
g = RGL::AdjacencyGraph.new
pattern = /\s*([^\"]+)[\"\s]*--[\"\s]*([^\"\[\;]+)/ # ugly but works
latest = ''
IO.foreach(file) { |line|
case line
when /^digraph/
g = RGL::DirectedAdjacencyGraph.new
pattern = /\s*([^\"]+)[\"\s]*->[\"\s]*([^\"\[\;]+)/
when pattern
g.add_edge $1.strip,$2.strip
when /(.+)\[/
latest = $1.strip.sub(/^"/,'').sub(/"$/,'')
# puts latest
when /color = red/
# puts latest
$endpoints << latest
latest = ''
else
nil
end
}
g
end
if File.exists? "#{$basename}.dot"
$graph = graph_from_dotfile("#{$basename}.dot")
else
$graph = RGL::DirectedAdjacencyGraph.new
# comment out next line to allow infinite path,
# which is safe because Science <-> Philosophy is a closed loop
# known unsafe pages include dates, e.g. 'October'
# $graph.add_vertex($root)
end
def print_distances
$distances.each do |url, dist|
puts " #{url} : #{dist}"
end
end
def walk_page(agent, page)
# walk a page until arriving at Philosophy
puts "Starting from: #{page.uri}"
name = page.uri.to_s.sub($scrub, '')
if not $endpoints.index(name)
$endpoints << name
end
while true
here = page.uri.to_s.sub($scrub, '')
nextpage = first_link(page)
there = nextpage.to_s.sub($scrub,'')
puts there
if nextpage.nil?
break
elsif $graph.has_vertex? there
$graph.add_edge(here, there)
break
end
$graph.add_edge(here, there)
page = agent.get(nextpage)
end
end
def in_paren(node)
# check whether we are inside parentheses
# this is not rigorous, it assumes use of parentheses is valid, which should be safe
previous = node.previous
prefix = ''
while previous
prefix = previous.text + prefix
previous = previous.previous
end
return prefix.count('(') > prefix.count(')')
end
def valid_link(node)
# ignore non-text links
if node.text.strip.empty?
# not a text link
return false
end
# validate href
href = node['href']
if not href.match(/^\/wiki\//)
# not a page link
return false
elsif href.match(/^\/wiki\/(Wikipedia|File|Help)\:/)
# ignore meta-pages
return false
end
# skip parentheticals
if in_paren(node)
return false
end
# validate node context
valid_container = false
parent = node.parent
while parent['id'] != 'bodyContent'
if parent.name == 'p'
valid_container = true
elsif parent.name == 'li'
valid_container = true
elsif parent.name == 'i'
# ignore italics
return false
elsif parent.name == 'table'
# don't include content in tables
return false
elsif parent.name == 'div'
# ignore divs with *any* kind of class
# formatting divs don't seem have a class
if not parent['class'].to_s.strip.empty?
return false
end
# cls = parent['class']
# if cls.match(/dablink/) or cls.match(/rellink/)
#
# return false
elsif parent.name == 'span'
# skip coordinates link
if parent['id'].to_s == 'coordinates'
return false
end
end
parent = parent.parent
end
return valid_container
end
def first_link(page)
# I bet this could be done with one clever XPATH
# loop through links in paragraphs or lists
bodyc = page.parser.xpath("//div[@id='bodyContent']")[0]
links = bodyc.xpath('//a')
links.each do |a|
href = a['href']
if href.nil?
next
else
# scrub hash-link
href = href.rstrip.split('#')[0]
end
if (not page.uri.to_s.end_with? href) and valid_link(a)
# first link that is to another wiki page, and not in parenthesis
# various contexts excluded
return $wikipedia+href
end
end
nil
end
def skip_page(page)
# Don't try some categories of pages
if $graph.has_vertex? page.uri.to_s.sub($scrub, '')
# already visited
puts "ignoring page already visited: #{page.uri}"
return true
elsif page.title.start_with? "List of"
# skip "List of ..." pages
puts "ignoring list page: #{page.uri}"
return true
elsif not page.parser.xpath("//table[@id='disambigbox']").empty?
# skip disambiguation pages
puts "ignoring disambiguation page: #{page.uri}"
return true
elsif not page.parser.xpath("//a[@href='/wiki/Wikipedia:Stub']").empty?
# skip stubs
puts "ignoring stub: #{page.uri}"
return true
else
return false
end
end
def save_dotfile(graph, fmt, name)
# to DOT format
dot = graph.to_dot_graph
dot.each_element do |element|
if element.name == 'Philosophy'
element.options['color'] = 'blue'
elsif not element.name.nil? and $endpoints.index(element.name)
# mark starting points as red
element.options['color'] = 'red'
end
end
# write the dotfile
File.open("#{name}.dot", 'w') do |f|
f << dot.to_s << "\n"
end
# draw the graph
system( "dot -T#{fmt} #{name}.dot -o #{name}.#{fmt}" )
end
def main(first_urls)
agent = Mechanize.new
# fake user-agent. Don't be a dick and abuse it
agent.user_agent_alias = 'Mac Safari'
i = 1
first_urls=first_urls.reverse
while true
# get a new random article
if first_urls.empty?
url = $random_url
else
url = first_urls.pop
if not url.start_with? 'http://'
# assume article name, not full URL
url = $scrub + url
end
end
page = agent.get url
# skip disambiguations, stubs, 'List of' pages
while skip_page(page)
page = agent.get $random_url
end
# add it to the graph
walk_page(agent, page)
save_dotfile($graph, 'svg', $basename)
if $report
# calculate and replort the distances
# (this is incredibly wasteful, because there will only be one new value
# each iteration)
rev = $graph.reverse
visitor = RGL::DFSVisitor.new(rev)
visitor.attach_distance_map
distances = []
rev.depth_first_visit('Philosophy', visitor) do |v|
if $endpoints.index(v)
d = visitor.distance_to_root(v)
distances << d
# puts "#{v} is #{d} steps from Philosophy"
end
end
avg = eval(distances.join('+')) / (1.0*distances.length)
puts "Found #{$endpoints.length} pages, averaging #{avg} links from Philosophy"
else
puts "visited #{$graph.count} sites, from #{$endpoints.count} starting pages, and found #{$graph.cycles.count} cycles"
# wait a second between walks, so we don't piss off Wikipedia
sleep rand
end
end
end
main(ARGV)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment