Created
May 26, 2011 19:18
-
-
Save minrk/993842 to your computer and use it in GitHub Desktop.
See http://www.xkcd.com/903 for details
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# a script for graphing first non-italic non-parenthetical link paths on wikipedia | |
# per http://www.xkcd.com/903 | |
# | |
require 'rubygems' | |
require 'mechanize' | |
require 'rgl/adjacency' | |
require 'rgl/dot' | |
require 'rgl/traversal' | |
$wikipedia = "http://en.wikipedia.org" | |
$random_url = $wikipedia+"/wiki/Special:Random" | |
$scrub = "http://en.wikipedia.org/wiki/" | |
# the root node for the tree: | |
$root = "Philosophy" | |
# Any of the following will work equally well, because they form a closed loop: | |
# Philosophy | |
# Existence | |
# Sense | |
# Organism | |
# Biology | |
# Natural_science | |
# Science | |
# Knowledge | |
# Fact | |
# Information | |
# Sequence | |
# Mathematics | |
# Quantity | |
# Property_(philosophy) | |
# Modern_philosophy | |
# filename for saving/graphing | |
$basename = 'xkcd903' | |
# whether to perform analysis and report after each iteration | |
$report = false | |
$endpoints = [] | |
def graph_from_dotfile (file) | |
# from RGL doc examples | |
g = RGL::AdjacencyGraph.new | |
pattern = /\s*([^\"]+)[\"\s]*--[\"\s]*([^\"\[\;]+)/ # ugly but works | |
latest = '' | |
IO.foreach(file) { |line| | |
case line | |
when /^digraph/ | |
g = RGL::DirectedAdjacencyGraph.new | |
pattern = /\s*([^\"]+)[\"\s]*->[\"\s]*([^\"\[\;]+)/ | |
when pattern | |
g.add_edge $1.strip,$2.strip | |
when /(.+)\[/ | |
latest = $1.strip.sub(/^"/,'').sub(/"$/,'') | |
# puts latest | |
when /color = red/ | |
# puts latest | |
$endpoints << latest | |
latest = '' | |
else | |
nil | |
end | |
} | |
g | |
end | |
if File.exists? "#{$basename}.dot" | |
$graph = graph_from_dotfile("#{$basename}.dot") | |
else | |
$graph = RGL::DirectedAdjacencyGraph.new | |
# comment out next line to allow infinite path, | |
# which is safe because Science <-> Philosophy is a closed loop | |
# known unsafe pages include dates, e.g. 'October' | |
# $graph.add_vertex($root) | |
end | |
def print_distances | |
$distances.each do |url, dist| | |
puts " #{url} : #{dist}" | |
end | |
end | |
def walk_page(agent, page) | |
# walk a page until arriving at Philosophy | |
puts "Starting from: #{page.uri}" | |
name = page.uri.to_s.sub($scrub, '') | |
if not $endpoints.index(name) | |
$endpoints << name | |
end | |
while true | |
here = page.uri.to_s.sub($scrub, '') | |
nextpage = first_link(page) | |
there = nextpage.to_s.sub($scrub,'') | |
puts there | |
if nextpage.nil? | |
break | |
elsif $graph.has_vertex? there | |
$graph.add_edge(here, there) | |
break | |
end | |
$graph.add_edge(here, there) | |
page = agent.get(nextpage) | |
end | |
end | |
def in_paren(node) | |
# check whether we are inside parentheses | |
# this is not rigorous, it assumes use of parentheses is valid, which should be safe | |
previous = node.previous | |
prefix = '' | |
while previous | |
prefix = previous.text + prefix | |
previous = previous.previous | |
end | |
return prefix.count('(') > prefix.count(')') | |
end | |
def valid_link(node) | |
# ignore non-text links | |
if node.text.strip.empty? | |
# not a text link | |
return false | |
end | |
# validate href | |
href = node['href'] | |
if not href.match(/^\/wiki\//) | |
# not a page link | |
return false | |
elsif href.match(/^\/wiki\/(Wikipedia|File|Help)\:/) | |
# ignore meta-pages | |
return false | |
end | |
# skip parentheticals | |
if in_paren(node) | |
return false | |
end | |
# validate node context | |
valid_container = false | |
parent = node.parent | |
while parent['id'] != 'bodyContent' | |
if parent.name == 'p' | |
valid_container = true | |
elsif parent.name == 'li' | |
valid_container = true | |
elsif parent.name == 'i' | |
# ignore italics | |
return false | |
elsif parent.name == 'table' | |
# don't include content in tables | |
return false | |
elsif parent.name == 'div' | |
# ignore divs with *any* kind of class | |
# formatting divs don't seem have a class | |
if not parent['class'].to_s.strip.empty? | |
return false | |
end | |
# cls = parent['class'] | |
# if cls.match(/dablink/) or cls.match(/rellink/) | |
# | |
# return false | |
elsif parent.name == 'span' | |
# skip coordinates link | |
if parent['id'].to_s == 'coordinates' | |
return false | |
end | |
end | |
parent = parent.parent | |
end | |
return valid_container | |
end | |
def first_link(page) | |
# I bet this could be done with one clever XPATH | |
# loop through links in paragraphs or lists | |
bodyc = page.parser.xpath("//div[@id='bodyContent']")[0] | |
links = bodyc.xpath('//a') | |
links.each do |a| | |
href = a['href'] | |
if href.nil? | |
next | |
else | |
# scrub hash-link | |
href = href.rstrip.split('#')[0] | |
end | |
if (not page.uri.to_s.end_with? href) and valid_link(a) | |
# first link that is to another wiki page, and not in parenthesis | |
# various contexts excluded | |
return $wikipedia+href | |
end | |
end | |
nil | |
end | |
def skip_page(page) | |
# Don't try some categories of pages | |
if $graph.has_vertex? page.uri.to_s.sub($scrub, '') | |
# already visited | |
puts "ignoring page already visited: #{page.uri}" | |
return true | |
elsif page.title.start_with? "List of" | |
# skip "List of ..." pages | |
puts "ignoring list page: #{page.uri}" | |
return true | |
elsif not page.parser.xpath("//table[@id='disambigbox']").empty? | |
# skip disambiguation pages | |
puts "ignoring disambiguation page: #{page.uri}" | |
return true | |
elsif not page.parser.xpath("//a[@href='/wiki/Wikipedia:Stub']").empty? | |
# skip stubs | |
puts "ignoring stub: #{page.uri}" | |
return true | |
else | |
return false | |
end | |
end | |
def save_dotfile(graph, fmt, name) | |
# to DOT format | |
dot = graph.to_dot_graph | |
dot.each_element do |element| | |
if element.name == 'Philosophy' | |
element.options['color'] = 'blue' | |
elsif not element.name.nil? and $endpoints.index(element.name) | |
# mark starting points as red | |
element.options['color'] = 'red' | |
end | |
end | |
# write the dotfile | |
File.open("#{name}.dot", 'w') do |f| | |
f << dot.to_s << "\n" | |
end | |
# draw the graph | |
system( "dot -T#{fmt} #{name}.dot -o #{name}.#{fmt}" ) | |
end | |
def main(first_urls) | |
agent = Mechanize.new | |
# fake user-agent. Don't be a dick and abuse it | |
agent.user_agent_alias = 'Mac Safari' | |
i = 1 | |
first_urls=first_urls.reverse | |
while true | |
# get a new random article | |
if first_urls.empty? | |
url = $random_url | |
else | |
url = first_urls.pop | |
if not url.start_with? 'http://' | |
# assume article name, not full URL | |
url = $scrub + url | |
end | |
end | |
page = agent.get url | |
# skip disambiguations, stubs, 'List of' pages | |
while skip_page(page) | |
page = agent.get $random_url | |
end | |
# add it to the graph | |
walk_page(agent, page) | |
save_dotfile($graph, 'svg', $basename) | |
if $report | |
# calculate and replort the distances | |
# (this is incredibly wasteful, because there will only be one new value | |
# each iteration) | |
rev = $graph.reverse | |
visitor = RGL::DFSVisitor.new(rev) | |
visitor.attach_distance_map | |
distances = [] | |
rev.depth_first_visit('Philosophy', visitor) do |v| | |
if $endpoints.index(v) | |
d = visitor.distance_to_root(v) | |
distances << d | |
# puts "#{v} is #{d} steps from Philosophy" | |
end | |
end | |
avg = eval(distances.join('+')) / (1.0*distances.length) | |
puts "Found #{$endpoints.length} pages, averaging #{avg} links from Philosophy" | |
else | |
puts "visited #{$graph.count} sites, from #{$endpoints.count} starting pages, and found #{$graph.cycles.count} cycles" | |
# wait a second between walks, so we don't piss off Wikipedia | |
sleep rand | |
end | |
end | |
end | |
main(ARGV) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment