Skip to content

Instantly share code, notes, and snippets.

@gvx
Created June 14, 2011 17:48
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save gvx/1025436 to your computer and use it in GitHub Desktop.
Save gvx/1025436 to your computer and use it in GitHub Desktop.
Path to Philosophy code
class Alias(object):
def __init__(self, initial):
self._set = {initial}
self.initial = initial
def add(self, alias):
self._set.add(alias)
def merge(self, other):
self._set.update(other._set)
def __iter__(self):
return iter(self._set)
class AliasDict(object):
def __init__(self):
self._dict = {}
def add(self, one, other):
if one in self._dict:
if other in self._dict: #merge!
self._dict[one].merge(self._dict[other])
for k in self._dict[other]:
self._dict[k] = self._dict[one]
else:
self._dict[one].add(other)
elif other in self._dict:
self._dict[other].add(one)
else:
self._dict[one] = self._dict[other] = Alias(one)
self._dict[one].add(other)
def get(self, n):
return self._dict.get(n)
def __contains__(self, s):
return s in self._dict
from html.parser import HTMLParser
class LinkFound(Exception):
def __init__(self, link):
self.linkname = link
class WikipediaParser(HTMLParser):
bad_namespaces = {'File', 'File_talk', 'Wikipedia', 'Wikipedia_talk', 'Template', 'Template_talk', 'Talk', 'User', 'User_talk', 'Help'}
def __init__(self):
HTMLParser.__init__(self)
self.inside_italics = False
self.open_parens = 0
self.open_divs = 0
self.started = False
self.inside_table = False
def handle_starttag(self, tag, attrs):
if tag in ('i', 'em'):
self.inside_italics = True
elif tag == 'div':
for key, value in attrs:
if key == 'id' and value == 'bodyContent':
self.started = True
self.open_divs = 0
return
self.open_divs += 1
elif tag == 'table':
self.inside_table = True
elif tag == 'a':
if self.started and not self.inside_italics and not self.inside_table and self.open_parens <= 0 and self.open_divs <= 0:
for key, value in attrs:
if key == 'href' and value.startswith('/wiki/'):
value = value[6:]
if value.split(':',1)[0] not in self.bad_namespaces:
raise LinkFound(value)
def handle_endtag(self, tag):
if tag in ('i', 'em'):
self.inside_italics = False
elif tag == 'div':
self.open_divs -= 1
elif tag == 'table':
self.inside_table = False
def handle_data(self, data):
self.open_parens += data.count('(') - data.count(')')
from walk import Walker
from urllib.parse import unquote
def check_endpoint(go_from, endpoint):
w = Walker(endpoint)
w.start(go_from)
return w
def quote(n): #make page title fit for printing
return '"'+unquote(n).replace('_', ' ')+'"'
if __name__ == '__main__':
import sys
g = len(sys.argv) > 1 and sys.argv[1] or 'Special:Random'
e = 'Philosophy' #change for another end point
w = check_endpoint(g, e)
l = [w.aliases.get(g).initial]
while len(l) <= len(w.cache):
l.append(w.cache[l[-1]])
print('I', e in w and 'could' or 'could not', 'reach', quote(e), 'from', quote(w.aliases.get(g).initial), 'in', len(w.cache), 'steps')
print(' -> '.join(quote(x) for x in l))
from parser import *
from aliasdict import *
from urllib.request import urlopen, Request
class Walker(object):
def __init__(self, stop_at=None, lang='en'):
self.lang = lang
self.build_url()
self.cache = {}
self.stop_at = stop_at
self.aliases = AliasDict()
def start(self, url='Special:Random'):
while url:
url = self.walk_from(url)
def build_url(self):
self.built_url = 'http://' + self.lang + '.wikipedia.org/wiki/'
def walk_from(self, url):
if url in self.aliases:
return
wpp = WikipediaParser()
resp = urlopen(Request(self.built_url + url, headers={'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11'})) #It doesn't like Python's default user agent, so I spoof it.
try:
text = resp.read().decode('utf-8') #Assuming UTF-8. Awful, I know.
except Exception as e:
print('Ignored', e, 'for', url) #for some reason, it receives garbage on some articles
#if you know why this is, please let me know
text = ''
try:
wpp.feed(text)
except LinkFound as e:
n_url = resp.geturl()[len(self.built_url):]
self.aliases.add(n_url, url)
if self.stop_at in self.aliases:
return
self.cache[n_url] = e.linkname.split('#',1)[0]
return self.cache[n_url]
def __contains__(self, s):
return s in self.aliases
@qubodup
Copy link

qubodup commented Jun 14, 2011

On Arch (python 3.2): $ python3 philosophy.py test
File "philosophy.py", line 18
z l = [w.aliases.get(g).initial]
^
SyntaxError: invalid syntax

@gvx
Copy link
Author

gvx commented Jun 14, 2011

Damn, fixed. Some z got in the way, somehow.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment