gvx/aliasdict.py

## aliasdict.py
class Alias(object):
	def __init__(self, initial):
		self._set = {initial}
		self.initial = initial
	def add(self, alias):
		self._set.add(alias)
	def merge(self, other):
		self._set.update(other._set)
	def __iter__(self):
		return iter(self._set)

class AliasDict(object):
	def __init__(self):
		self._dict = {}
	def add(self, one, other):
		if one in self._dict:
			if other in self._dict: #merge!
				self._dict[one].merge(self._dict[other])
				for k in self._dict[other]:
					self._dict[k] = self._dict[one]
			else:
				self._dict[one].add(other)
		elif other in self._dict:
			self._dict[other].add(one)
		else:
			self._dict[one] = self._dict[other] = Alias(one)
			self._dict[one].add(other)
	def get(self, n):
		return self._dict.get(n)
	def __contains__(self, s):
		return s in self._dict

## parser.py
from html.parser import HTMLParser

class LinkFound(Exception):
	def __init__(self, link):
		self.linkname = link

class WikipediaParser(HTMLParser):
	bad_namespaces = {'File', 'File_talk', 'Wikipedia', 'Wikipedia_talk', 'Template', 'Template_talk', 'Talk', 'User', 'User_talk', 'Help'}
	def __init__(self):
		HTMLParser.__init__(self)
		self.inside_italics = False
		self.open_parens = 0
		self.open_divs = 0
		self.started = False
		self.inside_table = False
	def handle_starttag(self, tag, attrs):
		if tag in ('i', 'em'):
			self.inside_italics = True
		elif tag == 'div':
			for key, value in attrs:
				if key == 'id' and value == 'bodyContent':
					self.started = True
					self.open_divs = 0
					return
			self.open_divs += 1
		elif tag == 'table':
			self.inside_table = True
		elif tag == 'a':
			if self.started and not self.inside_italics and not self.inside_table and self.open_parens <= 0 and self.open_divs <= 0:
				for key, value in attrs:
					if key == 'href' and value.startswith('/wiki/'):
						value = value[6:]
						if value.split(':',1)[0] not in self.bad_namespaces:
							raise LinkFound(value)
	def handle_endtag(self, tag):
		if tag in ('i', 'em'):
			self.inside_italics = False
		elif tag == 'div':
			self.open_divs -= 1
		elif tag == 'table':
			self.inside_table = False
	def handle_data(self, data):
		self.open_parens += data.count('(') - data.count(')')

## philosophy.py
from walk import Walker
from urllib.parse import unquote

def check_endpoint(go_from, endpoint):
	w = Walker(endpoint)
	w.start(go_from)
	return w

def quote(n): #make page title fit for printing
	return '"'+unquote(n).replace('_', ' ')+'"'

if __name__ == '__main__':
	import sys
	g = len(sys.argv) > 1 and sys.argv[1] or 'Special:Random'
	e = 'Philosophy' #change for another end point
	w = check_endpoint(g, e)

	l = [w.aliases.get(g).initial]
	while len(l) <= len(w.cache):
		l.append(w.cache[l[-1]])

	print('I', e in w and 'could' or 'could not', 'reach', quote(e), 'from', quote(w.aliases.get(g).initial), 'in', len(w.cache), 'steps')
	print(' -> '.join(quote(x) for x in l))

## walk.py
from parser import *
from aliasdict import *
from urllib.request import urlopen, Request

class Walker(object):
	def __init__(self, stop_at=None, lang='en'):
		self.lang = lang
		self.build_url()
		self.cache = {}
		self.stop_at = stop_at
		self.aliases = AliasDict()

	def start(self, url='Special:Random'):
		while url:
			url = self.walk_from(url)

	def build_url(self):
		self.built_url = 'http://' + self.lang + '.wikipedia.org/wiki/'

	def walk_from(self, url):
		if url in self.aliases:
			return
		wpp = WikipediaParser()
		resp = urlopen(Request(self.built_url + url, headers={'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11'})) #It doesn't like Python's default user agent, so I spoof it.
		try:
			text = resp.read().decode('utf-8') #Assuming UTF-8. Awful, I know.
		except Exception as e:
			print('Ignored', e, 'for', url) #for some reason, it receives garbage on some articles
			                                #if you know why this is, please let me know
			text = ''
		try:
			wpp.feed(text)
		except LinkFound as e:
			n_url = resp.geturl()[len(self.built_url):]
			self.aliases.add(n_url, url)
			if self.stop_at in self.aliases:
				return
			self.cache[n_url] = e.linkname.split('#',1)[0]
			return self.cache[n_url]
	def __contains__(self, s):
		return s in self.aliases
	class Alias(object):
	def __init__(self, initial):
	self._set = {initial}
	self.initial = initial
	def add(self, alias):
	self._set.add(alias)
	def merge(self, other):
	self._set.update(other._set)
	def __iter__(self):
	return iter(self._set)

	class AliasDict(object):
	def __init__(self):
	self._dict = {}
	def add(self, one, other):
	if one in self._dict:
	if other in self._dict: #merge!
	self._dict[one].merge(self._dict[other])
	for k in self._dict[other]:
	self._dict[k] = self._dict[one]
	else:
	self._dict[one].add(other)
	elif other in self._dict:
	self._dict[other].add(one)
	else:
	self._dict[one] = self._dict[other] = Alias(one)
	self._dict[one].add(other)
	def get(self, n):
	return self._dict.get(n)
	def __contains__(self, s):
	return s in self._dict
	from html.parser import HTMLParser

	class LinkFound(Exception):
	def __init__(self, link):
	self.linkname = link

	class WikipediaParser(HTMLParser):
	bad_namespaces = {'File', 'File_talk', 'Wikipedia', 'Wikipedia_talk', 'Template', 'Template_talk', 'Talk', 'User', 'User_talk', 'Help'}
	def __init__(self):
	HTMLParser.__init__(self)
	self.inside_italics = False
	self.open_parens = 0
	self.open_divs = 0
	self.started = False
	self.inside_table = False
	def handle_starttag(self, tag, attrs):
	if tag in ('i', 'em'):
	self.inside_italics = True
	elif tag == 'div':
	for key, value in attrs:
	if key == 'id' and value == 'bodyContent':
	self.started = True
	self.open_divs = 0
	return
	self.open_divs += 1
	elif tag == 'table':
	self.inside_table = True
	elif tag == 'a':
	if self.started and not self.inside_italics and not self.inside_table and self.open_parens <= 0 and self.open_divs <= 0:
	for key, value in attrs:
	if key == 'href' and value.startswith('/wiki/'):
	value = value[6:]
	if value.split(':',1)[0] not in self.bad_namespaces:
	raise LinkFound(value)
	def handle_endtag(self, tag):
	if tag in ('i', 'em'):
	self.inside_italics = False
	elif tag == 'div':
	self.open_divs -= 1
	elif tag == 'table':
	self.inside_table = False
	def handle_data(self, data):
	self.open_parens += data.count('(') - data.count(')')
	from walk import Walker
	from urllib.parse import unquote

	def check_endpoint(go_from, endpoint):
	w = Walker(endpoint)
	w.start(go_from)
	return w

	def quote(n): #make page title fit for printing
	return '"'+unquote(n).replace('_', ' ')+'"'

	if __name__ == '__main__':
	import sys
	g = len(sys.argv) > 1 and sys.argv[1] or 'Special:Random'
	e = 'Philosophy' #change for another end point
	w = check_endpoint(g, e)

	l = [w.aliases.get(g).initial]
	while len(l) <= len(w.cache):
	l.append(w.cache[l[-1]])

	print('I', e in w and 'could' or 'could not', 'reach', quote(e), 'from', quote(w.aliases.get(g).initial), 'in', len(w.cache), 'steps')
	print(' -> '.join(quote(x) for x in l))
	from parser import *
	from aliasdict import *
	from urllib.request import urlopen, Request

	class Walker(object):
	def __init__(self, stop_at=None, lang='en'):
	self.lang = lang
	self.build_url()
	self.cache = {}
	self.stop_at = stop_at
	self.aliases = AliasDict()

	def start(self, url='Special:Random'):
	while url:
	url = self.walk_from(url)

	def build_url(self):
	self.built_url = 'http://' + self.lang + '.wikipedia.org/wiki/'

	def walk_from(self, url):
	if url in self.aliases:
	return
	wpp = WikipediaParser()
	resp = urlopen(Request(self.built_url + url, headers={'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11'})) #It doesn't like Python's default user agent, so I spoof it.
	try:
	text = resp.read().decode('utf-8') #Assuming UTF-8. Awful, I know.
	except Exception as e:
	print('Ignored', e, 'for', url) #for some reason, it receives garbage on some articles
	#if you know why this is, please let me know
	text = ''
	try:
	wpp.feed(text)
	except LinkFound as e:
	n_url = resp.geturl()[len(self.built_url):]
	self.aliases.add(n_url, url)
	if self.stop_at in self.aliases:
	return
	self.cache[n_url] = e.linkname.split('#',1)[0]
	return self.cache[n_url]
	def __contains__(self, s):
	return s in self.aliases