Skip to content

Instantly share code, notes, and snippets.

@hlian
Created May 26, 2011 16:00
Show Gist options
  • Save hlian/993419 to your computer and use it in GitHub Desktop.
Save hlian/993419 to your computer and use it in GitHub Desktop.
EVERY WIKIPEDIA ARTICLE CONVERGES TO PHILOSOPHY
#!/usr/bin/env python
import gzip
import urllib2
from cStringIO import StringIO
from lxml import etree, html
from lxml.cssselect import CSSSelector
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
ALL_LINKS = CSSSelector('a')
ITALIC_LINKS = CSSSelector('i a')
PARAGRAPHS = CSSSelector('#bodyContent > p')
def to_unicode(element):
"""A replacement for lxml's :meth:`Element.text_content`, which
does not return unicode. This trick via Geoffrey Sneddon."""
return etree.tostring(element,
encoding=unicode,
method='text',
with_tail=False)
def unparens(p):
text = etree.tostring(p)
depth = 0
newtext = ''
inquote = False
for c in text:
if c == '(' and not inquote:
depth += 1
continue
if c == ')' and not inquote:
depth -= 1
continue
if c == '"':
inquote = not inquote
if depth == 0:
newtext += c
return html.fragment_fromstring(newtext)
def iterate(url):
f = opener.open(url, timeout=2)
print(f.url)
if f.info().get('content-encoding') == 'gzip':
data = StringIO(f.read())
gzipper = gzip.GzipFile(fileobj=data)
root = html.document_fromstring(gzipper.read())
else:
root = html.document_fromstring(f.read())
ps = PARAGRAPHS(root)
for p in ps:
p = unparens(p)
links = ALL_LINKS(p)
italics = set(ITALIC_LINKS(p))
links = [l for l in links if not l in italics]
links = [l for l in links if not to_unicode(l).startswith('[')]
if links:
href = links[0].attrib['href']
if 'File:' in href:
continue
if href.startswith('/'):
href = 'http://en.wikipedia.org' + href
return href
if __name__ == '__main__':
href = 'http://en.wikipedia.org/wiki/Special:Random'
#href = 'http://en.wikipedia.org/wiki/United_States' (GZIP)!
while True:
href = iterate(href)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment