Skip to content

Instantly share code, notes, and snippets.

@HalCanary
Last active June 7, 2023 12:59
Show Gist options
  • Save HalCanary/0bffb2da97445b1ac93b38ed45b8a70a to your computer and use it in GitHub Desktop.
Save HalCanary/0bffb2da97445b1ac93b38ed45b8a70a to your computer and use it in GitHub Desktop.
#! /usr/bin/env python
# Copyright 2020 Hal W Canary, III. all rights reserved.
import lxml.html
import subprocess
import sys
import urllib2
class WikipediaTextParser:
'''
Load a page from Wikipedia and provide access to the topics in the page as
plain text.
TODO: make a tree of subtopics by looking at h3 and h4 tags.
'''
def __init__(self):
self._topics = []
def load(self, uri):
'''
load a page from Wikipedia.
@param uri a string in the form 'https://en.wikipedia.org/wiki/TITLE'
@return True if everything worked correctly.
'''
self._topics = []
try:
page = urllib2.urlopen(urllib2.Request(uri)).read()
except ValueError:
return False
doc = lxml.html.document_fromstring(page)
content = doc.find(".//div[@id='content']//div[@class='mw-parser-output']")
if content is None:
return False
last = ''
header = '[preface]'
for child in content.iterchildren():
if isinstance(child, lxml.html.HtmlComment):
continue
if child.tag == 'h2':
if child.get('id') == 'mw-toc-heading':
continue
headline = child.find(".//span[@class='mw-headline']")
if headline is None:
continue
self._topics.append((header, last))
header, last = ' '.join(x.strip() for x in headline.itertext()), ''
last += ' '.join(child.itertext()) + '\n'
self._topics.append((header, last))
return True
def topics(self):
'''
Returns a list of top-level topics about the subject.
'''
return [topic for topic, contents in self._topics]
def get_topic(self, index):
'''
Given the index of a topic in the topic list, return information about
the topic.
'''
return self._topics[index][1]
def more(string, width, height):
'''
Helper function that acts like the Unix `more` command. Hit `enter` to view next page.
'''
def fold(line, width):
assert width > 0
index = 0
while True:
f = index + width
if f >= len(line):
yield line[index:]
return
while line[f] != ' ':
if f == index + 1:
f = index + width - 1
break
f -= 1
f += 1
yield line[index:f]
index = f
count = 1
for line in string.splitlines():
for b in fold(line, width - 1):
sys.stdout.write(b.encode('utf-8') + '\n')
count += 1
if count == height:
raw_input(':')
sys.stdout.write('\r ')
count = 1
def browse(uri):
'''
Code to demonstrate use of WikipediaTextParser class.
'''
width, height = (int(subprocess.check_output(['tput', x])) for x in ['cols', 'lines'])
w = WikipediaTextParser()
assert w.load(uri)
rule = '_' * width + '\n\n'
while True:
sys.stdout.write(rule)
for index, topic in enumerate(w.topics()):
sys.stdout.write('%d. %s\n' % (index, topic))
answer = raw_input('which topic? ')
try:
topic_number = int(answer)
except ValueError:
break
sys.stdout.write(rule)
more(w.get_topic(topic_number), width, height)
sys.stdout.write('\n')
DEFAULT_URI = 'https://en.wikipedia.org/wiki/Heidenheim_an_der_Brenz'
if __name__ == '__main__':
browse(sys.argv[1] if len(sys.argv) > 1 else DEFAULT_URI)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment