Last active
June 7, 2023 12:59
-
-
Save HalCanary/0bffb2da97445b1ac93b38ed45b8a70a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# Copyright 2020 Hal W Canary, III. all rights reserved. | |
import lxml.html | |
import subprocess | |
import sys | |
import urllib2 | |
class WikipediaTextParser: | |
''' | |
Load a page from Wikipedia and provide access to the topics in the page as | |
plain text. | |
TODO: make a tree of subtopics by looking at h3 and h4 tags. | |
''' | |
def __init__(self): | |
self._topics = [] | |
def load(self, uri): | |
''' | |
load a page from Wikipedia. | |
@param uri a string in the form 'https://en.wikipedia.org/wiki/TITLE' | |
@return True if everything worked correctly. | |
''' | |
self._topics = [] | |
try: | |
page = urllib2.urlopen(urllib2.Request(uri)).read() | |
except ValueError: | |
return False | |
doc = lxml.html.document_fromstring(page) | |
content = doc.find(".//div[@id='content']//div[@class='mw-parser-output']") | |
if content is None: | |
return False | |
last = '' | |
header = '[preface]' | |
for child in content.iterchildren(): | |
if isinstance(child, lxml.html.HtmlComment): | |
continue | |
if child.tag == 'h2': | |
if child.get('id') == 'mw-toc-heading': | |
continue | |
headline = child.find(".//span[@class='mw-headline']") | |
if headline is None: | |
continue | |
self._topics.append((header, last)) | |
header, last = ' '.join(x.strip() for x in headline.itertext()), '' | |
last += ' '.join(child.itertext()) + '\n' | |
self._topics.append((header, last)) | |
return True | |
def topics(self): | |
''' | |
Returns a list of top-level topics about the subject. | |
''' | |
return [topic for topic, contents in self._topics] | |
def get_topic(self, index): | |
''' | |
Given the index of a topic in the topic list, return information about | |
the topic. | |
''' | |
return self._topics[index][1] | |
def more(string, width, height): | |
''' | |
Helper function that acts like the Unix `more` command. Hit `enter` to view next page. | |
''' | |
def fold(line, width): | |
assert width > 0 | |
index = 0 | |
while True: | |
f = index + width | |
if f >= len(line): | |
yield line[index:] | |
return | |
while line[f] != ' ': | |
if f == index + 1: | |
f = index + width - 1 | |
break | |
f -= 1 | |
f += 1 | |
yield line[index:f] | |
index = f | |
count = 1 | |
for line in string.splitlines(): | |
for b in fold(line, width - 1): | |
sys.stdout.write(b.encode('utf-8') + '\n') | |
count += 1 | |
if count == height: | |
raw_input(':') | |
sys.stdout.write('\r ') | |
count = 1 | |
def browse(uri): | |
''' | |
Code to demonstrate use of WikipediaTextParser class. | |
''' | |
width, height = (int(subprocess.check_output(['tput', x])) for x in ['cols', 'lines']) | |
w = WikipediaTextParser() | |
assert w.load(uri) | |
rule = '_' * width + '\n\n' | |
while True: | |
sys.stdout.write(rule) | |
for index, topic in enumerate(w.topics()): | |
sys.stdout.write('%d. %s\n' % (index, topic)) | |
answer = raw_input('which topic? ') | |
try: | |
topic_number = int(answer) | |
except ValueError: | |
break | |
sys.stdout.write(rule) | |
more(w.get_topic(topic_number), width, height) | |
sys.stdout.write('\n') | |
DEFAULT_URI = 'https://en.wikipedia.org/wiki/Heidenheim_an_der_Brenz' | |
if __name__ == '__main__': | |
browse(sys.argv[1] if len(sys.argv) > 1 else DEFAULT_URI) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment