HalCanary/wikipedia_parser.py

## wikipedia_parser.py
#! /usr/bin/env python

# Copyright 2020 Hal W Canary, III.  all rights reserved.

import lxml.html
import subprocess
import sys
import urllib2

class WikipediaTextParser:
    '''
    Load a page from Wikipedia and provide access to the topics in the page as
    plain text.

    TODO: make a tree of subtopics by looking at h3 and h4 tags.
    '''
    def __init__(self):
        self._topics = []

    def load(self, uri):
        '''
        load a page from Wikipedia.
        @param uri a string in the form 'https://en.wikipedia.org/wiki/TITLE'
        @return True if everything worked correctly.
        '''
        self._topics = []
        try:
            page = urllib2.urlopen(urllib2.Request(uri)).read()
        except ValueError:
            return False
        doc = lxml.html.document_fromstring(page)
        content = doc.find(".//div[@id='content']//div[@class='mw-parser-output']")
        if content is None:
            return False
        last = ''
        header = '[preface]'
        for child in content.iterchildren():
            if isinstance(child, lxml.html.HtmlComment):
                continue
            if child.tag == 'h2':
                if child.get('id') == 'mw-toc-heading':
                    continue
                headline = child.find(".//span[@class='mw-headline']")
                if headline is None:
                    continue
                self._topics.append((header, last))
                header, last = ' '.join(x.strip() for x in headline.itertext()), ''
            last += ' '.join(child.itertext()) + '\n'
        self._topics.append((header, last))
        return True

    def topics(self):
        '''
        Returns a list of top-level topics about the subject.
        '''
        return [topic for topic, contents in self._topics]

    def get_topic(self, index):
        '''
        Given the index of a topic in the topic list, return information about
        the topic.
        '''
        return self._topics[index][1]


def more(string, width, height):
    '''
    Helper function that acts like the Unix `more` command.  Hit `enter` to view next page.
    '''
    def fold(line, width):
        assert width > 0
        index = 0
        while True:
            f = index + width
            if f >= len(line):
                yield line[index:]
                return
            while line[f] != ' ':
                if f == index + 1:
                    f = index + width - 1
                    break
                f -= 1
            f += 1
            yield line[index:f]
            index = f

    count = 1
    for line in string.splitlines():
        for b in fold(line, width - 1):
            sys.stdout.write(b.encode('utf-8') + '\n')
            count += 1
            if count == height:
                raw_input(':')
                sys.stdout.write('\r ')
                count = 1


def browse(uri):
    '''
    Code to demonstrate use of WikipediaTextParser class.
    '''
    width, height = (int(subprocess.check_output(['tput', x])) for x in ['cols', 'lines'])
    w = WikipediaTextParser()
    assert w.load(uri)
    rule = '_' * width + '\n\n'
    while True:
        sys.stdout.write(rule)
        for index, topic in enumerate(w.topics()):
            sys.stdout.write('%d. %s\n' % (index, topic))
        answer = raw_input('which topic? ')
        try:
            topic_number = int(answer)
        except ValueError:
            break
        sys.stdout.write(rule)
        more(w.get_topic(topic_number), width, height)
    sys.stdout.write('\n')

DEFAULT_URI = 'https://en.wikipedia.org/wiki/Heidenheim_an_der_Brenz'

if __name__ == '__main__':
    browse(sys.argv[1] if len(sys.argv) > 1 else DEFAULT_URI)
	#! /usr/bin/env python

	# Copyright 2020 Hal W Canary, III. all rights reserved.

	import lxml.html
	import subprocess
	import sys
	import urllib2

	class WikipediaTextParser:
	'''
	Load a page from Wikipedia and provide access to the topics in the page as
	plain text.

	TODO: make a tree of subtopics by looking at h3 and h4 tags.
	'''
	def __init__(self):
	self._topics = []

	def load(self, uri):
	'''
	load a page from Wikipedia.
	@param uri a string in the form 'https://en.wikipedia.org/wiki/TITLE'
	@return True if everything worked correctly.
	'''
	self._topics = []
	try:
	page = urllib2.urlopen(urllib2.Request(uri)).read()
	except ValueError:
	return False
	doc = lxml.html.document_fromstring(page)
	content = doc.find(".//div[@id='content']//div[@class='mw-parser-output']")
	if content is None:
	return False
	last = ''
	header = '[preface]'
	for child in content.iterchildren():
	if isinstance(child, lxml.html.HtmlComment):
	continue
	if child.tag == 'h2':
	if child.get('id') == 'mw-toc-heading':
	continue
	headline = child.find(".//span[@class='mw-headline']")
	if headline is None:
	continue
	self._topics.append((header, last))
	header, last = ' '.join(x.strip() for x in headline.itertext()), ''
	last += ' '.join(child.itertext()) + '\n'
	self._topics.append((header, last))
	return True

	def topics(self):
	'''
	Returns a list of top-level topics about the subject.
	'''
	return [topic for topic, contents in self._topics]

	def get_topic(self, index):
	'''
	Given the index of a topic in the topic list, return information about
	the topic.
	'''
	return self._topics[index][1]


	def more(string, width, height):
	'''
	Helper function that acts like the Unix `more` command. Hit `enter` to view next page.
	'''
	def fold(line, width):
	assert width > 0
	index = 0
	while True:
	f = index + width
	if f >= len(line):
	yield line[index:]
	return
	while line[f] != ' ':
	if f == index + 1:
	f = index + width - 1
	break
	f -= 1
	f += 1
	yield line[index:f]
	index = f

	count = 1
	for line in string.splitlines():
	for b in fold(line, width - 1):
	sys.stdout.write(b.encode('utf-8') + '\n')
	count += 1
	if count == height:
	raw_input(':')
	sys.stdout.write('\r ')
	count = 1


	def browse(uri):
	'''
	Code to demonstrate use of WikipediaTextParser class.
	'''
	width, height = (int(subprocess.check_output(['tput', x])) for x in ['cols', 'lines'])
	w = WikipediaTextParser()
	assert w.load(uri)
	rule = '_' * width + '\n\n'
	while True:
	sys.stdout.write(rule)
	for index, topic in enumerate(w.topics()):
	sys.stdout.write('%d. %s\n' % (index, topic))
	answer = raw_input('which topic? ')
	try:
	topic_number = int(answer)
	except ValueError:
	break
	sys.stdout.write(rule)
	more(w.get_topic(topic_number), width, height)
	sys.stdout.write('\n')

	DEFAULT_URI = 'https://en.wikipedia.org/wiki/Heidenheim_an_der_Brenz'

	if __name__ == '__main__':
	browse(sys.argv[1] if len(sys.argv) > 1 else DEFAULT_URI)