paveldedik/parser.py

## parser.py
# -*- coding: utf-8 -*-


import re
import urllib


WIKI_URL = ('http://en.wikipedia.org/wiki/'
            'List_of_countries_and_dependencies_by_area')

START_RE = re.compile(ur'^<[^<>]+>')
END_RE = re.compile(ur'^</[^<>]+>')
SPECIAL_RE = re.compile(ur'^<[^<>]+ ?/>')
TEXT_RE = re.compile(ur'^[^<>]+')
CDATA_RE = re.compile(ur'^<!\[CDATA\[(.*?)\]\]>')

ELEM_NAME_RE = re.compile(ur'</?([^ <>]+)[^<>]*>')

WHITESPACE_RE = re.compile(ur'\s+')
COMMENT_RE = re.compile(ur'<!--(.*?)-->', re.S)
DOCTYPE_RE = re.compile(ur'<!DOCTYPE [^<>]+>', re.I)


def download(url):
    stream = urllib.urlopen(url)
    return stream.read()


def create_elem(name):
    tag = ELEM_NAME_RE.search(name).group(1)
    return {'tag': tag, 'value': '', 'subelements': []}


def preprocess(html):
    html = DOCTYPE_RE.sub('', html)
    html = COMMENT_RE.sub('', html)
    return WHITESPACE_RE.sub(' ', html).strip()


def parse(html):
    stack = []
    html = preprocess(html)

    while html:
        match = (
            CDATA_RE.match(html) or
            SPECIAL_RE.match(html) or
            END_RE.match(html) or
            START_RE.match(html) or
            TEXT_RE.match(html)
        )

        if match is None:
            raise RuntimeError('Invalid XHTML.')
        elif match.re == START_RE:
            elem = create_elem(match.group(0))
            stack.append(elem)
        elif match.re == END_RE:
            elem = stack.pop(-1)
            if stack:
                stack[-1]['subelements'].append(elem)
        else:
            stack[-1]['value'] += match.group(0).strip()

        shift = len(match.group(0))
        html = html[shift:].strip()

    return elem


def pretty(tree, indent=0):
    tree_str = ' ' * indent + '<{}>'.format(tree['tag'])

    for subelem in tree['subelements']:
        tree_str += '\n' + pretty(subelem, indent=indent+4)

    new_line = '\n' * bool(tree['subelements'])
    tree_str += tree['value'] + new_line + '</{}>'.format(tree['tag'])

    return tree_str


def main():
    html = download(WIKI_URL)
    tree = parse(html)
    # ...


if __name__ == '__main__':
    main()
	# -- coding: utf-8 --


	import re
	import urllib


	WIKI_URL = ('http://en.wikipedia.org/wiki/'
	'List_of_countries_and_dependencies_by_area')

	START_RE = re.compile(ur'^<[^<>]+>')
	END_RE = re.compile(ur'^</[^<>]+>')
	SPECIAL_RE = re.compile(ur'^<[^<>]+ ?/>')
	TEXT_RE = re.compile(ur'^[^<>]+')
	CDATA_RE = re.compile(ur'^<!\[CDATA\[(.*?)\]\]>')

	ELEM_NAME_RE = re.compile(ur'</?([^ <>]+)[^<>]*>')

	WHITESPACE_RE = re.compile(ur'\s+')
	COMMENT_RE = re.compile(ur'<!--(.*?)-->', re.S)
	DOCTYPE_RE = re.compile(ur'<!DOCTYPE [^<>]+>', re.I)


	def download(url):
	stream = urllib.urlopen(url)
	return stream.read()


	def create_elem(name):
	tag = ELEM_NAME_RE.search(name).group(1)
	return {'tag': tag, 'value': '', 'subelements': []}


	def preprocess(html):
	html = DOCTYPE_RE.sub('', html)
	html = COMMENT_RE.sub('', html)
	return WHITESPACE_RE.sub(' ', html).strip()


	def parse(html):
	stack = []
	html = preprocess(html)

	while html:
	match = (
	CDATA_RE.match(html) or
	SPECIAL_RE.match(html) or
	END_RE.match(html) or
	START_RE.match(html) or
	TEXT_RE.match(html)
	)

	if match is None:
	raise RuntimeError('Invalid XHTML.')
	elif match.re == START_RE:
	elem = create_elem(match.group(0))
	stack.append(elem)
	elif match.re == END_RE:
	elem = stack.pop(-1)
	if stack:
	stack[-1]['subelements'].append(elem)
	else:
	stack[-1]['value'] += match.group(0).strip()

	shift = len(match.group(0))
	html = html[shift:].strip()

	return elem


	def pretty(tree, indent=0):
	tree_str = ' ' * indent + '<{}>'.format(tree['tag'])

	for subelem in tree['subelements']:
	tree_str += '\n' + pretty(subelem, indent=indent+4)

	new_line = '\n' * bool(tree['subelements'])
	tree_str += tree['value'] + new_line + '</{}>'.format(tree['tag'])

	return tree_str


	def main():
	html = download(WIKI_URL)
	tree = parse(html)
	# ...


	if __name__ == '__main__':
	main()