Skip to content

Instantly share code, notes, and snippets.

@paveldedik
Last active August 29, 2015 14:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save paveldedik/bdf30df08796375b946b to your computer and use it in GitHub Desktop.
Save paveldedik/bdf30df08796375b946b to your computer and use it in GitHub Desktop.
Parsing HTML using Regular Expressions and a Stack
# -*- coding: utf-8 -*-
import re
import urllib
WIKI_URL = ('http://en.wikipedia.org/wiki/'
'List_of_countries_and_dependencies_by_area')
START_RE = re.compile(ur'^<[^<>]+>')
END_RE = re.compile(ur'^</[^<>]+>')
SPECIAL_RE = re.compile(ur'^<[^<>]+ ?/>')
TEXT_RE = re.compile(ur'^[^<>]+')
CDATA_RE = re.compile(ur'^<!\[CDATA\[(.*?)\]\]>')
ELEM_NAME_RE = re.compile(ur'</?([^ <>]+)[^<>]*>')
WHITESPACE_RE = re.compile(ur'\s+')
COMMENT_RE = re.compile(ur'<!--(.*?)-->', re.S)
DOCTYPE_RE = re.compile(ur'<!DOCTYPE [^<>]+>', re.I)
def download(url):
stream = urllib.urlopen(url)
return stream.read()
def create_elem(name):
tag = ELEM_NAME_RE.search(name).group(1)
return {'tag': tag, 'value': '', 'subelements': []}
def preprocess(html):
html = DOCTYPE_RE.sub('', html)
html = COMMENT_RE.sub('', html)
return WHITESPACE_RE.sub(' ', html).strip()
def parse(html):
stack = []
html = preprocess(html)
while html:
match = (
CDATA_RE.match(html) or
SPECIAL_RE.match(html) or
END_RE.match(html) or
START_RE.match(html) or
TEXT_RE.match(html)
)
if match is None:
raise RuntimeError('Invalid XHTML.')
elif match.re == START_RE:
elem = create_elem(match.group(0))
stack.append(elem)
elif match.re == END_RE:
elem = stack.pop(-1)
if stack:
stack[-1]['subelements'].append(elem)
else:
stack[-1]['value'] += match.group(0).strip()
shift = len(match.group(0))
html = html[shift:].strip()
return elem
def pretty(tree, indent=0):
tree_str = ' ' * indent + '<{}>'.format(tree['tag'])
for subelem in tree['subelements']:
tree_str += '\n' + pretty(subelem, indent=indent+4)
new_line = '\n' * bool(tree['subelements'])
tree_str += tree['value'] + new_line + '</{}>'.format(tree['tag'])
return tree_str
def main():
html = download(WIKI_URL)
tree = parse(html)
# ...
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment