Skip to content

Instantly share code, notes, and snippets.

@jgraham
Created February 12, 2012 12:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jgraham/1808356 to your computer and use it in GitHub Desktop.
Save jgraham/1808356 to your computer and use it in GitHub Desktop.
HTMLParser backed by html5lib
import tokenizer
from constants import tokenTypes, tagTokenTypes
class HTMLParser(object):
def __init__(self):
self.reset()
def reset(self):
self._tokenizer = None
pass
def feed(self, data):
self._tokenizer = tokenizer.HTMLTokenizer(data)
self._process()
def close(self):
pass
def getpos(self):
return self._tokenizer.stream.position
def _process(self):
handlers = dict((tokenTypes[key], value) for key, value in [
("Doctype", self._handle_decl),
("Characters", self._handle_data),
("SpaceCharacters", self._handle_data),
("StartTag", self._handle_starttag),
("EndTag", self._handle_endtag),
("EmptyTag", self._handle_starttag),
("Comment", self._handle_comment),
("ParseError", None)])
for token in self._tokenizer:
handlers[token["type"]](token)
def _handle_decl(self, token):
data = "DOCTYPE %s"%token[name]
#Add in all the extra data here
self.handle_decl(data)
def handle_decl(self, data):
pass
def _handle_data(self, token):
self.handle_data(token["data"])
def handle_data(self, data):
pass
def _handle_starttag(self, token):
if token["name"] == "script":
self._tokenizer.state = self._tokenizer.scriptDataState
elif token["name"] == "style":
self._tokenizer.state = self._tokenizer.rawtextState
if token["selfClosing"]:
self.handle_startendtag(token["name"], token["data"])
else:
self.handle_starttag(token["name"], token["data"])
def handle_starttag(self, name, attrs):
pass
def handle_startendtag(self, name, attrs):
self.handle_starttag(name, attrs)
self.handle_endtag(name, attrs)
def _handle_endtag(self, token):
self.handle_endtag(token["name"])
def handle_endtag(self, name):
pass
def _handle_comment(self, token):
self.handle_comment(token["data"])
def handle_comment(self, data):
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment