Created
September 28, 2016 22:42
-
-
Save arciisine/b58e681a38161d66d405d19b9663eb07 to your computer and use it in GitHub Desktop.
Python HTML Parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import sys | |
class Node(object): | |
def __init__(self): | |
pass | |
class TextNode(Node): | |
def __init__(self, text = ''): | |
Node.__init__(self) | |
self.text = text | |
def __str__(self): | |
return repr(self) | |
def __repr__(self): | |
return self.text | |
class TagNode(Node): | |
def __init__(self, tagName = None, attributes = None, children = None): | |
Node.__init__(self) | |
self.tagName = tagName | |
self.children = children if children is not None else [] | |
self.attributes = attributes if attributes is not None else {} | |
def __str__(self): | |
return repr(self) | |
def __repr__(self): | |
return '<%(name)s%(attrs)s>%(children)s</%(name)s>'%\ | |
{"name":self.tagName, "children":''.join(map(str, self.children)), "attrs": ''.join(map(lambda p: ' %s="%s"'%p if p[1] is not None else ' %s'%p[0], self.attributes.items()))} | |
class Parser(object): | |
@staticmethod | |
def is_valid_ident_letter(ch): | |
return ch >= 'a' and ch <= 'z' or ch >= 'A' and ch <= 'Z' | |
@staticmethod | |
def is_valid_ident_start(ch): | |
return Parser.is_valid_ident_letter(ch) or ch == '-' or ch == '$' or ch == ':' | |
@staticmethod | |
def is_whitespace(ch): | |
return ch == ' ' or ch == '\t' or ch == '\n' or ch == '\r' | |
def __init__(self): | |
self.top = TagNode('') | |
self.stack = [self.top] | |
self.in_start_tag = False | |
self.in_tag = False | |
self.is_start_tag = None | |
self.in_tag_name = False | |
self.in_attr_key = False | |
self.in_attr_value = False | |
self.attr_quote = None | |
self.pending_tag = None | |
self.pending_text = '' | |
self.pending_node = None | |
self.pending_name = None | |
self.pending_attrs = {} | |
self.pending_key = None | |
self.pending_value = None | |
def startTag(self): | |
if self.pending_text: | |
self.flushText() | |
self.in_tag = True | |
self.in_start_tag = None | |
self.in_tag_name = False | |
self.pending_name = None | |
self.pending_text = '' | |
def flushName(self): | |
if not self.pending_text: | |
raise Exception("Missing name") | |
self.pending_name = self.pending_text | |
self.pending_text = '' | |
self.in_tag_name = False | |
self.in_attr_key = True | |
def flushText(self): | |
self.top.children.append(TextNode(self.pending_text)) | |
self.pending_text = '' | |
def flushKey(self): | |
self.in_attr_key = False | |
self.in_attr_value = True | |
self.pending_key = self.pending_text | |
self.pending_text = '' | |
def flushAttr(self): | |
if self.pending_key: | |
self.pending_attrs[self.pending_key] = self.pending_text | |
elif self.pending_text: | |
self.pending_attrs[self.pending_text] = None | |
else: | |
pass | |
self.pending_text = '' | |
self.pending_key = '' | |
self.in_attr_key = True | |
self.in_attr_value = False | |
self.attr_quote = None | |
def push(self): | |
if self.in_tag_name: | |
self.flushName() | |
elif self.in_attr_key: | |
self.flushAttr() | |
next = TagNode(self.pending_name, self.pending_attrs) | |
self.top.children.append(next) | |
self.stack.insert(0, next) | |
self.top = next | |
self.in_tag = False | |
self.in_start_tag = None | |
self.pending_name = None | |
self.pending_attrs = {} | |
def pop(self): | |
self.stack = self.stack[1:] | |
self.top = self.stack[0] | |
self.in_tag = False | |
self.in_start_tag = None | |
self.pending_text = '' | |
self.pending_name = None | |
def err(self, ch): | |
raise Exception("Invalid token: "+ch) | |
def consume(self, ch): | |
#If starting tag | |
if not self.in_tag and ch == '<': | |
self.startTag() | |
return | |
#If inside of a tag | |
if self.in_tag: | |
#Name not given yet | |
if ch == '>': | |
if self.in_start_tag: | |
self.push() | |
else: | |
self.pop() | |
return | |
elif self.pending_name is None: | |
if not self.in_tag_name: | |
self.in_tag_name = True | |
self.in_start_tag = not ch == '/' | |
if not self.in_start_tag: return | |
else: | |
if not self.pending_text: | |
if Parser.is_whitespace(ch): | |
return | |
elif not Parser.is_valid_ident_start(ch): | |
self.err(ch) | |
else: | |
if Parser.is_whitespace(ch): | |
self.flushName() | |
return | |
elif not Parser.is_valid_ident_letter(ch): | |
self.err(ch) | |
#If inside a start tag | |
if self.in_start_tag == True: | |
if self.in_attr_key: | |
if ch == '=': | |
self.flushKey() | |
return | |
elif Parser.is_whitespace(ch): | |
self.flushAttr() | |
return | |
elif not Parser.is_valid_ident_letter(ch): | |
self.err(ch) | |
if self.in_attr_value: | |
if self.attr_quote: | |
if ch == self.attr_quote: | |
self.flushAttr() | |
self.in_attr_key = True | |
return | |
else: | |
if ch == "'" or ch == '"': | |
self.attr_quote = ch | |
return | |
elif not Parser.is_whitespace(ch): | |
self.err(ch) | |
self.pending_text += ch | |
def parse(text): | |
parser = Parser() | |
for block in text: | |
if type(block) == str: | |
for ch in block: | |
parser.consume(ch) | |
return parser.top | |
if __name__ == '__main__': | |
print parse(sys.stdin) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment