Skip to content

Instantly share code, notes, and snippets.

@arciisine
Created September 28, 2016 22:42
Show Gist options
  • Save arciisine/b58e681a38161d66d405d19b9663eb07 to your computer and use it in GitHub Desktop.
Save arciisine/b58e681a38161d66d405d19b9663eb07 to your computer and use it in GitHub Desktop.
Python HTML Parser
#!/usr/bin/python
import sys
class Node(object):
def __init__(self):
pass
class TextNode(Node):
def __init__(self, text = ''):
Node.__init__(self)
self.text = text
def __str__(self):
return repr(self)
def __repr__(self):
return self.text
class TagNode(Node):
def __init__(self, tagName = None, attributes = None, children = None):
Node.__init__(self)
self.tagName = tagName
self.children = children if children is not None else []
self.attributes = attributes if attributes is not None else {}
def __str__(self):
return repr(self)
def __repr__(self):
return '<%(name)s%(attrs)s>%(children)s</%(name)s>'%\
{"name":self.tagName, "children":''.join(map(str, self.children)), "attrs": ''.join(map(lambda p: ' %s="%s"'%p if p[1] is not None else ' %s'%p[0], self.attributes.items()))}
class Parser(object):
@staticmethod
def is_valid_ident_letter(ch):
return ch >= 'a' and ch <= 'z' or ch >= 'A' and ch <= 'Z'
@staticmethod
def is_valid_ident_start(ch):
return Parser.is_valid_ident_letter(ch) or ch == '-' or ch == '$' or ch == ':'
@staticmethod
def is_whitespace(ch):
return ch == ' ' or ch == '\t' or ch == '\n' or ch == '\r'
def __init__(self):
self.top = TagNode('')
self.stack = [self.top]
self.in_start_tag = False
self.in_tag = False
self.is_start_tag = None
self.in_tag_name = False
self.in_attr_key = False
self.in_attr_value = False
self.attr_quote = None
self.pending_tag = None
self.pending_text = ''
self.pending_node = None
self.pending_name = None
self.pending_attrs = {}
self.pending_key = None
self.pending_value = None
def startTag(self):
if self.pending_text:
self.flushText()
self.in_tag = True
self.in_start_tag = None
self.in_tag_name = False
self.pending_name = None
self.pending_text = ''
def flushName(self):
if not self.pending_text:
raise Exception("Missing name")
self.pending_name = self.pending_text
self.pending_text = ''
self.in_tag_name = False
self.in_attr_key = True
def flushText(self):
self.top.children.append(TextNode(self.pending_text))
self.pending_text = ''
def flushKey(self):
self.in_attr_key = False
self.in_attr_value = True
self.pending_key = self.pending_text
self.pending_text = ''
def flushAttr(self):
if self.pending_key:
self.pending_attrs[self.pending_key] = self.pending_text
elif self.pending_text:
self.pending_attrs[self.pending_text] = None
else:
pass
self.pending_text = ''
self.pending_key = ''
self.in_attr_key = True
self.in_attr_value = False
self.attr_quote = None
def push(self):
if self.in_tag_name:
self.flushName()
elif self.in_attr_key:
self.flushAttr()
next = TagNode(self.pending_name, self.pending_attrs)
self.top.children.append(next)
self.stack.insert(0, next)
self.top = next
self.in_tag = False
self.in_start_tag = None
self.pending_name = None
self.pending_attrs = {}
def pop(self):
self.stack = self.stack[1:]
self.top = self.stack[0]
self.in_tag = False
self.in_start_tag = None
self.pending_text = ''
self.pending_name = None
def err(self, ch):
raise Exception("Invalid token: "+ch)
def consume(self, ch):
#If starting tag
if not self.in_tag and ch == '<':
self.startTag()
return
#If inside of a tag
if self.in_tag:
#Name not given yet
if ch == '>':
if self.in_start_tag:
self.push()
else:
self.pop()
return
elif self.pending_name is None:
if not self.in_tag_name:
self.in_tag_name = True
self.in_start_tag = not ch == '/'
if not self.in_start_tag: return
else:
if not self.pending_text:
if Parser.is_whitespace(ch):
return
elif not Parser.is_valid_ident_start(ch):
self.err(ch)
else:
if Parser.is_whitespace(ch):
self.flushName()
return
elif not Parser.is_valid_ident_letter(ch):
self.err(ch)
#If inside a start tag
if self.in_start_tag == True:
if self.in_attr_key:
if ch == '=':
self.flushKey()
return
elif Parser.is_whitespace(ch):
self.flushAttr()
return
elif not Parser.is_valid_ident_letter(ch):
self.err(ch)
if self.in_attr_value:
if self.attr_quote:
if ch == self.attr_quote:
self.flushAttr()
self.in_attr_key = True
return
else:
if ch == "'" or ch == '"':
self.attr_quote = ch
return
elif not Parser.is_whitespace(ch):
self.err(ch)
self.pending_text += ch
def parse(text):
parser = Parser()
for block in text:
if type(block) == str:
for ch in block:
parser.consume(ch)
return parser.top
if __name__ == '__main__':
print parse(sys.stdin)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment