Skip to content

Instantly share code, notes, and snippets.

@frobnitzem
Created February 3, 2021 16:07
Show Gist options
  • Save frobnitzem/5a0784ae255ff6dadb59e5454588bf3a to your computer and use it in GitHub Desktop.
Save frobnitzem/5a0784ae255ff6dadb59e5454588bf3a to your computer and use it in GitHub Desktop.
Parse an HTML table into json.
#!/usr/bin/env python3
import json
from html.parser import HTMLParser
# HTML is stupid - these tags don't close:
voids = set([ 'area', 'base', 'br', 'col',
'command', 'embed', 'hr', 'img',
'input', 'keygen', 'link', 'meta',
'param', 'source', 'track', 'wbr'
])
# This parser lists out tag paths present in the data
# Use it to determine what HTML tags to consider
# as internal nodes and leaf nodes.
class GetElems(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.elems = set()
self.path = []
def handle_starttag(self, tag, attrs):
self.path.append(tag)
self.elems.add('.'.join(self.path))
if tag in voids:
self.path.pop()
def handle_endtag(self, tag):
if tag in voids:
return
self.path.pop()
# This class creates a "collapsed" HTML hierarchy.
#
# Internal nodes have the type:
# { 'child-tag 1': [type 1 children], 'child-tag-2': [type 2 children]}
#
# Leaf nodes have the type:
# [ "string1", "string2", ... ]
#
# >>> p = ElemParser(['div'], ['p'])
# >>> p.feed('<div><p>Test</p> <p>Parse me!</p></div>')
# >>> p.doc
#
# { 'div': {'p': [["Test"], ["Parse me!"]]} }
#
# obviously, 'elems' and 'leaves' should be disjoint
# sets of HTML tag names - not including voids.
class ElemParser(HTMLParser):
def __init__(self, elems, leaves):
HTMLParser.__init__(self)
self.doc = {}
self.elems = set(elems)
self.leaves = set(leaves)
# parse stack: right-most element is current doc element
self.loc = [(0, '', self.doc)]
self.level = 0 # stack-level (number of outer start-tags)
# 0 = parsing node
# 1 = parsing leaf
self.state = 0
def handle_starttag(self, tag, attrs):
if tag in voids:
return
self.level += 1
if self.state != 0 or (tag not in self.elems \
and tag not in self.leaves):
return
i, t, d = self.loc[-1]
try:
l = d[tag]
except KeyError:
l = []
d[tag] = l
if tag in self.leaves:
l.append([])
self.state = 1
else:
l.append({})
self.loc.append((self.level, tag, l[-1]))
def handle_endtag(self, tag):
if tag in voids:
return
self.level -= 1
i, t, d = self.loc[-1]
if i != self.level+1:
return
# pop the current doc. element
if t != tag:
raise ValueError(f"Start/end tag mismatch expected {t}, found {tag}")
self.loc.pop()
if t in self.leaves:
self.state = 0
def handle_data(self, data):
if self.state != 1:
return
i, t, d = self.loc[-1]
d.append(data)
def test():
print_elems('<html><head><title>Test</title></head>'
'<body><h1>Parse me!</h1></body></html>')
print_elems('<div><p>Test</p> <p>Parse me!</p></div>')
print_json('<div><p>Test</p> <p>Parse me!</p></div>', ['div'], ['p'])
def print_elems(html):
parser = GetElems()
parser.feed(html)
print(parser.elems)
def print_json(html, elems, leaves):
p = ElemParser(elems, leaves)
p.feed(html)
#print(p.doc)
print(json.dumps(p.doc, indent=4))
if __name__=="__main__":
import sys
if len(sys.argv) == 1: # run the tests if there's no input file
test()
exit(0)
with open(sys.argv[1], encoding='utf-8') as f:
html = f.read()
# Step 1: determine what paths are present
#print_elems(html)
# Step 2: this worked for my table
print_json(html, ['tbody', 'thead', 'tr'], ['td', 'th'])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment