Skip to content

Instantly share code, notes, and snippets.

@agumonkey
Last active February 1, 2019 15:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save agumonkey/df5d8bcb6e4249e4df09f37327295aae to your computer and use it in GitHub Desktop.
Save agumonkey/df5d8bcb6e4249e4df09f37327295aae to your computer and use it in GitHub Desktop.
petit arbre html
from pprint import pprint as pp
from html.parser import HTMLParser
from collections import namedtuple
import requests
#######
# dom #
#######
class Form(HTMLParser):
"""
faux parser that aggregates some tags
"""
def __init__(self):
self.title = None
self.form = None
self.links = []
self.frames = []
super().__init__()
def get(self):
return {
'title': self.title,
'form': self.form,
'links': self.links,
'frames': self.frames,
}
def handle_starttag(self,t,a):
if t == 'form':
self.form = (t,a)
elif t == 'a':
self.links.append((t,a))
elif t == 'iframe':
self.frames.append((t,a))
elif t == 'title':
self.title = (t,a)
Nod = namedtuple('Nod', ['tag','attrs','children'])
def f_attrs(a,f=str):
return '.'.join(f(k) for k in dict(a).keys())
class NODE(Nod):
def __repr__(self):
t = self.tag
a = f_attrs(self.attrs)
c = len(self.children)
return '<%s %s %d>' % (t,a,c)
def format(self,i=0,p=' '):
ii = p * i
ss = '\n'.join(c.format(i+1) for c in self.children)
ss = ('\n' + ss + '\n') if self.children else ''
aa = f_attrs(self.attrs)
aa = (' ' + aa) if self.attrs else ''
open_t = '<%s%s>' % (self.tag, aa)
end_t = '</%s>' % self.tag
return ii + open_t + ss + (ii if self.children else '') + end_t
def view(self,i=0,p='.'):
for c in self.children:
print(i * p, c.tag, f_attrs(c.attrs))
c.view(i+1)
class Dom(HTMLParser):
"""
Creates a tree of Nodes
"""
def __init__(self):
#+ stack :: [Node]
self.stack = [NODE('root',[],[])]
super().__init__()
def handle_starttag(self,t,a):
print('--' * len(self.stack), '>', t, f_attrs(a))
if t in ['meta', 'br','img','input']:
print('--' * len(self.stack), '!', t)
self.handle_startendtag(t,a)
else:
self.stack.append(NODE(t,a,[]))
# Too noisy for now
# def handle_data(self,d):
# self.handle_startendtag('@',[('text', d)])
def handle_comment(self,c):
print('..' * len(self.stack), '#', c)
def handle_decl(self,d):
print('[decl]', d)
def handle_pi(self,p):
print('[pi]', p)
def handle_startendtag(self,t,x):
print('.:' * len(self.stack), t)
self.stack[-1].children.append(NODE(t,x,[]))
def handle_endtag(self,etag):
c = self.stack.pop()
print('--' * len(self.stack),'<', etag)
if c.tag != etag:
print('Fix','[fake-startend]', etag, "should be", c.tag)
self.stack.append(c)
self.handle_startendtag(etag, [('class','fix-fake')])
else:
self.stack[-1].children.append(c)
def dom(self, response):
self.feed(response.content.decode())
r,*css = self.stack
for e in css:
r.children.append(e)
return r
########
# test #
########
R = namedtuple('Resp','content')
test1 = R(b'<html><a></a></html>')
test2 = R(b'<html><a><b></b></a></html>')
test3 = R(b'<html><a><b></b><c></c></a></html>')
test4 = R(b'<html><a><s/><s/><b></b><c></c></a></html>')
test5 = R(b'<html><a><s/><s/><b></b><c></c></a><body><div></div><hr/><div></div></body></html>')
#########
# xpath #
#########
def fold(f,l,z):
x = z
for e in l:
x = f(x,e)
return x
def concat(ls):
def plus(a,b):
return a + b
return fold(plus, ls, [])
def flatten(l):
if type(l) is not type([]):
return [l]
else:
return concat([flatten(e) for e in l])
def x(d,s):
if type(d) is type([]):
_ = [x(e,s) for e in d]
return flatten(_)
else:
if s == '*':
return d.children
else:
return [e for e in d.children if e.tag == s]
def xs(d, steps):
return fold(x, steps, d)
## TODO: dom traversal generator
## TODO: stream based HTMLParser xpath-filter HTMLParser /a/b/c/d ..
def xpath(d,*p):
if len(p) == 1:
return d
else:
s,*r = p
return xpath([e for e in d.children if e.tag == s][0], r)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment