Last active
February 1, 2019 15:52
-
-
Save agumonkey/df5d8bcb6e4249e4df09f37327295aae to your computer and use it in GitHub Desktop.
petit arbre html
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pprint import pprint as pp | |
from html.parser import HTMLParser | |
from collections import namedtuple | |
import requests | |
####### | |
# dom # | |
####### | |
class Form(HTMLParser): | |
""" | |
faux parser that aggregates some tags | |
""" | |
def __init__(self): | |
self.title = None | |
self.form = None | |
self.links = [] | |
self.frames = [] | |
super().__init__() | |
def get(self): | |
return { | |
'title': self.title, | |
'form': self.form, | |
'links': self.links, | |
'frames': self.frames, | |
} | |
def handle_starttag(self,t,a): | |
if t == 'form': | |
self.form = (t,a) | |
elif t == 'a': | |
self.links.append((t,a)) | |
elif t == 'iframe': | |
self.frames.append((t,a)) | |
elif t == 'title': | |
self.title = (t,a) | |
Nod = namedtuple('Nod', ['tag','attrs','children']) | |
def f_attrs(a,f=str): | |
return '.'.join(f(k) for k in dict(a).keys()) | |
class NODE(Nod): | |
def __repr__(self): | |
t = self.tag | |
a = f_attrs(self.attrs) | |
c = len(self.children) | |
return '<%s %s %d>' % (t,a,c) | |
def format(self,i=0,p=' '): | |
ii = p * i | |
ss = '\n'.join(c.format(i+1) for c in self.children) | |
ss = ('\n' + ss + '\n') if self.children else '' | |
aa = f_attrs(self.attrs) | |
aa = (' ' + aa) if self.attrs else '' | |
open_t = '<%s%s>' % (self.tag, aa) | |
end_t = '</%s>' % self.tag | |
return ii + open_t + ss + (ii if self.children else '') + end_t | |
def view(self,i=0,p='.'): | |
for c in self.children: | |
print(i * p, c.tag, f_attrs(c.attrs)) | |
c.view(i+1) | |
class Dom(HTMLParser): | |
""" | |
Creates a tree of Nodes | |
""" | |
def __init__(self): | |
#+ stack :: [Node] | |
self.stack = [NODE('root',[],[])] | |
super().__init__() | |
def handle_starttag(self,t,a): | |
print('--' * len(self.stack), '>', t, f_attrs(a)) | |
if t in ['meta', 'br','img','input']: | |
print('--' * len(self.stack), '!', t) | |
self.handle_startendtag(t,a) | |
else: | |
self.stack.append(NODE(t,a,[])) | |
# Too noisy for now | |
# def handle_data(self,d): | |
# self.handle_startendtag('@',[('text', d)]) | |
def handle_comment(self,c): | |
print('..' * len(self.stack), '#', c) | |
def handle_decl(self,d): | |
print('[decl]', d) | |
def handle_pi(self,p): | |
print('[pi]', p) | |
def handle_startendtag(self,t,x): | |
print('.:' * len(self.stack), t) | |
self.stack[-1].children.append(NODE(t,x,[])) | |
def handle_endtag(self,etag): | |
c = self.stack.pop() | |
print('--' * len(self.stack),'<', etag) | |
if c.tag != etag: | |
print('Fix','[fake-startend]', etag, "should be", c.tag) | |
self.stack.append(c) | |
self.handle_startendtag(etag, [('class','fix-fake')]) | |
else: | |
self.stack[-1].children.append(c) | |
def dom(self, response): | |
self.feed(response.content.decode()) | |
r,*css = self.stack | |
for e in css: | |
r.children.append(e) | |
return r | |
######## | |
# test # | |
######## | |
R = namedtuple('Resp','content') | |
test1 = R(b'<html><a></a></html>') | |
test2 = R(b'<html><a><b></b></a></html>') | |
test3 = R(b'<html><a><b></b><c></c></a></html>') | |
test4 = R(b'<html><a><s/><s/><b></b><c></c></a></html>') | |
test5 = R(b'<html><a><s/><s/><b></b><c></c></a><body><div></div><hr/><div></div></body></html>') | |
######### | |
# xpath # | |
######### | |
def fold(f,l,z): | |
x = z | |
for e in l: | |
x = f(x,e) | |
return x | |
def concat(ls): | |
def plus(a,b): | |
return a + b | |
return fold(plus, ls, []) | |
def flatten(l): | |
if type(l) is not type([]): | |
return [l] | |
else: | |
return concat([flatten(e) for e in l]) | |
def x(d,s): | |
if type(d) is type([]): | |
_ = [x(e,s) for e in d] | |
return flatten(_) | |
else: | |
if s == '*': | |
return d.children | |
else: | |
return [e for e in d.children if e.tag == s] | |
def xs(d, steps): | |
return fold(x, steps, d) | |
## TODO: dom traversal generator | |
## TODO: stream based HTMLParser xpath-filter HTMLParser /a/b/c/d .. | |
def xpath(d,*p): | |
if len(p) == 1: | |
return d | |
else: | |
s,*r = p | |
return xpath([e for e in d.children if e.tag == s][0], r) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment