agumonkey/dom.py

## dom.py
from pprint import pprint as pp
from html.parser import HTMLParser
from collections import namedtuple

import requests

#######
# dom #
#######

class Form(HTMLParser):
    """
    faux parser that aggregates some tags
    """
    def __init__(self):
        self.title = None
        self.form = None
        self.links = []
        self.frames = []
        super().__init__()

    def get(self):
        return {
            'title': self.title,
            'form': self.form,
            'links': self.links,
            'frames': self.frames,
        }

    def handle_starttag(self,t,a):
        if t == 'form':
            self.form = (t,a)
        elif t == 'a':
            self.links.append((t,a))
        elif t == 'iframe':
            self.frames.append((t,a))
        elif t == 'title':
            self.title = (t,a)

Nod = namedtuple('Nod', ['tag','attrs','children'])

def f_attrs(a,f=str):
    return '.'.join(f(k) for k in dict(a).keys())

class NODE(Nod):
    def __repr__(self):
        t = self.tag
        a = f_attrs(self.attrs)
        c = len(self.children)
        return '<%s %s %d>' % (t,a,c)

    def format(self,i=0,p=' '):
        ii = p * i

        ss = '\n'.join(c.format(i+1) for c in self.children)
        ss = ('\n' + ss + '\n') if self.children else ''

        aa = f_attrs(self.attrs)
        aa = (' ' + aa) if self.attrs else ''

        open_t = '<%s%s>' % (self.tag, aa)
        end_t = '</%s>' % self.tag
        return ii + open_t + ss + (ii if self.children else '') + end_t

    def view(self,i=0,p='.'):
        for c in self.children:
            print(i * p, c.tag, f_attrs(c.attrs))
            c.view(i+1)


class Dom(HTMLParser):
    """
    Creates a tree of Nodes
    """
    def __init__(self):
        #+ stack :: [Node]
        self.stack = [NODE('root',[],[])]
        super().__init__()

    def handle_starttag(self,t,a):
        print('--' * len(self.stack), '>', t, f_attrs(a))
        if t in ['meta', 'br','img','input']:
            print('--' * len(self.stack), '!', t)
            self.handle_startendtag(t,a)
        else:
            self.stack.append(NODE(t,a,[]))

    # Too noisy for now
    # def handle_data(self,d):
    #     self.handle_startendtag('@',[('text', d)])

    def handle_comment(self,c):
        print('..' * len(self.stack), '#', c)

    def handle_decl(self,d):
        print('[decl]', d)

    def handle_pi(self,p):
        print('[pi]', p)

    def handle_startendtag(self,t,x):
        print('.:' * len(self.stack), t)
        self.stack[-1].children.append(NODE(t,x,[]))

    def handle_endtag(self,etag):
        c = self.stack.pop()
        print('--' * len(self.stack),'<', etag)
        if c.tag != etag:
            print('Fix','[fake-startend]', etag, "should be", c.tag)
            self.stack.append(c)
            self.handle_startendtag(etag, [('class','fix-fake')])
        else:
            self.stack[-1].children.append(c)

    def dom(self, response):
        self.feed(response.content.decode())
        r,*css = self.stack
        for e in css:
            r.children.append(e)
        return r

########
# test #
########

R = namedtuple('Resp','content')
test1 = R(b'<html><a></a></html>')
test2 = R(b'<html><a><b></b></a></html>')
test3 = R(b'<html><a><b></b><c></c></a></html>')
test4 = R(b'<html><a><s/><s/><b></b><c></c></a></html>')
test5 = R(b'<html><a><s/><s/><b></b><c></c></a><body><div></div><hr/><div></div></body></html>')

#########
# xpath #
#########

def fold(f,l,z):
    x = z
    for e in l:
        x = f(x,e)
    return x

def concat(ls):
    def plus(a,b):
        return a + b
    return fold(plus, ls, [])

def flatten(l):
    if type(l) is not type([]):
        return [l]
    else:
        return concat([flatten(e) for e in l])

def x(d,s):
    if type(d) is type([]):
        _ = [x(e,s) for e in d]
        return flatten(_)
    else:
        if s == '*':
            return d.children
        else:
            return [e for e in d.children if e.tag == s]

def xs(d, steps):
    return fold(x, steps, d)

## TODO: dom traversal generator
## TODO: stream based HTMLParser xpath-filter HTMLParser /a/b/c/d ..

def xpath(d,*p):
    if len(p) == 1:
        return d
    else:
        s,*r = p
        return xpath([e for e in d.children if e.tag == s][0], r)
	from pprint import pprint as pp
	from html.parser import HTMLParser
	from collections import namedtuple

	import requests

	#######
	# dom #
	#######

	class Form(HTMLParser):
	"""
	faux parser that aggregates some tags
	"""
	def __init__(self):
	self.title = None
	self.form = None
	self.links = []
	self.frames = []
	super().__init__()

	def get(self):
	return {
	'title': self.title,
	'form': self.form,
	'links': self.links,
	'frames': self.frames,
	}

	def handle_starttag(self,t,a):
	if t == 'form':
	self.form = (t,a)
	elif t == 'a':
	self.links.append((t,a))
	elif t == 'iframe':
	self.frames.append((t,a))
	elif t == 'title':
	self.title = (t,a)

	Nod = namedtuple('Nod', ['tag','attrs','children'])

	def f_attrs(a,f=str):
	return '.'.join(f(k) for k in dict(a).keys())

	class NODE(Nod):
	def __repr__(self):
	t = self.tag
	a = f_attrs(self.attrs)
	c = len(self.children)
	return '<%s %s %d>' % (t,a,c)

	def format(self,i=0,p=' '):
	ii = p * i

	ss = '\n'.join(c.format(i+1) for c in self.children)
	ss = ('\n' + ss + '\n') if self.children else ''

	aa = f_attrs(self.attrs)
	aa = (' ' + aa) if self.attrs else ''

	open_t = '<%s%s>' % (self.tag, aa)
	end_t = '</%s>' % self.tag
	return ii + open_t + ss + (ii if self.children else '') + end_t

	def view(self,i=0,p='.'):
	for c in self.children:
	print(i * p, c.tag, f_attrs(c.attrs))
	c.view(i+1)


	class Dom(HTMLParser):
	"""
	Creates a tree of Nodes
	"""
	def __init__(self):
	#+ stack :: [Node]
	self.stack = [NODE('root',[],[])]
	super().__init__()

	def handle_starttag(self,t,a):
	print('--' * len(self.stack), '>', t, f_attrs(a))
	if t in ['meta', 'br','img','input']:
	print('--' * len(self.stack), '!', t)
	self.handle_startendtag(t,a)
	else:
	self.stack.append(NODE(t,a,[]))

	# Too noisy for now
	# def handle_data(self,d):
	# self.handle_startendtag('@',[('text', d)])

	def handle_comment(self,c):
	print('..' * len(self.stack), '#', c)

	def handle_decl(self,d):
	print('[decl]', d)

	def handle_pi(self,p):
	print('[pi]', p)

	def handle_startendtag(self,t,x):
	print('.:' * len(self.stack), t)
	self.stack[-1].children.append(NODE(t,x,[]))

	def handle_endtag(self,etag):
	c = self.stack.pop()
	print('--' * len(self.stack),'<', etag)
	if c.tag != etag:
	print('Fix','[fake-startend]', etag, "should be", c.tag)
	self.stack.append(c)
	self.handle_startendtag(etag, [('class','fix-fake')])
	else:
	self.stack[-1].children.append(c)

	def dom(self, response):
	self.feed(response.content.decode())
	r,*css = self.stack
	for e in css:
	r.children.append(e)
	return r

	########
	# test #
	########

	R = namedtuple('Resp','content')
	test1 = R(b'<html><a></a></html>')
	test2 = R(b'<html><a><b></b></a></html>')
	test3 = R(b'<html><a><b></b><c></c></a></html>')
	test4 = R(b'<html><a><s/><s/><b></b><c></c></a></html>')
	test5 = R(b'<html><a><s/><s/><b></b><c></c></a><body><div></div><hr/><div></div></body></html>')

	#########
	# xpath #
	#########

	def fold(f,l,z):
	x = z
	for e in l:
	x = f(x,e)
	return x

	def concat(ls):
	def plus(a,b):
	return a + b
	return fold(plus, ls, [])

	def flatten(l):
	if type(l) is not type([]):
	return [l]
	else:
	return concat([flatten(e) for e in l])

	def x(d,s):
	if type(d) is type([]):
	_ = [x(e,s) for e in d]
	return flatten(_)
	else:
	if s == '*':
	return d.children
	else:
	return [e for e in d.children if e.tag == s]

	def xs(d, steps):
	return fold(x, steps, d)

	## TODO: dom traversal generator
	## TODO: stream based HTMLParser xpath-filter HTMLParser /a/b/c/d ..

	def xpath(d,*p):
	if len(p) == 1:
	return d
	else:
	s,*r = p
	return xpath([e for e in d.children if e.tag == s][0], r)