alexdelorenzo/wrapper.py

## wrapper.py
from functools import lru_cache

from lxml.html import HtmlElement, Element, fromstring
from lxml.etree import XPath
from bs4 import BeautifulSoup, Tag


class BeauToLxml(object):
    """
     An adapter for a small subset of the BeautifulSoup4 API
     that I use. Translates calls to a much faster
     lxml backend not based on bs4's slow lxml-based engine.

     I'm sure that get_xpath() can be refactored nicely,
     with better xpath selectors as well, but I'm gonna throw
     a @lru_cache() on it and call it a day.

    """
    def __init__(self, html):
        super().__init__()

        html_type = type(html)

        if html_type in (str, bytes):
            self.html = fromstring(html)

        elif html_type == BeauToLxml:
            self.html = html.html

        elif html_type in (Element, HtmlElement):
            self.html = html

        elif html_type in (Tag, BeautifulSoup):
            self.html = fromstring(str(html))

    def __repr__(self):
        return 'BeauToLxml: ' + repr(self.html)

    def __str__(self):
        return self.html.text

    def __getitem__(self, item):
        items = self.html.attrib[item]
        #
        # if ' ' in items:
        #     print(item, "items: ", items)

        if item == 'class':
            items = items.split(' ')

        return items

    def __getattr__(self, item):
        val = self.find(item)

        if val is None:
            if hasattr(self.html, item):
                return getattr(self.html, item)

            else:
                return None

        else:
            return val

    @property
    def text(self) -> str:
        text = self.html.text_content()

        return text if text else ''

    def find(self, tag: str, _class: str=None, **kwargs):
        return find(self.html, tag, _class, **kwargs)

    def find_all(self, tag: str, _class: str=None, **kwargs) -> tuple:
        return find_all(self.html, tag, _class, **kwargs)


def find(html: Element, tag: str, _class: str=None, **kwargs) -> BeauToLxml or None:
    results = find_all(html, tag, _class, gen=True, **kwargs)

    return next(iter(results)) if results else None


def find_all(html: Element, tag: str, _class: str=None, gen: bool=False, **kwargs) -> iter or tuple:
    xpath = get_xpath(tag, _class, **kwargs)
    elems = xpath(html)

    if not elems:
        return tuple()

    wrapper_map = map(BeauToLxml, elems) # returns an iterator

    return wrapper_map if gen else tuple(wrapper_map)


@lru_cache(maxsize=None)
def get_xpath(tag: str, _class: str=None, **kwargs) -> XPath:
    tag_xp = './/' + tag

    if _class:
        kwargs['class'] = _class

    for attr, val in kwargs.items():
        tag_xp += '['
        attr_xp = '@' + attr
        val_type = type(val)

        if val_type == bool:
            if val:
                tag_xp += attr_xp

            else:
                tag_xp += 'not(%s)' % attr_xp

        elif val_type in (set, list, tuple):
            for item in val:
                val_xp = '"%s", ' % item

            val_xp = val_xp[:-2] if val else ''
            tag_xp += 'contains(%s, %s)' % (attr_xp, val_xp)

        elif val_type == str:
            tag_xp += 'contains(%s, "%s")' % (attr_xp, val_xp)

        else:
            tag_xp += "%s=%s'" % (attr_xp, val)

        tag_xp += ']'

    return XPath(tag_xp)
	from functools import lru_cache

	from lxml.html import HtmlElement, Element, fromstring
	from lxml.etree import XPath
	from bs4 import BeautifulSoup, Tag


	class BeauToLxml(object):
	"""
	An adapter for a small subset of the BeautifulSoup4 API
	that I use. Translates calls to a much faster
	lxml backend not based on bs4's slow lxml-based engine.

	I'm sure that get_xpath() can be refactored nicely,
	with better xpath selectors as well, but I'm gonna throw
	a @lru_cache() on it and call it a day.

	"""
	def __init__(self, html):
	super().__init__()

	html_type = type(html)

	if html_type in (str, bytes):
	self.html = fromstring(html)

	elif html_type == BeauToLxml:
	self.html = html.html

	elif html_type in (Element, HtmlElement):
	self.html = html

	elif html_type in (Tag, BeautifulSoup):
	self.html = fromstring(str(html))

	def __repr__(self):
	return 'BeauToLxml: ' + repr(self.html)

	def __str__(self):
	return self.html.text

	def __getitem__(self, item):
	items = self.html.attrib[item]
	#
	# if ' ' in items:
	# print(item, "items: ", items)

	if item == 'class':
	items = items.split(' ')

	return items

	def __getattr__(self, item):
	val = self.find(item)

	if val is None:
	if hasattr(self.html, item):
	return getattr(self.html, item)

	else:
	return None

	else:
	return val

	@property
	def text(self) -> str:
	text = self.html.text_content()

	return text if text else ''

	def find(self, tag: str, _class: str=None, **kwargs):
	return find(self.html, tag, _class, **kwargs)

	def find_all(self, tag: str, _class: str=None, **kwargs) -> tuple:
	return find_all(self.html, tag, _class, **kwargs)


	def find(html: Element, tag: str, _class: str=None, **kwargs) -> BeauToLxml or None:
	results = find_all(html, tag, _class, gen=True, **kwargs)

	return next(iter(results)) if results else None


	def find_all(html: Element, tag: str, _class: str=None, gen: bool=False, **kwargs) -> iter or tuple:
	xpath = get_xpath(tag, _class, **kwargs)
	elems = xpath(html)

	if not elems:
	return tuple()

	wrapper_map = map(BeauToLxml, elems) # returns an iterator

	return wrapper_map if gen else tuple(wrapper_map)


	@lru_cache(maxsize=None)
	def get_xpath(tag: str, _class: str=None, **kwargs) -> XPath:
	tag_xp = './/' + tag

	if _class:
	kwargs['class'] = _class

	for attr, val in kwargs.items():
	tag_xp += '['
	attr_xp = '@' + attr
	val_type = type(val)

	if val_type == bool:
	if val:
	tag_xp += attr_xp

	else:
	tag_xp += 'not(%s)' % attr_xp

	elif val_type in (set, list, tuple):
	for item in val:
	val_xp = '"%s", ' % item

	val_xp = val_xp[:-2] if val else ''
	tag_xp += 'contains(%s, %s)' % (attr_xp, val_xp)

	elif val_type == str:
	tag_xp += 'contains(%s, "%s")' % (attr_xp, val_xp)

	else:
	tag_xp += "%s=%s'" % (attr_xp, val)

	tag_xp += ']'

	return XPath(tag_xp)