kotfic/bs4Proxy.py

## bs4Proxy.py
import re

class SoupProxy(object):

    def __init__(self, contents, **kwargs):
        try:
            self.contents = contents.read()
        except AttributeError:
            self.contents = contents

        # Defaults
        self.parent = None
        self.start = 0
        self.end = len(self.contents)

        for k,v in kwargs.items():
            setattr(self, k, v)

    def __getitem__(self, k):
        return getattr(self, k)

    def __setitem__(self, k, v):
        return setattr(self, k)

    @property
    def text(self):
        return re.sub('<[^>]*>', '', self.contents)

    @property
    def string(self):
        return self.text

    @string.setter
    def string(self, value):
        # This is a shameless HACK to allow us to replace a PHI
        # value up the SoupProxy parent chain. This certainly
        # BAD DESIGN ALL AROUND and will probably come back to
        # BITE ME(YOU) IN THE ASS at some point.
        assert len(value) == len(self.contents), "Length of new string value must be the same as current string value."

        self.contents = value
        if self.parent:
            self.parent.string = self.parent.contents[:self.start] + value + self.parent.contents[self.end:]

    def __str__(self):
        return self.contents

    def __repr__(self):
        return self.contents

    def __unicode__(self):
        return self.contents.encode("utf-8")

    def keys(self):
        return self.__dict__.keys()


    def find_all(self, tag):
        """Return a list containing SoupProxy objects that match tags related to the query string"""
        # This is a bad idea,  please see: http://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags/1732454#1732454
        tags = []
        tag_matchers = re.finditer(r"<(%s) (.*?)>(.*?)</%s>" % (tag, tag), self.contents, flags=re.DOTALL)
        for tag_matcher in tag_matchers:
            name, attrs, text = tag_matcher.groups()
            start, end = tag_matcher.span(3)
            tag = {'name' : name,
                   'start' : start,
                   'end' : end,
                   'parent' : self}

            for key,value in  re.findall(r'([^ ]*?)="(.*?)"', attrs, re.DOTALL):
                tag[key] = value

            tags.append(SoupProxy(text, **tag))

        return tags


class BeautifulSoup(SoupProxy):
    def __init__(self, contents, parser, **kwargs):
        super(BeautifulSoup, self).__init__(contents, **kwargs)
	import re

	class SoupProxy(object):

	def __init__(self, contents, **kwargs):
	try:
	self.contents = contents.read()
	except AttributeError:
	self.contents = contents

	# Defaults
	self.parent = None
	self.start = 0
	self.end = len(self.contents)

	for k,v in kwargs.items():
	setattr(self, k, v)

	def __getitem__(self, k):
	return getattr(self, k)

	def __setitem__(self, k, v):
	return setattr(self, k)

	@property
	def text(self):
	return re.sub('<[^>]*>', '', self.contents)

	@property
	def string(self):
	return self.text

	@string.setter
	def string(self, value):
	# This is a shameless HACK to allow us to replace a PHI
	# value up the SoupProxy parent chain. This certainly
	# BAD DESIGN ALL AROUND and will probably come back to
	# BITE ME(YOU) IN THE ASS at some point.
	assert len(value) == len(self.contents), "Length of new string value must be the same as current string value."

	self.contents = value
	if self.parent:
	self.parent.string = self.parent.contents[:self.start] + value + self.parent.contents[self.end:]

	def __str__(self):
	return self.contents

	def __repr__(self):
	return self.contents

	def __unicode__(self):
	return self.contents.encode("utf-8")

	def keys(self):
	return self.__dict__.keys()


	def find_all(self, tag):
	"""Return a list containing SoupProxy objects that match tags related to the query string"""
	# This is a bad idea, please see: http://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags/1732454#1732454
	tags = []
	tag_matchers = re.finditer(r"<(%s) (.?)>(.?)</%s>" % (tag, tag), self.contents, flags=re.DOTALL)
	for tag_matcher in tag_matchers:
	name, attrs, text = tag_matcher.groups()
	start, end = tag_matcher.span(3)
	tag = {'name' : name,
	'start' : start,
	'end' : end,
	'parent' : self}

	for key,value in re.findall(r'([^ ]?)="(.?)"', attrs, re.DOTALL):
	tag[key] = value

	tags.append(SoupProxy(text, **tag))

	return tags


	class BeautifulSoup(SoupProxy):
	def __init__(self, contents, parser, **kwargs):
	super(BeautifulSoup, self).__init__(contents, **kwargs)