Skip to content

Instantly share code, notes, and snippets.

@kotfic
Created January 14, 2014 21:34
Show Gist options
  • Save kotfic/8426220 to your computer and use it in GitHub Desktop.
Save kotfic/8426220 to your computer and use it in GitHub Desktop.
import re
class SoupProxy(object):
def __init__(self, contents, **kwargs):
try:
self.contents = contents.read()
except AttributeError:
self.contents = contents
# Defaults
self.parent = None
self.start = 0
self.end = len(self.contents)
for k,v in kwargs.items():
setattr(self, k, v)
def __getitem__(self, k):
return getattr(self, k)
def __setitem__(self, k, v):
return setattr(self, k)
@property
def text(self):
return re.sub('<[^>]*>', '', self.contents)
@property
def string(self):
return self.text
@string.setter
def string(self, value):
# This is a shameless HACK to allow us to replace a PHI
# value up the SoupProxy parent chain. This certainly
# BAD DESIGN ALL AROUND and will probably come back to
# BITE ME(YOU) IN THE ASS at some point.
assert len(value) == len(self.contents), "Length of new string value must be the same as current string value."
self.contents = value
if self.parent:
self.parent.string = self.parent.contents[:self.start] + value + self.parent.contents[self.end:]
def __str__(self):
return self.contents
def __repr__(self):
return self.contents
def __unicode__(self):
return self.contents.encode("utf-8")
def keys(self):
return self.__dict__.keys()
def find_all(self, tag):
"""Return a list containing SoupProxy objects that match tags related to the query string"""
# This is a bad idea, please see: http://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags/1732454#1732454
tags = []
tag_matchers = re.finditer(r"<(%s) (.*?)>(.*?)</%s>" % (tag, tag), self.contents, flags=re.DOTALL)
for tag_matcher in tag_matchers:
name, attrs, text = tag_matcher.groups()
start, end = tag_matcher.span(3)
tag = {'name' : name,
'start' : start,
'end' : end,
'parent' : self}
for key,value in re.findall(r'([^ ]*?)="(.*?)"', attrs, re.DOTALL):
tag[key] = value
tags.append(SoupProxy(text, **tag))
return tags
class BeautifulSoup(SoupProxy):
def __init__(self, contents, parser, **kwargs):
super(BeautifulSoup, self).__init__(contents, **kwargs)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment