Created
January 14, 2014 21:34
-
-
Save kotfic/8426220 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
class SoupProxy(object): | |
def __init__(self, contents, **kwargs): | |
try: | |
self.contents = contents.read() | |
except AttributeError: | |
self.contents = contents | |
# Defaults | |
self.parent = None | |
self.start = 0 | |
self.end = len(self.contents) | |
for k,v in kwargs.items(): | |
setattr(self, k, v) | |
def __getitem__(self, k): | |
return getattr(self, k) | |
def __setitem__(self, k, v): | |
return setattr(self, k) | |
@property | |
def text(self): | |
return re.sub('<[^>]*>', '', self.contents) | |
@property | |
def string(self): | |
return self.text | |
@string.setter | |
def string(self, value): | |
# This is a shameless HACK to allow us to replace a PHI | |
# value up the SoupProxy parent chain. This certainly | |
# BAD DESIGN ALL AROUND and will probably come back to | |
# BITE ME(YOU) IN THE ASS at some point. | |
assert len(value) == len(self.contents), "Length of new string value must be the same as current string value." | |
self.contents = value | |
if self.parent: | |
self.parent.string = self.parent.contents[:self.start] + value + self.parent.contents[self.end:] | |
def __str__(self): | |
return self.contents | |
def __repr__(self): | |
return self.contents | |
def __unicode__(self): | |
return self.contents.encode("utf-8") | |
def keys(self): | |
return self.__dict__.keys() | |
def find_all(self, tag): | |
"""Return a list containing SoupProxy objects that match tags related to the query string""" | |
# This is a bad idea, please see: http://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags/1732454#1732454 | |
tags = [] | |
tag_matchers = re.finditer(r"<(%s) (.*?)>(.*?)</%s>" % (tag, tag), self.contents, flags=re.DOTALL) | |
for tag_matcher in tag_matchers: | |
name, attrs, text = tag_matcher.groups() | |
start, end = tag_matcher.span(3) | |
tag = {'name' : name, | |
'start' : start, | |
'end' : end, | |
'parent' : self} | |
for key,value in re.findall(r'([^ ]*?)="(.*?)"', attrs, re.DOTALL): | |
tag[key] = value | |
tags.append(SoupProxy(text, **tag)) | |
return tags | |
class BeautifulSoup(SoupProxy): | |
def __init__(self, contents, parser, **kwargs): | |
super(BeautifulSoup, self).__init__(contents, **kwargs) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment