Skip to content

Instantly share code, notes, and snippets.

@aGHz
Created January 8, 2014 18:39
Show Gist options
  • Save aGHz/8321991 to your computer and use it in GitHub Desktop.
Save aGHz/8321991 to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
import functools
import inspect
class Field(object):
def __init__(self, selector=None):
self._value = None
self.selector = selector
def _get_element(self, selector=None):
selector = selector or self.selector
return self.soup.select(selector) if selector else []
@property
def element(self):
return self._get_element()
@property
def value(self):
return self._value
def parse(self, soup):
self.soup = soup
self._value = self._get_element()
return self._value
def parser(self, selector=None):
def decorator(fn):
#import pdb; pdb.set_trace()
self.selector = selector
self._parse = self.parse
def new_parse(field, soup):
value = self._parse(soup)
field._value = fn(soup, value)
return field.value
self.parse = new_parse.__get__(self, self.__class__)
return fn # nobody should call this, we don't care
return decorator
class StringsField(Field):
def parse(self, soup):
self._value = reduce(lambda l, e: l + list(e.stripped_strings),
super(StringsField, self).parse(soup), [])
return self._value
class TextField(StringsField):
def __init__(self, selector=None, separator=' ', *args, **kwargs):
super(TextField, self).__init__(selector=selector, *args, **kwargs)
self.separator = separator
def parse(self, soup):
self._value = self.separator.join(super(TextField, self).parse(soup))
return self._value
class IntField(TextField):
def __init__(self, *args, **kwargs):
super(IntField, self).__init__(separator='', *args, **kwargs)
def parse(self, soup):
self._value = int(super(IntField, self).parse(soup))
return self._value
class ListField(Field):
def parse(self, soup):
super(ListField, self).parse(soup)
lis = self._value.
class Document(object):
def __init__(self, content=None):
self.__fields = {name: attr for (name, attr) in inspect.getmembers(self) if isinstance(attr, Field)}
if content:
self.parse(content)
def parse(self, content):
self.content = content
self.soup = BeautifulSoup(content)
for field in self.__fields.values():
field.parse(self.soup)
class MyDoc(Document):
age = IntField('#age')
sum = StringsField()
@sum.parser('#qwer')
def sum_items(soup, parsed):
return sum(map(lambda s: int(s), parsed))
content = """
<span id="age">42</span>
<ul id="qwer">
<li>1</li>
<li>2</li>
<li>3</li>
</ul>
"""
document = MyDoc(content)
print document.age.value
print document.sum.value
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment