Created
January 8, 2014 18:39
-
-
Save aGHz/8321991 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import functools | |
import inspect | |
class Field(object): | |
def __init__(self, selector=None): | |
self._value = None | |
self.selector = selector | |
def _get_element(self, selector=None): | |
selector = selector or self.selector | |
return self.soup.select(selector) if selector else [] | |
@property | |
def element(self): | |
return self._get_element() | |
@property | |
def value(self): | |
return self._value | |
def parse(self, soup): | |
self.soup = soup | |
self._value = self._get_element() | |
return self._value | |
def parser(self, selector=None): | |
def decorator(fn): | |
#import pdb; pdb.set_trace() | |
self.selector = selector | |
self._parse = self.parse | |
def new_parse(field, soup): | |
value = self._parse(soup) | |
field._value = fn(soup, value) | |
return field.value | |
self.parse = new_parse.__get__(self, self.__class__) | |
return fn # nobody should call this, we don't care | |
return decorator | |
class StringsField(Field): | |
def parse(self, soup): | |
self._value = reduce(lambda l, e: l + list(e.stripped_strings), | |
super(StringsField, self).parse(soup), []) | |
return self._value | |
class TextField(StringsField): | |
def __init__(self, selector=None, separator=' ', *args, **kwargs): | |
super(TextField, self).__init__(selector=selector, *args, **kwargs) | |
self.separator = separator | |
def parse(self, soup): | |
self._value = self.separator.join(super(TextField, self).parse(soup)) | |
return self._value | |
class IntField(TextField): | |
def __init__(self, *args, **kwargs): | |
super(IntField, self).__init__(separator='', *args, **kwargs) | |
def parse(self, soup): | |
self._value = int(super(IntField, self).parse(soup)) | |
return self._value | |
class ListField(Field): | |
def parse(self, soup): | |
super(ListField, self).parse(soup) | |
lis = self._value. | |
class Document(object): | |
def __init__(self, content=None): | |
self.__fields = {name: attr for (name, attr) in inspect.getmembers(self) if isinstance(attr, Field)} | |
if content: | |
self.parse(content) | |
def parse(self, content): | |
self.content = content | |
self.soup = BeautifulSoup(content) | |
for field in self.__fields.values(): | |
field.parse(self.soup) | |
class MyDoc(Document): | |
age = IntField('#age') | |
sum = StringsField() | |
@sum.parser('#qwer') | |
def sum_items(soup, parsed): | |
return sum(map(lambda s: int(s), parsed)) | |
content = """ | |
<span id="age">42</span> | |
<ul id="qwer"> | |
<li>1</li> | |
<li>2</li> | |
<li>3</li> | |
</ul> | |
""" | |
document = MyDoc(content) | |
print document.age.value | |
print document.sum.value |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment