Last active
August 29, 2015 14:15
-
-
Save umrashrf/2f090797bdc25857325b to your computer and use it in GitHub Desktop.
extracting selector from scrapy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
From 2b781158628b586bf085888dbd8d32334fa4bf6a Mon Sep 17 00:00:00 2001 | |
From: Umair Ashraf <umr.ashrf@gmail.com> | |
Date: Sat, 21 Feb 2015 20:04:07 +0500 | |
Subject: [PATCH 1/5] added basic selectors files -- gitignore and readme | |
--- | |
.gitignore | 11 +++++++++++ | |
README.md | 3 +++ | |
2 files changed, 14 insertions(+) | |
create mode 100644 .gitignore | |
create mode 100644 README.md | |
diff --git a/.gitignore b/.gitignore | |
new file mode 100644 | |
index 0000000..837a67b | |
--- /dev/null | |
+++ b/.gitignore | |
@@ -0,0 +1,11 @@ | |
+*.pyc | |
+_trial_temp* | |
+dropin.cache | |
+docs/build | |
+*egg-info | |
+.tox | |
+venv | |
+build | |
+dist | |
+.idea | |
+.html | |
diff --git a/README.md b/README.md | |
new file mode 100644 | |
index 0000000..6196415 | |
--- /dev/null | |
+++ b/README.md | |
@@ -0,0 +1,3 @@ | |
+# Selectors | |
+ | |
+Selectors provide high level API for XML and HTML parsing using XPath and CSS selectors in Python. | |
-- | |
1.9.1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
From eb60eb2ceaac01d23ec8167f4f3d797ed6c03014 Mon Sep 17 00:00:00 2001 | |
From: Umair Ashraf <umr.ashrf@gmail.com> | |
Date: Sat, 21 Feb 2015 20:13:35 +0500 | |
Subject: [PATCH 2/5] made selectors independent of scrapy | |
--- | |
selectors/__init__.py | 10 +- | |
selectors/common.py | 22 ++++ | |
selectors/csstranslator.py | 16 +-- | |
selectors/exceptions.py | 5 + | |
selectors/lxmldocument.py | 31 ------ | |
selectors/lxmlsel.py | 50 --------- | |
selectors/unified.py | 63 +++-------- | |
selectors/utils/decorator.py | 23 +--- | |
selectors/utils/misc.py | 86 +-------------- | |
selectors/utils/python.py | 249 ------------------------------------------- | |
10 files changed, 58 insertions(+), 497 deletions(-) | |
create mode 100644 selectors/common.py | |
create mode 100644 selectors/exceptions.py | |
delete mode 100644 selectors/lxmldocument.py | |
delete mode 100644 selectors/lxmlsel.py | |
diff --git a/selectors/__init__.py b/selectors/__init__.py | |
index bfbde4d..97eb9d5 100644 | |
--- a/selectors/__init__.py | |
+++ b/selectors/__init__.py | |
@@ -1,5 +1,5 @@ | |
-""" | |
-Selectors | |
-""" | |
-from scrapy.selector.unified import * | |
-from scrapy.selector.lxmlsel import * | |
+ | |
+__version__ = '0.0.1' | |
+ | |
+ | |
+from selectors.unified import * | |
diff --git a/selectors/common.py b/selectors/common.py | |
new file mode 100644 | |
index 0000000..4cbf1ec | |
--- /dev/null | |
+++ b/selectors/common.py | |
@@ -0,0 +1,22 @@ | |
+""" | |
+We need these things in Scrapy and Selectors packages both | |
+""" | |
+from lxml import etree | |
+ | |
+from .csstranslator import SelectorHTMLTranslator, SelectorGenericTranslator | |
+ | |
+ | |
+class SafeXMLParser(etree.XMLParser): | |
+ def __init__(self, *args, **kwargs): | |
+ kwargs.setdefault('resolve_entities', False) | |
+ super(SafeXMLParser, self).__init__(*args, **kwargs) | |
+ | |
+ | |
+_ctgroup = { | |
+ 'html': {'_parser': etree.HTMLParser, | |
+ '_csstranslator': SelectorHTMLTranslator(), | |
+ '_tostring_method': 'html'}, | |
+ 'xml': {'_parser': SafeXMLParser, | |
+ '_csstranslator': SelectorGenericTranslator(), | |
+ '_tostring_method': 'xml'}, | |
+} | |
diff --git a/selectors/csstranslator.py b/selectors/csstranslator.py | |
index 7482837..2148a10 100644 | |
--- a/selectors/csstranslator.py | |
+++ b/selectors/csstranslator.py | |
@@ -3,7 +3,7 @@ from cssselect.xpath import _unicode_safe_getattr, XPathExpr, ExpressionError | |
from cssselect.parser import FunctionalPseudoElement | |
-class ScrapyXPathExpr(XPathExpr): | |
+class SelectorXPathExpr(XPathExpr): | |
textnode = False | |
attribute = None | |
@@ -16,7 +16,7 @@ class ScrapyXPathExpr(XPathExpr): | |
return x | |
def __str__(self): | |
- path = super(ScrapyXPathExpr, self).__str__() | |
+ path = super(SelectorXPathExpr, self).__str__() | |
if self.textnode: | |
if path == '*': | |
path = 'text()' | |
@@ -33,7 +33,7 @@ class ScrapyXPathExpr(XPathExpr): | |
return path | |
def join(self, combiner, other): | |
- super(ScrapyXPathExpr, self).join(combiner, other) | |
+ super(SelectorXPathExpr, self).join(combiner, other) | |
self.textnode = other.textnode | |
self.attribute = other.attribute | |
return self | |
@@ -43,7 +43,7 @@ class TranslatorMixin(object): | |
def xpath_element(self, selector): | |
xpath = super(TranslatorMixin, self).xpath_element(selector) | |
- return ScrapyXPathExpr.from_xpath(xpath) | |
+ return SelectorXPathExpr.from_xpath(xpath) | |
def xpath_pseudo_element(self, xpath, pseudo_element): | |
if isinstance(pseudo_element, FunctionalPseudoElement): | |
@@ -71,18 +71,18 @@ class TranslatorMixin(object): | |
raise ExpressionError( | |
"Expected a single string or ident for ::attr(), got %r" | |
% function.arguments) | |
- return ScrapyXPathExpr.from_xpath(xpath, | |
+ return SelectorXPathExpr.from_xpath(xpath, | |
attribute=function.arguments[0].value) | |
def xpath_text_simple_pseudo_element(self, xpath): | |
"""Support selecting text nodes using ::text pseudo-element""" | |
- return ScrapyXPathExpr.from_xpath(xpath, textnode=True) | |
+ return SelectorXPathExpr.from_xpath(xpath, textnode=True) | |
-class ScrapyGenericTranslator(TranslatorMixin, GenericTranslator): | |
+class SelectorGenericTranslator(TranslatorMixin, GenericTranslator): | |
pass | |
-class ScrapyHTMLTranslator(TranslatorMixin, HTMLTranslator): | |
+class SelectorHTMLTranslator(TranslatorMixin, HTMLTranslator): | |
pass | |
diff --git a/selectors/exceptions.py b/selectors/exceptions.py | |
new file mode 100644 | |
index 0000000..9ed8b6b | |
--- /dev/null | |
+++ b/selectors/exceptions.py | |
@@ -0,0 +1,5 @@ | |
+class SelectorsDeprecationWarning(Warning): | |
+ """Warning category for deprecated features, since the default | |
+ DeprecationWarning is silenced on Python 2.7+ | |
+ """ | |
+ pass | |
diff --git a/selectors/lxmldocument.py b/selectors/lxmldocument.py | |
deleted file mode 100644 | |
index 817349b..0000000 | |
--- a/selectors/lxmldocument.py | |
+++ /dev/null | |
@@ -1,31 +0,0 @@ | |
-""" | |
-This module contains a simple class (LxmlDocument) which provides cache and | |
-garbage collection to lxml element tree documents. | |
-""" | |
- | |
-import weakref | |
-from lxml import etree | |
-from scrapy.utils.trackref import object_ref | |
- | |
- | |
-def _factory(response, parser_cls): | |
- url = response.url | |
- body = response.body_as_unicode().strip().encode('utf8') or '<html/>' | |
- parser = parser_cls(recover=True, encoding='utf8') | |
- return etree.fromstring(body, parser=parser, base_url=url) | |
- | |
- | |
-class LxmlDocument(object_ref): | |
- | |
- cache = weakref.WeakKeyDictionary() | |
- __slots__ = ['__weakref__'] | |
- | |
- def __new__(cls, response, parser=etree.HTMLParser): | |
- cache = cls.cache.setdefault(response, {}) | |
- if parser not in cache: | |
- obj = object_ref.__new__(cls) | |
- cache[parser] = _factory(response, parser) | |
- return cache[parser] | |
- | |
- def __str__(self): | |
- return "<LxmlDocument %s>" % self.root.tag | |
diff --git a/selectors/lxmlsel.py b/selectors/lxmlsel.py | |
deleted file mode 100644 | |
index 070cb23..0000000 | |
--- a/selectors/lxmlsel.py | |
+++ /dev/null | |
@@ -1,50 +0,0 @@ | |
-""" | |
-XPath selectors based on lxml | |
-""" | |
-from scrapy.utils.deprecate import create_deprecated_class | |
-from .unified import Selector, SelectorList | |
- | |
- | |
-__all__ = ['HtmlXPathSelector', 'XmlXPathSelector', 'XPathSelector', | |
- 'XPathSelectorList'] | |
- | |
-def _xpathselector_css(self, *a, **kw): | |
- raise RuntimeError('.css() method not available for %s, ' | |
- 'instantiate scrapy.Selector ' | |
- 'instead' % type(self).__name__) | |
- | |
-XPathSelector = create_deprecated_class( | |
- 'XPathSelector', | |
- Selector, | |
- { | |
- '__slots__': (), | |
- '_default_type': 'html', | |
- 'css': _xpathselector_css, | |
- }, | |
- new_class_path='scrapy.Selector', | |
- old_class_path='scrapy.selector.XPathSelector', | |
-) | |
- | |
-XmlXPathSelector = create_deprecated_class( | |
- 'XmlXPathSelector', | |
- XPathSelector, | |
- clsdict={ | |
- '__slots__': (), | |
- '_default_type': 'xml', | |
- }, | |
- new_class_path='scrapy.Selector', | |
- old_class_path='scrapy.selector.XmlXPathSelector', | |
-) | |
- | |
-HtmlXPathSelector = create_deprecated_class( | |
- 'HtmlXPathSelector', | |
- XPathSelector, | |
- clsdict={ | |
- '__slots__': (), | |
- '_default_type': 'html', | |
- }, | |
- new_class_path='scrapy.Selector', | |
- old_class_path='scrapy.selector.HtmlXPathSelector', | |
-) | |
- | |
-XPathSelectorList = create_deprecated_class('XPathSelectorList', SelectorList) | |
diff --git a/selectors/unified.py b/selectors/unified.py | |
index b8a3678..77b363a 100644 | |
--- a/selectors/unified.py | |
+++ b/selectors/unified.py | |
@@ -1,57 +1,24 @@ | |
""" | |
XPath selectors based on lxml | |
""" | |
+import re | |
from lxml import etree | |
-from scrapy.utils.misc import extract_regex | |
-from scrapy.utils.trackref import object_ref | |
-from scrapy.utils.python import unicode_to_str, flatten | |
-from scrapy.utils.decorator import deprecated | |
-from scrapy.http import HtmlResponse, XmlResponse | |
-from .lxmldocument import LxmlDocument | |
-from .csstranslator import ScrapyHTMLTranslator, ScrapyGenericTranslator | |
+from .utils.misc import extract_regex | |
+from .utils.python import flatten | |
+from .utils.decorator import deprecated | |
+from .common import _ctgroup | |
__all__ = ['Selector', 'SelectorList'] | |
-class SafeXMLParser(etree.XMLParser): | |
- def __init__(self, *args, **kwargs): | |
- kwargs.setdefault('resolve_entities', False) | |
- super(SafeXMLParser, self).__init__(*args, **kwargs) | |
+class Selector(object): | |
-_ctgroup = { | |
- 'html': {'_parser': etree.HTMLParser, | |
- '_csstranslator': ScrapyHTMLTranslator(), | |
- '_tostring_method': 'html'}, | |
- 'xml': {'_parser': SafeXMLParser, | |
- '_csstranslator': ScrapyGenericTranslator(), | |
- '_tostring_method': 'xml'}, | |
-} | |
+ __slots__ = ['text', 'namespaces', 'type', '_expr', '_root', | |
+ '_parser', '_csstranslator', '_tostring_method'] | |
- | |
-def _st(response, st): | |
- if st is None: | |
- return 'xml' if isinstance(response, XmlResponse) else 'html' | |
- elif st in ('xml', 'html'): | |
- return st | |
- else: | |
- raise ValueError('Invalid type: %s' % st) | |
- | |
- | |
-def _response_from_text(text, st): | |
- rt = XmlResponse if st == 'xml' else HtmlResponse | |
- return rt(url='about:blank', encoding='utf-8', | |
- body=unicode_to_str(text, 'utf-8')) | |
- | |
- | |
-class Selector(object_ref): | |
- | |
- __slots__ = ['response', 'text', 'namespaces', 'type', '_expr', '_root', | |
- '__weakref__', '_parser', '_csstranslator', '_tostring_method'] | |
- | |
- _default_type = None | |
_default_namespaces = { | |
"re": "http://exslt.org/regular-expressions", | |
@@ -65,23 +32,23 @@ class Selector(object_ref): | |
} | |
_lxml_smart_strings = False | |
- def __init__(self, response=None, text=None, type=None, namespaces=None, | |
+ def __init__(self, text=None, url=None, type='html', namespaces=None, | |
_root=None, _expr=None): | |
- self.type = st = _st(response, type or self._default_type) | |
+ self.type = st = type | |
self._parser = _ctgroup[st]['_parser'] | |
self._csstranslator = _ctgroup[st]['_csstranslator'] | |
self._tostring_method = _ctgroup[st]['_tostring_method'] | |
+ self.text = text | |
if text is not None: | |
- response = _response_from_text(text, st) | |
+ body = text.strip().encode('utf8') or '<html/>' | |
+ parser_obj = self._parser(recover=True, encoding='utf8') | |
+ _root = etree.fromstring(body, base_url=url, parser=parser_obj) | |
- if response is not None: | |
- _root = LxmlDocument(response, self._parser) | |
- | |
- self.response = response | |
self.namespaces = dict(self._default_namespaces) | |
if namespaces is not None: | |
self.namespaces.update(namespaces) | |
+ | |
self._root = _root | |
self._expr = _expr | |
diff --git a/selectors/utils/decorator.py b/selectors/utils/decorator.py | |
index 38bee1a..2177a9a 100644 | |
--- a/selectors/utils/decorator.py | |
+++ b/selectors/utils/decorator.py | |
@@ -1,9 +1,7 @@ | |
import warnings | |
from functools import wraps | |
-from twisted.internet import defer, threads | |
- | |
-from scrapy.exceptions import ScrapyDeprecationWarning | |
+from selectors.exceptions import SelectorsDeprecationWarning | |
def deprecated(use_instead=None): | |
@@ -17,7 +15,7 @@ def deprecated(use_instead=None): | |
message = "Call to deprecated function %s." % func.__name__ | |
if use_instead: | |
message += " Use %s instead." % use_instead | |
- warnings.warn(message, category=ScrapyDeprecationWarning, stacklevel=2) | |
+ warnings.warn(message, category=SelectorsDeprecationWarning, stacklevel=2) | |
return func(*args, **kwargs) | |
return wrapped | |
@@ -25,20 +23,3 @@ def deprecated(use_instead=None): | |
deco = deco(use_instead) | |
use_instead = None | |
return deco | |
- | |
- | |
-def defers(func): | |
- """Decorator to make sure a function always returns a deferred""" | |
- @wraps(func) | |
- def wrapped(*a, **kw): | |
- return defer.maybeDeferred(func, *a, **kw) | |
- return wrapped | |
- | |
-def inthread(func): | |
- """Decorator to call a function in a thread and return a deferred with the | |
- result | |
- """ | |
- @wraps(func) | |
- def wrapped(*a, **kw): | |
- return threads.deferToThread(func, *a, **kw) | |
- return wrapped | |
diff --git a/selectors/utils/misc.py b/selectors/utils/misc.py | |
index 3152db6..969e78e 100644 | |
--- a/selectors/utils/misc.py | |
+++ b/selectors/utils/misc.py | |
@@ -1,76 +1,9 @@ | |
"""Helper functions which doesn't fit anywhere else""" | |
import re | |
-import hashlib | |
-from importlib import import_module | |
-from pkgutil import iter_modules | |
-import six | |
from w3lib.html import replace_entities | |
-from scrapy.utils.python import flatten | |
-from scrapy.item import BaseItem | |
- | |
- | |
-_ITERABLE_SINGLE_VALUES = dict, BaseItem, six.text_type, bytes | |
- | |
- | |
-def arg_to_iter(arg): | |
- """Convert an argument to an iterable. The argument can be a None, single | |
- value, or an iterable. | |
- | |
- Exception: if arg is a dict, [arg] will be returned | |
- """ | |
- if arg is None: | |
- return [] | |
- elif not isinstance(arg, _ITERABLE_SINGLE_VALUES) and hasattr(arg, '__iter__'): | |
- return arg | |
- else: | |
- return [arg] | |
- | |
- | |
-def load_object(path): | |
- """Load an object given its absolute object path, and return it. | |
- | |
- object can be a class, function, variable o instance. | |
- path ie: 'scrapy.contrib.downloadermiddelware.redirect.RedirectMiddleware' | |
- """ | |
- | |
- try: | |
- dot = path.rindex('.') | |
- except ValueError: | |
- raise ValueError("Error loading object '%s': not a full path" % path) | |
- | |
- module, name = path[:dot], path[dot+1:] | |
- mod = import_module(module) | |
- | |
- try: | |
- obj = getattr(mod, name) | |
- except AttributeError: | |
- raise NameError("Module '%s' doesn't define any object named '%s'" % (module, name)) | |
- | |
- return obj | |
- | |
- | |
-def walk_modules(path): | |
- """Loads a module and all its submodules from a the given module path and | |
- returns them. If *any* module throws an exception while importing, that | |
- exception is thrown back. | |
- | |
- For example: walk_modules('scrapy.utils') | |
- """ | |
- | |
- mods = [] | |
- mod = import_module(path) | |
- mods.append(mod) | |
- if hasattr(mod, '__path__'): | |
- for _, subpath, ispkg in iter_modules(mod.__path__): | |
- fullpath = path + '.' + subpath | |
- if ispkg: | |
- mods += walk_modules(fullpath) | |
- else: | |
- submod = import_module(fullpath) | |
- mods.append(submod) | |
- return mods | |
+from .python import flatten | |
def extract_regex(regex, text, encoding='utf-8'): | |
@@ -94,20 +27,3 @@ def extract_regex(regex, text, encoding='utf-8'): | |
return [replace_entities(s, keep=['lt', 'amp']) for s in strings] | |
else: | |
return [replace_entities(unicode(s, encoding), keep=['lt', 'amp']) for s in strings] | |
- | |
- | |
-def md5sum(file): | |
- """Calculate the md5 checksum of a file-like object without reading its | |
- whole content in memory. | |
- | |
- >>> from io import BytesIO | |
- >>> md5sum(BytesIO(b'file content to hash')) | |
- '784406af91dd5a54fbb9c84c2236595a' | |
- """ | |
- m = hashlib.md5() | |
- while 1: | |
- d = file.read(8096) | |
- if not d: | |
- break | |
- m.update(d) | |
- return m.hexdigest() | |
diff --git a/selectors/utils/python.py b/selectors/utils/python.py | |
index 551d337..beb62f0 100644 | |
--- a/selectors/utils/python.py | |
+++ b/selectors/utils/python.py | |
@@ -1,19 +1,3 @@ | |
-""" | |
-This module contains essential stuff that should've come with Python itself ;) | |
- | |
-It also contains functions (or functionality) which is in Python versions | |
-higher than 2.5 which used to be the lowest version supported by Scrapy. | |
- | |
-""" | |
-import os | |
-import re | |
-import inspect | |
-import weakref | |
-import errno | |
-import six | |
-from functools import partial, wraps | |
- | |
- | |
def flatten(x): | |
"""flatten(sequence) -> list | |
@@ -34,236 +18,3 @@ def flatten(x): | |
else: | |
result.append(el) | |
return result | |
- | |
- | |
-def unique(list_, key=lambda x: x): | |
- """efficient function to uniquify a list preserving item order""" | |
- seen = set() | |
- result = [] | |
- for item in list_: | |
- seenkey = key(item) | |
- if seenkey in seen: | |
- continue | |
- seen.add(seenkey) | |
- result.append(item) | |
- return result | |
- | |
- | |
-def str_to_unicode(text, encoding=None, errors='strict'): | |
- """Return the unicode representation of text in the given encoding. Unlike | |
- .encode(encoding) this function can be applied directly to a unicode | |
- object without the risk of double-decoding problems (which can happen if | |
- you don't use the default 'ascii' encoding) | |
- """ | |
- | |
- if encoding is None: | |
- encoding = 'utf-8' | |
- if isinstance(text, str): | |
- return text.decode(encoding, errors) | |
- elif isinstance(text, unicode): | |
- return text | |
- else: | |
- raise TypeError('str_to_unicode must receive a str or unicode object, got %s' % type(text).__name__) | |
- | |
-def unicode_to_str(text, encoding=None, errors='strict'): | |
- """Return the str representation of text in the given encoding. Unlike | |
- .encode(encoding) this function can be applied directly to a str | |
- object without the risk of double-decoding problems (which can happen if | |
- you don't use the default 'ascii' encoding) | |
- """ | |
- | |
- if encoding is None: | |
- encoding = 'utf-8' | |
- if isinstance(text, unicode): | |
- return text.encode(encoding, errors) | |
- elif isinstance(text, str): | |
- return text | |
- else: | |
- raise TypeError('unicode_to_str must receive a unicode or str object, got %s' % type(text).__name__) | |
- | |
-def re_rsearch(pattern, text, chunk_size=1024): | |
- """ | |
- This function does a reverse search in a text using a regular expression | |
- given in the attribute 'pattern'. | |
- Since the re module does not provide this functionality, we have to find for | |
- the expression into chunks of text extracted from the end (for the sake of efficiency). | |
- At first, a chunk of 'chunk_size' kilobytes is extracted from the end, and searched for | |
- the pattern. If the pattern is not found, another chunk is extracted, and another | |
- search is performed. | |
- This process continues until a match is found, or until the whole file is read. | |
- In case the pattern wasn't found, None is returned, otherwise it returns a tuple containing | |
- the start position of the match, and the ending (regarding the entire text). | |
- """ | |
- def _chunk_iter(): | |
- offset = len(text) | |
- while True: | |
- offset -= (chunk_size * 1024) | |
- if offset <= 0: | |
- break | |
- yield (text[offset:], offset) | |
- yield (text, 0) | |
- | |
- pattern = re.compile(pattern) if isinstance(pattern, basestring) else pattern | |
- for chunk, offset in _chunk_iter(): | |
- matches = [match for match in pattern.finditer(chunk)] | |
- if matches: | |
- return (offset + matches[-1].span()[0], offset + matches[-1].span()[1]) | |
- return None | |
- | |
-def memoizemethod_noargs(method): | |
- """Decorator to cache the result of a method (without arguments) using a | |
- weak reference to its object | |
- """ | |
- cache = weakref.WeakKeyDictionary() | |
- @wraps(method) | |
- def new_method(self, *args, **kwargs): | |
- if self not in cache: | |
- cache[self] = method(self, *args, **kwargs) | |
- return cache[self] | |
- return new_method | |
- | |
-_BINARYCHARS = set(map(chr, range(32))) - set(["\0", "\t", "\n", "\r"]) | |
- | |
-def isbinarytext(text): | |
- """Return True if the given text is considered binary, or false | |
- otherwise, by looking for binary bytes at their chars | |
- """ | |
- assert isinstance(text, str), "text must be str, got '%s'" % type(text).__name__ | |
- return any(c in _BINARYCHARS for c in text) | |
- | |
-def get_func_args(func, stripself=False): | |
- """Return the argument name list of a callable""" | |
- if inspect.isfunction(func): | |
- func_args, _, _, _ = inspect.getargspec(func) | |
- elif inspect.isclass(func): | |
- return get_func_args(func.__init__, True) | |
- elif inspect.ismethod(func): | |
- return get_func_args(func.__func__, True) | |
- elif inspect.ismethoddescriptor(func): | |
- return [] | |
- elif isinstance(func, partial): | |
- return [x for x in get_func_args(func.func)[len(func.args):] | |
- if not (func.keywords and x in func.keywords)] | |
- elif hasattr(func, '__call__'): | |
- if inspect.isroutine(func): | |
- return [] | |
- elif getattr(func, '__name__', None) == '__call__': | |
- return [] | |
- else: | |
- return get_func_args(func.__call__, True) | |
- else: | |
- raise TypeError('%s is not callable' % type(func)) | |
- if stripself: | |
- func_args.pop(0) | |
- return func_args | |
- | |
-def get_spec(func): | |
- """Returns (args, kwargs) tuple for a function | |
- >>> import re | |
- >>> get_spec(re.match) | |
- (['pattern', 'string'], {'flags': 0}) | |
- | |
- >>> class Test(object): | |
- ... def __call__(self, val): | |
- ... pass | |
- ... def method(self, val, flags=0): | |
- ... pass | |
- | |
- >>> get_spec(Test) | |
- (['self', 'val'], {}) | |
- | |
- >>> get_spec(Test.method) | |
- (['self', 'val'], {'flags': 0}) | |
- | |
- >>> get_spec(Test().method) | |
- (['self', 'val'], {'flags': 0}) | |
- """ | |
- | |
- if inspect.isfunction(func) or inspect.ismethod(func): | |
- spec = inspect.getargspec(func) | |
- elif hasattr(func, '__call__'): | |
- spec = inspect.getargspec(func.__call__) | |
- else: | |
- raise TypeError('%s is not callable' % type(func)) | |
- | |
- defaults = spec.defaults or [] | |
- | |
- firstdefault = len(spec.args) - len(defaults) | |
- args = spec.args[:firstdefault] | |
- kwargs = dict(zip(spec.args[firstdefault:], defaults)) | |
- return args, kwargs | |
- | |
-def equal_attributes(obj1, obj2, attributes): | |
- """Compare two objects attributes""" | |
- # not attributes given return False by default | |
- if not attributes: | |
- return False | |
- | |
- for attr in attributes: | |
- # support callables like itemgetter | |
- if callable(attr): | |
- if not attr(obj1) == attr(obj2): | |
- return False | |
- else: | |
- # check that objects has attribute | |
- if not hasattr(obj1, attr): | |
- return False | |
- if not hasattr(obj2, attr): | |
- return False | |
- # compare object attributes | |
- if not getattr(obj1, attr) == getattr(obj2, attr): | |
- return False | |
- # all attributes equal | |
- return True | |
- | |
- | |
-class WeakKeyCache(object): | |
- | |
- def __init__(self, default_factory): | |
- self.default_factory = default_factory | |
- self._weakdict = weakref.WeakKeyDictionary() | |
- | |
- def __getitem__(self, key): | |
- if key not in self._weakdict: | |
- self._weakdict[key] = self.default_factory(key) | |
- return self._weakdict[key] | |
- | |
- | |
-def stringify_dict(dct_or_tuples, encoding='utf-8', keys_only=True): | |
- """Return a (new) dict with the unicode keys (and values if, keys_only is | |
- False) of the given dict converted to strings. `dct_or_tuples` can be a | |
- dict or a list of tuples, like any dict constructor supports. | |
- """ | |
- d = {} | |
- for k, v in six.iteritems(dict(dct_or_tuples)): | |
- k = k.encode(encoding) if isinstance(k, unicode) else k | |
- if not keys_only: | |
- v = v.encode(encoding) if isinstance(v, unicode) else v | |
- d[k] = v | |
- return d | |
- | |
-def is_writable(path): | |
- """Return True if the given path can be written (if it exists) or created | |
- (if it doesn't exist) | |
- """ | |
- if os.path.exists(path): | |
- return os.access(path, os.W_OK) | |
- else: | |
- return os.access(os.path.dirname(path), os.W_OK) | |
- | |
-def setattr_default(obj, name, value): | |
- """Set attribute value, but only if it's not already set. Similar to | |
- setdefault() for dicts. | |
- """ | |
- if not hasattr(obj, name): | |
- setattr(obj, name, value) | |
- | |
- | |
-def retry_on_eintr(function, *args, **kw): | |
- """Run a function and retry it while getting EINTR errors""" | |
- while True: | |
- try: | |
- return function(*args, **kw) | |
- except IOError as e: | |
- if e.errno != errno.EINTR: | |
- raise | |
-- | |
1.9.1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
From 7cf971c08ece0007d891045497f2e164b3bf8044 Mon Sep 17 00:00:00 2001 | |
From: Umair Ashraf <umr.ashrf@gmail.com> | |
Date: Sat, 21 Feb 2015 20:05:30 +0500 | |
Subject: [PATCH 3/5] added selectors specific python package files | |
--- | |
MANIFEST.in | 2 ++ | |
requirements.txt | 3 +++ | |
setup.py | 28 ++++++++++++++++++++++++++++ | |
3 files changed, 33 insertions(+) | |
create mode 100644 MANIFEST.in | |
create mode 100644 requirements.txt | |
create mode 100644 setup.py | |
diff --git a/MANIFEST.in b/MANIFEST.in | |
new file mode 100644 | |
index 0000000..2970947 | |
--- /dev/null | |
+++ b/MANIFEST.in | |
@@ -0,0 +1,2 @@ | |
+include README.md | |
+include MANIFEST.in | |
diff --git a/requirements.txt b/requirements.txt | |
new file mode 100644 | |
index 0000000..9a0bc80 | |
--- /dev/null | |
+++ b/requirements.txt | |
@@ -0,0 +1,3 @@ | |
+lxml | |
+w3lib>=1.8.0 | |
+cssselect>=0.9 | |
diff --git a/setup.py b/setup.py | |
new file mode 100644 | |
index 0000000..c3f8aa0 | |
--- /dev/null | |
+++ b/setup.py | |
@@ -0,0 +1,28 @@ | |
+import re | |
+ | |
+from setuptools import setup, find_packages | |
+ | |
+ | |
+(version, ) = re.findall(r"__version__[^=]*=[^']*[']([^']+)[']", | |
+ open('selectors/__init__.py').read()) | |
+ | |
+ | |
+setup( | |
+ name='Selectors', | |
+ version=version, | |
+ url='http://github.com/scrapy/selectors', | |
+ description='Selectors used by Scrapy framework', | |
+ long_description=open('README.md').read(), | |
+ author='Selectors developers', | |
+ maintainer='Scrapy developers', | |
+ maintainer_email='info@scrapy.org', | |
+ license='BSD', | |
+ packages=find_packages(exclude=('tests', 'tests.*')), | |
+ include_package_data=True, | |
+ zip_safe=False, | |
+ install_requires=[ | |
+ 'lxml', | |
+ 'w3lib>=1.8.0', | |
+ 'cssselect>=0.9', | |
+ ], | |
+) | |
-- | |
1.9.1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
From 109c2e096de49494e745c885df8901698b58d11e Mon Sep 17 00:00:00 2001 | |
From: Umair Ashraf <umr.ashrf@gmail.com> | |
Date: Sat, 21 Feb 2015 20:24:31 +0500 | |
Subject: [PATCH 4/5] removed scrapy dependent tests and changed code to suit | |
selectors package | |
--- | |
tests/__init__.py | 14 --- | |
tests/test_selector.py | 222 ++++------------------------------- | |
tests/test_selector_csstranslator.py | 12 +- | |
3 files changed, 31 insertions(+), 217 deletions(-) | |
delete mode 100644 tests/__init__.py | |
diff --git a/tests/__init__.py b/tests/__init__.py | |
deleted file mode 100644 | |
index 54e79b3..0000000 | |
--- a/tests/__init__.py | |
+++ /dev/null | |
@@ -1,14 +0,0 @@ | |
-""" | |
-tests: this package contains all Scrapy unittests | |
- | |
-see http://doc.scrapy.org/en/latest/contributing.html#running-tests | |
-""" | |
- | |
-import os | |
- | |
-tests_datadir = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'sample_data') | |
- | |
-def get_testdata(*paths): | |
- """Return test data""" | |
- path = os.path.join(tests_datadir, *paths) | |
- return open(path, 'rb').read() | |
diff --git a/tests/test_selector.py b/tests/test_selector.py | |
index 6fbb451..91c7d31 100644 | |
--- a/tests/test_selector.py | |
+++ b/tests/test_selector.py | |
@@ -1,11 +1,10 @@ | |
import re | |
import warnings | |
import weakref | |
+ | |
from twisted.trial import unittest | |
-from scrapy.exceptions import ScrapyDeprecationWarning | |
-from scrapy.http import TextResponse, HtmlResponse, XmlResponse | |
-from scrapy.selector import Selector | |
-from scrapy.selector.lxmlsel import XmlXPathSelector, HtmlXPathSelector, XPathSelector | |
+ | |
+from selectors import Selector | |
class SelectorTestCase(unittest.TestCase): | |
@@ -15,8 +14,7 @@ class SelectorTestCase(unittest.TestCase): | |
def test_simple_selection(self): | |
"""Simple selector tests""" | |
body = "<p><input name='a'value='1'/><input name='b'value='2'/></p>" | |
- response = TextResponse(url="http://example.com", body=body) | |
- sel = self.sscls(response) | |
+ sel = self.sscls(url="http://example.com", text=body) | |
xl = sel.xpath('//input') | |
self.assertEqual(2, len(xl)) | |
@@ -38,8 +36,7 @@ class SelectorTestCase(unittest.TestCase): | |
def test_representation_slice(self): | |
body = u"<p><input name='{}' value='\xa9'/></p>".format(50 * 'b') | |
- response = TextResponse(url="http://example.com", body=body, encoding='utf8') | |
- sel = self.sscls(response) | |
+ sel = self.sscls(url="http://example.com", text=body) | |
self.assertEqual( | |
map(repr, sel.xpath('//input/@name')), | |
@@ -48,8 +45,7 @@ class SelectorTestCase(unittest.TestCase): | |
def test_representation_unicode_query(self): | |
body = u"<p><input name='{}' value='\xa9'/></p>".format(50 * 'b') | |
- response = TextResponse(url="http://example.com", body=body, encoding='utf8') | |
- sel = self.sscls(response) | |
+ sel = self.sscls(url="http://example.com", text=body) | |
self.assertEqual( | |
map(repr, sel.xpath(u'//input[@value="\xa9"]/@value')), | |
["<Selector xpath=u'//input[@value=\"\\xa9\"]/@value' data=u'\\xa9'>"] | |
@@ -57,8 +53,7 @@ class SelectorTestCase(unittest.TestCase): | |
def test_select_unicode_query(self): | |
body = u"<p><input name='\xa9' value='1'/></p>" | |
- response = TextResponse(url="http://example.com", body=body, encoding='utf8') | |
- sel = self.sscls(response) | |
+ sel = self.sscls(url="http://example.com", text=body) | |
self.assertEqual(sel.xpath(u'//input[@name="\xa9"]/@value').extract(), [u'1']) | |
def test_list_elements_type(self): | |
@@ -69,8 +64,7 @@ class SelectorTestCase(unittest.TestCase): | |
def test_boolean_result(self): | |
body = "<p><input name='a'value='1'/><input name='b'value='2'/></p>" | |
- response = TextResponse(url="http://example.com", body=body) | |
- xs = self.sscls(response) | |
+ xs = self.sscls(url="http://example.com", text=body) | |
self.assertEquals(xs.xpath("//input[@name='a']/@name='a'").extract(), [u'1']) | |
self.assertEquals(xs.xpath("//input[@name='a']/@name='n'").extract(), [u'0']) | |
@@ -86,18 +80,6 @@ class SelectorTestCase(unittest.TestCase): | |
self.assertEqual(xs.xpath("//div").extract(), | |
[u'<div><img src="a.jpg"><p>Hello</p></img></div>']) | |
- def test_flavor_detection(self): | |
- text = '<div><img src="a.jpg"><p>Hello</div>' | |
- sel = self.sscls(XmlResponse('http://example.com', body=text)) | |
- self.assertEqual(sel.type, 'xml') | |
- self.assertEqual(sel.xpath("//div").extract(), | |
- [u'<div><img src="a.jpg"><p>Hello</p></img></div>']) | |
- | |
- sel = self.sscls(HtmlResponse('http://example.com', body=text)) | |
- self.assertEqual(sel.type, 'html') | |
- self.assertEqual(sel.xpath("//div").extract(), | |
- [u'<div><img src="a.jpg"><p>Hello</p></div>']) | |
- | |
def test_nested_selectors(self): | |
"""Nested selector tests""" | |
body = """<body> | |
@@ -113,8 +95,7 @@ class SelectorTestCase(unittest.TestCase): | |
</div> | |
</body>""" | |
- response = HtmlResponse(url="http://example.com", body=body) | |
- x = self.sscls(response) | |
+ x = self.sscls(url="http://example.com", text=body) | |
divtwo = x.xpath('//div[@class="two"]') | |
self.assertEqual(divtwo.xpath("//li").extract(), | |
["<li>one</li>", "<li>two</li>", "<li>four</li>", "<li>five</li>", "<li>six</li>"]) | |
@@ -145,8 +126,7 @@ class SelectorTestCase(unittest.TestCase): | |
</test> | |
""" | |
- response = XmlResponse(url="http://example.com", body=body) | |
- x = self.sscls(response) | |
+ x = self.sscls(url="http://example.com", text=body, type="xml") | |
x.register_namespace("somens", "http://scrapy.org") | |
self.assertEqual(x.xpath("//somens:a/text()").extract(), | |
@@ -162,8 +142,7 @@ class SelectorTestCase(unittest.TestCase): | |
<p:SecondTestTag><material>iron</material><price>90</price><p:name>Dried Rose</p:name></p:SecondTestTag> | |
</BrowseNode> | |
""" | |
- response = XmlResponse(url="http://example.com", body=body) | |
- x = self.sscls(response) | |
+ x = self.sscls(url="http://example.com", text=body, type="xml") | |
x.register_namespace("xmlns", "http://webservices.amazon.com/AWSECommerceService/2005-10-05") | |
x.register_namespace("p", "http://www.scrapy.org/product") | |
x.register_namespace("b", "http://somens.com") | |
@@ -184,8 +163,7 @@ class SelectorTestCase(unittest.TestCase): | |
</ul> | |
Age: 20 | |
</div>""" | |
- response = HtmlResponse(url="http://example.com", body=body) | |
- x = self.sscls(response) | |
+ x = self.sscls(url="http://example.com", text=body) | |
name_re = re.compile("Name: (\w+)") | |
self.assertEqual(x.xpath("//ul/li").re(name_re), | |
@@ -193,12 +171,6 @@ class SelectorTestCase(unittest.TestCase): | |
self.assertEqual(x.xpath("//ul/li").re("Age: (\d+)"), | |
["10", "20"]) | |
- def test_re_intl(self): | |
- body = """<div>Evento: cumplea\xc3\xb1os</div>""" | |
- response = HtmlResponse(url="http://example.com", body=body, encoding='utf-8') | |
- x = self.sscls(response) | |
- self.assertEqual(x.xpath("//div").re("Evento: (\w+)"), [u'cumplea\xf1os']) | |
- | |
def test_selector_over_text(self): | |
hs = self.sscls(text='<root>lala</root>') | |
self.assertEqual(hs.extract(), u'<html><body><root>lala</root></body></html>') | |
@@ -207,8 +179,7 @@ class SelectorTestCase(unittest.TestCase): | |
self.assertEqual(xs.xpath('.').extract(), [u'<root>lala</root>']) | |
def test_invalid_xpath(self): | |
- response = XmlResponse(url="http://example.com", body="<html></html>") | |
- x = self.sscls(response) | |
+ x = self.sscls(url="http://example.com", text="<html></html>") | |
xpath = "//test[@foo='bar]" | |
try: | |
x.xpath(xpath) | |
@@ -219,43 +190,16 @@ class SelectorTestCase(unittest.TestCase): | |
else: | |
raise AssertionError("A invalid XPath does not raise an exception") | |
- def test_http_header_encoding_precedence(self): | |
- # u'\xa3' = pound symbol in unicode | |
- # u'\xc2\xa3' = pound symbol in utf-8 | |
- # u'\xa3' = pound symbol in latin-1 (iso-8859-1) | |
- | |
- meta = u'<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">' | |
- head = u'<head>' + meta + u'</head>' | |
- body_content = u'<span id="blank">\xa3</span>' | |
- body = u'<body>' + body_content + u'</body>' | |
- html = u'<html>' + head + body + u'</html>' | |
- encoding = 'utf-8' | |
- html_utf8 = html.encode(encoding) | |
- | |
- headers = {'Content-Type': ['text/html; charset=utf-8']} | |
- response = HtmlResponse(url="http://example.com", headers=headers, body=html_utf8) | |
- x = self.sscls(response) | |
- self.assertEquals(x.xpath("//span[@id='blank']/text()").extract(), | |
- [u'\xa3']) | |
- | |
def test_empty_bodies(self): | |
# shouldn't raise errors | |
- r1 = TextResponse('http://www.example.com', body='') | |
- self.sscls(r1).xpath('//text()').extract() | |
+ self.sscls(url='http://www.example.com', text='').xpath('//text()').extract() | |
def test_null_bytes(self): | |
# shouldn't raise errors | |
- r1 = TextResponse('http://www.example.com', \ | |
- body='<root>pre\x00post</root>', \ | |
- encoding='utf-8') | |
- self.sscls(r1).xpath('//text()').extract() | |
- | |
- def test_badly_encoded_body(self): | |
- # \xe9 alone isn't valid utf8 sequence | |
- r1 = TextResponse('http://www.example.com', \ | |
- body='<html><p>an Jos\xe9 de</p><html>', \ | |
- encoding='utf-8') | |
- self.sscls(r1).xpath('//text()').extract() | |
+ self.sscls(url='http://www.example.com', | |
+ text='<root>pre\x00post</root>', | |
+ type='xml') \ | |
+ .xpath('//text()').extract() | |
def test_select_on_unevaluable_nodes(self): | |
r = self.sscls(text=u'<span class="big">some text</span>') | |
@@ -284,13 +228,6 @@ class SelectorTestCase(unittest.TestCase): | |
self.assertEquals(x2.extract(), [u'<b>Options:</b>']) | |
test_nested_select_on_text_nodes.skip = "Text nodes lost parent node reference in lxml" | |
- def test_weakref_slots(self): | |
- """Check that classes are using slots and are weak-referenceable""" | |
- x = self.sscls() | |
- weakref.ref(x) | |
- assert not hasattr(x, '__dict__'), "%s does not use __slots__" % \ | |
- x.__class__.__name__ | |
- | |
def test_remove_namespaces(self): | |
xml = """<?xml version="1.0" encoding="UTF-8"?> | |
<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en-US" xmlns:media="http://search.yahoo.com/mrss/"> | |
@@ -298,7 +235,7 @@ class SelectorTestCase(unittest.TestCase): | |
<link type="application/atom+xml"> | |
</feed> | |
""" | |
- sel = self.sscls(XmlResponse("http://example.com/feed.atom", body=xml)) | |
+ sel = self.sscls(url="http://example.com/feed.atom", text=xml, type="xml") | |
self.assertEqual(len(sel.xpath("//link")), 0) | |
sel.remove_namespaces() | |
self.assertEqual(len(sel.xpath("//link")), 2) | |
@@ -310,7 +247,7 @@ class SelectorTestCase(unittest.TestCase): | |
<link atom:type="application/atom+xml"> | |
</feed> | |
""" | |
- sel = self.sscls(XmlResponse("http://example.com/feed.atom", body=xml)) | |
+ sel = self.sscls(url="http://example.com/feed.atom", text=xml, type="xml") | |
self.assertEqual(len(sel.xpath("//link/@type")), 0) | |
sel.remove_namespaces() | |
self.assertEqual(len(sel.xpath("//link/@type")), 2) | |
@@ -334,17 +271,15 @@ class SelectorTestCase(unittest.TestCase): | |
</div> | |
</body>""" | |
- response = HtmlResponse(url="http://example.com", body=body) | |
- | |
# .getparent() is available for text nodes and attributes | |
# only when smart_strings are on | |
- x = self.sscls(response) | |
+ x = self.sscls(url="http://example.com", text=body) | |
li_text = x.xpath('//li/text()') | |
self.assertFalse(any(map(lambda e: hasattr(e._root, 'getparent'), li_text))) | |
div_class = x.xpath('//div/@class') | |
self.assertFalse(any(map(lambda e: hasattr(e._root, 'getparent'), div_class))) | |
- x = SmartStringsSelector(response) | |
+ x = SmartStringsSelector(url="http://example.com", text=body) | |
li_text = x.xpath('//li/text()') | |
self.assertTrue(all(map(lambda e: hasattr(e._root, 'getparent'), li_text))) | |
div_class = x.xpath('//div/@class') | |
@@ -355,116 +290,11 @@ class SelectorTestCase(unittest.TestCase): | |
'<!DOCTYPE foo [ <!ELEMENT foo ANY > <!ENTITY xxe SYSTEM '\ | |
'"file:///etc/passwd" >]><foo>&xxe;</foo>' | |
- response = XmlResponse('http://example.com', body=malicious_xml) | |
- sel = self.sscls(response=response) | |
+ sel = self.sscls(url='http://example.com', text=malicious_xml, type="xml") | |
self.assertEqual(sel.extract(), '<foo>&xxe;</foo>') | |
-class DeprecatedXpathSelectorTest(unittest.TestCase): | |
- | |
- text = '<div><img src="a.jpg"><p>Hello</div>' | |
- | |
- def test_warnings_xpathselector(self): | |
- cls = XPathSelector | |
- with warnings.catch_warnings(record=True) as w: | |
- class UserClass(cls): | |
- pass | |
- | |
- # subclassing must issue a warning | |
- self.assertEqual(len(w), 1, str(cls)) | |
- self.assertIn('scrapy.Selector', str(w[0].message)) | |
- | |
- # subclass instance doesn't issue a warning | |
- usel = UserClass(text=self.text) | |
- self.assertEqual(len(w), 1) | |
- | |
- # class instance must issue a warning | |
- sel = cls(text=self.text) | |
- self.assertEqual(len(w), 2, str((cls, [x.message for x in w]))) | |
- self.assertIn('scrapy.Selector', str(w[1].message)) | |
- | |
- # subclass and instance checks | |
- self.assertTrue(issubclass(cls, Selector)) | |
- self.assertTrue(isinstance(sel, Selector)) | |
- self.assertTrue(isinstance(usel, Selector)) | |
- | |
- def test_warnings_xmlxpathselector(self): | |
- cls = XmlXPathSelector | |
- with warnings.catch_warnings(record=True) as w: | |
- class UserClass(cls): | |
- pass | |
- | |
- # subclassing must issue a warning | |
- self.assertEqual(len(w), 1, str(cls)) | |
- self.assertIn('scrapy.Selector', str(w[0].message)) | |
- | |
- # subclass instance doesn't issue a warning | |
- usel = UserClass(text=self.text) | |
- self.assertEqual(len(w), 1) | |
- | |
- # class instance must issue a warning | |
- sel = cls(text=self.text) | |
- self.assertEqual(len(w), 2, str((cls, [x.message for x in w]))) | |
- self.assertIn('scrapy.Selector', str(w[1].message)) | |
- | |
- # subclass and instance checks | |
- self.assertTrue(issubclass(cls, Selector)) | |
- self.assertTrue(issubclass(cls, XPathSelector)) | |
- self.assertTrue(isinstance(sel, Selector)) | |
- self.assertTrue(isinstance(usel, Selector)) | |
- self.assertTrue(isinstance(sel, XPathSelector)) | |
- self.assertTrue(isinstance(usel, XPathSelector)) | |
- | |
- def test_warnings_htmlxpathselector(self): | |
- cls = HtmlXPathSelector | |
- with warnings.catch_warnings(record=True) as w: | |
- class UserClass(cls): | |
- pass | |
- | |
- # subclassing must issue a warning | |
- self.assertEqual(len(w), 1, str(cls)) | |
- self.assertIn('scrapy.Selector', str(w[0].message)) | |
- | |
- # subclass instance doesn't issue a warning | |
- usel = UserClass(text=self.text) | |
- self.assertEqual(len(w), 1) | |
- | |
- # class instance must issue a warning | |
- sel = cls(text=self.text) | |
- self.assertEqual(len(w), 2, str((cls, [x.message for x in w]))) | |
- self.assertIn('scrapy.Selector', str(w[1].message)) | |
- | |
- # subclass and instance checks | |
- self.assertTrue(issubclass(cls, Selector)) | |
- self.assertTrue(issubclass(cls, XPathSelector)) | |
- self.assertTrue(isinstance(sel, Selector)) | |
- self.assertTrue(isinstance(usel, Selector)) | |
- self.assertTrue(isinstance(sel, XPathSelector)) | |
- self.assertTrue(isinstance(usel, XPathSelector)) | |
- | |
- def test_xpathselector(self): | |
- with warnings.catch_warnings(record=True): | |
- hs = XPathSelector(text=self.text) | |
- self.assertEqual(hs.select("//div").extract(), | |
- [u'<div><img src="a.jpg"><p>Hello</p></div>']) | |
- self.assertRaises(RuntimeError, hs.css, 'div') | |
- | |
- def test_htmlxpathselector(self): | |
- with warnings.catch_warnings(record=True): | |
- hs = HtmlXPathSelector(text=self.text) | |
- self.assertEqual(hs.select("//div").extract(), | |
- [u'<div><img src="a.jpg"><p>Hello</p></div>']) | |
- self.assertRaises(RuntimeError, hs.css, 'div') | |
- | |
- def test_xmlxpathselector(self): | |
- with warnings.catch_warnings(record=True): | |
- xs = XmlXPathSelector(text=self.text) | |
- self.assertEqual(xs.select("//div").extract(), | |
- [u'<div><img src="a.jpg"><p>Hello</p></img></div>']) | |
- self.assertRaises(RuntimeError, xs.css, 'div') | |
- | |
- | |
class ExsltTestCase(unittest.TestCase): | |
sscls = Selector | |
@@ -479,8 +309,7 @@ class ExsltTestCase(unittest.TestCase): | |
<a href="http://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.xml">EXSLT match example</a> | |
</div> | |
""" | |
- response = TextResponse(url="http://example.com", body=body) | |
- sel = self.sscls(response) | |
+ sel = self.sscls(url="http://example.com", text=body) | |
# re:test() | |
self.assertEqual( | |
@@ -557,8 +386,7 @@ class ExsltTestCase(unittest.TestCase): | |
</div> | |
</div> | |
""" | |
- response = TextResponse(url="http://example.com", body=body) | |
- sel = self.sscls(response) | |
+ sel = self.sscls(url="http://example.com", text=body) | |
self.assertEqual( | |
sel.xpath('''//div[@itemtype="http://schema.org/Event"] | |
diff --git a/tests/test_selector_csstranslator.py b/tests/test_selector_csstranslator.py | |
index 7ef9003..b648320 100644 | |
--- a/tests/test_selector_csstranslator.py | |
+++ b/tests/test_selector_csstranslator.py | |
@@ -2,9 +2,10 @@ | |
Selector tests for cssselect backend | |
""" | |
from twisted.trial import unittest | |
-from scrapy.http import HtmlResponse | |
-from scrapy.selector.csstranslator import ScrapyHTMLTranslator | |
-from scrapy.selector import Selector | |
+ | |
+from selectors import Selector | |
+from selectors.csstranslator import SelectorHTMLTranslator | |
+ | |
from cssselect.parser import SelectorSyntaxError | |
from cssselect.xpath import ExpressionError | |
@@ -47,7 +48,7 @@ HTMLBODY = ''' | |
class TranslatorMixinTest(unittest.TestCase): | |
- tr_cls = ScrapyHTMLTranslator | |
+ tr_cls = SelectorHTMLTranslator | |
def setUp(self): | |
self.tr = self.tr_cls() | |
@@ -119,8 +120,7 @@ class CSSSelectorTest(unittest.TestCase): | |
sscls = Selector | |
def setUp(self): | |
- self.htmlresponse = HtmlResponse('http://example.com', body=HTMLBODY) | |
- self.sel = self.sscls(self.htmlresponse) | |
+ self.sel = self.sscls(url='http://example.com', text=HTMLBODY) | |
def x(self, *a, **kw): | |
return [v.strip() for v in self.sel.css(*a, **kw).extract() if v.strip()] | |
-- | |
1.9.1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
From eab029568b37d564bc8bbdb5b836cf0976a39cd1 Mon Sep 17 00:00:00 2001 | |
From: Umair Ashraf <umr.ashrf@gmail.com> | |
Date: Sat, 21 Feb 2015 20:25:01 +0500 | |
Subject: [PATCH 5/5] added selectors tests specific support files | |
--- | |
pytest.ini | 4 ++++ | |
tests/requirements.txt | 3 +++ | |
tox.ini | 14 ++++++++++++++ | |
3 files changed, 21 insertions(+) | |
create mode 100644 pytest.ini | |
create mode 100644 tests/requirements.txt | |
create mode 100644 tox.ini | |
diff --git a/pytest.ini b/pytest.ini | |
new file mode 100644 | |
index 0000000..cc48090 | |
--- /dev/null | |
+++ b/pytest.ini | |
@@ -0,0 +1,4 @@ | |
+[pytest] | |
+python_files=test_*.py __init__.py | |
+addopts = --doctest-modules --assert=plain | |
+twisted = 1 | |
diff --git a/tests/requirements.txt b/tests/requirements.txt | |
new file mode 100644 | |
index 0000000..18ae516 | |
--- /dev/null | |
+++ b/tests/requirements.txt | |
@@ -0,0 +1,3 @@ | |
+Twisted>=10.0.0 | |
+pytest-twisted | |
+mock | |
diff --git a/tox.ini b/tox.ini | |
new file mode 100644 | |
index 0000000..617df26 | |
--- /dev/null | |
+++ b/tox.ini | |
@@ -0,0 +1,14 @@ | |
+# Tox (http://tox.testrun.org/) is a tool for running tests | |
+# in multiple virtualenvs. This configuration file will run the | |
+# test suite on all supported python versions. To use it, "pip install tox" | |
+# and then run "tox" from this directory. | |
+ | |
+[tox] | |
+envlist = py27 | |
+ | |
+[testenv] | |
+deps = | |
+ -rrequirements.txt | |
+ -rtests/requirements.txt | |
+commands = | |
+ py.test {posargs:selectors tests} | |
-- | |
1.9.1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# startover | |
git checkout master | |
git branch -D selectors selector-code utils-code tests-code | |
# split scrapy/selector dir to selector-code branch | |
git checkout -b selector-code | |
git filter-branch -f --prune-empty \ | |
--subdirectory-filter scrapy/selector -- selector-code | |
# mv files to selectors/ dir without new commit | |
git filter-branch -f \ | |
--index-filter ' | |
git ls-files -s \ | |
| sed "s-\t-&selectors/-" \ | |
| GIT_INDEX_FILE=$GIT_INDEX_FILE.new git update-index --index-info \ | |
&& mv $GIT_INDEX_FILE.new $GIT_INDEX_FILE' | |
# now we need to split utils | |
git checkout master | |
# split scrapy/utils dir to utils-code branch | |
git checkout -b utils-code | |
git filter-branch -f --prune-empty \ | |
--subdirectory-filter scrapy/utils -- utils-code | |
# only keep required utils files | |
git filter-branch -f \ | |
--prune-empty \ | |
--index-filter ' | |
git ls-tree -z -r --name-only --full-tree $GIT_COMMIT \ | |
| grep -z -v "^__init__.py$" \ | |
| grep -z -v "^decorator.py$" \ | |
| grep -z -v "^misc.py$" \ | |
| grep -z -v "^python.py$" \ | |
| xargs -0 -r git rm --cached -r | |
' \ | |
-- \ | |
utils-code | |
# mv files to selectors/utils/ dir without new commit | |
git filter-branch -f \ | |
--index-filter ' | |
git ls-files -s \ | |
| sed "s-\t-&selectors/utils/-" \ | |
| GIT_INDEX_FILE=$GIT_INDEX_FILE.new git update-index --index-info \ | |
&& mv $GIT_INDEX_FILE.new $GIT_INDEX_FILE' | |
# now we need to split tests | |
git checkout master | |
# split tests dir to tests-code branch | |
git checkout -b tests-code | |
git filter-branch -f --prune-empty \ | |
--subdirectory-filter tests -- tests-code | |
# only keep required tests files | |
git filter-branch -f \ | |
--prune-empty \ | |
--index-filter ' | |
git ls-tree -z -r --name-only --full-tree $GIT_COMMIT \ | |
| grep -z -v "^__init__.py$" \ | |
| grep -z -v "^test_selector.py$" \ | |
| grep -z -v "^test_selector_csstranslator.py$" \ | |
| xargs -0 -r git rm --cached -r | |
' \ | |
-- \ | |
tests-code | |
# mv files to tests/ dir without new commit | |
git filter-branch -f \ | |
--index-filter ' | |
git ls-files -s \ | |
| sed "s-\t-&tests/-" \ | |
| GIT_INDEX_FILE=$GIT_INDEX_FILE.new git update-index --index-info \ | |
&& mv $GIT_INDEX_FILE.new $GIT_INDEX_FILE' | |
# centralized branch for all selectors code | |
git checkout --orphan selectors | |
git rm -r -f . | |
# merge and rebase separate branches | |
git merge selector-code | |
git rebase utils-code | |
git rebase tests-code | |
# release branches | |
git branch -D selector-code utils-code tests-code | |
# now we can apply selectors patches | |
for f in $(ls *.patch); do | |
git am < $f; | |
done | |
# now we can remove selectors from scrapy | |
# references | |
# http://git-scm.com/docs/git-filter-branch | |
# http://git-scm.com/docs/git-ls-tree | |
# examples | |
# https://stackoverflow.com/questions/359424/detach-subdirectory-into-separate-git-repository | |
# https://stackoverflow.com/questions/359424/detach-subdirectory-into-separate-git-repository | |
# https://github.com/apenwarr/git-subtree/blob/master/git-subtree.txt | |
# https://stackoverflow.com/questions/6403715/git-how-to-split-off-library-from-project-filter-branch-subtree?rq=1 | |
# https://stackoverflow.com/questions/5998987/splitting-a-set-of-files-within-a-git-repo-into-their-own-repository-preserving | |
# https://www.kernel.org/pub/software/scm/git/docs/git-filter-branch.html | |
# http://stackoverflow.com/a/7396584 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment