Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save umrashrf/2f090797bdc25857325b to your computer and use it in GitHub Desktop.
Save umrashrf/2f090797bdc25857325b to your computer and use it in GitHub Desktop.
extracting selector from scrapy
From 2b781158628b586bf085888dbd8d32334fa4bf6a Mon Sep 17 00:00:00 2001
From: Umair Ashraf <umr.ashrf@gmail.com>
Date: Sat, 21 Feb 2015 20:04:07 +0500
Subject: [PATCH 1/5] added basic selectors files -- gitignore and readme
---
.gitignore | 11 +++++++++++
README.md | 3 +++
2 files changed, 14 insertions(+)
create mode 100644 .gitignore
create mode 100644 README.md
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..837a67b
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,11 @@
+*.pyc
+_trial_temp*
+dropin.cache
+docs/build
+*egg-info
+.tox
+venv
+build
+dist
+.idea
+.html
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..6196415
--- /dev/null
+++ b/README.md
@@ -0,0 +1,3 @@
+# Selectors
+
+Selectors provide high level API for XML and HTML parsing using XPath and CSS selectors in Python.
--
1.9.1
From eb60eb2ceaac01d23ec8167f4f3d797ed6c03014 Mon Sep 17 00:00:00 2001
From: Umair Ashraf <umr.ashrf@gmail.com>
Date: Sat, 21 Feb 2015 20:13:35 +0500
Subject: [PATCH 2/5] made selectors independent of scrapy
---
selectors/__init__.py | 10 +-
selectors/common.py | 22 ++++
selectors/csstranslator.py | 16 +--
selectors/exceptions.py | 5 +
selectors/lxmldocument.py | 31 ------
selectors/lxmlsel.py | 50 ---------
selectors/unified.py | 63 +++--------
selectors/utils/decorator.py | 23 +---
selectors/utils/misc.py | 86 +--------------
selectors/utils/python.py | 249 -------------------------------------------
10 files changed, 58 insertions(+), 497 deletions(-)
create mode 100644 selectors/common.py
create mode 100644 selectors/exceptions.py
delete mode 100644 selectors/lxmldocument.py
delete mode 100644 selectors/lxmlsel.py
diff --git a/selectors/__init__.py b/selectors/__init__.py
index bfbde4d..97eb9d5 100644
--- a/selectors/__init__.py
+++ b/selectors/__init__.py
@@ -1,5 +1,5 @@
-"""
-Selectors
-"""
-from scrapy.selector.unified import *
-from scrapy.selector.lxmlsel import *
+
+__version__ = '0.0.1'
+
+
+from selectors.unified import *
diff --git a/selectors/common.py b/selectors/common.py
new file mode 100644
index 0000000..4cbf1ec
--- /dev/null
+++ b/selectors/common.py
@@ -0,0 +1,22 @@
+"""
+We need these things in Scrapy and Selectors packages both
+"""
+from lxml import etree
+
+from .csstranslator import SelectorHTMLTranslator, SelectorGenericTranslator
+
+
+class SafeXMLParser(etree.XMLParser):
+ def __init__(self, *args, **kwargs):
+ kwargs.setdefault('resolve_entities', False)
+ super(SafeXMLParser, self).__init__(*args, **kwargs)
+
+
+_ctgroup = {
+ 'html': {'_parser': etree.HTMLParser,
+ '_csstranslator': SelectorHTMLTranslator(),
+ '_tostring_method': 'html'},
+ 'xml': {'_parser': SafeXMLParser,
+ '_csstranslator': SelectorGenericTranslator(),
+ '_tostring_method': 'xml'},
+}
diff --git a/selectors/csstranslator.py b/selectors/csstranslator.py
index 7482837..2148a10 100644
--- a/selectors/csstranslator.py
+++ b/selectors/csstranslator.py
@@ -3,7 +3,7 @@ from cssselect.xpath import _unicode_safe_getattr, XPathExpr, ExpressionError
from cssselect.parser import FunctionalPseudoElement
-class ScrapyXPathExpr(XPathExpr):
+class SelectorXPathExpr(XPathExpr):
textnode = False
attribute = None
@@ -16,7 +16,7 @@ class ScrapyXPathExpr(XPathExpr):
return x
def __str__(self):
- path = super(ScrapyXPathExpr, self).__str__()
+ path = super(SelectorXPathExpr, self).__str__()
if self.textnode:
if path == '*':
path = 'text()'
@@ -33,7 +33,7 @@ class ScrapyXPathExpr(XPathExpr):
return path
def join(self, combiner, other):
- super(ScrapyXPathExpr, self).join(combiner, other)
+ super(SelectorXPathExpr, self).join(combiner, other)
self.textnode = other.textnode
self.attribute = other.attribute
return self
@@ -43,7 +43,7 @@ class TranslatorMixin(object):
def xpath_element(self, selector):
xpath = super(TranslatorMixin, self).xpath_element(selector)
- return ScrapyXPathExpr.from_xpath(xpath)
+ return SelectorXPathExpr.from_xpath(xpath)
def xpath_pseudo_element(self, xpath, pseudo_element):
if isinstance(pseudo_element, FunctionalPseudoElement):
@@ -71,18 +71,18 @@ class TranslatorMixin(object):
raise ExpressionError(
"Expected a single string or ident for ::attr(), got %r"
% function.arguments)
- return ScrapyXPathExpr.from_xpath(xpath,
+ return SelectorXPathExpr.from_xpath(xpath,
attribute=function.arguments[0].value)
def xpath_text_simple_pseudo_element(self, xpath):
"""Support selecting text nodes using ::text pseudo-element"""
- return ScrapyXPathExpr.from_xpath(xpath, textnode=True)
+ return SelectorXPathExpr.from_xpath(xpath, textnode=True)
-class ScrapyGenericTranslator(TranslatorMixin, GenericTranslator):
+class SelectorGenericTranslator(TranslatorMixin, GenericTranslator):
pass
-class ScrapyHTMLTranslator(TranslatorMixin, HTMLTranslator):
+class SelectorHTMLTranslator(TranslatorMixin, HTMLTranslator):
pass
diff --git a/selectors/exceptions.py b/selectors/exceptions.py
new file mode 100644
index 0000000..9ed8b6b
--- /dev/null
+++ b/selectors/exceptions.py
@@ -0,0 +1,5 @@
+class SelectorsDeprecationWarning(Warning):
+ """Warning category for deprecated features, since the default
+ DeprecationWarning is silenced on Python 2.7+
+ """
+ pass
diff --git a/selectors/lxmldocument.py b/selectors/lxmldocument.py
deleted file mode 100644
index 817349b..0000000
--- a/selectors/lxmldocument.py
+++ /dev/null
@@ -1,31 +0,0 @@
-"""
-This module contains a simple class (LxmlDocument) which provides cache and
-garbage collection to lxml element tree documents.
-"""
-
-import weakref
-from lxml import etree
-from scrapy.utils.trackref import object_ref
-
-
-def _factory(response, parser_cls):
- url = response.url
- body = response.body_as_unicode().strip().encode('utf8') or '<html/>'
- parser = parser_cls(recover=True, encoding='utf8')
- return etree.fromstring(body, parser=parser, base_url=url)
-
-
-class LxmlDocument(object_ref):
-
- cache = weakref.WeakKeyDictionary()
- __slots__ = ['__weakref__']
-
- def __new__(cls, response, parser=etree.HTMLParser):
- cache = cls.cache.setdefault(response, {})
- if parser not in cache:
- obj = object_ref.__new__(cls)
- cache[parser] = _factory(response, parser)
- return cache[parser]
-
- def __str__(self):
- return "<LxmlDocument %s>" % self.root.tag
diff --git a/selectors/lxmlsel.py b/selectors/lxmlsel.py
deleted file mode 100644
index 070cb23..0000000
--- a/selectors/lxmlsel.py
+++ /dev/null
@@ -1,50 +0,0 @@
-"""
-XPath selectors based on lxml
-"""
-from scrapy.utils.deprecate import create_deprecated_class
-from .unified import Selector, SelectorList
-
-
-__all__ = ['HtmlXPathSelector', 'XmlXPathSelector', 'XPathSelector',
- 'XPathSelectorList']
-
-def _xpathselector_css(self, *a, **kw):
- raise RuntimeError('.css() method not available for %s, '
- 'instantiate scrapy.Selector '
- 'instead' % type(self).__name__)
-
-XPathSelector = create_deprecated_class(
- 'XPathSelector',
- Selector,
- {
- '__slots__': (),
- '_default_type': 'html',
- 'css': _xpathselector_css,
- },
- new_class_path='scrapy.Selector',
- old_class_path='scrapy.selector.XPathSelector',
-)
-
-XmlXPathSelector = create_deprecated_class(
- 'XmlXPathSelector',
- XPathSelector,
- clsdict={
- '__slots__': (),
- '_default_type': 'xml',
- },
- new_class_path='scrapy.Selector',
- old_class_path='scrapy.selector.XmlXPathSelector',
-)
-
-HtmlXPathSelector = create_deprecated_class(
- 'HtmlXPathSelector',
- XPathSelector,
- clsdict={
- '__slots__': (),
- '_default_type': 'html',
- },
- new_class_path='scrapy.Selector',
- old_class_path='scrapy.selector.HtmlXPathSelector',
-)
-
-XPathSelectorList = create_deprecated_class('XPathSelectorList', SelectorList)
diff --git a/selectors/unified.py b/selectors/unified.py
index b8a3678..77b363a 100644
--- a/selectors/unified.py
+++ b/selectors/unified.py
@@ -1,57 +1,24 @@
"""
XPath selectors based on lxml
"""
+import re
from lxml import etree
-from scrapy.utils.misc import extract_regex
-from scrapy.utils.trackref import object_ref
-from scrapy.utils.python import unicode_to_str, flatten
-from scrapy.utils.decorator import deprecated
-from scrapy.http import HtmlResponse, XmlResponse
-from .lxmldocument import LxmlDocument
-from .csstranslator import ScrapyHTMLTranslator, ScrapyGenericTranslator
+from .utils.misc import extract_regex
+from .utils.python import flatten
+from .utils.decorator import deprecated
+from .common import _ctgroup
__all__ = ['Selector', 'SelectorList']
-class SafeXMLParser(etree.XMLParser):
- def __init__(self, *args, **kwargs):
- kwargs.setdefault('resolve_entities', False)
- super(SafeXMLParser, self).__init__(*args, **kwargs)
+class Selector(object):
-_ctgroup = {
- 'html': {'_parser': etree.HTMLParser,
- '_csstranslator': ScrapyHTMLTranslator(),
- '_tostring_method': 'html'},
- 'xml': {'_parser': SafeXMLParser,
- '_csstranslator': ScrapyGenericTranslator(),
- '_tostring_method': 'xml'},
-}
+ __slots__ = ['text', 'namespaces', 'type', '_expr', '_root',
+ '_parser', '_csstranslator', '_tostring_method']
-
-def _st(response, st):
- if st is None:
- return 'xml' if isinstance(response, XmlResponse) else 'html'
- elif st in ('xml', 'html'):
- return st
- else:
- raise ValueError('Invalid type: %s' % st)
-
-
-def _response_from_text(text, st):
- rt = XmlResponse if st == 'xml' else HtmlResponse
- return rt(url='about:blank', encoding='utf-8',
- body=unicode_to_str(text, 'utf-8'))
-
-
-class Selector(object_ref):
-
- __slots__ = ['response', 'text', 'namespaces', 'type', '_expr', '_root',
- '__weakref__', '_parser', '_csstranslator', '_tostring_method']
-
- _default_type = None
_default_namespaces = {
"re": "http://exslt.org/regular-expressions",
@@ -65,23 +32,23 @@ class Selector(object_ref):
}
_lxml_smart_strings = False
- def __init__(self, response=None, text=None, type=None, namespaces=None,
+ def __init__(self, text=None, url=None, type='html', namespaces=None,
_root=None, _expr=None):
- self.type = st = _st(response, type or self._default_type)
+ self.type = st = type
self._parser = _ctgroup[st]['_parser']
self._csstranslator = _ctgroup[st]['_csstranslator']
self._tostring_method = _ctgroup[st]['_tostring_method']
+ self.text = text
if text is not None:
- response = _response_from_text(text, st)
+ body = text.strip().encode('utf8') or '<html/>'
+ parser_obj = self._parser(recover=True, encoding='utf8')
+ _root = etree.fromstring(body, base_url=url, parser=parser_obj)
- if response is not None:
- _root = LxmlDocument(response, self._parser)
-
- self.response = response
self.namespaces = dict(self._default_namespaces)
if namespaces is not None:
self.namespaces.update(namespaces)
+
self._root = _root
self._expr = _expr
diff --git a/selectors/utils/decorator.py b/selectors/utils/decorator.py
index 38bee1a..2177a9a 100644
--- a/selectors/utils/decorator.py
+++ b/selectors/utils/decorator.py
@@ -1,9 +1,7 @@
import warnings
from functools import wraps
-from twisted.internet import defer, threads
-
-from scrapy.exceptions import ScrapyDeprecationWarning
+from selectors.exceptions import SelectorsDeprecationWarning
def deprecated(use_instead=None):
@@ -17,7 +15,7 @@ def deprecated(use_instead=None):
message = "Call to deprecated function %s." % func.__name__
if use_instead:
message += " Use %s instead." % use_instead
- warnings.warn(message, category=ScrapyDeprecationWarning, stacklevel=2)
+ warnings.warn(message, category=SelectorsDeprecationWarning, stacklevel=2)
return func(*args, **kwargs)
return wrapped
@@ -25,20 +23,3 @@ def deprecated(use_instead=None):
deco = deco(use_instead)
use_instead = None
return deco
-
-
-def defers(func):
- """Decorator to make sure a function always returns a deferred"""
- @wraps(func)
- def wrapped(*a, **kw):
- return defer.maybeDeferred(func, *a, **kw)
- return wrapped
-
-def inthread(func):
- """Decorator to call a function in a thread and return a deferred with the
- result
- """
- @wraps(func)
- def wrapped(*a, **kw):
- return threads.deferToThread(func, *a, **kw)
- return wrapped
diff --git a/selectors/utils/misc.py b/selectors/utils/misc.py
index 3152db6..969e78e 100644
--- a/selectors/utils/misc.py
+++ b/selectors/utils/misc.py
@@ -1,76 +1,9 @@
"""Helper functions which doesn't fit anywhere else"""
import re
-import hashlib
-from importlib import import_module
-from pkgutil import iter_modules
-import six
from w3lib.html import replace_entities
-from scrapy.utils.python import flatten
-from scrapy.item import BaseItem
-
-
-_ITERABLE_SINGLE_VALUES = dict, BaseItem, six.text_type, bytes
-
-
-def arg_to_iter(arg):
- """Convert an argument to an iterable. The argument can be a None, single
- value, or an iterable.
-
- Exception: if arg is a dict, [arg] will be returned
- """
- if arg is None:
- return []
- elif not isinstance(arg, _ITERABLE_SINGLE_VALUES) and hasattr(arg, '__iter__'):
- return arg
- else:
- return [arg]
-
-
-def load_object(path):
- """Load an object given its absolute object path, and return it.
-
- object can be a class, function, variable o instance.
- path ie: 'scrapy.contrib.downloadermiddelware.redirect.RedirectMiddleware'
- """
-
- try:
- dot = path.rindex('.')
- except ValueError:
- raise ValueError("Error loading object '%s': not a full path" % path)
-
- module, name = path[:dot], path[dot+1:]
- mod = import_module(module)
-
- try:
- obj = getattr(mod, name)
- except AttributeError:
- raise NameError("Module '%s' doesn't define any object named '%s'" % (module, name))
-
- return obj
-
-
-def walk_modules(path):
- """Loads a module and all its submodules from a the given module path and
- returns them. If *any* module throws an exception while importing, that
- exception is thrown back.
-
- For example: walk_modules('scrapy.utils')
- """
-
- mods = []
- mod = import_module(path)
- mods.append(mod)
- if hasattr(mod, '__path__'):
- for _, subpath, ispkg in iter_modules(mod.__path__):
- fullpath = path + '.' + subpath
- if ispkg:
- mods += walk_modules(fullpath)
- else:
- submod = import_module(fullpath)
- mods.append(submod)
- return mods
+from .python import flatten
def extract_regex(regex, text, encoding='utf-8'):
@@ -94,20 +27,3 @@ def extract_regex(regex, text, encoding='utf-8'):
return [replace_entities(s, keep=['lt', 'amp']) for s in strings]
else:
return [replace_entities(unicode(s, encoding), keep=['lt', 'amp']) for s in strings]
-
-
-def md5sum(file):
- """Calculate the md5 checksum of a file-like object without reading its
- whole content in memory.
-
- >>> from io import BytesIO
- >>> md5sum(BytesIO(b'file content to hash'))
- '784406af91dd5a54fbb9c84c2236595a'
- """
- m = hashlib.md5()
- while 1:
- d = file.read(8096)
- if not d:
- break
- m.update(d)
- return m.hexdigest()
diff --git a/selectors/utils/python.py b/selectors/utils/python.py
index 551d337..beb62f0 100644
--- a/selectors/utils/python.py
+++ b/selectors/utils/python.py
@@ -1,19 +1,3 @@
-"""
-This module contains essential stuff that should've come with Python itself ;)
-
-It also contains functions (or functionality) which is in Python versions
-higher than 2.5 which used to be the lowest version supported by Scrapy.
-
-"""
-import os
-import re
-import inspect
-import weakref
-import errno
-import six
-from functools import partial, wraps
-
-
def flatten(x):
"""flatten(sequence) -> list
@@ -34,236 +18,3 @@ def flatten(x):
else:
result.append(el)
return result
-
-
-def unique(list_, key=lambda x: x):
- """efficient function to uniquify a list preserving item order"""
- seen = set()
- result = []
- for item in list_:
- seenkey = key(item)
- if seenkey in seen:
- continue
- seen.add(seenkey)
- result.append(item)
- return result
-
-
-def str_to_unicode(text, encoding=None, errors='strict'):
- """Return the unicode representation of text in the given encoding. Unlike
- .encode(encoding) this function can be applied directly to a unicode
- object without the risk of double-decoding problems (which can happen if
- you don't use the default 'ascii' encoding)
- """
-
- if encoding is None:
- encoding = 'utf-8'
- if isinstance(text, str):
- return text.decode(encoding, errors)
- elif isinstance(text, unicode):
- return text
- else:
- raise TypeError('str_to_unicode must receive a str or unicode object, got %s' % type(text).__name__)
-
-def unicode_to_str(text, encoding=None, errors='strict'):
- """Return the str representation of text in the given encoding. Unlike
- .encode(encoding) this function can be applied directly to a str
- object without the risk of double-decoding problems (which can happen if
- you don't use the default 'ascii' encoding)
- """
-
- if encoding is None:
- encoding = 'utf-8'
- if isinstance(text, unicode):
- return text.encode(encoding, errors)
- elif isinstance(text, str):
- return text
- else:
- raise TypeError('unicode_to_str must receive a unicode or str object, got %s' % type(text).__name__)
-
-def re_rsearch(pattern, text, chunk_size=1024):
- """
- This function does a reverse search in a text using a regular expression
- given in the attribute 'pattern'.
- Since the re module does not provide this functionality, we have to find for
- the expression into chunks of text extracted from the end (for the sake of efficiency).
- At first, a chunk of 'chunk_size' kilobytes is extracted from the end, and searched for
- the pattern. If the pattern is not found, another chunk is extracted, and another
- search is performed.
- This process continues until a match is found, or until the whole file is read.
- In case the pattern wasn't found, None is returned, otherwise it returns a tuple containing
- the start position of the match, and the ending (regarding the entire text).
- """
- def _chunk_iter():
- offset = len(text)
- while True:
- offset -= (chunk_size * 1024)
- if offset <= 0:
- break
- yield (text[offset:], offset)
- yield (text, 0)
-
- pattern = re.compile(pattern) if isinstance(pattern, basestring) else pattern
- for chunk, offset in _chunk_iter():
- matches = [match for match in pattern.finditer(chunk)]
- if matches:
- return (offset + matches[-1].span()[0], offset + matches[-1].span()[1])
- return None
-
-def memoizemethod_noargs(method):
- """Decorator to cache the result of a method (without arguments) using a
- weak reference to its object
- """
- cache = weakref.WeakKeyDictionary()
- @wraps(method)
- def new_method(self, *args, **kwargs):
- if self not in cache:
- cache[self] = method(self, *args, **kwargs)
- return cache[self]
- return new_method
-
-_BINARYCHARS = set(map(chr, range(32))) - set(["\0", "\t", "\n", "\r"])
-
-def isbinarytext(text):
- """Return True if the given text is considered binary, or false
- otherwise, by looking for binary bytes at their chars
- """
- assert isinstance(text, str), "text must be str, got '%s'" % type(text).__name__
- return any(c in _BINARYCHARS for c in text)
-
-def get_func_args(func, stripself=False):
- """Return the argument name list of a callable"""
- if inspect.isfunction(func):
- func_args, _, _, _ = inspect.getargspec(func)
- elif inspect.isclass(func):
- return get_func_args(func.__init__, True)
- elif inspect.ismethod(func):
- return get_func_args(func.__func__, True)
- elif inspect.ismethoddescriptor(func):
- return []
- elif isinstance(func, partial):
- return [x for x in get_func_args(func.func)[len(func.args):]
- if not (func.keywords and x in func.keywords)]
- elif hasattr(func, '__call__'):
- if inspect.isroutine(func):
- return []
- elif getattr(func, '__name__', None) == '__call__':
- return []
- else:
- return get_func_args(func.__call__, True)
- else:
- raise TypeError('%s is not callable' % type(func))
- if stripself:
- func_args.pop(0)
- return func_args
-
-def get_spec(func):
- """Returns (args, kwargs) tuple for a function
- >>> import re
- >>> get_spec(re.match)
- (['pattern', 'string'], {'flags': 0})
-
- >>> class Test(object):
- ... def __call__(self, val):
- ... pass
- ... def method(self, val, flags=0):
- ... pass
-
- >>> get_spec(Test)
- (['self', 'val'], {})
-
- >>> get_spec(Test.method)
- (['self', 'val'], {'flags': 0})
-
- >>> get_spec(Test().method)
- (['self', 'val'], {'flags': 0})
- """
-
- if inspect.isfunction(func) or inspect.ismethod(func):
- spec = inspect.getargspec(func)
- elif hasattr(func, '__call__'):
- spec = inspect.getargspec(func.__call__)
- else:
- raise TypeError('%s is not callable' % type(func))
-
- defaults = spec.defaults or []
-
- firstdefault = len(spec.args) - len(defaults)
- args = spec.args[:firstdefault]
- kwargs = dict(zip(spec.args[firstdefault:], defaults))
- return args, kwargs
-
-def equal_attributes(obj1, obj2, attributes):
- """Compare two objects attributes"""
- # not attributes given return False by default
- if not attributes:
- return False
-
- for attr in attributes:
- # support callables like itemgetter
- if callable(attr):
- if not attr(obj1) == attr(obj2):
- return False
- else:
- # check that objects has attribute
- if not hasattr(obj1, attr):
- return False
- if not hasattr(obj2, attr):
- return False
- # compare object attributes
- if not getattr(obj1, attr) == getattr(obj2, attr):
- return False
- # all attributes equal
- return True
-
-
-class WeakKeyCache(object):
-
- def __init__(self, default_factory):
- self.default_factory = default_factory
- self._weakdict = weakref.WeakKeyDictionary()
-
- def __getitem__(self, key):
- if key not in self._weakdict:
- self._weakdict[key] = self.default_factory(key)
- return self._weakdict[key]
-
-
-def stringify_dict(dct_or_tuples, encoding='utf-8', keys_only=True):
- """Return a (new) dict with the unicode keys (and values if, keys_only is
- False) of the given dict converted to strings. `dct_or_tuples` can be a
- dict or a list of tuples, like any dict constructor supports.
- """
- d = {}
- for k, v in six.iteritems(dict(dct_or_tuples)):
- k = k.encode(encoding) if isinstance(k, unicode) else k
- if not keys_only:
- v = v.encode(encoding) if isinstance(v, unicode) else v
- d[k] = v
- return d
-
-def is_writable(path):
- """Return True if the given path can be written (if it exists) or created
- (if it doesn't exist)
- """
- if os.path.exists(path):
- return os.access(path, os.W_OK)
- else:
- return os.access(os.path.dirname(path), os.W_OK)
-
-def setattr_default(obj, name, value):
- """Set attribute value, but only if it's not already set. Similar to
- setdefault() for dicts.
- """
- if not hasattr(obj, name):
- setattr(obj, name, value)
-
-
-def retry_on_eintr(function, *args, **kw):
- """Run a function and retry it while getting EINTR errors"""
- while True:
- try:
- return function(*args, **kw)
- except IOError as e:
- if e.errno != errno.EINTR:
- raise
--
1.9.1
From 7cf971c08ece0007d891045497f2e164b3bf8044 Mon Sep 17 00:00:00 2001
From: Umair Ashraf <umr.ashrf@gmail.com>
Date: Sat, 21 Feb 2015 20:05:30 +0500
Subject: [PATCH 3/5] added selectors specific python package files
---
MANIFEST.in | 2 ++
requirements.txt | 3 +++
setup.py | 28 ++++++++++++++++++++++++++++
3 files changed, 33 insertions(+)
create mode 100644 MANIFEST.in
create mode 100644 requirements.txt
create mode 100644 setup.py
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..2970947
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,2 @@
+include README.md
+include MANIFEST.in
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..9a0bc80
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,3 @@
+lxml
+w3lib>=1.8.0
+cssselect>=0.9
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..c3f8aa0
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,28 @@
+import re
+
+from setuptools import setup, find_packages
+
+
+(version, ) = re.findall(r"__version__[^=]*=[^']*[']([^']+)[']",
+ open('selectors/__init__.py').read())
+
+
+setup(
+ name='Selectors',
+ version=version,
+ url='http://github.com/scrapy/selectors',
+ description='Selectors used by Scrapy framework',
+ long_description=open('README.md').read(),
+ author='Selectors developers',
+ maintainer='Scrapy developers',
+ maintainer_email='info@scrapy.org',
+ license='BSD',
+ packages=find_packages(exclude=('tests', 'tests.*')),
+ include_package_data=True,
+ zip_safe=False,
+ install_requires=[
+ 'lxml',
+ 'w3lib>=1.8.0',
+ 'cssselect>=0.9',
+ ],
+)
--
1.9.1
From 109c2e096de49494e745c885df8901698b58d11e Mon Sep 17 00:00:00 2001
From: Umair Ashraf <umr.ashrf@gmail.com>
Date: Sat, 21 Feb 2015 20:24:31 +0500
Subject: [PATCH 4/5] removed scrapy dependent tests and changed code to suit
selectors package
---
tests/__init__.py | 14 ---
tests/test_selector.py | 222 ++++-------------------------------
tests/test_selector_csstranslator.py | 12 +-
3 files changed, 31 insertions(+), 217 deletions(-)
delete mode 100644 tests/__init__.py
diff --git a/tests/__init__.py b/tests/__init__.py
deleted file mode 100644
index 54e79b3..0000000
--- a/tests/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-"""
-tests: this package contains all Scrapy unittests
-
-see http://doc.scrapy.org/en/latest/contributing.html#running-tests
-"""
-
-import os
-
-tests_datadir = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'sample_data')
-
-def get_testdata(*paths):
- """Return test data"""
- path = os.path.join(tests_datadir, *paths)
- return open(path, 'rb').read()
diff --git a/tests/test_selector.py b/tests/test_selector.py
index 6fbb451..91c7d31 100644
--- a/tests/test_selector.py
+++ b/tests/test_selector.py
@@ -1,11 +1,10 @@
import re
import warnings
import weakref
+
from twisted.trial import unittest
-from scrapy.exceptions import ScrapyDeprecationWarning
-from scrapy.http import TextResponse, HtmlResponse, XmlResponse
-from scrapy.selector import Selector
-from scrapy.selector.lxmlsel import XmlXPathSelector, HtmlXPathSelector, XPathSelector
+
+from selectors import Selector
class SelectorTestCase(unittest.TestCase):
@@ -15,8 +14,7 @@ class SelectorTestCase(unittest.TestCase):
def test_simple_selection(self):
"""Simple selector tests"""
body = "<p><input name='a'value='1'/><input name='b'value='2'/></p>"
- response = TextResponse(url="http://example.com", body=body)
- sel = self.sscls(response)
+ sel = self.sscls(url="http://example.com", text=body)
xl = sel.xpath('//input')
self.assertEqual(2, len(xl))
@@ -38,8 +36,7 @@ class SelectorTestCase(unittest.TestCase):
def test_representation_slice(self):
body = u"<p><input name='{}' value='\xa9'/></p>".format(50 * 'b')
- response = TextResponse(url="http://example.com", body=body, encoding='utf8')
- sel = self.sscls(response)
+ sel = self.sscls(url="http://example.com", text=body)
self.assertEqual(
map(repr, sel.xpath('//input/@name')),
@@ -48,8 +45,7 @@ class SelectorTestCase(unittest.TestCase):
def test_representation_unicode_query(self):
body = u"<p><input name='{}' value='\xa9'/></p>".format(50 * 'b')
- response = TextResponse(url="http://example.com", body=body, encoding='utf8')
- sel = self.sscls(response)
+ sel = self.sscls(url="http://example.com", text=body)
self.assertEqual(
map(repr, sel.xpath(u'//input[@value="\xa9"]/@value')),
["<Selector xpath=u'//input[@value=\"\\xa9\"]/@value' data=u'\\xa9'>"]
@@ -57,8 +53,7 @@ class SelectorTestCase(unittest.TestCase):
def test_select_unicode_query(self):
body = u"<p><input name='\xa9' value='1'/></p>"
- response = TextResponse(url="http://example.com", body=body, encoding='utf8')
- sel = self.sscls(response)
+ sel = self.sscls(url="http://example.com", text=body)
self.assertEqual(sel.xpath(u'//input[@name="\xa9"]/@value').extract(), [u'1'])
def test_list_elements_type(self):
@@ -69,8 +64,7 @@ class SelectorTestCase(unittest.TestCase):
def test_boolean_result(self):
body = "<p><input name='a'value='1'/><input name='b'value='2'/></p>"
- response = TextResponse(url="http://example.com", body=body)
- xs = self.sscls(response)
+ xs = self.sscls(url="http://example.com", text=body)
self.assertEquals(xs.xpath("//input[@name='a']/@name='a'").extract(), [u'1'])
self.assertEquals(xs.xpath("//input[@name='a']/@name='n'").extract(), [u'0'])
@@ -86,18 +80,6 @@ class SelectorTestCase(unittest.TestCase):
self.assertEqual(xs.xpath("//div").extract(),
[u'<div><img src="a.jpg"><p>Hello</p></img></div>'])
- def test_flavor_detection(self):
- text = '<div><img src="a.jpg"><p>Hello</div>'
- sel = self.sscls(XmlResponse('http://example.com', body=text))
- self.assertEqual(sel.type, 'xml')
- self.assertEqual(sel.xpath("//div").extract(),
- [u'<div><img src="a.jpg"><p>Hello</p></img></div>'])
-
- sel = self.sscls(HtmlResponse('http://example.com', body=text))
- self.assertEqual(sel.type, 'html')
- self.assertEqual(sel.xpath("//div").extract(),
- [u'<div><img src="a.jpg"><p>Hello</p></div>'])
-
def test_nested_selectors(self):
"""Nested selector tests"""
body = """<body>
@@ -113,8 +95,7 @@ class SelectorTestCase(unittest.TestCase):
</div>
</body>"""
- response = HtmlResponse(url="http://example.com", body=body)
- x = self.sscls(response)
+ x = self.sscls(url="http://example.com", text=body)
divtwo = x.xpath('//div[@class="two"]')
self.assertEqual(divtwo.xpath("//li").extract(),
["<li>one</li>", "<li>two</li>", "<li>four</li>", "<li>five</li>", "<li>six</li>"])
@@ -145,8 +126,7 @@ class SelectorTestCase(unittest.TestCase):
</test>
"""
- response = XmlResponse(url="http://example.com", body=body)
- x = self.sscls(response)
+ x = self.sscls(url="http://example.com", text=body, type="xml")
x.register_namespace("somens", "http://scrapy.org")
self.assertEqual(x.xpath("//somens:a/text()").extract(),
@@ -162,8 +142,7 @@ class SelectorTestCase(unittest.TestCase):
<p:SecondTestTag><material>iron</material><price>90</price><p:name>Dried Rose</p:name></p:SecondTestTag>
</BrowseNode>
"""
- response = XmlResponse(url="http://example.com", body=body)
- x = self.sscls(response)
+ x = self.sscls(url="http://example.com", text=body, type="xml")
x.register_namespace("xmlns", "http://webservices.amazon.com/AWSECommerceService/2005-10-05")
x.register_namespace("p", "http://www.scrapy.org/product")
x.register_namespace("b", "http://somens.com")
@@ -184,8 +163,7 @@ class SelectorTestCase(unittest.TestCase):
</ul>
Age: 20
</div>"""
- response = HtmlResponse(url="http://example.com", body=body)
- x = self.sscls(response)
+ x = self.sscls(url="http://example.com", text=body)
name_re = re.compile("Name: (\w+)")
self.assertEqual(x.xpath("//ul/li").re(name_re),
@@ -193,12 +171,6 @@ class SelectorTestCase(unittest.TestCase):
self.assertEqual(x.xpath("//ul/li").re("Age: (\d+)"),
["10", "20"])
- def test_re_intl(self):
- body = """<div>Evento: cumplea\xc3\xb1os</div>"""
- response = HtmlResponse(url="http://example.com", body=body, encoding='utf-8')
- x = self.sscls(response)
- self.assertEqual(x.xpath("//div").re("Evento: (\w+)"), [u'cumplea\xf1os'])
-
def test_selector_over_text(self):
hs = self.sscls(text='<root>lala</root>')
self.assertEqual(hs.extract(), u'<html><body><root>lala</root></body></html>')
@@ -207,8 +179,7 @@ class SelectorTestCase(unittest.TestCase):
self.assertEqual(xs.xpath('.').extract(), [u'<root>lala</root>'])
def test_invalid_xpath(self):
- response = XmlResponse(url="http://example.com", body="<html></html>")
- x = self.sscls(response)
+ x = self.sscls(url="http://example.com", text="<html></html>")
xpath = "//test[@foo='bar]"
try:
x.xpath(xpath)
@@ -219,43 +190,16 @@ class SelectorTestCase(unittest.TestCase):
else:
raise AssertionError("A invalid XPath does not raise an exception")
- def test_http_header_encoding_precedence(self):
- # u'\xa3' = pound symbol in unicode
- # u'\xc2\xa3' = pound symbol in utf-8
- # u'\xa3' = pound symbol in latin-1 (iso-8859-1)
-
- meta = u'<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">'
- head = u'<head>' + meta + u'</head>'
- body_content = u'<span id="blank">\xa3</span>'
- body = u'<body>' + body_content + u'</body>'
- html = u'<html>' + head + body + u'</html>'
- encoding = 'utf-8'
- html_utf8 = html.encode(encoding)
-
- headers = {'Content-Type': ['text/html; charset=utf-8']}
- response = HtmlResponse(url="http://example.com", headers=headers, body=html_utf8)
- x = self.sscls(response)
- self.assertEquals(x.xpath("//span[@id='blank']/text()").extract(),
- [u'\xa3'])
-
def test_empty_bodies(self):
# shouldn't raise errors
- r1 = TextResponse('http://www.example.com', body='')
- self.sscls(r1).xpath('//text()').extract()
+ self.sscls(url='http://www.example.com', text='').xpath('//text()').extract()
def test_null_bytes(self):
# shouldn't raise errors
- r1 = TextResponse('http://www.example.com', \
- body='<root>pre\x00post</root>', \
- encoding='utf-8')
- self.sscls(r1).xpath('//text()').extract()
-
- def test_badly_encoded_body(self):
- # \xe9 alone isn't valid utf8 sequence
- r1 = TextResponse('http://www.example.com', \
- body='<html><p>an Jos\xe9 de</p><html>', \
- encoding='utf-8')
- self.sscls(r1).xpath('//text()').extract()
+ self.sscls(url='http://www.example.com',
+ text='<root>pre\x00post</root>',
+ type='xml') \
+ .xpath('//text()').extract()
def test_select_on_unevaluable_nodes(self):
r = self.sscls(text=u'<span class="big">some text</span>')
@@ -284,13 +228,6 @@ class SelectorTestCase(unittest.TestCase):
self.assertEquals(x2.extract(), [u'<b>Options:</b>'])
test_nested_select_on_text_nodes.skip = "Text nodes lost parent node reference in lxml"
- def test_weakref_slots(self):
- """Check that classes are using slots and are weak-referenceable"""
- x = self.sscls()
- weakref.ref(x)
- assert not hasattr(x, '__dict__'), "%s does not use __slots__" % \
- x.__class__.__name__
-
def test_remove_namespaces(self):
xml = """<?xml version="1.0" encoding="UTF-8"?>
<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en-US" xmlns:media="http://search.yahoo.com/mrss/">
@@ -298,7 +235,7 @@ class SelectorTestCase(unittest.TestCase):
<link type="application/atom+xml">
</feed>
"""
- sel = self.sscls(XmlResponse("http://example.com/feed.atom", body=xml))
+ sel = self.sscls(url="http://example.com/feed.atom", text=xml, type="xml")
self.assertEqual(len(sel.xpath("//link")), 0)
sel.remove_namespaces()
self.assertEqual(len(sel.xpath("//link")), 2)
@@ -310,7 +247,7 @@ class SelectorTestCase(unittest.TestCase):
<link atom:type="application/atom+xml">
</feed>
"""
- sel = self.sscls(XmlResponse("http://example.com/feed.atom", body=xml))
+ sel = self.sscls(url="http://example.com/feed.atom", text=xml, type="xml")
self.assertEqual(len(sel.xpath("//link/@type")), 0)
sel.remove_namespaces()
self.assertEqual(len(sel.xpath("//link/@type")), 2)
@@ -334,17 +271,15 @@ class SelectorTestCase(unittest.TestCase):
</div>
</body>"""
- response = HtmlResponse(url="http://example.com", body=body)
-
# .getparent() is available for text nodes and attributes
# only when smart_strings are on
- x = self.sscls(response)
+ x = self.sscls(url="http://example.com", text=body)
li_text = x.xpath('//li/text()')
self.assertFalse(any(map(lambda e: hasattr(e._root, 'getparent'), li_text)))
div_class = x.xpath('//div/@class')
self.assertFalse(any(map(lambda e: hasattr(e._root, 'getparent'), div_class)))
- x = SmartStringsSelector(response)
+ x = SmartStringsSelector(url="http://example.com", text=body)
li_text = x.xpath('//li/text()')
self.assertTrue(all(map(lambda e: hasattr(e._root, 'getparent'), li_text)))
div_class = x.xpath('//div/@class')
@@ -355,116 +290,11 @@ class SelectorTestCase(unittest.TestCase):
'<!DOCTYPE foo [ <!ELEMENT foo ANY > <!ENTITY xxe SYSTEM '\
'"file:///etc/passwd" >]><foo>&xxe;</foo>'
- response = XmlResponse('http://example.com', body=malicious_xml)
- sel = self.sscls(response=response)
+ sel = self.sscls(url='http://example.com', text=malicious_xml, type="xml")
self.assertEqual(sel.extract(), '<foo>&xxe;</foo>')
-class DeprecatedXpathSelectorTest(unittest.TestCase):
-
- text = '<div><img src="a.jpg"><p>Hello</div>'
-
- def test_warnings_xpathselector(self):
- cls = XPathSelector
- with warnings.catch_warnings(record=True) as w:
- class UserClass(cls):
- pass
-
- # subclassing must issue a warning
- self.assertEqual(len(w), 1, str(cls))
- self.assertIn('scrapy.Selector', str(w[0].message))
-
- # subclass instance doesn't issue a warning
- usel = UserClass(text=self.text)
- self.assertEqual(len(w), 1)
-
- # class instance must issue a warning
- sel = cls(text=self.text)
- self.assertEqual(len(w), 2, str((cls, [x.message for x in w])))
- self.assertIn('scrapy.Selector', str(w[1].message))
-
- # subclass and instance checks
- self.assertTrue(issubclass(cls, Selector))
- self.assertTrue(isinstance(sel, Selector))
- self.assertTrue(isinstance(usel, Selector))
-
- def test_warnings_xmlxpathselector(self):
- cls = XmlXPathSelector
- with warnings.catch_warnings(record=True) as w:
- class UserClass(cls):
- pass
-
- # subclassing must issue a warning
- self.assertEqual(len(w), 1, str(cls))
- self.assertIn('scrapy.Selector', str(w[0].message))
-
- # subclass instance doesn't issue a warning
- usel = UserClass(text=self.text)
- self.assertEqual(len(w), 1)
-
- # class instance must issue a warning
- sel = cls(text=self.text)
- self.assertEqual(len(w), 2, str((cls, [x.message for x in w])))
- self.assertIn('scrapy.Selector', str(w[1].message))
-
- # subclass and instance checks
- self.assertTrue(issubclass(cls, Selector))
- self.assertTrue(issubclass(cls, XPathSelector))
- self.assertTrue(isinstance(sel, Selector))
- self.assertTrue(isinstance(usel, Selector))
- self.assertTrue(isinstance(sel, XPathSelector))
- self.assertTrue(isinstance(usel, XPathSelector))
-
- def test_warnings_htmlxpathselector(self):
- cls = HtmlXPathSelector
- with warnings.catch_warnings(record=True) as w:
- class UserClass(cls):
- pass
-
- # subclassing must issue a warning
- self.assertEqual(len(w), 1, str(cls))
- self.assertIn('scrapy.Selector', str(w[0].message))
-
- # subclass instance doesn't issue a warning
- usel = UserClass(text=self.text)
- self.assertEqual(len(w), 1)
-
- # class instance must issue a warning
- sel = cls(text=self.text)
- self.assertEqual(len(w), 2, str((cls, [x.message for x in w])))
- self.assertIn('scrapy.Selector', str(w[1].message))
-
- # subclass and instance checks
- self.assertTrue(issubclass(cls, Selector))
- self.assertTrue(issubclass(cls, XPathSelector))
- self.assertTrue(isinstance(sel, Selector))
- self.assertTrue(isinstance(usel, Selector))
- self.assertTrue(isinstance(sel, XPathSelector))
- self.assertTrue(isinstance(usel, XPathSelector))
-
- def test_xpathselector(self):
- with warnings.catch_warnings(record=True):
- hs = XPathSelector(text=self.text)
- self.assertEqual(hs.select("//div").extract(),
- [u'<div><img src="a.jpg"><p>Hello</p></div>'])
- self.assertRaises(RuntimeError, hs.css, 'div')
-
- def test_htmlxpathselector(self):
- with warnings.catch_warnings(record=True):
- hs = HtmlXPathSelector(text=self.text)
- self.assertEqual(hs.select("//div").extract(),
- [u'<div><img src="a.jpg"><p>Hello</p></div>'])
- self.assertRaises(RuntimeError, hs.css, 'div')
-
- def test_xmlxpathselector(self):
- with warnings.catch_warnings(record=True):
- xs = XmlXPathSelector(text=self.text)
- self.assertEqual(xs.select("//div").extract(),
- [u'<div><img src="a.jpg"><p>Hello</p></img></div>'])
- self.assertRaises(RuntimeError, xs.css, 'div')
-
-
class ExsltTestCase(unittest.TestCase):
sscls = Selector
@@ -479,8 +309,7 @@ class ExsltTestCase(unittest.TestCase):
<a href="http://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.xml">EXSLT match example</a>
</div>
"""
- response = TextResponse(url="http://example.com", body=body)
- sel = self.sscls(response)
+ sel = self.sscls(url="http://example.com", text=body)
# re:test()
self.assertEqual(
@@ -557,8 +386,7 @@ class ExsltTestCase(unittest.TestCase):
</div>
</div>
"""
- response = TextResponse(url="http://example.com", body=body)
- sel = self.sscls(response)
+ sel = self.sscls(url="http://example.com", text=body)
self.assertEqual(
sel.xpath('''//div[@itemtype="http://schema.org/Event"]
diff --git a/tests/test_selector_csstranslator.py b/tests/test_selector_csstranslator.py
index 7ef9003..b648320 100644
--- a/tests/test_selector_csstranslator.py
+++ b/tests/test_selector_csstranslator.py
@@ -2,9 +2,10 @@
Selector tests for cssselect backend
"""
from twisted.trial import unittest
-from scrapy.http import HtmlResponse
-from scrapy.selector.csstranslator import ScrapyHTMLTranslator
-from scrapy.selector import Selector
+
+from selectors import Selector
+from selectors.csstranslator import SelectorHTMLTranslator
+
from cssselect.parser import SelectorSyntaxError
from cssselect.xpath import ExpressionError
@@ -47,7 +48,7 @@ HTMLBODY = '''
class TranslatorMixinTest(unittest.TestCase):
- tr_cls = ScrapyHTMLTranslator
+ tr_cls = SelectorHTMLTranslator
def setUp(self):
self.tr = self.tr_cls()
@@ -119,8 +120,7 @@ class CSSSelectorTest(unittest.TestCase):
sscls = Selector
def setUp(self):
- self.htmlresponse = HtmlResponse('http://example.com', body=HTMLBODY)
- self.sel = self.sscls(self.htmlresponse)
+ self.sel = self.sscls(url='http://example.com', text=HTMLBODY)
def x(self, *a, **kw):
return [v.strip() for v in self.sel.css(*a, **kw).extract() if v.strip()]
--
1.9.1
From eab029568b37d564bc8bbdb5b836cf0976a39cd1 Mon Sep 17 00:00:00 2001
From: Umair Ashraf <umr.ashrf@gmail.com>
Date: Sat, 21 Feb 2015 20:25:01 +0500
Subject: [PATCH 5/5] added selectors tests specific support files
---
pytest.ini | 4 ++++
tests/requirements.txt | 3 +++
tox.ini | 14 ++++++++++++++
3 files changed, 21 insertions(+)
create mode 100644 pytest.ini
create mode 100644 tests/requirements.txt
create mode 100644 tox.ini
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 0000000..cc48090
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,4 @@
+[pytest]
+python_files=test_*.py __init__.py
+addopts = --doctest-modules --assert=plain
+twisted = 1
diff --git a/tests/requirements.txt b/tests/requirements.txt
new file mode 100644
index 0000000..18ae516
--- /dev/null
+++ b/tests/requirements.txt
@@ -0,0 +1,3 @@
+Twisted>=10.0.0
+pytest-twisted
+mock
diff --git a/tox.ini b/tox.ini
new file mode 100644
index 0000000..617df26
--- /dev/null
+++ b/tox.ini
@@ -0,0 +1,14 @@
+# Tox (http://tox.testrun.org/) is a tool for running tests
+# in multiple virtualenvs. This configuration file will run the
+# test suite on all supported python versions. To use it, "pip install tox"
+# and then run "tox" from this directory.
+
+[tox]
+envlist = py27
+
+[testenv]
+deps =
+ -rrequirements.txt
+ -rtests/requirements.txt
+commands =
+ py.test {posargs:selectors tests}
--
1.9.1
#!/bin/bash
# startover
git checkout master
git branch -D selectors selector-code utils-code tests-code
# split scrapy/selector dir to selector-code branch
git checkout -b selector-code
git filter-branch -f --prune-empty \
--subdirectory-filter scrapy/selector -- selector-code
# mv files to selectors/ dir without new commit
git filter-branch -f \
--index-filter '
git ls-files -s \
| sed "s-\t-&selectors/-" \
| GIT_INDEX_FILE=$GIT_INDEX_FILE.new git update-index --index-info \
&& mv $GIT_INDEX_FILE.new $GIT_INDEX_FILE'
# now we need to split utils
git checkout master
# split scrapy/utils dir to utils-code branch
git checkout -b utils-code
git filter-branch -f --prune-empty \
--subdirectory-filter scrapy/utils -- utils-code
# only keep required utils files
git filter-branch -f \
--prune-empty \
--index-filter '
git ls-tree -z -r --name-only --full-tree $GIT_COMMIT \
| grep -z -v "^__init__.py$" \
| grep -z -v "^decorator.py$" \
| grep -z -v "^misc.py$" \
| grep -z -v "^python.py$" \
| xargs -0 -r git rm --cached -r
' \
-- \
utils-code
# mv files to selectors/utils/ dir without new commit
git filter-branch -f \
--index-filter '
git ls-files -s \
| sed "s-\t-&selectors/utils/-" \
| GIT_INDEX_FILE=$GIT_INDEX_FILE.new git update-index --index-info \
&& mv $GIT_INDEX_FILE.new $GIT_INDEX_FILE'
# now we need to split tests
git checkout master
# split tests dir to tests-code branch
git checkout -b tests-code
git filter-branch -f --prune-empty \
--subdirectory-filter tests -- tests-code
# only keep required tests files
git filter-branch -f \
--prune-empty \
--index-filter '
git ls-tree -z -r --name-only --full-tree $GIT_COMMIT \
| grep -z -v "^__init__.py$" \
| grep -z -v "^test_selector.py$" \
| grep -z -v "^test_selector_csstranslator.py$" \
| xargs -0 -r git rm --cached -r
' \
-- \
tests-code
# mv files to tests/ dir without new commit
git filter-branch -f \
--index-filter '
git ls-files -s \
| sed "s-\t-&tests/-" \
| GIT_INDEX_FILE=$GIT_INDEX_FILE.new git update-index --index-info \
&& mv $GIT_INDEX_FILE.new $GIT_INDEX_FILE'
# centralized branch for all selectors code
git checkout --orphan selectors
git rm -r -f .
# merge and rebase separate branches
git merge selector-code
git rebase utils-code
git rebase tests-code
# release branches
git branch -D selector-code utils-code tests-code
# now we can apply selectors patches
for f in $(ls *.patch); do
git am < $f;
done
# now we can remove selectors from scrapy
# references
# http://git-scm.com/docs/git-filter-branch
# http://git-scm.com/docs/git-ls-tree
# examples
# https://stackoverflow.com/questions/359424/detach-subdirectory-into-separate-git-repository
# https://stackoverflow.com/questions/359424/detach-subdirectory-into-separate-git-repository
# https://github.com/apenwarr/git-subtree/blob/master/git-subtree.txt
# https://stackoverflow.com/questions/6403715/git-how-to-split-off-library-from-project-filter-branch-subtree?rq=1
# https://stackoverflow.com/questions/5998987/splitting-a-set-of-files-within-a-git-repo-into-their-own-repository-preserving
# https://www.kernel.org/pub/software/scm/git/docs/git-filter-branch.html
# http://stackoverflow.com/a/7396584
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment