Skip to content

Instantly share code, notes, and snippets.

View nramirezuy's full-sized avatar

Nicolás Ramírez nramirezuy

  • Montevideo. Uruguay
View GitHub Profile
diff --git a/docs/faq.rst b/docs/faq.rst
index 47bfede..2e36216 100644
--- a/docs/faq.rst
+++ b/docs/faq.rst
@@ -66,6 +66,16 @@ Yes. Support for HTTP proxies is provided (since Scrapy 0.8) through the HTTP
Proxy downloader middleware. See
:class:`~scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware`.
+Can Scrapy execute JavaScript while scraping the web?
+-----------------------------------------------------
diff --git a/scrapy/selector/unified.py b/scrapy/selector/unified.py
index b8a3678..28d8ac9 100644
--- a/scrapy/selector/unified.py
+++ b/scrapy/selector/unified.py
@@ -46,6 +46,38 @@ def _response_from_text(text, st):
body=unicode_to_str(text, 'utf-8'))
+class SelectorContext(object):
+
diff --git a/scrapy/selector/unified.py b/scrapy/selector/unified.py
index b8a3678..6ce1de1 100644
--- a/scrapy/selector/unified.py
+++ b/scrapy/selector/unified.py
@@ -46,10 +46,40 @@ def _response_from_text(text, st):
body=unicode_to_str(text, 'utf-8'))
+import threading
+data = threading.local()
from scrapy.spider import Spider
class SeveralNamesSpider(Spider):
def start_requests(self):
print 'name: {}, start_urls: {}'.format(self.name, self.start_urls)
for name, start_urls in (('name1', ('url1', )), ('name2', ('url2', ))):
def parse_category(self, response):
item = Item()
item['category'] = get_category(response)
for url in get_product_urls(response):
yield Request(url, callback=self.parse_product, meta={'item': item.copy()])
def parse_product(self, response):
item = response.meta.get('item', {})
...
return item
from scrapy.spider import Spider
from scrapy.http import Request
class StateSpider(Spider):
name = 'state'
def start_requests(self):
print 'State:', getattr(self, 'state', None)
yield Request('http://example.com')
LOG 1
=====
scrapy crawl state -s JOBDIR=test
/home/scrapinghub/Devel/testspiders/testspiders/spiders/dummy.py:3: ScrapyDeprecationWarning: testspiders.spiders.dummy.DummySpider inherits from deprecated class scrapy.spider.BaseSpider, please inherit from scrapy.spider.Spider. (warning only on first subclass, there may be others)
class DummySpider(BaseSpider):
/home/scrapinghub/Devel/scrapy/scrapy/contrib/linkextractors/sgml.py:106: ScrapyDeprecationWarning: SgmlLinkExtractor is deprecated and will be removed in future releases. Please use scrapy.contrib.linkextractors.LinkExtractor
ScrapyDeprecationWarning
2014-08-21 14:30:41-0300 [scrapy] INFO: Scrapy 0.25.1 started (bot: testspiders)
2014-08-21 14:30:41-0300 [scrapy] INFO: Optional features available: ssl, http11, boto
2014-08-21 14:30:41-0300 [scrapy] INFO: Overridden settings: {'CLOSESPIDER_TIMEOUT': 3600, 'CLOSESPIDER_PAGECOUNT': 1000, 'SPIDER_MODULES': ['testspiders.spiders'], 'NEWSPIDER_MODULE': 'testspiders.spiders', 'BOT_NAME': 'testspiders'}
import collections, json
from urllib import urlretrieve
from urlparse import urljoin
from csv import DictReader, reader as csv_reader
import scrapinghub
from project.settings import SH_APIKEY
from scrapy.spider import Spider
from scrapy import log
class DummySpider(Spider):
name = "dummy"
allowed_domains = ["example.com", "iana.org"]
start_urls = (
'http://www.example.com/',
)
def dictpath(dct, path):
"""Resolve dictpath
>>> r = {'also_viewed': ['url1', 'url2']}
>>> list(dictpath(r, 'also_viewed'))
['url1', 'url2']
>>> r = {'related': [{'url': 'url1'}, {'url': 'url2'}]}
>>> list(dictpath(r, 'related:url'))
['url1', 'url2']
>>> r = {'related': [{'urls': ['url1', 'url2']}, {'urls': ['url3', 'url4']}]}
>>> list(dictpath(r, 'related:urls'))