Skip to content

Instantly share code, notes, and snippets.

View nramirezuy's full-sized avatar

Nicolás Ramírez nramirezuy

  • Montevideo. Uruguay
View GitHub Profile
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
import sys
from twisted.internet import defer, reactor
from twisted.python import log
@defer.inlineCallbacks
def raise_typeerror():
try:
1 / 0
except:
""" Announcer Extension
This extension has the objective of log useful stuff at the beginning.
Usage:
EXTENSIONS = {
'toolbox.extensions.announcer.AnnouncerExtension': 1,
}
@nramirezuy
nramirezuy / Spider code
Last active August 29, 2015 14:21
Boro, where is my traceback?
from scrapy.spider import Spider
class TestSpider(Spider):
name = 'test'
start_urls = ['http://example.com']
def parse(self, response):
raise Exception
@nramirezuy
nramirezuy / output_test_dclass
Last active August 29, 2015 14:17
Item memory leak
Starting memory: 19332 (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
Ending memory: 19456 (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
Filename: test_dclass.py
Line # Mem usage Increment Line Contents
================================================
11 19.000 MiB 0.000 MiB class _DummyA(cls):
diff --git a/scrapy/commands/shell.py b/scrapy/commands/shell.py
index 7c07064..d57643a 100644
--- a/scrapy/commands/shell.py
+++ b/scrapy/commands/shell.py
@@ -16,7 +16,8 @@ from scrapy.utils.spider import spidercls_for_request, DefaultSpider
class Command(ScrapyCommand):
requires_project = False
- default_settings = {'KEEP_ALIVE': True, 'LOGSTATS_INTERVAL': 0}
+ default_settings = {'KEEP_ALIVE': True, 'LOGSTATS_INTERVAL': 0,
>>> fetch('http://example.com')
2015-03-18 17:41:40-0300 [default] DEBUG: Crawled (200) <GET http://example.com> (referer: None)
[s] Available Scrapy objects:
[s] crawler <scrapy.crawler.Crawler object at 0x18f6910>
[s] item {}
[s] request <GET http://example.com>
[s] response <200 http://example.com>
[s] settings <scrapy.settings.Settings object at 0x18f7f90>
[s] spider <DefaultSpider 'default' at 0x1d98450>
[s] Useful shortcuts:
from pprint import pprint
from time import sleep, time
from twisted.internet import defer, reactor
def stop():
if not reactor.getDelayedCalls():
reactor.stop()
reactor.callLater(0.1, stop)
scrapy/contrib/pipeline/images.py:112: if self.IMAGES_RESULT_FIELD in item.fields:
scrapy/contrib/pipeline/files.py:270: if self.FILES_RESULT_FIELD in item.fields:
scrapy/contrib/loader/__init__.py:122: value = self.item.fields[field_name].get(key, default)
scrapy/commands/parse.py:110: if isinstance(x, BaseItem):
scrapy/contracts/default.py:86: if isinstance(x, BaseItem):
scrapy/contrib/spiders/feed.py:129: if isinstance(ret, (BaseItem, Request)):
scrapy/contrib/exporter/__init__.py:243: if isinstance(value, BaseItem):
scrapy/contrib/loader/__init__.py:121: if isinstance(self.item, Item):
scrapy/core/scraper.py:177: elif isinstance(output, BaseItem):
def dictpath(dct, path):
"""Resolve dictpath
>>> r = {'also_viewed': ['url1', 'url2']}
>>> list(dictpath(r, 'also_viewed'))
['url1', 'url2']
>>> r = {'related': [{'url': 'url1'}, {'url': 'url2'}]}
>>> list(dictpath(r, 'related:url'))
['url1', 'url2']
>>> r = {'related': [{'urls': ['url1', 'url2']}, {'urls': ['url3', 'url4']}]}
>>> list(dictpath(r, 'related:urls'))