Skip to content

Instantly share code, notes, and snippets.

@pohzipohzi
Forked from kmike/ipython-scrapy.ipynb
Last active July 26, 2018 05:47
Show Gist options
  • Save pohzipohzi/b5e085b164683281dc094fe6b3559732 to your computer and use it in GitHub Desktop.
Save pohzipohzi/b5e085b164683281dc094fe6b3559732 to your computer and use it in GitHub Desktop.
kmike's ipython-scrapy proof of concept, updated for scrapy 1.5
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# IPython-Scrapy\n",
"This notebook is a minimal proof-of-concept Scrapy-IPython integration.\n",
"\n",
"To try this notebook, create a 'tmp' subfolder (in the folder 'jupyter notebook' is executed from) and run `python -m http.server` from this 'tmp' folder."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Code for downloading web pages via scrapy:"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import optparse\n",
"import sys\n",
"\n",
"from twisted.internet import reactor, threads, defer\n",
"from twisted.python import threadable\n",
"from w3lib.url import any_to_uri\n",
"from threading import Thread\n",
"\n",
"from scrapy.commands import ScrapyCommand\n",
"from scrapy.crawler import Crawler, CrawlerProcess\n",
"from scrapy.exceptions import IgnoreRequest\n",
"from scrapy.http import Request\n",
"from scrapy.settings import Settings\n",
"from scrapy.spiders import Spider\n",
"from scrapy.utils.datatypes import SequenceExclude\n",
"from scrapy.utils.spider import spidercls_for_request, DefaultSpider\n",
"from scrapy.utils.project import get_project_settings\n",
"\n",
"class ShellObject(object):\n",
"\n",
" def __init__(self, crawler):\n",
" self.crawler = crawler\n",
" \n",
" def fetch(self, url, spider=None, redirect=True, **kwargs):\n",
" url = any_to_uri(url)\n",
" request = Request(url, dont_filter=True, **kwargs)\n",
" if redirect:\n",
" request.meta['handle_httpstatus_list'] = SequenceExclude(range(300, 400))\n",
" else:\n",
" request.meta['handle_httpstatus_all'] = True\n",
" response = None\n",
" try:\n",
" response, spider = threads.blockingCallFromThread(\n",
" reactor, self._schedule, request, spider)\n",
" except IgnoreRequest:\n",
" pass\n",
" return response\n",
"\n",
" def _schedule(self, request, spider):\n",
" spider = self._open_spider(request, spider)\n",
" d = _request_deferred(request)\n",
" d.addCallback(lambda x: (x, spider))\n",
" self.crawler.engine.crawl(request, spider)\n",
" return d\n",
"\n",
" def _open_spider(self, request, spider):\n",
" spider = self.crawler.spider or self.crawler._create_spider()\n",
" self.crawler.spider = spider\n",
" self.crawler.engine.open_spider(spider, close_if_idle=False)\n",
" self.spider = spider\n",
" return spider\n",
" \n",
"class ShellCommand(ScrapyCommand):\n",
"\n",
" default_settings = {\n",
" 'KEEP_ALIVE': True,\n",
" 'LOGSTATS_INTERVAL': 0,\n",
" 'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter',\n",
" }\n",
"\n",
" def add_options(self, parser):\n",
" ScrapyCommand.add_options(self, parser)\n",
" parser.add_option(\"-c\", dest=\"code\",\n",
" help=\"evaluate the code in the shell, print the result and exit\")\n",
" parser.add_option(\"--spider\", dest=\"spider\",\n",
" help=\"use this spider\")\n",
" parser.add_option(\"--no-redirect\", dest=\"no_redirect\", action=\"store_true\", \\\n",
" default=False, help=\"do not handle HTTP 3xx status codes and print response as-is\")\n",
"\n",
" def run(self, args, opts):\n",
" url = args[0]\n",
" spider_loader = self.crawler_process.spider_loader\n",
" spidercls = DefaultSpider\n",
" spidercls = spidercls_for_request(spider_loader, Request(url), spidercls, log_multiple=True)\n",
" crawler = self.crawler_process._create_crawler(spidercls)\n",
" crawler.engine = crawler._create_engine()\n",
" crawler.engine.start()\n",
" self._start_crawler_thread()\n",
" shell = ShellObject(crawler)\n",
" return shell.fetch(url=url, redirect=not opts.no_redirect)\n",
" \n",
" def _start_crawler_thread(self):\n",
" t = Thread(target=self.crawler_process.start,\n",
" kwargs={'stop_after_crawl': False})\n",
" t.daemon = True\n",
" t.start()\n",
"\n",
"def _request_deferred(request):\n",
" \n",
" request_callback = request.callback\n",
" request_errback = request.errback\n",
" \n",
" def _restore_callbacks(result):\n",
" request.callback = request_callback\n",
" request.errback = request_errback\n",
" return result\n",
"\n",
" d = defer.Deferred()\n",
" d.addBoth(_restore_callbacks)\n",
" if request.callback:\n",
" d.addCallbacks(request.callback, request.errback)\n",
"\n",
" request.callback, request.errback = d.callback, d.errback\n",
" return d\n",
"\n",
"def download(url):\n",
" settings = get_project_settings()\n",
" parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \\\n",
" conflict_handler='resolve')\n",
" cmd = ShellCommand()\n",
" settings.setdict(cmd.default_settings, priority='command')\n",
" cmd.settings = settings\n",
" cmd.add_options(parser)\n",
" opts, args = parser.parse_args(args=[url])\n",
" cmd.process_options(args, opts)\n",
" cmd.crawler_process = CrawlerProcess(settings)\n",
" return cmd.run(args, opts)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Code for highlighting XPaths and displaying HTML in IPython cells:"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"from IPython import display\n",
"from scrapy import Selector\n",
"import lxml.html\n",
"import os\n",
"\n",
"TMP_DIR = './tmp'\n",
"\n",
"def _show_in_iframe(local_url):\n",
" fname = os.path.join(TMP_DIR, 'output.html')\n",
" html = \"\"\"<html><body>\n",
" <p><input type='button' value='Do we need'> <input type='button' value='some UI controls?'></p>\n",
" <hr>\n",
" <iframe style='width:800px; height:600px;' src=\"%s\"></iframe>\n",
" </body></html>\"\"\" % local_url\n",
" display.display(display.HTML(html))\n",
" \n",
"def show_in_iframe(html):\n",
" fname = os.path.join(TMP_DIR, 'output.html')\n",
" with open(fname, 'wb') as f:\n",
" f.write(html) \n",
" _show_in_iframe('http://127.0.0.1:8000/output.html')\n",
"\n",
"def _highlight(hxs):\n",
" el = hxs.root\n",
" el.attrib['style'] = 'background-color: yellow;' + el.get('style', '') \n",
" \n",
"def show_hxs_select(hxs, xpath):\n",
" for link in hxs.xpath(xpath):\n",
" _highlight(link)\n",
" \n",
" body = lxml.html.tostring(hxs.root.getroottree())\n",
" show_in_iframe(body)\n",
"\n",
"def show_xpath(url, xpath):\n",
" response = download(url)\n",
" hxs = Selector(response)\n",
" show_hxs_select(hxs, xpath)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Usage with ipywidget"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "13436f1b85464c8f88433a3abacaab4f",
"version_major": 2,
"version_minor": 0
},
"text/html": [
"<p>Failed to display Jupyter Widget of type <code>VBox</code>.</p>\n",
"<p>\n",
" If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
" that the widgets JavaScript is still loading. If this message persists, it\n",
" likely means that the widgets JavaScript library is either not installed or\n",
" not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
" Widgets Documentation</a> for setup instructions.\n",
"</p>\n",
"<p>\n",
" If you're reading this message in another frontend (for example, a static\n",
" rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
" it may mean that your frontend doesn't currently support widgets.\n",
"</p>\n"
],
"text/plain": [
"VBox(children=(Text(value='http://crawlera.com', description='URL', placeholder='eg. http://crawlera.com'), Text(value='//a[contains(text(), \"i\")]', description='Selector', placeholder='eg. //a[contains(text(), \"i\")]'), Button(description='show_xpath', style=ButtonStyle())))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2018-01-13 23:10:33 [scrapy.utils.log] INFO: Scrapy 1.5.0 started (bot: scrapybot)\n",
"2018-01-13 23:10:33 [scrapy.utils.log] INFO: Versions: lxml 4.1.1.0, libxml2 2.9.7, cssselect 1.0.1, parsel 1.2.0, w3lib 1.18.0, Twisted 17.9.0, Python 3.5.2 (default, Nov 23 2017, 16:37:01) - [GCC 5.4.0 20160609], pyOpenSSL 17.5.0 (OpenSSL 1.1.0g 2 Nov 2017), cryptography 2.1.4, Platform Linux-4.4.0-104-generic-x86_64-with-Ubuntu-16.04-xenial\n",
"2018-01-13 23:10:33 [scrapy.crawler] INFO: Overridden settings: {'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter', 'LOGSTATS_INTERVAL': 0}\n",
"2018-01-13 23:10:33 [scrapy.middleware] INFO: Enabled extensions:\n",
"['scrapy.extensions.memusage.MemoryUsage',\n",
" 'scrapy.extensions.corestats.CoreStats',\n",
" 'scrapy.extensions.telnet.TelnetConsole']\n",
"2018-01-13 23:10:33 [scrapy.middleware] INFO: Enabled downloader middlewares:\n",
"['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',\n",
" 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',\n",
" 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',\n",
" 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',\n",
" 'scrapy.downloadermiddlewares.retry.RetryMiddleware',\n",
" 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',\n",
" 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',\n",
" 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',\n",
" 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',\n",
" 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',\n",
" 'scrapy.downloadermiddlewares.stats.DownloaderStats']\n",
"2018-01-13 23:10:33 [scrapy.middleware] INFO: Enabled spider middlewares:\n",
"['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',\n",
" 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',\n",
" 'scrapy.spidermiddlewares.referer.RefererMiddleware',\n",
" 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',\n",
" 'scrapy.spidermiddlewares.depth.DepthMiddleware']\n",
"2018-01-13 23:10:33 [scrapy.middleware] INFO: Enabled item pipelines:\n",
"[]\n",
"2018-01-13 23:10:33 [scrapy.extensions.telnet] DEBUG: Telnet console listening on 127.0.0.1:6032\n",
"Exception in thread Thread-13:\n",
"Traceback (most recent call last):\n",
" File \"/usr/lib/python3.5/threading.py\", line 914, in _bootstrap_inner\n",
" self.run()\n",
" File \"/usr/lib/python3.5/threading.py\", line 862, in run\n",
" self._target(*self._args, **self._kwargs)\n",
" File \"/home/poh/Desktop/opensrc/scrapy/scrapy/crawler.py\", line 291, in start\n",
" reactor.run(installSignalHandlers=False) # blocking call\n",
" File \"/home/poh/Desktop/opensrc/scrapy/venv/lib/python3.5/site-packages/twisted/internet/base.py\", line 1242, in run\n",
" self.startRunning(installSignalHandlers=installSignalHandlers)\n",
" File \"/home/poh/Desktop/opensrc/scrapy/venv/lib/python3.5/site-packages/twisted/internet/base.py\", line 1222, in startRunning\n",
" ReactorBase.startRunning(self)\n",
" File \"/home/poh/Desktop/opensrc/scrapy/venv/lib/python3.5/site-packages/twisted/internet/base.py\", line 728, in startRunning\n",
" raise error.ReactorAlreadyRunning()\n",
"twisted.internet.error.ReactorAlreadyRunning\n",
"\n",
"2018-01-13 23:10:33 [scrapy.core.engine] INFO: Spider opened\n",
"2018-01-13 23:10:33 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://crawlera.com/> from <GET http://crawlera.com>\n",
"2018-01-13 23:10:33 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://scrapinghub.com/crawlera/> from <GET https://crawlera.com/>\n",
"2018-01-13 23:10:35 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://scrapinghub.com/crawlera> from <GET https://scrapinghub.com/crawlera/>\n",
"2018-01-13 23:10:35 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://scrapinghub.com/crawlera> (referer: None)\n"
]
},
{
"data": {
"text/html": [
"<html><body>\n",
" <p><input type='button' value='Do we need'> <input type='button' value='some UI controls?'></p>\n",
" <hr>\n",
" <iframe style='width:800px; height:600px;' src=\"http://127.0.0.1:8000/output.html\"></iframe>\n",
" </body></html>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import ipywidgets as widgets\n",
"import traitlets\n",
"from IPython import display\n",
"\n",
"def display_widget():\n",
" t1 = widgets.Text(\n",
" description='URL',\n",
" placeholder='eg. http://crawlera.com',\n",
" value='http://crawlera.com',\n",
" )\n",
" t2 = widgets.Text(\n",
" description='Selector',\n",
" placeholder='eg. //a[contains(text(), \"i\")]',\n",
" value='//a[contains(text(), \"i\")]',\n",
" )\n",
" b = widgets.Button(\n",
" description='show_xpath',\n",
" )\n",
" b.on_click(lambda x: show_xpath(t1.value,t2.value))\n",
" d = widgets.VBox([t1,t2,b])\n",
" display.display(d)\n",
" \n",
"display_widget()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "venv",
"language": "python",
"name": "venv"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment