Skip to content

Instantly share code, notes, and snippets.

@aes512
Forked from vatsalj/ipython-scrapy.ipynb
Created March 21, 2016 03:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aes512/394ea450ebfa3be1cee3 to your computer and use it in GitHub Desktop.
Save aes512/394ea450ebfa3be1cee3 to your computer and use it in GitHub Desktop.
Updated kmike's gist to remove deprecated parts such as BaseSpider, signal item_parsed, crawler.install(). Works with scrapy 0.23.0.
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": ""
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#IPython-Scrapy\n",
"\n",
"This notebook is a minimal proof-of-concept Scrapy-IPython integration.\n",
"\n",
"To try this notebook, create a 'tmp' subfolder (in the folder 'ipython notebook' is executed from) and run\n",
"\n",
" python -m SimpleHTTPServer\n",
" \n",
"from this 'tmp' folder."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Code for downloading webpages via Scrapy:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from __future__ import print_function\n",
"import os\n",
"import sys\n",
"import multiprocessing\n",
"from multiprocessing.queues import Queue\n",
"import lxml.etree\n",
"import lxml.html\n",
"from scrapy import project, signals\n",
"from scrapy.spider import Spider\n",
"from scrapy.item import Item, Field\n",
"from scrapy.crawler import CrawlerProcess\n",
"from scrapy.xlib.pydispatch import dispatcher\n",
"from scrapy.utils.project import get_project_settings\n",
"from scrapy.http import Request\n",
"from scrapy.selector import XPathSelector, XmlXPathSelector, HtmlXPathSelector\n",
"\n",
"TMP_DIR = './tmp'\n",
"\n",
"class ResponseItem(Item):\n",
" response = Field()\n",
"\n",
"class ResponseSpider(Spider):\n",
" name = 'response_spider'\n",
" \n",
" def __init__(self, url):\n",
" self.url = url\n",
" super(ResponseSpider, self).__init__()\n",
" \n",
" def start_requests(self):\n",
" return [Request(self.url, self.parse, dont_filter=True)]\n",
" \n",
" def parse(self, response):\n",
" # request with callback fails to serialize - why?\n",
" req = response.request.replace(callback=None)\n",
" return ResponseItem(\n",
" response=response.replace(request=req),\n",
" )\n",
" \n",
" \n",
"class CrawlerWorker(multiprocessing.Process):\n",
" def __init__(self, result_queue, spider, settings=None):\n",
" multiprocessing.Process.__init__(self)\n",
" self.settings = settings or get_project_settings()\n",
" self.result_queue = result_queue\n",
" self.spider = spider\n",
" self.items = []\n",
" dispatcher.connect(self._item_scraped, signals.item_scraped)\n",
" \n",
" def _item_scraped(self, item):\n",
" self.items.append(item)\n",
" \n",
" def run(self):\n",
" self.crawler = CrawlerProcess(self.settings)\n",
" self.crawler = self.crawler_process.create_crawler()\n",
" self.crawler.crawl(self.spider)\n",
" self.crawler.start() \n",
" self.crawler.stop()\n",
" self.result_queue.put(self.items)\n",
" \n",
"\n",
"def _download(url):\n",
" result_queue = Queue()\n",
" spider = ResponseSpider(url)\n",
" crawler = CrawlerWorker(result_queue, spider)\n",
" crawler.start() \n",
" item = result_queue.get()[0]\n",
" result_queue.cancel_join_thread()\n",
" crawler.join()\n",
" return item['response']\n",
"\n",
"def set_base(body, base):\n",
" if '<base' not in body:\n",
" body = body.replace('<head>', '<head><base href=\"%s\">' % base)\n",
" return body\n",
"\n",
"def download(url):\n",
" \"\"\"\n",
" Download 'url' using Scrapy. Return Response.\n",
" \"\"\"\n",
" response = _download(url)\n",
" return response.replace(body=set_base(response.body, url))"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Code for highlighting XPaths and displaying HTML in IPython cells:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from IPython import display\n",
"\n",
"def _show_in_iframe(local_url):\n",
" fname = os.path.join(TMP_DIR, 'output.html')\n",
" html = \"\"\"<html><body>\n",
" <p><input type='button' value='Do we need'> <input type='button' value='some UI controls?'></p>\n",
" <hr>\n",
" <iframe style='width:800px; height:600px;' src=\"%s\"></iframe>\n",
" </body></html>\"\"\" % local_url\n",
" display.display(display.HTML(html))\n",
"\n",
"\n",
"def show_in_iframe(html):\n",
" fname = os.path.join(TMP_DIR, 'output.html')\n",
" with open(fname, 'wb') as f: \n",
" f.write(html) \n",
" _show_in_iframe('http://127.0.0.1:8000/output.html')\n",
" \n",
"\n",
"def _highlight(hxs):\n",
" el = hxs._root\n",
" el.attrib['style'] = 'background-color: yellow;' + el.get('style', '') \n",
"\n",
"\n",
"def show_hxs_select(hxs, xpath):\n",
" for link in hxs.select(xpath):\n",
" _highlight(link)\n",
" \n",
" body = lxml.html.tostring(hxs._root.getroottree())\n",
" show_in_iframe(body)\n",
"\n",
" \n",
"def show_xpath(url, xpath):\n",
" response = download(url)\n",
" hxs = HtmlXPathSelector(response)\n",
" show_hxs_select(hxs, xpath)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 13
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Usage example:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"show_xpath('http://crawlera.com', '//a[contains(text(), \"i\")]')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<html><body>\n",
" <p><input type='button' value='Do we need'> <input type='button' value='some UI controls?'></p>\n",
" <hr>\n",
" <iframe style='width:800px; height:600px;' src=\"http://127.0.0.1:8000/output.html\"></iframe>\n",
" </body></html>"
],
"metadata": {},
"output_type": "display_data",
"text": [
"<IPython.core.display.HTML at 0x112518890>"
]
}
],
"prompt_number": 19
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment