Last active
December 28, 2015 17:59
-
-
Save pjob/7539554 to your computer and use it in GitHub Desktop.
DataPhilly - Scrapy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "DataPhilly - Scrapy" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": "Scrapy Demo: Tutorial http://doc.scrapy.org/en/latest/intro/tutorial.html" | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 4, | |
"metadata": {}, | |
"source": "Create spider" | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "!scrapy startproject tutorial", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "!ls -R tutorial", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "scrapy.cfg \u001b[34mtutorial\u001b[m\u001b[m\r\n\r\ntutorial/tutorial:\r\n__init__.py items.py pipelines.py settings.py \u001b[34mspiders\u001b[m\u001b[m\r\n\r\ntutorial/tutorial/spiders:\r\n__init__.py\r\n" | |
} | |
], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 4, | |
"metadata": {}, | |
"source": "Create our item to store title, link, description" | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "%%writefile tutorial/tutorial/items.py\n\nfrom scrapy.item import Item, Field\n\nclass DmozItem(Item):\n title = Field()\n link = Field()\n desc = Field()", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "Overwriting tutorial/tutorial/items.py\n" | |
} | |
], | |
"prompt_number": 3 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 4, | |
"metadata": {}, | |
"source": "Create our BaseSpider and start with\nhttp://www.dmoz.org/Computers/Programming/Languages/Python/Books/\nhttp://www.dmoz.org/Computers/Programming/Languages/Python/Resources/\n an" | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "%%writefile tutorial/tutorial/spiders/dmoz_spider.py\n\nfrom scrapy.spider import BaseSpider\n\nclass DmozSpider(BaseSpider):\n name = \"dmoz\"\n allowed_domains = [\"dmoz.org\"]\n start_urls = [\n \"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/\",\n \"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/\"\n ]\n\n def parse(self, response):\n filename = response.url.split(\"/\")[-2]\n open(filename, 'wb').write(response.body)", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "Writing tutorial/tutorial/spiders/dmoz_spider.py\n" | |
} | |
], | |
"prompt_number": 4 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 4, | |
"metadata": {}, | |
"source": "Run the project " | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "! cd tutorial/; scrapy crawl dmoz", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:15:39-0500 [scrapy] INFO: Scrapy 0.20.0 started (bot: tutorial)\r\n2013-11-18 22:15:39-0500 [scrapy] DEBUG: Optional features available: ssl, http11\r\n2013-11-18 22:15:39-0500 [scrapy] DEBUG: Overridden settings: {'NEWSPIDER_MODULE': 'tutorial.spiders', 'SPIDER_MODULES': ['tutorial.spiders'], 'BOT_NAME': 'tutorial'}\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:15:39-0500 [scrapy] DEBUG: Enabled extensions: LogStats, TelnetConsole, CloseSpider, WebService, CoreStats, SpiderState\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:15:40-0500 [scrapy] DEBUG: Enabled downloader middlewares: HttpAuthMiddleware, DownloadTimeoutMiddleware, UserAgentMiddleware, RetryMiddleware, DefaultHeadersMiddleware, MetaRefreshMiddleware, HttpCompressionMiddleware, RedirectMiddleware, CookiesMiddleware, ChunkedTransferMiddleware, DownloaderStats\r\n2013-11-18 22:15:40-0500 [scrapy] DEBUG: Enabled spider middlewares: HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware\r\n2013-11-18 22:15:40-0500 [scrapy] DEBUG: Enabled item pipelines: \r\n2013-11-18 22:15:40-0500 [dmoz] INFO: Spider opened\r\n2013-11-18 22:15:40-0500 [dmoz] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)\r\n2013-11-18 22:15:40-0500 [scrapy] DEBUG: Telnet console listening on 0.0.0.0:6023\r\n2013-11-18 22:15:40-0500 [scrapy] DEBUG: Web service listening on 0.0.0.0:6080\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:15:40-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/> (referer: None)\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:15:40-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Books/> (referer: None)\r\n2013-11-18 22:15:40-0500 [dmoz] INFO: Closing spider (finished)\r\n2013-11-18 22:15:40-0500 [dmoz] INFO: Dumping Scrapy stats:\r\n\t{'downloader/request_bytes': 530,\r\n\t 'downloader/request_count': 2,\r\n\t 'downloader/request_method_count/GET': 2,\r\n\t 'downloader/response_bytes': 14892,\r\n\t 'downloader/response_count': 2,\r\n\t 'downloader/response_status_count/200': 2,\r\n\t 'finish_reason': 'finished',\r\n\t 'finish_time': datetime.datetime(2013, 11, 19, 3, 15, 40, 260043),\r\n\t 'log_count/DEBUG': 8,\r\n\t 'log_count/INFO': 3,\r\n\t 'response_received_count': 2,\r\n\t 'scheduler/dequeued': 2,\r\n\t 'scheduler/dequeued/memory': 2,\r\n\t 'scheduler/enqueued': 2,\r\n\t 'scheduler/enqueued/memory': 2,\r\n\t 'start_time': datetime.datetime(2013, 11, 19, 3, 15, 40, 39752)}\r\n2013-11-18 22:15:40-0500 [dmoz] INFO: Spider closed (finished)\r\n" | |
} | |
], | |
"prompt_number": 5 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 4, | |
"metadata": {}, | |
"source": "Select title, link, and description using css selectors" | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "%%writefile tutorial/tutorial/spiders/dmoz_spider.py\n\nfrom scrapy.spider import BaseSpider\nfrom scrapy.selector import Selector\n\nclass DmozSpider(BaseSpider):\n name = \"dmoz\"\n allowed_domains = [\"dmoz.org\"]\n start_urls = [\n \"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/\",\n \"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/\"\n ]\n\n def parse(self, response):\n sel = Selector(response)\n sites = sel.xpath('//ul/li')\n for site in sites:\n title = site.xpath('a/text()').extract()\n link = site.xpath('a/@href').extract()\n desc = site.xpath('text()').extract()\n print title, link, desc", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "Overwriting tutorial/tutorial/spiders/dmoz_spider.py\n" | |
} | |
], | |
"prompt_number": 6 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "! cd tutorial/; scrapy crawl dmoz", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:15:45-0500 [scrapy] INFO: Scrapy 0.20.0 started (bot: tutorial)\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:15:45-0500 [scrapy] DEBUG: Optional features available: ssl, http11\r\n2013-11-18 22:15:45-0500 [scrapy] DEBUG: Overridden settings: {'NEWSPIDER_MODULE': 'tutorial.spiders', 'SPIDER_MODULES': ['tutorial.spiders'], 'BOT_NAME': 'tutorial'}\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:15:45-0500 [scrapy] DEBUG: Enabled extensions: LogStats, TelnetConsole, CloseSpider, WebService, CoreStats, SpiderState\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:15:45-0500 [scrapy] DEBUG: Enabled downloader middlewares: HttpAuthMiddleware, DownloadTimeoutMiddleware, UserAgentMiddleware, RetryMiddleware, DefaultHeadersMiddleware, MetaRefreshMiddleware, HttpCompressionMiddleware, RedirectMiddleware, CookiesMiddleware, ChunkedTransferMiddleware, DownloaderStats\r\n2013-11-18 22:15:45-0500 [scrapy] DEBUG: Enabled spider middlewares: HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware\r\n2013-11-18 22:15:45-0500 [scrapy] DEBUG: Enabled item pipelines: \r\n2013-11-18 22:15:45-0500 [dmoz] INFO: Spider opened\r\n2013-11-18 22:15:45-0500 [dmoz] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)\r\n2013-11-18 22:15:45-0500 [scrapy] DEBUG: Telnet console listening on 0.0.0.0:6023\r\n2013-11-18 22:15:45-0500 [scrapy] DEBUG: Web service listening on 0.0.0.0:6080\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:15:45-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/> (referer: None)\r\n[u'Top'] [u'/'] [u'\\r\\n\\r\\n ']\r\n[u'Computers'] [u'/Computers/'] []\r\n[u'Programming'] [u'/Computers/Programming/'] []\r\n[u'Languages'] [u'/Computers/Programming/Languages/'] []\r\n[u'Python'] [u'/Computers/Programming/Languages/Python/'] []\r\n[] [] [u'\\r\\n ', u'\\xa0', u'\\r\\n ']\r\n[u'Computers: Programming: Resources'] [u'/Computers/Programming/Resources/'] [u'\\r\\n ', u' \\r\\n ', u'\\r\\n ']\r\n[u\"eff-bot's Daily Python URL\"] [u'http://www.pythonware.com/daily/'] [u'\\r\\n\\t\\t\\t\\r\\n ', u' \\r\\n\\t\\t\\t\\r\\n - Contains links to assorted resources from the Python universe, compiled by PythonWare.\\r\\n \\r\\n ']\r\n[u'Free Python and Zope Hosting Directory'] [u'http://www.oinko.net/freepython/'] [u'\\r\\n\\t\\t\\t\\r\\n ', u' \\r\\n\\t\\t\\t\\r\\n - A directory of free Python and Zope hosting providers, with reviews and ratings.\\r\\n \\r\\n ']\r\n[u\"O'Reilly Python Center\"] [u'http://oreilly.com/python/'] [u'\\r\\n\\t\\t\\t\\r\\n ', u' \\r\\n\\t\\t\\t\\r\\n - Features Python books, resources, news and articles.\\r\\n \\r\\n ']\r\n[u\"Python Developer's Guide\"] [u'http://www.python.org/dev/'] [u'\\r\\n\\t\\t\\t\\r\\n ', u' \\r\\n\\t\\t\\t\\r\\n - Resources for reporting bugs, accessing the Python source tree with CVS and taking part in the development of Python.\\r\\n \\r\\n ']\r\n[u'Social Bug'] [u'http://win32com.goermezer.de/'] [u'\\r\\n\\t\\t\\t\\r\\n ', u' \\r\\n\\t\\t\\t\\r\\n - Scripts, examples and news about Python programming for the Windows platform.\\r\\n \\r\\n ']\r\n2013-11-18 22:15:45-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Books/> (referer: None)\r\n[u'Top'] [u'/'] [u'\\r\\n\\r\\n ']\r\n[u'Computers'] [u'/Computers/'] []\r\n[u'Programming'] [u'/Computers/Programming/'] []\r\n[u'Languages'] [u'/Computers/Programming/Languages/'] []\r\n[u'Python'] [u'/Computers/Programming/Languages/Python/'] []\r\n[] [] [u'\\r\\n ', u'\\xa0', u'\\r\\n ']\r\n[u'Computers: Programming: Languages: Python: Resources'] [u'/Computers/Programming/Languages/Python/Resources/'] [u'\\r\\n ', u' \\r\\n ', u'\\r\\n ']\r\n[u'Computers: Programming: Languages: Ruby: Books'] [u'/Computers/Programming/Languages/Ruby/Books/'] [u'\\r\\n ', u' \\r\\n ', u'\\r\\n ']\r\n[u'German'] [u'/World/Deutsch/Computer/Programmieren/Sprachen/Python/B%C3%BCcher/'] [u'\\r\\n \\t', u'\\r\\n ', u'\\r\\n\\t\\t\\t\\t\\t']\r\n[u'Russian'] [u'/World/Russian/%D0%9A%D0%BE%D0%BC%D0%BF%D1%8C%D1%8E%D1%82%D0%B5%D1%80%D1%8B/%D0%9F%D1%80%D0%BE%D0%B3%D1%80%D0%B0%D0%BC%D0%BC%D0%B8%D1%80%D0%BE%D0%B2%D0%B0%D0%BD%D0%B8%D0%B5/%D0%AF%D0%B7%D1%8B%D0%BA%D0%B8/Python/%D0%9A%D0%BD%D0%B8%D0%B3%D0%B8/'] [u'\\r\\n \\t', u'\\r\\n ', u'\\r\\n\\t\\t\\t\\t\\t']\r\n[u'Core Python Programming'] [u'http://www.pearsonhighered.com/educator/academic/product/0,,0130260363,00%2Ben-USS_01DBC.html'] [u'\\r\\n\\t\\t\\t\\r\\n ', u' \\r\\n\\t\\t\\t\\r\\n - By Wesley J. Chun; Prentice Hall PTR, 2001, ISBN 0130260363. For experienced developers to improve extant skills; professional level examples. Starts by introducing syntax, objects, error handling, functions, classes, built-ins. [Prentice Hall]\\r\\n \\r\\n ']\r\n[u'Data Structures and Algorithms with Object-Oriented Design Patterns in Python'] [u'http://www.brpreiss.com/books/opus7/html/book.html'] [u'\\r\\n\\t\\t\\t\\r\\n ', u' \\r\\n\\t\\t\\t\\r\\n - The primary goal of this book is to promote object-oriented design using Python and to illustrate the use of the emerging object-oriented design patterns.\\r\\nA secondary goal of the book is to present mathematical tools just in time. Analysis techniques and proofs are presented as needed and in the proper context.\\r\\n \\r\\n ']\r\n[u'Dive Into Python 3'] [u'http://www.diveintopython.net/'] [u'\\r\\n\\t\\t\\t\\r\\n ', u' \\r\\n\\t\\t\\t\\r\\n - By Mark Pilgrim, Guide to Python 3 and its differences from Python 2. Each chapter starts with a real code sample and explains it fully. Has a comprehensive appendix of all the syntactic and semantic changes in Python 3\\r\\n\\r\\n\\r\\n \\r\\n ']\r\n[u'Foundations of Python Network Programming'] [u'http://rhodesmill.org/brandon/2011/foundations-of-python-network-programming/'] [u'\\r\\n\\t\\t\\t\\r\\n ', u' \\r\\n\\t\\t\\t\\r\\n - This book covers a wide range of topics. From raw TCP and UDP to encryption with TSL, and then to HTTP, SMTP, POP, IMAP, and ssh. It gives you a good understanding of each field and how to do everything on the network with Python.\\r\\n \\r\\n ']\r\n[u'Free Python books'] [u'http://www.techbooksforfree.com/perlpython.shtml'] [u'\\r\\n\\t\\t\\t\\r\\n ', u' \\r\\n\\t\\t\\t\\r\\n - Free Python books and tutorials.\\r\\n \\r\\n ']\r\n[u'FreeTechBooks: Python Scripting Language'] [u'http://www.freetechbooks.com/python-f6.html'] [u'\\r\\n\\t\\t\\t\\r\\n ', u' \\r\\n\\t\\t\\t\\r\\n - Annotated list of free online books on Python scripting language. Topics range from beginner to advanced.\\r\\n \\r\\n ']\r\n[u'How to Think Like a Computer Scientist: Learning with Python'] [u'http://greenteapress.com/thinkpython/'] [u'\\r\\n\\t\\t\\t\\r\\n ', u' \\r\\n\\t\\t\\t\\r\\n - By Allen B. Downey, Jeffrey Elkner, Chris Meyers; Green Tea Press, 2002, ISBN 0971677506. Teaches general principles of programming, via Python as subject language. Thorough, in-depth approach to many basic and intermediate programming topics. Full text online and downloads: HTML, PDF, PS, LaTeX. [Free, Green Tea Press]\\r\\n \\r\\n ']\r\n[u'An Introduction to Python'] [u'http://www.network-theory.co.uk/python/intro/'] [u'\\r\\n\\t\\t\\t\\r\\n ', u' \\r\\n\\t\\t\\t\\r\\n - By Guido van Rossum, Fred L. Drake, Jr.; Network Theory Ltd., 2003, ISBN 0954161769. Printed edition of official tutorial, for v2.x, from Python.org. [Network Theory, online]\\r\\n \\r\\n ']\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "[u'Learn to Program Using Python'] [u'http://www.freenetpages.co.uk/hp/alan.gauld/'] [u'\\r\\n\\t\\t\\t\\r\\n ', u' \\r\\n\\t\\t\\t\\r\\n - Book by Alan Gauld with full text online. Introduction for those learning programming basics: terminology, concepts, methods to write code. Assumes no prior knowledge but basic computer skills.\\r\\n \\r\\n ']\r\n[u'Making Use of Python'] [u'http://www.wiley.com/WileyCDA/WileyTitle/productCd-0471219754.html'] [u'\\r\\n\\t\\t\\t\\r\\n ', u' \\r\\n\\t\\t\\t\\r\\n - By Rashi Gupta; John Wiley and Sons, 2002, ISBN 0471219754. Covers language basics, use for CGI scripting, GUI development, network programming; shows why it is one of more sophisticated of popular scripting languages. [Wiley]\\r\\n \\r\\n ']\r\n[u'Practical Python'] [u'http://hetland.org/writing/practical-python/'] [u'\\r\\n\\t\\t\\t\\r\\n ', u' \\r\\n\\t\\t\\t\\r\\n - By Magnus Lie Hetland; Apress LP, 2002, ISBN 1590590066. Readable guide to ideas most vital to new users, from basics common to high level languages, to more specific aspects, to a series of 10 ever more complex programs. [Apress]\\r\\n \\r\\n ']\r\n[u'Pro Python System Administration'] [u'http://www.sysadminpy.com/'] [u'\\r\\n\\t\\t\\t\\r\\n ', u' \\r\\n\\t\\t\\t\\r\\n - By Rytis Sileika, ISBN13: 978-1-4302-2605-5, Uses real-world system administration examples like manage devices with SNMP and SOAP, build a distributed monitoring system, manage web applications and parse complex log files, monitor and manage MySQL databases.\\r\\n\\r\\n \\r\\n ']\r\n[u'Programming in Python 3 (Second Edition)'] [u'http://www.qtrac.eu/py3book.html'] [u'\\r\\n\\t\\t\\t\\r\\n ', u' \\r\\n\\t\\t\\t\\r\\n - A Complete Introduction to the Python 3.\\r\\n \\r\\n ']\r\n[u'Python 2.1 Bible'] [u'http://www.wiley.com/WileyCDA/WileyTitle/productCd-0764548077.html'] [u'\\r\\n\\t\\t\\t\\r\\n ', u' \\r\\n\\t\\t\\t\\r\\n - By Dave Brueck, Stephen Tanner; John Wiley and Sons, 2001, ISBN 0764548077. Full coverage, clear explanations, hands-on examples, full language reference; shows step by step how to use components, assemble them, form full-featured programs. [John Wiley and Sons]\\r\\n \\r\\n ']\r\n[u'Python 3 Object Oriented Programming'] [u'https://www.packtpub.com/python-3-object-oriented-programming/book'] [u'\\r\\n\\t\\t\\t\\r\\n ', u' \\r\\n\\t\\t\\t\\r\\n - A step-by-step tutorial for OOP in Python 3, including discussion and examples of abstraction, encapsulation, information hiding, and raise, handle, define, and manipulate exceptions.\\r\\n \\r\\n ']\r\n[u'Python Language Reference Manual'] [u'http://www.network-theory.co.uk/python/language/'] [u'\\r\\n\\t\\t\\t\\r\\n ', u' \\r\\n\\t\\t\\t\\r\\n - By Guido van Rossum, Fred L. Drake, Jr.; Network Theory Ltd., 2003, ISBN 0954161785. Printed edition of official language reference, for v2.x, from Python.org, describes syntax, built-in datatypes. [Network Theory, online]\\r\\n \\r\\n ']\r\n[u'Python Programming Patterns'] [u'http://www.pearsonhighered.com/educator/academic/product/0,,0130409561,00%2Ben-USS_01DBC.html'] [u'\\r\\n\\t\\t\\t\\r\\n ', u' \\r\\n\\t\\t\\t\\r\\n - By Thomas W. Christopher; Prentice Hall PTR, 2002, ISBN 0130409561. Shows how to write large programs, introduces powerful design patterns that deliver high levels of robustness, scalability, reuse.\\r\\n \\r\\n ']\r\n[u'Python Programming with the Java Class Libraries: A Tutorial for Building Web and Enterprise Applications with Jython'] [u'http://www.informit.com/store/product.aspx?isbn=0201616165&redir=1'] [u'\\r\\n\\t\\t\\t\\r\\n ', u\" \\r\\n\\t\\t\\t\\r\\n - By Richard Hightower; Addison-Wesley, 2002, 0201616165. Begins with Python basics, many exercises, interactive sessions. Shows programming novices concepts and practical methods. Shows programming experts Python's abilities and ways to interface with Java APIs. [publisher website]\\r\\n \\r\\n \"]\r\n[u'Python: Visual QuickStart Guide'] [u'http://www.pearsonhighered.com/educator/academic/product/0,,0201748843,00%2Ben-USS_01DBC.html'] [u'\\r\\n\\t\\t\\t\\r\\n ', u' \\r\\n\\t\\t\\t\\r\\n - By Chris Fehily; Peachpit Press, 2002, ISBN 0201748843. Task-based, step-by-step visual reference guide, many screen shots, for courses in digital graphics; Web design, scripting, development; multimedia, page layout, office tools, operating systems. [Prentice Hall]\\r\\n \\r\\n ']\r\n[u'Sams Teach Yourself Python in 24 Hours'] [u'http://www.informit.com/store/product.aspx?isbn=0672317354'] [u'\\r\\n\\t\\t\\t\\r\\n ', u' \\r\\n\\t\\t\\t\\r\\n - By Ivan Van Laningham; Sams Publishing, 2000, ISBN 0672317354. Split into 24 hands-on, 1 hour lessons; steps needed to learn topic: syntax, language features, OO design and programming, GUIs (Tkinter), system administration, CGI. [Sams Publishing]\\r\\n \\r\\n ']\r\n[u'Text Processing in Python'] [u'http://gnosis.cx/TPiP/'] [u'\\r\\n\\t\\t\\t\\r\\n ', u' \\r\\n\\t\\t\\t\\r\\n - By David Mertz; Addison Wesley. Book in progress, full text, ASCII format. Asks for feedback. [author website, Gnosis Software, Inc.]\\r\\n \\r\\n ']\r\n[u'XML Processing with Python'] [u'http://www.informit.com/store/product.aspx?isbn=0130211192'] [u'\\r\\n\\t\\t\\t\\r\\n ', u' \\r\\n\\t\\t\\t\\r\\n - By Sean McGrath; Prentice Hall PTR, 2000, ISBN 0130211192, has CD-ROM. Methods to build XML applications fast, Python tutorial, DOM and SAX, new Pyxie open source XML processing library. [Prentice Hall PTR]\\r\\n \\r\\n ']\r\n2013-11-18 22:15:45-0500 [dmoz] INFO: Closing spider (finished)\r\n2013-11-18 22:15:45-0500 [dmoz] INFO: Dumping Scrapy stats:\r\n\t{'downloader/request_bytes': 530,\r\n\t 'downloader/request_count': 2,\r\n\t 'downloader/request_method_count/GET': 2,\r\n\t 'downloader/response_bytes': 14892,\r\n\t 'downloader/response_count': 2,\r\n\t 'downloader/response_status_count/200': 2,\r\n\t 'finish_reason': 'finished',\r\n\t 'finish_time': datetime.datetime(2013, 11, 19, 3, 15, 45, 348844),\r\n\t 'log_count/DEBUG': 8,\r\n\t 'log_count/INFO': 3,\r\n\t 'response_received_count': 2,\r\n\t 'scheduler/dequeued': 2,\r\n\t 'scheduler/dequeued/memory': 2,\r\n\t 'scheduler/enqueued': 2,\r\n\t 'scheduler/enqueued/memory': 2,\r\n\t 'start_time': datetime.datetime(2013, 11, 19, 3, 15, 45, 143009)}\r\n2013-11-18 22:15:45-0500 [dmoz] INFO: Spider closed (finished)\r\n" | |
} | |
], | |
"prompt_number": 7 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 4, | |
"metadata": {}, | |
"source": "Clean up whitespace in description" | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "%%writefile tutorial/tutorial/spiders/dmoz_spider.py\n\nfrom scrapy.spider import BaseSpider\nfrom scrapy.selector import Selector\n\nfrom tutorial.items import DmozItem\n\nclass DmozSpider(BaseSpider):\n name = \"dmoz\"\n allowed_domains = [\"dmoz.org\"]\n start_urls = [\n \"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/\",\n \"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/\"\n ]\n\n def parse(self, response):\n sel = Selector(response)\n sites = sel.xpath('//ul/li')\n items = []\n for site in sites:\n item = DmozItem()\n item['title'] = site.xpath('a/text()').extract()\n item['link'] = site.xpath('a/@href').extract()\n item['desc'] = [x.strip() for x in site.xpath('text()').extract()]\n items.append(item)\n return items", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "Overwriting tutorial/tutorial/spiders/dmoz_spider.py\n" | |
} | |
], | |
"prompt_number": 8 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "! cd tutorial/; scrapy crawl dmoz", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:15:57-0500 [scrapy] INFO: Scrapy 0.20.0 started (bot: tutorial)\r\n2013-11-18 22:15:57-0500 [scrapy] DEBUG: Optional features available: ssl, http11\r\n2013-11-18 22:15:57-0500 [scrapy] DEBUG: Overridden settings: {'NEWSPIDER_MODULE': 'tutorial.spiders', 'SPIDER_MODULES': ['tutorial.spiders'], 'BOT_NAME': 'tutorial'}\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:15:57-0500 [scrapy] DEBUG: Enabled extensions: LogStats, TelnetConsole, CloseSpider, WebService, CoreStats, SpiderState\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:15:57-0500 [scrapy] DEBUG: Enabled downloader middlewares: HttpAuthMiddleware, DownloadTimeoutMiddleware, UserAgentMiddleware, RetryMiddleware, DefaultHeadersMiddleware, MetaRefreshMiddleware, HttpCompressionMiddleware, RedirectMiddleware, CookiesMiddleware, ChunkedTransferMiddleware, DownloaderStats\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:15:57-0500 [scrapy] DEBUG: Enabled spider middlewares: HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware\r\n2013-11-18 22:15:57-0500 [scrapy] DEBUG: Enabled item pipelines: \r\n2013-11-18 22:15:57-0500 [dmoz] INFO: Spider opened\r\n2013-11-18 22:15:57-0500 [dmoz] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)\r\n2013-11-18 22:15:57-0500 [scrapy] DEBUG: Telnet console listening on 0.0.0.0:6023\r\n2013-11-18 22:15:57-0500 [scrapy] DEBUG: Web service listening on 0.0.0.0:6080\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:15:57-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/> (referer: None)\r\n2013-11-18 22:15:57-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/>\r\n\t{'desc': [u''], 'link': [u'/'], 'title': [u'Top']}\r\n2013-11-18 22:15:57-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/>\r\n\t{'desc': [], 'link': [u'/Computers/'], 'title': [u'Computers']}\r\n2013-11-18 22:15:57-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/>\r\n\t{'desc': [], 'link': [u'/Computers/Programming/'], 'title': [u'Programming']}\r\n2013-11-18 22:15:57-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/>\r\n\t{'desc': [],\r\n\t 'link': [u'/Computers/Programming/Languages/'],\r\n\t 'title': [u'Languages']}\r\n2013-11-18 22:15:57-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/>\r\n\t{'desc': [],\r\n\t 'link': [u'/Computers/Programming/Languages/Python/'],\r\n\t 'title': [u'Python']}\r\n2013-11-18 22:15:57-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/>\r\n\t{'desc': [u'', u'', u''], 'link': [], 'title': []}\r\n2013-11-18 22:15:57-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/>\r\n\t{'desc': [u'', u'', u''],\r\n\t 'link': [u'/Computers/Programming/Resources/'],\r\n\t 'title': [u'Computers: Programming: Resources']}\r\n2013-11-18 22:15:57-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/>\r\n\t{'desc': [u'',\r\n\t u'- Contains links to assorted resources from the Python universe, compiled by PythonWare.'],\r\n\t 'link': [u'http://www.pythonware.com/daily/'],\r\n\t 'title': [u\"eff-bot's Daily Python URL\"]}\r\n2013-11-18 22:15:57-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/>\r\n\t{'desc': [u'',\r\n\t u'- A directory of free Python and Zope hosting providers, with reviews and ratings.'],\r\n\t 'link': [u'http://www.oinko.net/freepython/'],\r\n\t 'title': [u'Free Python and Zope Hosting Directory']}\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:15:57-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/>\r\n\t{'desc': [u'', u'- Features Python books, resources, news and articles.'],\r\n\t 'link': [u'http://oreilly.com/python/'],\r\n\t 'title': [u\"O'Reilly Python Center\"]}\r\n2013-11-18 22:15:57-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/>\r\n\t{'desc': [u'',\r\n\t u'- Resources for reporting bugs, accessing the Python source tree with CVS and taking part in the development of Python.'],\r\n\t 'link': [u'http://www.python.org/dev/'],\r\n\t 'title': [u\"Python Developer's Guide\"]}\r\n2013-11-18 22:15:57-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/>\r\n\t{'desc': [u'',\r\n\t u'- Scripts, examples and news about Python programming for the Windows platform.'],\r\n\t 'link': [u'http://win32com.goermezer.de/'],\r\n\t 'title': [u'Social Bug']}\r\n2013-11-18 22:15:57-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Books/> (referer: None)\r\n2013-11-18 22:15:57-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u''], 'link': [u'/'], 'title': [u'Top']}\r\n2013-11-18 22:15:57-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [], 'link': [u'/Computers/'], 'title': [u'Computers']}\r\n2013-11-18 22:15:57-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [], 'link': [u'/Computers/Programming/'], 'title': [u'Programming']}\r\n2013-11-18 22:15:57-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [],\r\n\t 'link': [u'/Computers/Programming/Languages/'],\r\n\t 'title': [u'Languages']}\r\n2013-11-18 22:15:57-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [],\r\n\t 'link': [u'/Computers/Programming/Languages/Python/'],\r\n\t 'title': [u'Python']}\r\n2013-11-18 22:15:57-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'', u'', u''], 'link': [], 'title': []}\r\n2013-11-18 22:15:57-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'', u'', u''],\r\n\t 'link': [u'/Computers/Programming/Languages/Python/Resources/'],\r\n\t 'title': [u'Computers: Programming: Languages: Python: Resources']}\r\n2013-11-18 22:15:57-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'', u'', u''],\r\n\t 'link': [u'/Computers/Programming/Languages/Ruby/Books/'],\r\n\t 'title': [u'Computers: Programming: Languages: Ruby: Books']}\r\n2013-11-18 22:15:57-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'', u'', u''],\r\n\t 'link': [u'/World/Deutsch/Computer/Programmieren/Sprachen/Python/B%C3%BCcher/'],\r\n\t 'title': [u'German']}\r\n2013-11-18 22:15:57-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'', u'', u''],\r\n\t 'link': [u'/World/Russian/%D0%9A%D0%BE%D0%BC%D0%BF%D1%8C%D1%8E%D1%82%D0%B5%D1%80%D1%8B/%D0%9F%D1%80%D0%BE%D0%B3%D1%80%D0%B0%D0%BC%D0%BC%D0%B8%D1%80%D0%BE%D0%B2%D0%B0%D0%BD%D0%B8%D0%B5/%D0%AF%D0%B7%D1%8B%D0%BA%D0%B8/Python/%D0%9A%D0%BD%D0%B8%D0%B3%D0%B8/'],\r\n\t 'title': [u'Russian']}\r\n2013-11-18 22:15:57-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'',\r\n\t u'- By Wesley J. Chun; Prentice Hall PTR, 2001, ISBN 0130260363. For experienced developers to improve extant skills; professional level examples. Starts by introducing syntax, objects, error handling, functions, classes, built-ins. [Prentice Hall]'],\r\n\t 'link': [u'http://www.pearsonhighered.com/educator/academic/product/0,,0130260363,00%2Ben-USS_01DBC.html'],\r\n\t 'title': [u'Core Python Programming']}\r\n2013-11-18 22:15:57-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'',\r\n\t u'- The primary goal of this book is to promote object-oriented design using Python and to illustrate the use of the emerging object-oriented design patterns.\\r\\nA secondary goal of the book is to present mathematical tools just in time. Analysis techniques and proofs are presented as needed and in the proper context.'],\r\n\t 'link': [u'http://www.brpreiss.com/books/opus7/html/book.html'],\r\n\t 'title': [u'Data Structures and Algorithms with Object-Oriented Design Patterns in Python']}\r\n2013-11-18 22:15:57-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'',\r\n\t u'- By Mark Pilgrim, Guide to Python 3 and its differences from Python 2. Each chapter starts with a real code sample and explains it fully. Has a comprehensive appendix of all the syntactic and semantic changes in Python 3'],\r\n\t 'link': [u'http://www.diveintopython.net/'],\r\n\t 'title': [u'Dive Into Python 3']}\r\n2013-11-18 22:15:57-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'',\r\n\t u'- This book covers a wide range of topics. From raw TCP and UDP to encryption with TSL, and then to HTTP, SMTP, POP, IMAP, and ssh. It gives you a good understanding of each field and how to do everything on the network with Python.'],\r\n\t 'link': [u'http://rhodesmill.org/brandon/2011/foundations-of-python-network-programming/'],\r\n\t 'title': [u'Foundations of Python Network Programming']}\r\n2013-11-18 22:15:57-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'', u'- Free Python books and tutorials.'],\r\n\t 'link': [u'http://www.techbooksforfree.com/perlpython.shtml'],\r\n\t 'title': [u'Free Python books']}\r\n2013-11-18 22:15:57-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'',\r\n\t u'- Annotated list of free online books on Python scripting language. Topics range from beginner to advanced.'],\r\n\t 'link': [u'http://www.freetechbooks.com/python-f6.html'],\r\n\t 'title': [u'FreeTechBooks: Python Scripting Language']}\r\n2013-11-18 22:15:57-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'',\r\n\t u'- By Allen B. Downey, Jeffrey Elkner, Chris Meyers; Green Tea Press, 2002, ISBN 0971677506. Teaches general principles of programming, via Python as subject language. Thorough, in-depth approach to many basic and intermediate programming topics. Full text online and downloads: HTML, PDF, PS, LaTeX. [Free, Green Tea Press]'],\r\n\t 'link': [u'http://greenteapress.com/thinkpython/'],\r\n\t 'title': [u'How to Think Like a Computer Scientist: Learning with Python']}\r\n2013-11-18 22:15:57-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'',\r\n\t u'- By Guido van Rossum, Fred L. Drake, Jr.; Network Theory Ltd., 2003, ISBN 0954161769. Printed edition of official tutorial, for v2.x, from Python.org. [Network Theory, online]'],\r\n\t 'link': [u'http://www.network-theory.co.uk/python/intro/'],\r\n\t 'title': [u'An Introduction to Python']}\r\n2013-11-18 22:15:57-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'',\r\n\t u'- Book by Alan Gauld with full text online. Introduction for those learning programming basics: terminology, concepts, methods to write code. Assumes no prior knowledge but basic computer skills.'],\r\n\t 'link': [u'http://www.freenetpages.co.uk/hp/alan.gauld/'],\r\n\t 'title': [u'Learn to Program Using Python']}\r\n2013-11-18 22:15:57-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'',\r\n\t u'- By Rashi Gupta; John Wiley and Sons, 2002, ISBN 0471219754. Covers language basics, use for CGI scripting, GUI development, network programming; shows why it is one of more sophisticated of popular scripting languages. [Wiley]'],\r\n\t 'link': [u'http://www.wiley.com/WileyCDA/WileyTitle/productCd-0471219754.html'],\r\n\t 'title': [u'Making Use of Python']}\r\n2013-11-18 22:15:57-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'',\r\n\t u'- By Magnus Lie Hetland; Apress LP, 2002, ISBN 1590590066. Readable guide to ideas most vital to new users, from basics common to high level languages, to more specific aspects, to a series of 10 ever more complex programs. [Apress]'],\r\n\t 'link': [u'http://hetland.org/writing/practical-python/'],\r\n\t 'title': [u'Practical Python']}\r\n2013-11-18 22:15:57-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'',\r\n\t u'- By Rytis Sileika, ISBN13: 978-1-4302-2605-5, Uses real-world system administration examples like manage devices with SNMP and SOAP, build a distributed monitoring system, manage web applications and parse complex log files, monitor and manage MySQL databases.'],\r\n\t 'link': [u'http://www.sysadminpy.com/'],\r\n\t 'title': [u'Pro Python System Administration']}\r\n2013-11-18 22:15:57-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'', u'- A Complete Introduction to the Python 3.'],\r\n\t 'link': [u'http://www.qtrac.eu/py3book.html'],\r\n\t 'title': [u'Programming in Python 3 (Second Edition)']}\r\n2013-11-18 22:15:57-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'',\r\n\t u'- By Dave Brueck, Stephen Tanner; John Wiley and Sons, 2001, ISBN 0764548077. Full coverage, clear explanations, hands-on examples, full language reference; shows step by step how to use components, assemble them, form full-featured programs. [John Wiley and Sons]'],\r\n\t 'link': [u'http://www.wiley.com/WileyCDA/WileyTitle/productCd-0764548077.html'],\r\n\t 'title': [u'Python 2.1 Bible']}\r\n2013-11-18 22:15:57-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'',\r\n\t u'- A step-by-step tutorial for OOP in Python 3, including discussion and examples of abstraction, encapsulation, information hiding, and raise, handle, define, and manipulate exceptions.'],\r\n\t 'link': [u'https://www.packtpub.com/python-3-object-oriented-programming/book'],\r\n\t 'title': [u'Python 3 Object Oriented Programming']}\r\n2013-11-18 22:15:57-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'',\r\n\t u'- By Guido van Rossum, Fred L. Drake, Jr.; Network Theory Ltd., 2003, ISBN 0954161785. Printed edition of official language reference, for v2.x, from Python.org, describes syntax, built-in datatypes. [Network Theory, online]'],\r\n\t 'link': [u'http://www.network-theory.co.uk/python/language/'],\r\n\t 'title': [u'Python Language Reference Manual']}\r\n2013-11-18 22:15:57-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'',\r\n\t u'- By Thomas W. Christopher; Prentice Hall PTR, 2002, ISBN 0130409561. Shows how to write large programs, introduces powerful design patterns that deliver high levels of robustness, scalability, reuse.'],\r\n\t 'link': [u'http://www.pearsonhighered.com/educator/academic/product/0,,0130409561,00%2Ben-USS_01DBC.html'],\r\n\t 'title': [u'Python Programming Patterns']}\r\n2013-11-18 22:15:57-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'',\r\n\t u\"- By Richard Hightower; Addison-Wesley, 2002, 0201616165. Begins with Python basics, many exercises, interactive sessions. Shows programming novices concepts and practical methods. Shows programming experts Python's abilities and ways to interface with Java APIs. [publisher website]\"],\r\n\t 'link': [u'http://www.informit.com/store/product.aspx?isbn=0201616165&redir=1'],\r\n\t 'title': [u'Python Programming with the Java Class Libraries: A Tutorial for Building Web and Enterprise Applications with Jython']}\r\n2013-11-18 22:15:57-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'',\r\n\t u'- By Chris Fehily; Peachpit Press, 2002, ISBN 0201748843. Task-based, step-by-step visual reference guide, many screen shots, for courses in digital graphics; Web design, scripting, development; multimedia, page layout, office tools, operating systems. [Prentice Hall]'],\r\n\t 'link': [u'http://www.pearsonhighered.com/educator/academic/product/0,,0201748843,00%2Ben-USS_01DBC.html'],\r\n\t 'title': [u'Python: Visual QuickStart Guide']}\r\n2013-11-18 22:15:57-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'',\r\n\t u'- By Ivan Van Laningham; Sams Publishing, 2000, ISBN 0672317354. Split into 24 hands-on, 1 hour lessons; steps needed to learn topic: syntax, language features, OO design and programming, GUIs (Tkinter), system administration, CGI. [Sams Publishing]'],\r\n\t 'link': [u'http://www.informit.com/store/product.aspx?isbn=0672317354'],\r\n\t 'title': [u'Sams Teach Yourself Python in 24 Hours']}\r\n2013-11-18 22:15:57-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'',\r\n\t u'- By David Mertz; Addison Wesley. Book in progress, full text, ASCII format. Asks for feedback. [author website, Gnosis Software, Inc.]'],\r\n\t 'link': [u'http://gnosis.cx/TPiP/'],\r\n\t 'title': [u'Text Processing in Python']}\r\n2013-11-18 22:15:57-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'',\r\n\t u'- By Sean McGrath; Prentice Hall PTR, 2000, ISBN 0130211192, has CD-ROM. Methods to build XML applications fast, Python tutorial, DOM and SAX, new Pyxie open source XML processing library. [Prentice Hall PTR]'],\r\n\t 'link': [u'http://www.informit.com/store/product.aspx?isbn=0130211192'],\r\n\t 'title': [u'XML Processing with Python']}\r\n2013-11-18 22:15:57-0500 [dmoz] INFO: Closing spider (finished)\r\n2013-11-18 22:15:57-0500 [dmoz] INFO: Dumping Scrapy stats:\r\n\t{'downloader/request_bytes': 530,\r\n\t 'downloader/request_count': 2,\r\n\t 'downloader/request_method_count/GET': 2,\r\n\t 'downloader/response_bytes': 14892,\r\n\t 'downloader/response_count': 2,\r\n\t 'downloader/response_status_count/200': 2,\r\n\t 'finish_reason': 'finished',\r\n\t 'finish_time': datetime.datetime(2013, 11, 19, 3, 15, 57, 747642),\r\n\t 'item_scraped_count': 44,\r\n\t 'log_count/DEBUG': 52,\r\n\t 'log_count/INFO': 3,\r\n\t 'response_received_count': 2,\r\n\t 'scheduler/dequeued': 2,\r\n\t 'scheduler/dequeued/memory': 2,\r\n\t 'scheduler/enqueued': 2,\r\n\t 'scheduler/enqueued/memory': 2,\r\n\t 'start_time': datetime.datetime(2013, 11, 19, 3, 15, 57, 559107)}\r\n2013-11-18 22:15:57-0500 [dmoz] INFO: Spider closed (finished)\r\n" | |
} | |
], | |
"prompt_number": 9 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 4, | |
"metadata": {}, | |
"source": "Output as json" | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "!cd tutorial/; scrapy crawl dmoz -o items.json -t json", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:15:59-0500 [scrapy] INFO: Scrapy 0.20.0 started (bot: tutorial)\r\n2013-11-18 22:15:59-0500 [scrapy] DEBUG: Optional features available: ssl, http11\r\n2013-11-18 22:15:59-0500 [scrapy] DEBUG: Overridden settings: {'NEWSPIDER_MODULE': 'tutorial.spiders', 'FEED_FORMAT': 'json', 'SPIDER_MODULES': ['tutorial.spiders'], 'FEED_URI': 'items.json', 'BOT_NAME': 'tutorial'}\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:15:59-0500 [scrapy] DEBUG: Enabled extensions: FeedExporter, LogStats, TelnetConsole, CloseSpider, WebService, CoreStats, SpiderState\r\n2013-11-18 22:15:59-0500 [scrapy] DEBUG: Enabled downloader middlewares: HttpAuthMiddleware, DownloadTimeoutMiddleware, UserAgentMiddleware, RetryMiddleware, DefaultHeadersMiddleware, MetaRefreshMiddleware, HttpCompressionMiddleware, RedirectMiddleware, CookiesMiddleware, ChunkedTransferMiddleware, DownloaderStats\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:15:59-0500 [scrapy] DEBUG: Enabled spider middlewares: HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware\r\n2013-11-18 22:15:59-0500 [scrapy] DEBUG: Enabled item pipelines: \r\n2013-11-18 22:15:59-0500 [dmoz] INFO: Spider opened\r\n2013-11-18 22:15:59-0500 [dmoz] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)\r\n2013-11-18 22:15:59-0500 [scrapy] DEBUG: Telnet console listening on 0.0.0.0:6023\r\n2013-11-18 22:15:59-0500 [scrapy] DEBUG: Web service listening on 0.0.0.0:6080\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:16:00-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Books/> (referer: None)\r\n2013-11-18 22:16:00-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u''], 'link': [u'/'], 'title': [u'Top']}\r\n2013-11-18 22:16:00-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [], 'link': [u'/Computers/'], 'title': [u'Computers']}\r\n2013-11-18 22:16:00-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [], 'link': [u'/Computers/Programming/'], 'title': [u'Programming']}\r\n2013-11-18 22:16:00-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [],\r\n\t 'link': [u'/Computers/Programming/Languages/'],\r\n\t 'title': [u'Languages']}\r\n2013-11-18 22:16:00-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [],\r\n\t 'link': [u'/Computers/Programming/Languages/Python/'],\r\n\t 'title': [u'Python']}\r\n2013-11-18 22:16:00-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'', u'', u''], 'link': [], 'title': []}\r\n2013-11-18 22:16:00-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'', u'', u''],\r\n\t 'link': [u'/Computers/Programming/Languages/Python/Resources/'],\r\n\t 'title': [u'Computers: Programming: Languages: Python: Resources']}\r\n2013-11-18 22:16:00-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'', u'', u''],\r\n\t 'link': [u'/Computers/Programming/Languages/Ruby/Books/'],\r\n\t 'title': [u'Computers: Programming: Languages: Ruby: Books']}\r\n2013-11-18 22:16:00-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'', u'', u''],\r\n\t 'link': [u'/World/Deutsch/Computer/Programmieren/Sprachen/Python/B%C3%BCcher/'],\r\n\t 'title': [u'German']}\r\n2013-11-18 22:16:00-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'', u'', u''],\r\n\t 'link': [u'/World/Russian/%D0%9A%D0%BE%D0%BC%D0%BF%D1%8C%D1%8E%D1%82%D0%B5%D1%80%D1%8B/%D0%9F%D1%80%D0%BE%D0%B3%D1%80%D0%B0%D0%BC%D0%BC%D0%B8%D1%80%D0%BE%D0%B2%D0%B0%D0%BD%D0%B8%D0%B5/%D0%AF%D0%B7%D1%8B%D0%BA%D0%B8/Python/%D0%9A%D0%BD%D0%B8%D0%B3%D0%B8/'],\r\n\t 'title': [u'Russian']}\r\n2013-11-18 22:16:00-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'',\r\n\t u'- By Wesley J. Chun; Prentice Hall PTR, 2001, ISBN 0130260363. For experienced developers to improve extant skills; professional level examples. Starts by introducing syntax, objects, error handling, functions, classes, built-ins. [Prentice Hall]'],\r\n\t 'link': [u'http://www.pearsonhighered.com/educator/academic/product/0,,0130260363,00%2Ben-USS_01DBC.html'],\r\n\t 'title': [u'Core Python Programming']}\r\n2013-11-18 22:16:00-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'',\r\n\t u'- The primary goal of this book is to promote object-oriented design using Python and to illustrate the use of the emerging object-oriented design patterns.\\r\\nA secondary goal of the book is to present mathematical tools just in time. Analysis techniques and proofs are presented as needed and in the proper context.'],\r\n\t 'link': [u'http://www.brpreiss.com/books/opus7/html/book.html'],\r\n\t 'title': [u'Data Structures and Algorithms with Object-Oriented Design Patterns in Python']}\r\n2013-11-18 22:16:00-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'',\r\n\t u'- By Mark Pilgrim, Guide to Python 3 and its differences from Python 2. Each chapter starts with a real code sample and explains it fully. Has a comprehensive appendix of all the syntactic and semantic changes in Python 3'],\r\n\t 'link': [u'http://www.diveintopython.net/'],\r\n\t 'title': [u'Dive Into Python 3']}\r\n2013-11-18 22:16:00-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'',\r\n\t u'- This book covers a wide range of topics. From raw TCP and UDP to encryption with TSL, and then to HTTP, SMTP, POP, IMAP, and ssh. It gives you a good understanding of each field and how to do everything on the network with Python.'],\r\n\t 'link': [u'http://rhodesmill.org/brandon/2011/foundations-of-python-network-programming/'],\r\n\t 'title': [u'Foundations of Python Network Programming']}\r\n2013-11-18 22:16:00-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'', u'- Free Python books and tutorials.'],\r\n\t 'link': [u'http://www.techbooksforfree.com/perlpython.shtml'],\r\n\t 'title': [u'Free Python books']}\r\n2013-11-18 22:16:00-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'',\r\n\t u'- Annotated list of free online books on Python scripting language. Topics range from beginner to advanced.'],\r\n\t 'link': [u'http://www.freetechbooks.com/python-f6.html'],\r\n\t 'title': [u'FreeTechBooks: Python Scripting Language']}\r\n2013-11-18 22:16:00-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'',\r\n\t u'- By Allen B. Downey, Jeffrey Elkner, Chris Meyers; Green Tea Press, 2002, ISBN 0971677506. Teaches general principles of programming, via Python as subject language. Thorough, in-depth approach to many basic and intermediate programming topics. Full text online and downloads: HTML, PDF, PS, LaTeX. [Free, Green Tea Press]'],\r\n\t 'link': [u'http://greenteapress.com/thinkpython/'],\r\n\t 'title': [u'How to Think Like a Computer Scientist: Learning with Python']}\r\n2013-11-18 22:16:00-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'',\r\n\t u'- By Guido van Rossum, Fred L. Drake, Jr.; Network Theory Ltd., 2003, ISBN 0954161769. Printed edition of official tutorial, for v2.x, from Python.org. [Network Theory, online]'],\r\n\t 'link': [u'http://www.network-theory.co.uk/python/intro/'],\r\n\t 'title': [u'An Introduction to Python']}\r\n2013-11-18 22:16:00-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'',\r\n\t u'- Book by Alan Gauld with full text online. Introduction for those learning programming basics: terminology, concepts, methods to write code. Assumes no prior knowledge but basic computer skills.'],\r\n\t 'link': [u'http://www.freenetpages.co.uk/hp/alan.gauld/'],\r\n\t 'title': [u'Learn to Program Using Python']}\r\n2013-11-18 22:16:00-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'',\r\n\t u'- By Rashi Gupta; John Wiley and Sons, 2002, ISBN 0471219754. Covers language basics, use for CGI scripting, GUI development, network programming; shows why it is one of more sophisticated of popular scripting languages. [Wiley]'],\r\n\t 'link': [u'http://www.wiley.com/WileyCDA/WileyTitle/productCd-0471219754.html'],\r\n\t 'title': [u'Making Use of Python']}\r\n2013-11-18 22:16:00-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'',\r\n\t u'- By Magnus Lie Hetland; Apress LP, 2002, ISBN 1590590066. Readable guide to ideas most vital to new users, from basics common to high level languages, to more specific aspects, to a series of 10 ever more complex programs. [Apress]'],\r\n\t 'link': [u'http://hetland.org/writing/practical-python/'],\r\n\t 'title': [u'Practical Python']}\r\n2013-11-18 22:16:00-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'',\r\n\t u'- By Rytis Sileika, ISBN13: 978-1-4302-2605-5, Uses real-world system administration examples like manage devices with SNMP and SOAP, build a distributed monitoring system, manage web applications and parse complex log files, monitor and manage MySQL databases.'],\r\n\t 'link': [u'http://www.sysadminpy.com/'],\r\n\t 'title': [u'Pro Python System Administration']}\r\n2013-11-18 22:16:00-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'', u'- A Complete Introduction to the Python 3.'],\r\n\t 'link': [u'http://www.qtrac.eu/py3book.html'],\r\n\t 'title': [u'Programming in Python 3 (Second Edition)']}\r\n2013-11-18 22:16:00-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'',\r\n\t u'- By Dave Brueck, Stephen Tanner; John Wiley and Sons, 2001, ISBN 0764548077. Full coverage, clear explanations, hands-on examples, full language reference; shows step by step how to use components, assemble them, form full-featured programs. [John Wiley and Sons]'],\r\n\t 'link': [u'http://www.wiley.com/WileyCDA/WileyTitle/productCd-0764548077.html'],\r\n\t 'title': [u'Python 2.1 Bible']}\r\n2013-11-18 22:16:00-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'',\r\n\t u'- A step-by-step tutorial for OOP in Python 3, including discussion and examples of abstraction, encapsulation, information hiding, and raise, handle, define, and manipulate exceptions.'],\r\n\t 'link': [u'https://www.packtpub.com/python-3-object-oriented-programming/book'],\r\n\t 'title': [u'Python 3 Object Oriented Programming']}\r\n2013-11-18 22:16:00-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'',\r\n\t u'- By Guido van Rossum, Fred L. Drake, Jr.; Network Theory Ltd., 2003, ISBN 0954161785. Printed edition of official language reference, for v2.x, from Python.org, describes syntax, built-in datatypes. [Network Theory, online]'],\r\n\t 'link': [u'http://www.network-theory.co.uk/python/language/'],\r\n\t 'title': [u'Python Language Reference Manual']}\r\n2013-11-18 22:16:00-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'',\r\n\t u'- By Thomas W. Christopher; Prentice Hall PTR, 2002, ISBN 0130409561. Shows how to write large programs, introduces powerful design patterns that deliver high levels of robustness, scalability, reuse.'],\r\n\t 'link': [u'http://www.pearsonhighered.com/educator/academic/product/0,,0130409561,00%2Ben-USS_01DBC.html'],\r\n\t 'title': [u'Python Programming Patterns']}\r\n2013-11-18 22:16:00-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'',\r\n\t u\"- By Richard Hightower; Addison-Wesley, 2002, 0201616165. Begins with Python basics, many exercises, interactive sessions. Shows programming novices concepts and practical methods. Shows programming experts Python's abilities and ways to interface with Java APIs. [publisher website]\"],\r\n\t 'link': [u'http://www.informit.com/store/product.aspx?isbn=0201616165&redir=1'],\r\n\t 'title': [u'Python Programming with the Java Class Libraries: A Tutorial for Building Web and Enterprise Applications with Jython']}\r\n2013-11-18 22:16:00-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'',\r\n\t u'- By Chris Fehily; Peachpit Press, 2002, ISBN 0201748843. Task-based, step-by-step visual reference guide, many screen shots, for courses in digital graphics; Web design, scripting, development; multimedia, page layout, office tools, operating systems. [Prentice Hall]'],\r\n\t 'link': [u'http://www.pearsonhighered.com/educator/academic/product/0,,0201748843,00%2Ben-USS_01DBC.html'],\r\n\t 'title': [u'Python: Visual QuickStart Guide']}\r\n2013-11-18 22:16:00-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'',\r\n\t u'- By Ivan Van Laningham; Sams Publishing, 2000, ISBN 0672317354. Split into 24 hands-on, 1 hour lessons; steps needed to learn topic: syntax, language features, OO design and programming, GUIs (Tkinter), system administration, CGI. [Sams Publishing]'],\r\n\t 'link': [u'http://www.informit.com/store/product.aspx?isbn=0672317354'],\r\n\t 'title': [u'Sams Teach Yourself Python in 24 Hours']}\r\n2013-11-18 22:16:00-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'',\r\n\t u'- By David Mertz; Addison Wesley. Book in progress, full text, ASCII format. Asks for feedback. [author website, Gnosis Software, Inc.]'],\r\n\t 'link': [u'http://gnosis.cx/TPiP/'],\r\n\t 'title': [u'Text Processing in Python']}\r\n2013-11-18 22:16:00-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>\r\n\t{'desc': [u'',\r\n\t u'- By Sean McGrath; Prentice Hall PTR, 2000, ISBN 0130211192, has CD-ROM. Methods to build XML applications fast, Python tutorial, DOM and SAX, new Pyxie open source XML processing library. [Prentice Hall PTR]'],\r\n\t 'link': [u'http://www.informit.com/store/product.aspx?isbn=0130211192'],\r\n\t 'title': [u'XML Processing with Python']}\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:16:00-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/> (referer: None)\r\n2013-11-18 22:16:00-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/>\r\n\t{'desc': [u''], 'link': [u'/'], 'title': [u'Top']}\r\n2013-11-18 22:16:00-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/>\r\n\t{'desc': [], 'link': [u'/Computers/'], 'title': [u'Computers']}\r\n2013-11-18 22:16:00-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/>\r\n\t{'desc': [], 'link': [u'/Computers/Programming/'], 'title': [u'Programming']}\r\n2013-11-18 22:16:00-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/>\r\n\t{'desc': [],\r\n\t 'link': [u'/Computers/Programming/Languages/'],\r\n\t 'title': [u'Languages']}\r\n2013-11-18 22:16:00-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/>\r\n\t{'desc': [],\r\n\t 'link': [u'/Computers/Programming/Languages/Python/'],\r\n\t 'title': [u'Python']}\r\n2013-11-18 22:16:00-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/>\r\n\t{'desc': [u'', u'', u''], 'link': [], 'title': []}\r\n2013-11-18 22:16:00-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/>\r\n\t{'desc': [u'', u'', u''],\r\n\t 'link': [u'/Computers/Programming/Resources/'],\r\n\t 'title': [u'Computers: Programming: Resources']}\r\n2013-11-18 22:16:00-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/>\r\n\t{'desc': [u'',\r\n\t u'- Contains links to assorted resources from the Python universe, compiled by PythonWare.'],\r\n\t 'link': [u'http://www.pythonware.com/daily/'],\r\n\t 'title': [u\"eff-bot's Daily Python URL\"]}\r\n2013-11-18 22:16:00-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/>\r\n\t{'desc': [u'',\r\n\t u'- A directory of free Python and Zope hosting providers, with reviews and ratings.'],\r\n\t 'link': [u'http://www.oinko.net/freepython/'],\r\n\t 'title': [u'Free Python and Zope Hosting Directory']}\r\n2013-11-18 22:16:00-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/>\r\n\t{'desc': [u'', u'- Features Python books, resources, news and articles.'],\r\n\t 'link': [u'http://oreilly.com/python/'],\r\n\t 'title': [u\"O'Reilly Python Center\"]}\r\n2013-11-18 22:16:00-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/>\r\n\t{'desc': [u'',\r\n\t u'- Resources for reporting bugs, accessing the Python source tree with CVS and taking part in the development of Python.'],\r\n\t 'link': [u'http://www.python.org/dev/'],\r\n\t 'title': [u\"Python Developer's Guide\"]}\r\n2013-11-18 22:16:00-0500 [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/>\r\n\t{'desc': [u'',\r\n\t u'- Scripts, examples and news about Python programming for the Windows platform.'],\r\n\t 'link': [u'http://win32com.goermezer.de/'],\r\n\t 'title': [u'Social Bug']}\r\n2013-11-18 22:16:00-0500 [dmoz] INFO: Closing spider (finished)\r\n2013-11-18 22:16:00-0500 [dmoz] INFO: Stored json feed (44 items) in: items.json\r\n2013-11-18 22:16:00-0500 [dmoz] INFO: Dumping Scrapy stats:\r\n\t{'downloader/request_bytes': 530,\r\n\t 'downloader/request_count': 2,\r\n\t 'downloader/request_method_count/GET': 2,\r\n\t 'downloader/response_bytes': 14892,\r\n\t 'downloader/response_count': 2,\r\n\t 'downloader/response_status_count/200': 2,\r\n\t 'finish_reason': 'finished',\r\n\t 'finish_time': datetime.datetime(2013, 11, 19, 3, 16, 0, 92126),\r\n\t 'item_scraped_count': 44,\r\n\t 'log_count/DEBUG': 52,\r\n\t 'log_count/INFO': 4,\r\n\t 'response_received_count': 2,\r\n\t 'scheduler/dequeued': 2,\r\n\t 'scheduler/dequeued/memory': 2,\r\n\t 'scheduler/enqueued': 2,\r\n\t 'scheduler/enqueued/memory': 2,\r\n\t 'start_time': datetime.datetime(2013, 11, 19, 3, 15, 59, 911770)}\r\n2013-11-18 22:16:00-0500 [dmoz] INFO: Spider closed (finished)\r\n" | |
} | |
], | |
"prompt_number": 10 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "! head tutorial/items.json", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "[{\"desc\": [\"\"], \"link\": [\"/\"], \"title\": [\"Top\"]},\r\n{\"desc\": [], \"link\": [\"/Computers/\"], \"title\": [\"Computers\"]},\r\n{\"desc\": [], \"link\": [\"/Computers/Programming/\"], \"title\": [\"Programming\"]},\r\n{\"desc\": [], \"link\": [\"/Computers/Programming/Languages/\"], \"title\": [\"Languages\"]},\r\n{\"desc\": [], \"link\": [\"/Computers/Programming/Languages/Python/\"], \"title\": [\"Python\"]},\r\n{\"desc\": [\"\", \"\", \"\"], \"link\": [], \"title\": []},\r\n{\"desc\": [\"\", \"\", \"\"], \"link\": [\"/Computers/Programming/Languages/Python/Resources/\"], \"title\": [\"Computers: Programming: Languages: Python: Resources\"]},\r\n{\"desc\": [\"\", \"\", \"\"], \"link\": [\"/Computers/Programming/Languages/Ruby/Books/\"], \"title\": [\"Computers: Programming: Languages: Ruby: Books\"]},\r\n{\"desc\": [\"\", \"\", \"\"], \"link\": [\"/World/Deutsch/Computer/Programmieren/Sprachen/Python/B%C3%BCcher/\"], \"title\": [\"German\"]},\r\n{\"desc\": [\"\", \"\", \"\"], \"link\": [\"/World/Russian/%D0%9A%D0%BE%D0%BC%D0%BF%D1%8C%D1%8E%D1%82%D0%B5%D1%80%D1%8B/%D0%9F%D1%80%D0%BE%D0%B3%D1%80%D0%B0%D0%BC%D0%BC%D0%B8%D1%80%D0%BE%D0%B2%D0%B0%D0%BD%D0%B8%D0%B5/%D0%AF%D0%B7%D1%8B%D0%BA%D0%B8/Python/%D0%9A%D0%BD%D0%B8%D0%B3%D0%B8/\"], \"title\": [\"Russian\"]},\r\n" | |
} | |
], | |
"prompt_number": 11 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 4, | |
"metadata": {}, | |
"source": "Creating a CrawlSpider that uses Rules to decide what links to crawl" | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "%%writefile tutorial/tutorial/spiders/dmoz_spider.py\n\nfrom scrapy.contrib.spiders import CrawlSpider, Rule\nfrom scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor\nfrom scrapy.http import Request\nfrom scrapy.selector import Selector\n\nfrom tutorial.items import DmozItem\n\nfrom scrapy.conf import settings\nsettings.overrides['DOWNLOAD_DELAY'] = 1\n\nclass DmozSpider(CrawlSpider):\n name = \"dmoz\"\n allowed_domains = [\"dmoz.org\"]\n start_urls = ['http://www.dmoz.org/Computers/Programming/Languages']\n \n rules = (\n Rule(SgmlLinkExtractor(deny=('\\?*', )), follow=True),\n Rule(SgmlLinkExtractor(allow=('www\\.dmoz\\.org\\/Computers\\/Programming\\/Languages\\/Python', ), unique=True), follow=True),\n )\n \n def parse_python_page(self, response):\n sel = Selector(response)\n sites = sel.xpath('//ul/li')\n print response.url\n for site in sites:\n try:\n url = site.xpath('a/@href').extract()[0]\n if url.startswith('/'):\n url = 'http://www.dmoz.org' + url\n yield Request(url=url)\n except:\n pass", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "Overwriting tutorial/tutorial/spiders/dmoz_spider.py\n" | |
} | |
], | |
"prompt_number": 12 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "! cd tutorial/; scrapy crawl dmoz", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:17:12-0500 [scrapy] INFO: Scrapy 0.20.0 started (bot: tutorial)\r\n2013-11-18 22:17:12-0500 [scrapy] DEBUG: Optional features available: ssl, http11\r\n2013-11-18 22:17:12-0500 [scrapy] DEBUG: Overridden settings: {'NEWSPIDER_MODULE': 'tutorial.spiders', 'SPIDER_MODULES': ['tutorial.spiders'], 'DOWNLOAD_DELAY': 1, 'BOT_NAME': 'tutorial'}\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:17:12-0500 [scrapy] DEBUG: Enabled extensions: LogStats, TelnetConsole, CloseSpider, WebService, CoreStats, SpiderState\r\n2013-11-18 22:17:12-0500 [scrapy] DEBUG: Enabled downloader middlewares: HttpAuthMiddleware, DownloadTimeoutMiddleware, UserAgentMiddleware, RetryMiddleware, DefaultHeadersMiddleware, MetaRefreshMiddleware, HttpCompressionMiddleware, RedirectMiddleware, CookiesMiddleware, ChunkedTransferMiddleware, DownloaderStats\r\n2013-11-18 22:17:12-0500 [scrapy] DEBUG: Enabled spider middlewares: HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware\r\n2013-11-18 22:17:12-0500 [scrapy] DEBUG: Enabled item pipelines: \r\n2013-11-18 22:17:12-0500 [dmoz] INFO: Spider opened\r\n2013-11-18 22:17:12-0500 [dmoz] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)\r\n2013-11-18 22:17:12-0500 [scrapy] DEBUG: Telnet console listening on 0.0.0.0:6023\r\n2013-11-18 22:17:12-0500 [scrapy] DEBUG: Web service listening on 0.0.0.0:6080\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:17:13-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages> (referer: None)\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:17:14-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/> (referer: http://www.dmoz.org/Computers/Programming/Languages)\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:17:15-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Development_Tools/> (referer: http://www.dmoz.org/Computers/Programming/Languages/Python/)\r\n2013-11-18 22:17:15-0500 [dmoz] DEBUG: Filtered duplicate request: <GET http://www.dmoz.org/Computers/Programming/Languages/Python/> - no more duplicates will be shown (see DUPEFILTER_CLASS)\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:17:17-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Web/> (referer: http://www.dmoz.org/Computers/Programming/Languages/Python/)\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:17:17-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/User_Groups/> (referer: http://www.dmoz.org/Computers/Programming/Languages/Python/)\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:17:19-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/> (referer: http://www.dmoz.org/Computers/Programming/Languages/Python/)\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:17:20-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Personal_Pages/> (referer: http://www.dmoz.org/Computers/Programming/Languages/Python/)\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:17:21-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Mailing_Lists/> (referer: http://www.dmoz.org/Computers/Programming/Languages/Python/)\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:17:23-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/FAQs,_Help,_and_Tutorials/> (referer: http://www.dmoz.org/Computers/Programming/Languages/Python/)\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:17:24-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Conferences/> (referer: http://www.dmoz.org/Computers/Programming/Languages/Python/)\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:17:25-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Commercial_Services/> (referer: http://www.dmoz.org/Computers/Programming/Languages/Python/)\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:17:26-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Books/> (referer: http://www.dmoz.org/Computers/Programming/Languages/Python/)\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:17:28-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Articles_and_Reviews/> (referer: http://www.dmoz.org/Computers/Programming/Languages/Python/)\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:17:29-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Modules/> (referer: http://www.dmoz.org/Computers/Programming/Languages/Python/)\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:17:30-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Implementations/> (referer: http://www.dmoz.org/Computers/Programming/Languages/Python/)\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:17:31-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Development_Tools/Foreign_Language_Interfaces/> (referer: http://www.dmoz.org/Computers/Programming/Languages/Python/Development_Tools/)\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:17:32-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Development_Tools/Documentation_Tools/> (referer: http://www.dmoz.org/Computers/Programming/Languages/Python/Development_Tools/)\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:17:34-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Development_Tools/Deployment/> (referer: http://www.dmoz.org/Computers/Programming/Languages/Python/Development_Tools/)\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:17:35-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Development_Tools/Integrated_Development_Environments/> (referer: http://www.dmoz.org/Computers/Programming/Languages/Python/Development_Tools/)\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:17:36-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Development_Tools/Performance_and_Testing/> (referer: http://www.dmoz.org/Computers/Programming/Languages/Python/Development_Tools/)\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:17:36-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Web/Web_Frameworks/> (referer: http://www.dmoz.org/Computers/Programming/Languages/Python/Web/)\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:17:38-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Web/Templating_Libraries/> (referer: http://www.dmoz.org/Computers/Programming/Languages/Python/Web/)\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:17:39-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Modules/Web/> (referer: http://www.dmoz.org/Computers/Programming/Languages/Python/Web/)\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:17:40-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Development_Tools/GUI_Builders/> (referer: http://www.dmoz.org/Computers/Programming/Languages/Python/Development_Tools/)\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:17:41-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Modules/Terminal_IO/> (referer: http://www.dmoz.org/Computers/Programming/Languages/Python/Modules/)\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:17:43-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Modules/Platform_Specific/> (referer: http://www.dmoz.org/Computers/Programming/Languages/Python/Modules/)\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:17:44-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Modules/Email/> (referer: http://www.dmoz.org/Computers/Programming/Languages/Python/Modules/)\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:17:46-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Modules/Data_Formats/> (referer: http://www.dmoz.org/Computers/Programming/Languages/Python/Modules/)\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:17:47-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Modules/XML/> (referer: http://www.dmoz.org/Computers/Programming/Languages/Python/Modules/)\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:17:48-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Modules/Text_Processing/> (referer: http://www.dmoz.org/Computers/Programming/Languages/Python/Modules/)\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:17:49-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Modules/Scientific/> (referer: http://www.dmoz.org/Computers/Programming/Languages/Python/Modules/)\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:17:51-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Modules/Network/> (referer: http://www.dmoz.org/Computers/Programming/Languages/Python/Modules/)\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:17:52-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Modules/Math_and_Calculations/> (referer: http://www.dmoz.org/Computers/Programming/Languages/Python/Modules/)\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:17:53-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Modules/GUI/> (referer: http://www.dmoz.org/Computers/Programming/Languages/Python/Modules/)\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:17:55-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Modules/Graphics/> (referer: http://www.dmoz.org/Computers/Programming/Languages/Python/Modules/)\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:17:56-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Modules/Distributed_Computing/> (referer: http://www.dmoz.org/Computers/Programming/Languages/Python/Modules/)\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:17:57-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Modules/Databases_and_Persistence/> (referer: http://www.dmoz.org/Computers/Programming/Languages/Python/Modules/)\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:17:58-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Web/Web_Frameworks/Django/> (referer: http://www.dmoz.org/Computers/Programming/Languages/Python/Web/Web_Frameworks/)\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:17:59-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Modules/Cryptography/> (referer: http://www.dmoz.org/Computers/Programming/Languages/Python/Modules/)\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:18:00-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Modules/Directories/> (referer: http://www.dmoz.org/Computers/Programming/Languages/Python/Modules/)\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:18:01-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Modules/Platform_Specific/Windows/> (referer: http://www.dmoz.org/Computers/Programming/Languages/Python/Modules/Platform_Specific/)\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:18:02-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Modules/Platform_Specific/Linux/> (referer: http://www.dmoz.org/Computers/Programming/Languages/Python/Modules/Platform_Specific/)\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:18:04-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Modules/XML/Parsers/> (referer: http://www.dmoz.org/Computers/Programming/Languages/Python/Modules/XML/)\r\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "2013-11-18 22:18:04-0500 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Modules/Databases_and_Persistence/Database_API/> (referer: http://www.dmoz.org/Computers/Programming/Languages/Python/Modules/Databases_and_Persistence/)\r\n2013-11-18 22:18:05-0500 [dmoz] INFO: Closing spider (finished)\r\n2013-11-18 22:18:05-0500 [dmoz] INFO: Dumping Scrapy stats:\r\n\t{'downloader/request_bytes': 17891,\r\n\t 'downloader/request_count': 44,\r\n\t 'downloader/request_method_count/GET': 44,\r\n\t 'downloader/response_bytes': 273313,\r\n\t 'downloader/response_count': 44,\r\n\t 'downloader/response_status_count/200': 44,\r\n\t 'finish_reason': 'finished',\r\n\t 'finish_time': datetime.datetime(2013, 11, 19, 3, 18, 5, 1707),\r\n\t 'log_count/DEBUG': 51,\r\n\t 'log_count/INFO': 3,\r\n\t 'request_depth_max': 5,\r\n\t 'response_received_count': 44,\r\n\t 'scheduler/dequeued': 44,\r\n\t 'scheduler/dequeued/memory': 44,\r\n\t 'scheduler/enqueued': 44,\r\n\t 'scheduler/enqueued/memory': 44,\r\n\t 'start_time': datetime.datetime(2013, 11, 19, 3, 17, 12, 987421)}\r\n2013-11-18 22:18:05-0500 [dmoz] INFO: Spider closed (finished)\r\n" | |
} | |
], | |
"prompt_number": 13 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 13 | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment