Skip to content

Instantly share code, notes, and snippets.

@pedrokoblitz
Last active January 4, 2016 19:49
Show Gist options
  • Save pedrokoblitz/8669900 to your computer and use it in GitHub Desktop.
Save pedrokoblitz/8669900 to your computer and use it in GitHub Desktop.
scrapy
# Many times when crawling we run into problems where content that is rendered on the page is generated with Javascript and therefore scrapy is unable to crawl for it (eg. ajax requests, jQuery craziness). However, if you use Scrapy along with the web testing framework Selenium then we are able to crawl anything displayed in a normal web browser.
#
# Some things to note:
# You must have the Python version of Selenium RC installed for this to work, and you must have set up Selenium properly. Also this is just a template crawler. You could get much crazier and more advanced with things but I just wanted to show the basic idea. As the code stands now you will be doing two requests for any given url. One request is made by Scrapy and the other is made by Selenium. I am sure there are ways around this so that you could possibly just make Selenium do the one and only request but I did not bother to implement that and by doing two requests you get to crawl the page with Scrapy too.
#
# This is quite powerful because now you have the entire rendered DOM available for you to crawl and you can still use all the nice crawling features in Scrapy. This will make for slower crawling of course but depending on how much you need the rendered DOM it might be worth the wait.
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
from selenium import selenium
class SeleniumSpider(CrawlSpider):
name = "SeleniumSpider"
start_urls = ["http://www.domain.com"]
rules = (
Rule(SgmlLinkExtractor(allow=('\.html', )), callback='parse_page',follow=True),
)
def __init__(self):
CrawlSpider.__init__(self)
self.verificationErrors = []
self.selenium = selenium("localhost", 4444, "*chrome", "http://www.domain.com")
self.selenium.start()
def __del__(self):
self.selenium.stop()
print self.verificationErrors
CrawlSpider.__del__(self)
def parse_page(self, response):
item = Item()
hxs = HtmlXPathSelector(response)
#Do some XPath selection with Scrapy
hxs.select('//div').extract()
sel = self.selenium
sel.open(response.url)
#Wait for javscript to load in Selenium
time.sleep(2.5)
#Do some crawling of javascript created content with Selenium
sel.get_text("//div")
yield item
# Snippet imported from snippets.scrapy.org (which no longer works)
# author: wynbennett
# date : Jun 21, 2011
# This is a piece of code that use webdrivers to load&render a page with Scrapy and Selenium.
#
# This work is based on the snippets [wynbennett](http://snippets.scrapy.org/users/wynbennett/) [posted here](http://snippets.scrapy.org/snippets/21/) some time ago
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
from myItem.items import myItem
from selenium import webdriver
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
import time
import pprint
class WebDriverSpider(CrawlSpider):
name = "WebDriverSpider"
start_urls = ["http://yourDomain.com/yourUrl.html"]
rules = (
Rule(SgmlLinkExtractor(allow=('\.html', ), allow_domains=('yourDomain.com', )), callback='parse_page',follow=False),
)
def __init__(self):
CrawlSpider.__init__(self)
self.verificationErrors = []
#create a profile with specific add-ons
#and do this. Firefox to load it
profile = FirefoxProfile(profile_directory="/home/yourUser/.mozilla/firefox/selenium/")
self.selenium = webdriver.Firefox(profile)
def __del__(self):
self.selenium.quit()
print self.verificationErrors
CrawlSpider.__del__(self)
def parse_page(self, response):
#normal scrapy result
hxs = HtmlXPathSelector(response)
#webdriver rendered page
sel = self.selenium
sel.get(response.url)
if sel:
#Wait for javascript to load in Selenium
time.sleep(2.5)
#Do some crawling of javascript created content with Selenium
item = myItem()
item['url'] = response.url
item['title'] = hxs.select('//title/text()').extract()
#something u can do only with webdrivers
item['thatDiv'] = sel.find_element_by_id("thatDiv")
# Snippet imported from snippets.scrapy.org (which no longer works)
# author: rollsappletree
# date : Aug 25, 2011
# When you run the Scrapy crawler from a program, the code blocks until the Scrapy crawler is finished. This is due to how Twisted (the underlying asynchronous network library) works. This prevents using the Scrapy crawler from scripts or other code.
#
# To circumvent this issue you can run the Scrapy crawler in a thread with this code.
#
# Keep in mind that this code is mainly for illustrative purposes and far from production ready.
#
# Also the code was only tested with Scrapy 0.8, and will probably need some adjustments for newer versions (since the core API isn't stable yet), but you get the idea.
"""
Code to run Scrapy crawler in a thread - works on Scrapy 0.8
"""
import threading, Queue
from twisted.internet import reactor
from scrapy.xlib.pydispatch import dispatcher
from scrapy.core.manager import scrapymanager
from scrapy.core.engine import scrapyengine
from scrapy.core import signals
class CrawlerThread(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
self.running = False
def run(self):
self.running = True
scrapymanager.configure(control_reactor=False)
scrapymanager.start()
reactor.run(installSignalHandlers=False)
def crawl(self, *args):
if not self.running:
raise RuntimeError("CrawlerThread not running")
self._call_and_block_until_signal(signals.spider_closed, \
scrapymanager.crawl, *args)
def stop(self):
reactor.callFromThread(scrapyengine.stop)
def _call_and_block_until_signal(self, signal, f, *a, **kw):
q = Queue.Queue()
def unblock():
q.put(None)
dispatcher.connect(unblock, signal=signal)
reactor.callFromThread(f, *a, **kw)
q.get()
# Usage example below:
import os
os.environ.setdefault('SCRAPY_SETTINGS_MODULE', 'myproject.settings')
from scrapy.xlib.pydispatch import dispatcher
from scrapy.core import signals
from scrapy.conf import settings
from scrapy.crawler import CrawlerThread
settings.overrides['LOG_ENABLED'] = False # avoid log noise
def item_passed(item):
print "Just scraped item:", item
dispatcher.connect(item_passed, signal=signals.item_passed)
crawler = CrawlerThread()
print "Starting crawler thread..."
crawler.start()
print "Crawling somedomain.com...."
crawler.crawl('somedomain.com) # blocking call
print "Crawling anotherdomain.com..."
crawler.crawl('anotherdomain.com') # blocking call
print "Stopping crawler thread..."
crawler.stop()
# Snippet imported from snippets.scrapy.org (which no longer works)
# author: pablo
# date : Aug 11, 2010
# This middleware can be used to avoid re-visiting already visited items, which can be useful for speeding up the scraping for projects with immutable items, ie. items that, once scraped, don't change.
from scrapy import log
from scrapy.http import Request
from scrapy.item import BaseItem
from scrapy.utils.request import request_fingerprint
from myproject.items import MyItem
class IgnoreVisitedItems(object):
"""Middleware to ignore re-visiting item pages if they were already visited
before. The requests to be filtered by have a meta['filter_visited'] flag
enabled and optionally define an id to use for identifying them, which
defaults the request fingerprint, although you'd want to use the item id,
if you already have it beforehand to make it more robust.
"""
FILTER_VISITED = 'filter_visited'
VISITED_ID = 'visited_id'
CONTEXT_KEY = 'visited_ids'
def process_spider_output(self, response, result, spider):
context = getattr(spider, 'context', {})
visited_ids = context.setdefault(self.CONTEXT_KEY, {})
ret = []
for x in result:
visited = False
if isinstance(x, Request):
if self.FILTER_VISITED in x.meta:
visit_id = self._visited_id(x)
if visit_id in visited_ids:
log.msg("Ignoring already visited: %s" % x.url,
level=log.INFO, spider=spider)
visited = True
elif isinstance(x, BaseItem):
visit_id = self._visited_id(response.request)
if visit_id:
visited_ids[visit_id] = True
x['visit_id'] = visit_id
x['visit_status'] = 'new'
if visited:
ret.append(MyItem(visit_id=visit_id, visit_status='old'))
else:
ret.append(x)
return ret
def _visited_id(self, request):
return request.meta.get(self.VISITED_ID) or request_fingerprint(request)
# Snippet imported from snippets.scrapy.org (which no longer works)
# author: pablo
# date : Aug 10, 2010
# Standard Python library imports
# 3rd party modules
import pymongo
from scrapy import log
from scrapy.conf import settings
from scrapy.exceptions import DropItem
class MongoDBPipeline(object):
def __init__(self):
self.server = settings['MONGODB_SERVER']
self.port = settings['MONGODB_PORT']
self.db = settings['MONGODB_DB']
self.col = settings['MONGODB_COLLECTION']
connection = pymongo.Connection(self.server, self.port)
db = connection[self.db]
self.collection = db[self.col]
def process_item(self, item, spider):
err_msg = ''
for field, data in item.items():
if not data:
err_msg += 'Missing %s of poem from %s\n' % (field, item['url'])
if err_msg:
raise DropItem(err_msg)
self.collection.insert(dict(item))
log.msg('Item written to MongoDB database %s/%s' % (self.db, self.col),
level=log.DEBUG, spider=spider)
return item
# Cannot use this to create the table, must have table already created
from twisted.enterprise import adbapi
import datetime
import MySQLdb.cursors
class SQLStorePipeline(object):
def __init__(self):
self.dbpool = adbapi.ConnectionPool('MySQLdb', db='mydb',
user='myuser', passwd='mypass', cursorclass=MySQLdb.cursors.DictCursor,
charset='utf8', use_unicode=True)
def process_item(self, item, spider):
# run db query in thread pool
query = self.dbpool.runInteraction(self._conditional_insert, item)
query.addErrback(self.handle_error)
return item
def _conditional_insert(self, tx, item):
# create record if doesn't exist.
# all this block run on it's own thread
tx.execute("select * from websites where link = %s", (item['link'][0], ))
result = tx.fetchone()
if result:
log.msg("Item already stored in db: %s" % item, level=log.DEBUG)
else:
tx.execute(\
"insert into websites (link, created) "
"values (%s, %s)",
(item['link'][0],
datetime.datetime.now())
)
log.msg("Item stored in db: %s" % item, level=log.DEBUG)
def handle_error(self, e):
log.err(e)
# Snippet imported from snippets.scrapy.org (which no longer works)
# author: redtricycle
# date : Nov 21, 2011
# You can use this middleware to have a random user agent every request the spider makes.
# You can define a user USER_AGEN_LIST in your settings and the spider will chose a random user agent from that list every time.
#
# You will have to disable the default user agent middleware and add this to your settings file.
#
# DOWNLOADER_MIDDLEWARES = {
# 'scraper.random_user_agent.RandomUserAgentMiddleware': 400,
# 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None,
# }
from scraper.settings import USER_AGENT_LIST
import random
from scrapy import log
class RandomUserAgentMiddleware(object):
def process_request(self, request, spider):
ua = random.choice(USER_AGENT_LIST)
if ua:
request.headers.setdefault('User-Agent', ua)
#log.msg('>>>> UA %s'%request.headers)
# Snippet imported from snippets.scrapy.org (which no longer works)
# author: dushyant
# date : Sep 16, 2011
# This scripts shows how to crawl a site without settings up a complete project.
#
# Note: the `crawler.start()` can't be called more than once due twisted's reactor limitation.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# author: Rolando Espinoza La fuente
#
# Changelog:
# 24/07/2011 - updated to work with scrapy 13.0dev
# 25/08/2010 - initial version. works with scrapy 0.9
from scrapy.contrib.loader import XPathItemLoader
from scrapy.item import Item, Field
from scrapy.selector import HtmlXPathSelector
from scrapy.spider import BaseSpider
class QuestionItem(Item):
"""Our SO Question Item"""
title = Field()
summary = Field()
tags = Field()
user = Field()
posted = Field()
votes = Field()
answers = Field()
views = Field()
class MySpider(BaseSpider):
"""Our ad-hoc spider"""
name = "myspider"
start_urls = ["http://stackoverflow.com/"]
question_list_xpath = '//div[@id="content"]//div[contains(@class, "question-summary")]'
def parse(self, response):
hxs = HtmlXPathSelector(response)
for qxs in hxs.select(self.question_list_xpath):
loader = XPathItemLoader(QuestionItem(), selector=qxs)
loader.add_xpath('title', './/h3/a/text()')
loader.add_xpath('summary', './/h3/a/@title')
loader.add_xpath('tags', './/a[@rel="tag"]/text()')
loader.add_xpath('user', './/div[@class="started"]/a[2]/text()')
loader.add_xpath('posted', './/div[@class="started"]/a[1]/span/@title')
loader.add_xpath('votes', './/div[@class="votes"]/div[1]/text()')
loader.add_xpath('answers', './/div[contains(@class, "answered")]/div[1]/text()')
loader.add_xpath('views', './/div[@class="views"]/div[1]/text()')
yield loader.load_item()
def main():
"""Setups item signal and run the spider"""
# set up signal to catch items scraped
from scrapy import signals
from scrapy.xlib.pydispatch import dispatcher
def catch_item(sender, item, **kwargs):
print "Got:", item
dispatcher.connect(catch_item, signal=signals.item_passed)
# shut off log
from scrapy.conf import settings
settings.overrides['LOG_ENABLED'] = False
# set up crawler
from scrapy.crawler import CrawlerProcess
crawler = CrawlerProcess(settings)
crawler.install()
crawler.configure()
# schedule spider
crawler.crawl(MySpider())
# start engine scrapy/twisted
print "STARTING ENGINE"
crawler.start()
print "ENGINE STOPPED"
if __name__ == '__main__':
main()
# Snippet imported from snippets.scrapy.org (which no longer works)
# author: darkrho
# date : Aug 25, 2010
# Standard Python library imports
# 3rd party imports
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
# My imports
from poetry_analysis.items import PoetryAnalysisItem
HTML_FILE_NAME = r'.+\.html'
class PoetryParser(object):
"""
Provides common parsing method for poems formatted this one specific way.
"""
date_pattern = r'(\d{2} \w{3,9} \d{4})'
def parse_poem(self, response):
hxs = HtmlXPathSelector(response)
item = PoetryAnalysisItem()
# All poetry text is in pre tags
text = hxs.select('//pre/text()').extract()
item['text'] = ''.join(text)
item['url'] = response.url
# head/title contains title - a poem by author
title_text = hxs.select('//head/title/text()').extract()[0]
item['title'], item['author'] = title_text.split(' - ')
item['author'] = item['author'].replace('a poem by', '')
for key in ['title', 'author']:
item[key] = item[key].strip()
item['date'] = hxs.select("//p[@class='small']/text()").re(date_pattern)
return item
class PoetrySpider(CrawlSpider, PoetryParser):
name = 'example.com_poetry'
allowed_domains = ['www.example.com']
root_path = 'someuser/poetry/'
start_urls = ['http://www.example.com/someuser/poetry/recent/',
'http://www.example.com/someuser/poetry/less_recent/']
rules = [Rule(SgmlLinkExtractor(allow=[start_urls[0] + HTML_FILE_NAME]),
callback='parse_poem'),
Rule(SgmlLinkExtractor(allow=[start_urls[1] + HTML_FILE_NAME]),
callback='parse_poem')]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment