Last active
January 4, 2016 19:49
-
-
Save pedrokoblitz/8669900 to your computer and use it in GitHub Desktop.
scrapy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Many times when crawling we run into problems where content that is rendered on the page is generated with Javascript and therefore scrapy is unable to crawl for it (eg. ajax requests, jQuery craziness). However, if you use Scrapy along with the web testing framework Selenium then we are able to crawl anything displayed in a normal web browser. | |
# | |
# Some things to note: | |
# You must have the Python version of Selenium RC installed for this to work, and you must have set up Selenium properly. Also this is just a template crawler. You could get much crazier and more advanced with things but I just wanted to show the basic idea. As the code stands now you will be doing two requests for any given url. One request is made by Scrapy and the other is made by Selenium. I am sure there are ways around this so that you could possibly just make Selenium do the one and only request but I did not bother to implement that and by doing two requests you get to crawl the page with Scrapy too. | |
# | |
# This is quite powerful because now you have the entire rendered DOM available for you to crawl and you can still use all the nice crawling features in Scrapy. This will make for slower crawling of course but depending on how much you need the rendered DOM it might be worth the wait. | |
from scrapy.contrib.spiders import CrawlSpider, Rule | |
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor | |
from scrapy.selector import HtmlXPathSelector | |
from scrapy.http import Request | |
from selenium import selenium | |
class SeleniumSpider(CrawlSpider): | |
name = "SeleniumSpider" | |
start_urls = ["http://www.domain.com"] | |
rules = ( | |
Rule(SgmlLinkExtractor(allow=('\.html', )), callback='parse_page',follow=True), | |
) | |
def __init__(self): | |
CrawlSpider.__init__(self) | |
self.verificationErrors = [] | |
self.selenium = selenium("localhost", 4444, "*chrome", "http://www.domain.com") | |
self.selenium.start() | |
def __del__(self): | |
self.selenium.stop() | |
print self.verificationErrors | |
CrawlSpider.__del__(self) | |
def parse_page(self, response): | |
item = Item() | |
hxs = HtmlXPathSelector(response) | |
#Do some XPath selection with Scrapy | |
hxs.select('//div').extract() | |
sel = self.selenium | |
sel.open(response.url) | |
#Wait for javscript to load in Selenium | |
time.sleep(2.5) | |
#Do some crawling of javascript created content with Selenium | |
sel.get_text("//div") | |
yield item | |
# Snippet imported from snippets.scrapy.org (which no longer works) | |
# author: wynbennett | |
# date : Jun 21, 2011 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This is a piece of code that use webdrivers to load&render a page with Scrapy and Selenium. | |
# | |
# This work is based on the snippets [wynbennett](http://snippets.scrapy.org/users/wynbennett/) [posted here](http://snippets.scrapy.org/snippets/21/) some time ago | |
from scrapy.contrib.spiders import CrawlSpider, Rule | |
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor | |
from scrapy.selector import HtmlXPathSelector | |
from scrapy.http import Request | |
from myItem.items import myItem | |
from selenium import webdriver | |
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile | |
import time | |
import pprint | |
class WebDriverSpider(CrawlSpider): | |
name = "WebDriverSpider" | |
start_urls = ["http://yourDomain.com/yourUrl.html"] | |
rules = ( | |
Rule(SgmlLinkExtractor(allow=('\.html', ), allow_domains=('yourDomain.com', )), callback='parse_page',follow=False), | |
) | |
def __init__(self): | |
CrawlSpider.__init__(self) | |
self.verificationErrors = [] | |
#create a profile with specific add-ons | |
#and do this. Firefox to load it | |
profile = FirefoxProfile(profile_directory="/home/yourUser/.mozilla/firefox/selenium/") | |
self.selenium = webdriver.Firefox(profile) | |
def __del__(self): | |
self.selenium.quit() | |
print self.verificationErrors | |
CrawlSpider.__del__(self) | |
def parse_page(self, response): | |
#normal scrapy result | |
hxs = HtmlXPathSelector(response) | |
#webdriver rendered page | |
sel = self.selenium | |
sel.get(response.url) | |
if sel: | |
#Wait for javascript to load in Selenium | |
time.sleep(2.5) | |
#Do some crawling of javascript created content with Selenium | |
item = myItem() | |
item['url'] = response.url | |
item['title'] = hxs.select('//title/text()').extract() | |
#something u can do only with webdrivers | |
item['thatDiv'] = sel.find_element_by_id("thatDiv") | |
# Snippet imported from snippets.scrapy.org (which no longer works) | |
# author: rollsappletree | |
# date : Aug 25, 2011 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# When you run the Scrapy crawler from a program, the code blocks until the Scrapy crawler is finished. This is due to how Twisted (the underlying asynchronous network library) works. This prevents using the Scrapy crawler from scripts or other code. | |
# | |
# To circumvent this issue you can run the Scrapy crawler in a thread with this code. | |
# | |
# Keep in mind that this code is mainly for illustrative purposes and far from production ready. | |
# | |
# Also the code was only tested with Scrapy 0.8, and will probably need some adjustments for newer versions (since the core API isn't stable yet), but you get the idea. | |
""" | |
Code to run Scrapy crawler in a thread - works on Scrapy 0.8 | |
""" | |
import threading, Queue | |
from twisted.internet import reactor | |
from scrapy.xlib.pydispatch import dispatcher | |
from scrapy.core.manager import scrapymanager | |
from scrapy.core.engine import scrapyengine | |
from scrapy.core import signals | |
class CrawlerThread(threading.Thread): | |
def __init__(self): | |
threading.Thread.__init__(self) | |
self.running = False | |
def run(self): | |
self.running = True | |
scrapymanager.configure(control_reactor=False) | |
scrapymanager.start() | |
reactor.run(installSignalHandlers=False) | |
def crawl(self, *args): | |
if not self.running: | |
raise RuntimeError("CrawlerThread not running") | |
self._call_and_block_until_signal(signals.spider_closed, \ | |
scrapymanager.crawl, *args) | |
def stop(self): | |
reactor.callFromThread(scrapyengine.stop) | |
def _call_and_block_until_signal(self, signal, f, *a, **kw): | |
q = Queue.Queue() | |
def unblock(): | |
q.put(None) | |
dispatcher.connect(unblock, signal=signal) | |
reactor.callFromThread(f, *a, **kw) | |
q.get() | |
# Usage example below: | |
import os | |
os.environ.setdefault('SCRAPY_SETTINGS_MODULE', 'myproject.settings') | |
from scrapy.xlib.pydispatch import dispatcher | |
from scrapy.core import signals | |
from scrapy.conf import settings | |
from scrapy.crawler import CrawlerThread | |
settings.overrides['LOG_ENABLED'] = False # avoid log noise | |
def item_passed(item): | |
print "Just scraped item:", item | |
dispatcher.connect(item_passed, signal=signals.item_passed) | |
crawler = CrawlerThread() | |
print "Starting crawler thread..." | |
crawler.start() | |
print "Crawling somedomain.com...." | |
crawler.crawl('somedomain.com) # blocking call | |
print "Crawling anotherdomain.com..." | |
crawler.crawl('anotherdomain.com') # blocking call | |
print "Stopping crawler thread..." | |
crawler.stop() | |
# Snippet imported from snippets.scrapy.org (which no longer works) | |
# author: pablo | |
# date : Aug 11, 2010 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This middleware can be used to avoid re-visiting already visited items, which can be useful for speeding up the scraping for projects with immutable items, ie. items that, once scraped, don't change. | |
from scrapy import log | |
from scrapy.http import Request | |
from scrapy.item import BaseItem | |
from scrapy.utils.request import request_fingerprint | |
from myproject.items import MyItem | |
class IgnoreVisitedItems(object): | |
"""Middleware to ignore re-visiting item pages if they were already visited | |
before. The requests to be filtered by have a meta['filter_visited'] flag | |
enabled and optionally define an id to use for identifying them, which | |
defaults the request fingerprint, although you'd want to use the item id, | |
if you already have it beforehand to make it more robust. | |
""" | |
FILTER_VISITED = 'filter_visited' | |
VISITED_ID = 'visited_id' | |
CONTEXT_KEY = 'visited_ids' | |
def process_spider_output(self, response, result, spider): | |
context = getattr(spider, 'context', {}) | |
visited_ids = context.setdefault(self.CONTEXT_KEY, {}) | |
ret = [] | |
for x in result: | |
visited = False | |
if isinstance(x, Request): | |
if self.FILTER_VISITED in x.meta: | |
visit_id = self._visited_id(x) | |
if visit_id in visited_ids: | |
log.msg("Ignoring already visited: %s" % x.url, | |
level=log.INFO, spider=spider) | |
visited = True | |
elif isinstance(x, BaseItem): | |
visit_id = self._visited_id(response.request) | |
if visit_id: | |
visited_ids[visit_id] = True | |
x['visit_id'] = visit_id | |
x['visit_status'] = 'new' | |
if visited: | |
ret.append(MyItem(visit_id=visit_id, visit_status='old')) | |
else: | |
ret.append(x) | |
return ret | |
def _visited_id(self, request): | |
return request.meta.get(self.VISITED_ID) or request_fingerprint(request) | |
# Snippet imported from snippets.scrapy.org (which no longer works) | |
# author: pablo | |
# date : Aug 10, 2010 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Standard Python library imports | |
# 3rd party modules | |
import pymongo | |
from scrapy import log | |
from scrapy.conf import settings | |
from scrapy.exceptions import DropItem | |
class MongoDBPipeline(object): | |
def __init__(self): | |
self.server = settings['MONGODB_SERVER'] | |
self.port = settings['MONGODB_PORT'] | |
self.db = settings['MONGODB_DB'] | |
self.col = settings['MONGODB_COLLECTION'] | |
connection = pymongo.Connection(self.server, self.port) | |
db = connection[self.db] | |
self.collection = db[self.col] | |
def process_item(self, item, spider): | |
err_msg = '' | |
for field, data in item.items(): | |
if not data: | |
err_msg += 'Missing %s of poem from %s\n' % (field, item['url']) | |
if err_msg: | |
raise DropItem(err_msg) | |
self.collection.insert(dict(item)) | |
log.msg('Item written to MongoDB database %s/%s' % (self.db, self.col), | |
level=log.DEBUG, spider=spider) | |
return item |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Cannot use this to create the table, must have table already created | |
from twisted.enterprise import adbapi | |
import datetime | |
import MySQLdb.cursors | |
class SQLStorePipeline(object): | |
def __init__(self): | |
self.dbpool = adbapi.ConnectionPool('MySQLdb', db='mydb', | |
user='myuser', passwd='mypass', cursorclass=MySQLdb.cursors.DictCursor, | |
charset='utf8', use_unicode=True) | |
def process_item(self, item, spider): | |
# run db query in thread pool | |
query = self.dbpool.runInteraction(self._conditional_insert, item) | |
query.addErrback(self.handle_error) | |
return item | |
def _conditional_insert(self, tx, item): | |
# create record if doesn't exist. | |
# all this block run on it's own thread | |
tx.execute("select * from websites where link = %s", (item['link'][0], )) | |
result = tx.fetchone() | |
if result: | |
log.msg("Item already stored in db: %s" % item, level=log.DEBUG) | |
else: | |
tx.execute(\ | |
"insert into websites (link, created) " | |
"values (%s, %s)", | |
(item['link'][0], | |
datetime.datetime.now()) | |
) | |
log.msg("Item stored in db: %s" % item, level=log.DEBUG) | |
def handle_error(self, e): | |
log.err(e) | |
# Snippet imported from snippets.scrapy.org (which no longer works) | |
# author: redtricycle | |
# date : Nov 21, 2011 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# You can use this middleware to have a random user agent every request the spider makes. | |
# You can define a user USER_AGEN_LIST in your settings and the spider will chose a random user agent from that list every time. | |
# | |
# You will have to disable the default user agent middleware and add this to your settings file. | |
# | |
# DOWNLOADER_MIDDLEWARES = { | |
# 'scraper.random_user_agent.RandomUserAgentMiddleware': 400, | |
# 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None, | |
# } | |
from scraper.settings import USER_AGENT_LIST | |
import random | |
from scrapy import log | |
class RandomUserAgentMiddleware(object): | |
def process_request(self, request, spider): | |
ua = random.choice(USER_AGENT_LIST) | |
if ua: | |
request.headers.setdefault('User-Agent', ua) | |
#log.msg('>>>> UA %s'%request.headers) | |
# Snippet imported from snippets.scrapy.org (which no longer works) | |
# author: dushyant | |
# date : Sep 16, 2011 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This scripts shows how to crawl a site without settings up a complete project. | |
# | |
# Note: the `crawler.start()` can't be called more than once due twisted's reactor limitation. | |
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# author: Rolando Espinoza La fuente | |
# | |
# Changelog: | |
# 24/07/2011 - updated to work with scrapy 13.0dev | |
# 25/08/2010 - initial version. works with scrapy 0.9 | |
from scrapy.contrib.loader import XPathItemLoader | |
from scrapy.item import Item, Field | |
from scrapy.selector import HtmlXPathSelector | |
from scrapy.spider import BaseSpider | |
class QuestionItem(Item): | |
"""Our SO Question Item""" | |
title = Field() | |
summary = Field() | |
tags = Field() | |
user = Field() | |
posted = Field() | |
votes = Field() | |
answers = Field() | |
views = Field() | |
class MySpider(BaseSpider): | |
"""Our ad-hoc spider""" | |
name = "myspider" | |
start_urls = ["http://stackoverflow.com/"] | |
question_list_xpath = '//div[@id="content"]//div[contains(@class, "question-summary")]' | |
def parse(self, response): | |
hxs = HtmlXPathSelector(response) | |
for qxs in hxs.select(self.question_list_xpath): | |
loader = XPathItemLoader(QuestionItem(), selector=qxs) | |
loader.add_xpath('title', './/h3/a/text()') | |
loader.add_xpath('summary', './/h3/a/@title') | |
loader.add_xpath('tags', './/a[@rel="tag"]/text()') | |
loader.add_xpath('user', './/div[@class="started"]/a[2]/text()') | |
loader.add_xpath('posted', './/div[@class="started"]/a[1]/span/@title') | |
loader.add_xpath('votes', './/div[@class="votes"]/div[1]/text()') | |
loader.add_xpath('answers', './/div[contains(@class, "answered")]/div[1]/text()') | |
loader.add_xpath('views', './/div[@class="views"]/div[1]/text()') | |
yield loader.load_item() | |
def main(): | |
"""Setups item signal and run the spider""" | |
# set up signal to catch items scraped | |
from scrapy import signals | |
from scrapy.xlib.pydispatch import dispatcher | |
def catch_item(sender, item, **kwargs): | |
print "Got:", item | |
dispatcher.connect(catch_item, signal=signals.item_passed) | |
# shut off log | |
from scrapy.conf import settings | |
settings.overrides['LOG_ENABLED'] = False | |
# set up crawler | |
from scrapy.crawler import CrawlerProcess | |
crawler = CrawlerProcess(settings) | |
crawler.install() | |
crawler.configure() | |
# schedule spider | |
crawler.crawl(MySpider()) | |
# start engine scrapy/twisted | |
print "STARTING ENGINE" | |
crawler.start() | |
print "ENGINE STOPPED" | |
if __name__ == '__main__': | |
main() | |
# Snippet imported from snippets.scrapy.org (which no longer works) | |
# author: darkrho | |
# date : Aug 25, 2010 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Standard Python library imports | |
# 3rd party imports | |
from scrapy.contrib.spiders import CrawlSpider, Rule | |
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor | |
from scrapy.selector import HtmlXPathSelector | |
# My imports | |
from poetry_analysis.items import PoetryAnalysisItem | |
HTML_FILE_NAME = r'.+\.html' | |
class PoetryParser(object): | |
""" | |
Provides common parsing method for poems formatted this one specific way. | |
""" | |
date_pattern = r'(\d{2} \w{3,9} \d{4})' | |
def parse_poem(self, response): | |
hxs = HtmlXPathSelector(response) | |
item = PoetryAnalysisItem() | |
# All poetry text is in pre tags | |
text = hxs.select('//pre/text()').extract() | |
item['text'] = ''.join(text) | |
item['url'] = response.url | |
# head/title contains title - a poem by author | |
title_text = hxs.select('//head/title/text()').extract()[0] | |
item['title'], item['author'] = title_text.split(' - ') | |
item['author'] = item['author'].replace('a poem by', '') | |
for key in ['title', 'author']: | |
item[key] = item[key].strip() | |
item['date'] = hxs.select("//p[@class='small']/text()").re(date_pattern) | |
return item | |
class PoetrySpider(CrawlSpider, PoetryParser): | |
name = 'example.com_poetry' | |
allowed_domains = ['www.example.com'] | |
root_path = 'someuser/poetry/' | |
start_urls = ['http://www.example.com/someuser/poetry/recent/', | |
'http://www.example.com/someuser/poetry/less_recent/'] | |
rules = [Rule(SgmlLinkExtractor(allow=[start_urls[0] + HTML_FILE_NAME]), | |
callback='parse_poem'), | |
Rule(SgmlLinkExtractor(allow=[start_urls[1] + HTML_FILE_NAME]), | |
callback='parse_poem')] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment