Skip to content

Instantly share code, notes, and snippets.

Last active May 16, 2021 11:34
Show Gist options
  • Save boatgm/4451253 to your computer and use it in GitHub Desktop.
Save boatgm/4451253 to your computer and use it in GitHub Desktop.
scrapy examples
from scrapy.spider import BaseSpider
class MindhacksSpider(BaseSpider):
domain_name = ""
start_urls = [""]
def parse(self, response):
return []
SPIDER = MindhacksSpider()
from scrapy.selector import HtmlXPathSelector
from scrapy.spider import BaseSpider
from scrapy.http import Request
from myproject.items import MyItem
class MySpider(BaseSpider):
name = ''
allowed_domains = ['']
start_urls = [
def parse(self, response):
hxs = HtmlXPathSelector(response)
for h3 in'//h3').extract():
yield MyItem(title=h3)
for url in'//a/@href').extract():
yield Request(url, callback=self.parse)
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item
class MySpider(CrawlSpider):
name = ''
allowed_domains = ['']
start_urls = ['']
rules = (
# Extract links matching 'category.php' (but not matching 'subsection.php')
# and follow links from them (since no callback means follow=True by default).
Rule(SgmlLinkExtractor(allow=('category\.php', ), deny=('subsection\.php', ))),
# Extract links matching 'item.php' and parse them with the spider's method parse_item
Rule(SgmlLinkExtractor(allow=('item\.php', )), callback='parse_item'),
def parse_item(self, response):
self.log('Hi, this is an item page! %s' % response.url)
hxs = HtmlXPathSelector(response)
item = Item()
item['id'] ='//td[@id="item_id"]/text()').re(r'ID: (\d+)')
item['name'] ='//td[@id="item_name"]/text()').extract()
item['description'] ='//td[@id="item_description"]/text()').extract()
return item
from scrapy import log
from scrapy.contrib.spiders import XMLFeedSpider
from myproject.items import TestItem
class MySpider(XMLFeedSpider):
name = ''
allowed_domains = ['']
start_urls = ['']
iterator = 'iternodes' # This is actually unnecesary, since it's the default value
itertag = 'item'
def parse_node(self, response, node):
log.msg('Hi, this is a <%s> node!: %s' % (self.itertag, ''.join(node.extract())))
item = Item()
item['id'] ='@id').extract()
item['name'] ='name').extract()
item['description'] ='description').extract()
return item
from scrapy import log
from scrapy.contrib.spiders import CSVFeedSpider
from myproject.items import TestItem
class MySpider(CSVFeedSpider):
name = ''
allowed_domains = ['']
start_urls = ['']
delimiter = ';'
headers = ['id', 'name', 'description']
def parse_row(self, response, row):
log.msg('Hi, this is a row!: %r' % row)
item = TestItem()
item['id'] = row['id']
item['name'] = row['name']
item['description'] = row['description']
return item
def parse(self, response):
items = []
hxs = HtmlXPathSelector(response)
posts = hxs.x('//h1/a/@href').extract()
for url in posts])
page_links = hxs.x('//div[@class="wp-pagenavi"]/a[not(@title)]')
for link in page_links:
if link.x('text()').extract()[0] == u'\xbb':
url = link.x('@href').extract()[0]
return items
def parse_post(self, response):
item = BlogCrawlItem()
item.url = unicode(response.url)
item.raw = response.body_as_unicode()
return [item]
class BlogCrawlItem(ScrapedItem):
def __init__(self):
self.url = ''
def __str__(self):
return 'BlogCrawlItem(url: %s)' % self.url
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment