Skip to content

Instantly share code, notes, and snippets.

@raphapassini
Created December 5, 2017 17:30
Show Gist options
  • Save raphapassini/9c7ab4149908cee986dafd245a7888d1 to your computer and use it in GitHub Desktop.
Save raphapassini/9c7ab4149908cee986dafd245a7888d1 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
from scrapy import signals
class G1Spider(scrapy.Spider):
name = 'g1'
allowed_domains = ['g1.globo.com']
start_urls = [
'https://g1.globo.com/mg/centro-oeste/noticia/criminosos-cercam-quartel-em-pompeu-e-policiais-sao-mortos.ghtml',
'http://g1.globo.com/minas-gerais/interatividade/enquete/2017/12/4/escolha-o-tema-da-entrevista-no-estudio-do-mgtv-para-quinta-feira-f0445236-d8ee-11e7-851f-0242ac110003.html',
'https://g1.globo.com/mg/zona-da-mata/noticia/trecho-interditado-da-br-116-na-zona-da-mata-deve-ser-liberado-ate-o-final-de-semana.ghtml'
]
counter = 0
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(G1Spider, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.spider_idle, signal=signals.spider_idle)
return spider
def spider_idle(self, spider):
self.logger.info('Spider is IDLE')
if not self.start_urls:
return
self.counter += 1
for req in self.start_requests():
self.crawler.engine.crawl(req, self)
def start_requests(self):
self.logger.info('Sending URL #{}'.format(self.counter))
url = self.start_urls.pop()
yield Request(url, meta={'idx': self.counter})
def parse(self, response):
self.logger.info('Received the response #{}'.format(
response.meta['idx']))
yield Request(
'http://python.org', dont_filter=True,
callback=self.parse_step2, meta={'idx': response.meta['idx']})
def parse_step2(self, response):
self.logger.info('Reached step 2 for URL #{}'.format(
response.meta['idx']))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment