Skip to content

Instantly share code, notes, and snippets.

@yashrsharma44
Created July 31, 2018 14:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save yashrsharma44/4b82ef91c5512a0d711d9855e3e19e81 to your computer and use it in GitHub Desktop.
Save yashrsharma44/4b82ef91c5512a0d711d9855e3e19e81 to your computer and use it in GitHub Desktop.
Sample spider for running the new asyncio support in scrapy
import scrapy
from scrapy.Fetch import Fetch
import asyncio
import aiohttp
class QuotesSpider(scrapy.Spider):
name = "quotes"
async def start_requests(self):
urls = [
'http://quotes.toscrape.com/page/1/',
'http://quotes.toscrape.com/page/2/'
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
async def parse(self, response):
links = [(response.xpath('//@href').extract()[-1])]
links.append(response.xpath('//@href').extract()[-2])
print("Started the aiohttp module!!")
conn = aiohttp.TCPConnector(verify_ssl=False)
async with aiohttp.ClientSession(connector=conn) as session:
print("Inside the aiohttp Client Session!!")
html = await self.fetch(session, 'https://python-forum.io/Thread-Exploring-async-await-without-knowing-how-they-work-ahead-of-time?pid=17292')
print(html)
print("Completed the aiohttp!!")
spider = response.spider # One has to get spider and crawler with response, in order to use Fetch. Will work on updating this!
crawler = response.crawler
for link in links:
res = await Fetch(url=link, crawler=crawler, spider=spider) # You can use yield scrapy.Request(...), for using a callback
print("Before the asyncio.sleep!!")
await asyncio.sleep(5)
print("___RESPONSE___and link {!r}__________________________________________________________{!r}".format(link,res))
print("---------------------------END OF PARSE------------------------------------------------")
async def parse2(self, response):
page = response.url.split("/")[-2]
print("/////////////////////-----------IN PARSE 2----------------------------//////////////////////")
filename = 'quotes-%s.html' % page
with open(filename, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
print("----END OF PARSE2 ------------")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment