Skip to content

Instantly share code, notes, and snippets.

@juanriaza
Created November 2, 2015 14:03
Show Gist options
  • Save juanriaza/5b5f66b864313c2902d0 to your computer and use it in GitHub Desktop.
Save juanriaza/5b5f66b864313c2902d0 to your computer and use it in GitHub Desktop.
import json
import scrapy
import urllib
class ExampleSpider(scrapy.Spider):
name = 'habrahabr.ru'
start_urls = ['http://habrahabr.ru/']
def parse(self, response):
for url in response.xpath('//h1[@class="title"]'
'/a[@class="post_title"]/@href').extract():
yield scrapy.Request(url, callback=self.habrapost)
def habrapost(self, response):
post = {
'url': response.url,
'title': response.xpath(
'//h1/span[@class="post_title"]//text()').extract_first(),
}
query = urllib.quote_plus(post['title'].encode('utf-8'))
search_url = 'https://ajax.googleapis.com/ajax/services/search/' \
'images?v=1.0&q={}'.format(query)
yield scrapy.Request(
search_url,
meta={'post': post},
callback=self.image_search)
def image_search(self, response):
json_data = json.loads(response.body)
image_url = json_data['responseData']['results'][0]['url']
yield scrapy.Request(
image_url,
meta={'post': response.meta['post']},
callback=self.image)
def image(self, response):
path = 'images/%s' % response.url.split('/')[-1]
with open(path, 'wb+') as f:
f.write(response.body)
post = response.meta['post']
post['image'] = path
yield post
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment