Skip to content

Instantly share code, notes, and snippets.

@iwatake2222
Last active December 6, 2017 04:40
Show Gist options
  • Save iwatake2222/872467b79c8799db69dbd7b0755e4490 to your computer and use it in GitHub Desktop.
Save iwatake2222/872467b79c8799db69dbd7b0755e4490 to your computer and use it in GitHub Desktop.
Python + Scrapyで画像を巡回取得する ref: https://qiita.com/take-iwiw/items/be292150316a3ca033eb
scrapy startproject test_scrapy
cd test_scrapy
scrapy genspider save_yahoo_image news.yahoo.co.jp
scrapy crawl save_yahoo_image
# -*- coding: utf-8 -*-
import scrapy
from scrapy.item import Item, Field
class ImageItem(Item):
image_directory_name = Field()
image_urls = Field()
images = Field()
# -*- coding: utf-8 -*-
import scrapy
from scrapy.pipelines.images import ImagesPipeline
from scrapy.utils.misc import md5sum
# refer: https://stackoverflow.com/questions/31779995/how-to-give-custom-name-to-images-when-downloading-through-scrapy
class MyImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield scrapy.Request(image_url, meta={'image_directory_name': item["image_directory_name"]})
def image_downloaded(self, response, request, info):
checksum = None
for path, image, buf in self.get_images(response, request, info):
if checksum is None:
buf.seek(0)
checksum = md5sum(buf)
width, height = image.size
filename = request._url.rsplit("/", 1)[1]
path = 'full/%s/%s' % (response.meta['image_directory_name'], filename)
self.store.persist_file(
path, buf, info,
meta={'width': width, 'height': height})
return checksum
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from test_scrapy.items import ImageItem
class SaveYahooImageSpider(CrawlSpider):
name = 'save_yahoo_image'
allowed_domains = ["news.yahoo.co.jp"]
start_urls = ["https://news.yahoo.co.jp"]
rules = (
Rule(LinkExtractor(allow=( )), callback="parse_page", follow=True),
)
def parse_page(self, response):
print("\n>>> Parse " + response.url + " <<<")
# print(response.url.rsplit("/", 1)[0])
item = ImageItem()
item["image_directory_name"] = self.start_urls[0].rsplit("/", 1)[1]
item["image_urls"] = []
for image_url in response.xpath("//img/@src").extract():
if "http" not in image_url:
item["image_urls"].append(response.url.rsplit("/", 1)[0] + "/" + image_url)
else:
item["image_urls"].append(image_url)
# print(vars(item))
return item
ITEM_PIPELINES = {'test_scrapy.pipelines.MyImagesPipeline': 1}
# ITEM_PIPELINES = {'scrapy.contrib.pipeline.images.ImagesPipeline': 1}
IMAGES_STORE = './savedImages'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment