Skip to content

Instantly share code, notes, and snippets.

@briehanlombaard
Created August 25, 2016 08:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save briehanlombaard/b19885f99b9ee26363070fd9883695b9 to your computer and use it in GitHub Desktop.
Save briehanlombaard/b19885f99b9ee26363070fd9883695b9 to your computer and use it in GitHub Desktop.
import scrapy
from io import BytesIO
from PIL import Image, ImageOps
from scrapy.pipelines.images import ImagesPipeline
class ArticleItem(scrapy.Item):
image_urls = scrapy.Field()
images = scrapy.Field()
class ArticleImagesPipeline(ImagesPipeline):
def convert_image(self, image, size=None):
if image.format == 'PNG' and image.mode == 'RGBA':
background = Image.new('RGBA', image.size, (255, 255, 255))
background.paste(image, image)
image = background.convert('RGB')
elif image.mode != 'RGB':
image = image.convert('RGB')
if size:
# Use ImageOps.fit instead of image.thumbnail so that we get an
# image with the exact dimensions given by `size`.
image = ImageOps.fit(image, size, Image.ANTIALIAS)
buf = BytesIO()
image.save(buf, 'JPEG')
return image, buf
class ArticleSpider(scrapy.Spider):
name = 'article'
custom_settings = {
'ITEM_PIPELINES': {
'spider.ArticleImagesPipeline': 100,
},
'IMAGES_STORE': 'articles',
'IMAGES_THUMBS': {
'small': (290, 150),
'medium': (610, 320),
},
'AUTOTHROTTLE_ENABLED': True,
}
def start_requests(self):
yield scrapy.Request(u'http://www.latimes.com/travel/la-trb-ellis-island-video-20160621-premiumvideo.html')
def parse(self, response):
yield ArticleItem(image_urls=['http://www.trbimg.com/img-57699896/turbine/la-trb-ellis-island-video-20160621'])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment