Skip to content

Instantly share code, notes, and snippets.

@rafikahmed
Created May 19, 2020 20:45
Show Gist options
  • Save rafikahmed/c2c4524178b89985ee19085a3d800c4a to your computer and use it in GitHub Desktop.
Save rafikahmed/c2c4524178b89985ee19085a3d800c4a to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
#
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import unicodedata
import re
import scrapy
from scrapy.loader.processors import TakeFirst, MapCompose
def slugify(value, allow_unicode=False):
"""
Convert to ASCII if 'allow_unicode' is False. Convert spaces to hyphens.
Remove characters that aren't alphanumerics, underscores, or hyphens.
Convert to lowercase. Also strip leading and trailing whitespace.
"""
value = str(value)
if allow_unicode:
value = unicodedata.normalize('NFKC', value)
else:
value = unicodedata.normalize('NFKD', value).encode(
'ascii', 'ignore').decode('ascii')
value = re.sub(r'[^\w\s-]', '', value.lower()).strip()
return re.sub(r'[-\s]+', '-', value)
class ScrapeimagesItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
image_urls = scrapy.Field()
images = scrapy.Field()
book_name = scrapy.Field(
input_processor=MapCompose(slugify),
output_processor=TakeFirst()
)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy import Request
from scrapy.pipelines.images import ImagesPipeline
import hashlib
class ScrapeimagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
return [Request(x, meta={'BookName': item.get('book_name')}) for x in item.get(self.images_urls_field, [])]
def file_path(self, request, response=None, info=None):
# start of deprecation warning block (can be removed in the future)
def _warn():
from scrapy.exceptions import ScrapyDeprecationWarning
import warnings
warnings.warn('ImagesPipeline.image_key(url) and file_key(url) methods are deprecated, '
'please use file_path(request, response=None, info=None) instead',
category=ScrapyDeprecationWarning, stacklevel=1)
# check if called from image_key or file_key with url as first argument
if not isinstance(request, Request):
_warn()
url = request
else:
url = request.url
# detect if file_key() or image_key() methods have been overridden
if not hasattr(self.file_key, '_base'):
_warn()
return self.file_key(url)
elif not hasattr(self.image_key, '_base'):
_warn()
return self.image_key(url)
# end of deprecation warning block
filename = request.meta['BookName'].replace(':', '')
return 'full/%s.jpg' % (filename)
ITEM_PIPELINES = {
'<project_name>.pipelines.ScrapeimagesPipeline': 1, # please replace <project_name> by your project name
}
IMAGES_STORE = 'C:\\Users\\username\\Desktop' # please replace username by your username
Download_TIMEOUT = 1200
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment