Last active
May 9, 2022 21:08
-
-
Save crioux/06af09ed86d3be57dcb5ce7a33156e0c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# log into pixabay and store your cookies in cookies.txt in the following format: | |
# | |
# __cfduid=xxx | |
# cf_use_ob=0 | |
# client_width=3423 | |
# csrftoken=xxx | |
# g_rated= | |
# is_human=1 | |
# sessionid="xxxx" | |
# | |
# then run: | |
# scrapy runspider pixabay-scraper.py -a search=something | |
# or just: | |
# scrapy runspider pixabay-scraper.py | |
# to search for cat pictures specifically | |
import scrapy | |
import os | |
import sys | |
class PixabayScraper(scrapy.Spider): | |
name = 'pixabay' | |
cookies={} | |
custom_settings = { | |
'CONCURRENT_REQUESTS': 1, | |
'CONCURRENT_REQUESTS_PER_DOMAIN': 1, | |
'DOWNLOAD_DELAY': 1, | |
'DEPTH_PRIORITY': 1, | |
'SCHEDULER_DISK_QUEUE': 'scrapy.squeues.PickleFifoDiskQueue', | |
'SCHEDULER_MEMORY_QUEUE': 'scrapy.squeues.FifoMemoryQueue' | |
} | |
def __init__(self, name=None, search='cat', **kwargs): | |
self.start_urls = ["https://pixabay.com/photos/search/{}".format(search)] | |
super(PixabayScraper, self).__init__(name, **kwargs) | |
with open("cookies.txt") as f: | |
for cl in f.readlines(): | |
cookie = cl.split("=") | |
if len(cookie)==2: | |
ck = cookie[0].strip() | |
cv = cookie[1].strip() | |
self.cookies[ck]=cv | |
def parse(self, response): | |
for item in response.css('div.item'): | |
#yield { | |
# 'text': quote.css('span.text::text').get(), | |
# 'author': quote.xpath('span/small/text()').get(), | |
#} | |
imgurl = item.css('a img::attr("data-lazy")').get() | |
if not imgurl: | |
continue | |
name=imgurl.split('/')[-1].split('_')[0]+".jpg" | |
if not os.path.exists(name): | |
downloadurl="https://pixabay.com/images/download/"+name+"?attachment" | |
yield response.follow(downloadurl, self.download, cookies=self.cookies) | |
else: | |
print("skipping: {}".format(name)) | |
urlsplit = response.request.url.split("?") | |
if len(urlsplit)==2: | |
pagi = int(urlsplit[1].split("=")[1])+1 | |
else: | |
pagi = 2 | |
nexturl = urlsplit[0]+"?pagi={}".format(pagi) | |
yield response.follow(nexturl, self.parse, cookies=self.cookies) | |
def download(self, response): | |
if "recaptcha" in response.body.decode("latin1"): | |
raise scrapy.exceptions.CloseSpider(reason='time to solve the captcha') | |
path = response.url.split('/')[-1].split('?')[0] | |
with open(path, 'wb') as f: | |
f.write(response.body) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
what is "response" parameter here?