Skip to content

Instantly share code, notes, and snippets.

@crioux
Last active May 9, 2022 21:08
Show Gist options
  • Save crioux/06af09ed86d3be57dcb5ce7a33156e0c to your computer and use it in GitHub Desktop.
Save crioux/06af09ed86d3be57dcb5ce7a33156e0c to your computer and use it in GitHub Desktop.
# log into pixabay and store your cookies in cookies.txt in the following format:
#
# __cfduid=xxx
# cf_use_ob=0
# client_width=3423
# csrftoken=xxx
# g_rated=
# is_human=1
# sessionid="xxxx"
#
# then run:
# scrapy runspider pixabay-scraper.py -a search=something
# or just:
# scrapy runspider pixabay-scraper.py
# to search for cat pictures specifically
import scrapy
import os
import sys
class PixabayScraper(scrapy.Spider):
name = 'pixabay'
cookies={}
custom_settings = {
'CONCURRENT_REQUESTS': 1,
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'DEPTH_PRIORITY': 1,
'SCHEDULER_DISK_QUEUE': 'scrapy.squeues.PickleFifoDiskQueue',
'SCHEDULER_MEMORY_QUEUE': 'scrapy.squeues.FifoMemoryQueue'
}
def __init__(self, name=None, search='cat', **kwargs):
self.start_urls = ["https://pixabay.com/photos/search/{}".format(search)]
super(PixabayScraper, self).__init__(name, **kwargs)
with open("cookies.txt") as f:
for cl in f.readlines():
cookie = cl.split("=")
if len(cookie)==2:
ck = cookie[0].strip()
cv = cookie[1].strip()
self.cookies[ck]=cv
def parse(self, response):
for item in response.css('div.item'):
#yield {
# 'text': quote.css('span.text::text').get(),
# 'author': quote.xpath('span/small/text()').get(),
#}
imgurl = item.css('a img::attr("data-lazy")').get()
if not imgurl:
continue
name=imgurl.split('/')[-1].split('_')[0]+".jpg"
if not os.path.exists(name):
downloadurl="https://pixabay.com/images/download/"+name+"?attachment"
yield response.follow(downloadurl, self.download, cookies=self.cookies)
else:
print("skipping: {}".format(name))
urlsplit = response.request.url.split("?")
if len(urlsplit)==2:
pagi = int(urlsplit[1].split("=")[1])+1
else:
pagi = 2
nexturl = urlsplit[0]+"?pagi={}".format(pagi)
yield response.follow(nexturl, self.parse, cookies=self.cookies)
def download(self, response):
if "recaptcha" in response.body.decode("latin1"):
raise scrapy.exceptions.CloseSpider(reason='time to solve the captcha')
path = response.url.split('/')[-1].split('?')[0]
with open(path, 'wb') as f:
f.write(response.body)
@Assassins231
Copy link

what is "response" parameter here?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment