Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
Scrape images from zerochan.net
from pprint import pprint
import requests
from bs4 import BeautifulSoup
def searchShow(showName):
"""
When you use their search bar, it sends your query to this webhook to list
some possible choices. The first one is usually the one we want, so we'll
return that.
It will look something like this:
Megumin|Character|Kono Subarashii Sekai ni Shukufuku wo!
"""
url = 'https://www.zerochan.net/suggest?q='
url += showName.strip().lower().replace(' ','+')
url += '&limit=10'
r = requests.get(url)
results = r.text.strip().split('\n')
if len(results) == 0:
return []
else:
return results[0]
def btw(inputText,leftText,rightText):
"""utility function to get the text between two other pieces of text.
"""
if leftText == None:
return inputText.split(rightText,1)[0]
elif rightText == None:
return inputText.split(leftText,1)[1]
else:
return inputText.split(leftText,1)[1].split(rightText,1)[0]
def scrapeShow(showEntry):
"""Now that we know we have our search properly formatted, go ahead and go
to the page for that search, and return a list of its image urls. We'll only
worry about the first 10 pages, or less if it doesn't have that many.
This number can be changed in this line:
maxPage = min([10, maxPage])
"""
url = 'https://www.zerochan.net/'
url += showEntry.split('|',1)[0].replace(' ','+')
url += '?p='
urls = []
i = 1
maxPage = 10
print(showEntry)
print('\tpage 1/x')
while i <= maxPage:
r = requests.get(url + str(i))
soup = BeautifulSoup(r.content, "html.parser")
images = soup.select("li a img")
if i == 1:
try:
maxPage = int(btw(soup.select('p.pagination')[0].text, 'of ', '\t'))
maxPage = min([10, maxPage]) # change if you want more photos
except:
print('Possible error. Maybe there is only one page.')
maxPage = 1
pass
else:
print('\tpage %d/%d' % (i,maxPage))
if len(images) == 0:
break # we've reached the start of the members-only pages.
for item in images:
imgurl = item.get('src')
# delete the second replace function if you want the thumbnails.
imgurl = imgurl.replace("s3","static").replace(".240.",".full.")
urls.append(imgurl)
print('\t\t%d' % len(urls))
i+=1
return urls
searches = ['megumin', 'non non biyori', 'kuroneko', 'yuru yuri', 'flip flappers', 'gabriel dropout', 'kiniro mosaic', 'seitokai no ichizon', 'relife', 'new game']
urls = []
for s in searches:
urls.extend(scrapeShow(searchShow(s)))
urls = list(set(urls))
# I recommend downloading the images with aria2. From the commandline,
# $ mkdir images
# $ aria2c -x16 -i links.txt -d images/
f = open('links.txt','w')
for u in urls:
f.write(u+'\n')
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment