Skip to content

Instantly share code, notes, and snippets.

@Manwholikespie
Created October 9, 2017 20:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Manwholikespie/695efa6ab7a156f4b4493dca9557c4d8 to your computer and use it in GitHub Desktop.
Save Manwholikespie/695efa6ab7a156f4b4493dca9557c4d8 to your computer and use it in GitHub Desktop.
Scrape images from zerochan.net
from pprint import pprint
import requests
from bs4 import BeautifulSoup
def searchShow(showName):
"""
When you use their search bar, it sends your query to this webhook to list
some possible choices. The first one is usually the one we want, so we'll
return that.
It will look something like this:
Megumin|Character|Kono Subarashii Sekai ni Shukufuku wo!
"""
url = 'https://www.zerochan.net/suggest?q='
url += showName.strip().lower().replace(' ','+')
url += '&limit=10'
r = requests.get(url)
results = r.text.strip().split('\n')
if len(results) == 0:
return []
else:
return results[0]
def btw(inputText,leftText,rightText):
"""utility function to get the text between two other pieces of text.
"""
if leftText == None:
return inputText.split(rightText,1)[0]
elif rightText == None:
return inputText.split(leftText,1)[1]
else:
return inputText.split(leftText,1)[1].split(rightText,1)[0]
def scrapeShow(showEntry):
"""Now that we know we have our search properly formatted, go ahead and go
to the page for that search, and return a list of its image urls. We'll only
worry about the first 10 pages, or less if it doesn't have that many.
This number can be changed in this line:
maxPage = min([10, maxPage])
"""
url = 'https://www.zerochan.net/'
url += showEntry.split('|',1)[0].replace(' ','+')
url += '?p='
urls = []
i = 1
maxPage = 10
print(showEntry)
print('\tpage 1/x')
while i <= maxPage:
r = requests.get(url + str(i))
soup = BeautifulSoup(r.content, "html.parser")
images = soup.select("li a img")
if i == 1:
try:
maxPage = int(btw(soup.select('p.pagination')[0].text, 'of ', '\t'))
maxPage = min([10, maxPage]) # change if you want more photos
except:
print('Possible error. Maybe there is only one page.')
maxPage = 1
pass
else:
print('\tpage %d/%d' % (i,maxPage))
if len(images) == 0:
break # we've reached the start of the members-only pages.
for item in images:
imgurl = item.get('src')
# delete the second replace function if you want the thumbnails.
imgurl = imgurl.replace("s3","static").replace(".240.",".full.")
urls.append(imgurl)
print('\t\t%d' % len(urls))
i+=1
return urls
searches = ['megumin', 'non non biyori', 'kuroneko', 'yuru yuri', 'flip flappers', 'gabriel dropout', 'kiniro mosaic', 'seitokai no ichizon', 'relife', 'new game']
urls = []
for s in searches:
urls.extend(scrapeShow(searchShow(s)))
urls = list(set(urls))
# I recommend downloading the images with aria2. From the commandline,
# $ mkdir images
# $ aria2c -x16 -i links.txt -d images/
f = open('links.txt','w')
for u in urls:
f.write(u+'\n')
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment