Created
October 9, 2017 20:54
-
-
Save Manwholikespie/695efa6ab7a156f4b4493dca9557c4d8 to your computer and use it in GitHub Desktop.
Scrape images from zerochan.net
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pprint import pprint | |
import requests | |
from bs4 import BeautifulSoup | |
def searchShow(showName): | |
""" | |
When you use their search bar, it sends your query to this webhook to list | |
some possible choices. The first one is usually the one we want, so we'll | |
return that. | |
It will look something like this: | |
Megumin|Character|Kono Subarashii Sekai ni Shukufuku wo! | |
""" | |
url = 'https://www.zerochan.net/suggest?q=' | |
url += showName.strip().lower().replace(' ','+') | |
url += '&limit=10' | |
r = requests.get(url) | |
results = r.text.strip().split('\n') | |
if len(results) == 0: | |
return [] | |
else: | |
return results[0] | |
def btw(inputText,leftText,rightText): | |
"""utility function to get the text between two other pieces of text. | |
""" | |
if leftText == None: | |
return inputText.split(rightText,1)[0] | |
elif rightText == None: | |
return inputText.split(leftText,1)[1] | |
else: | |
return inputText.split(leftText,1)[1].split(rightText,1)[0] | |
def scrapeShow(showEntry): | |
"""Now that we know we have our search properly formatted, go ahead and go | |
to the page for that search, and return a list of its image urls. We'll only | |
worry about the first 10 pages, or less if it doesn't have that many. | |
This number can be changed in this line: | |
maxPage = min([10, maxPage]) | |
""" | |
url = 'https://www.zerochan.net/' | |
url += showEntry.split('|',1)[0].replace(' ','+') | |
url += '?p=' | |
urls = [] | |
i = 1 | |
maxPage = 10 | |
print(showEntry) | |
print('\tpage 1/x') | |
while i <= maxPage: | |
r = requests.get(url + str(i)) | |
soup = BeautifulSoup(r.content, "html.parser") | |
images = soup.select("li a img") | |
if i == 1: | |
try: | |
maxPage = int(btw(soup.select('p.pagination')[0].text, 'of ', '\t')) | |
maxPage = min([10, maxPage]) # change if you want more photos | |
except: | |
print('Possible error. Maybe there is only one page.') | |
maxPage = 1 | |
pass | |
else: | |
print('\tpage %d/%d' % (i,maxPage)) | |
if len(images) == 0: | |
break # we've reached the start of the members-only pages. | |
for item in images: | |
imgurl = item.get('src') | |
# delete the second replace function if you want the thumbnails. | |
imgurl = imgurl.replace("s3","static").replace(".240.",".full.") | |
urls.append(imgurl) | |
print('\t\t%d' % len(urls)) | |
i+=1 | |
return urls | |
searches = ['megumin', 'non non biyori', 'kuroneko', 'yuru yuri', 'flip flappers', 'gabriel dropout', 'kiniro mosaic', 'seitokai no ichizon', 'relife', 'new game'] | |
urls = [] | |
for s in searches: | |
urls.extend(scrapeShow(searchShow(s))) | |
urls = list(set(urls)) | |
# I recommend downloading the images with aria2. From the commandline, | |
# $ mkdir images | |
# $ aria2c -x16 -i links.txt -d images/ | |
f = open('links.txt','w') | |
for u in urls: | |
f.write(u+'\n') | |
f.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment