Manwholikespie/scrape.py

## scrape.py
from pprint import pprint
import requests
from bs4 import BeautifulSoup

def searchShow(showName):
    """
    When you use their search bar, it sends your query to this webhook to list
    some possible choices. The first one is usually the one we want, so we'll
    return that.

    It will look something like this:
        Megumin|Character|Kono Subarashii Sekai ni Shukufuku wo!
    """
    url = 'https://www.zerochan.net/suggest?q='
    url += showName.strip().lower().replace(' ','+')
    url += '&limit=10'

    r = requests.get(url)
    results = r.text.strip().split('\n')
    if len(results) == 0:
        return []
    else:
        return results[0]

def btw(inputText,leftText,rightText):
    """utility function to get the text between two other pieces of text.
    """
    if leftText == None:
        return inputText.split(rightText,1)[0]
    elif rightText == None:
        return inputText.split(leftText,1)[1]
    else:
        return inputText.split(leftText,1)[1].split(rightText,1)[0]

def scrapeShow(showEntry):
    """Now that we know we have our search properly formatted, go ahead and go
    to the page for that search, and return a list of its image urls. We'll only
    worry about the first 10 pages, or less if it doesn't have that many.

    This number can be changed in this line:
        maxPage = min([10, maxPage])
    """
    url = 'https://www.zerochan.net/'
    url += showEntry.split('|',1)[0].replace(' ','+')
    url += '?p='

    urls = []

    i = 1
    maxPage = 10
    print(showEntry)
    print('\tpage 1/x')
    while i <= maxPage:
        r = requests.get(url + str(i))
        soup = BeautifulSoup(r.content, "html.parser")
        images = soup.select("li a img")

        if i == 1:
            try:
                maxPage = int(btw(soup.select('p.pagination')[0].text, 'of ', '\t'))
                maxPage = min([10, maxPage]) # change if you want more photos
            except:
                print('Possible error. Maybe there is only one page.')
                maxPage = 1
                pass
        else:
            print('\tpage %d/%d' % (i,maxPage))

        if len(images) == 0:
            break # we've reached the start of the members-only pages.
        for item in images:
            imgurl = item.get('src')
            # delete the second replace function if you want the thumbnails.
            imgurl = imgurl.replace("s3","static").replace(".240.",".full.")
            urls.append(imgurl)

        print('\t\t%d' % len(urls))
        i+=1

    return urls

searches = ['megumin', 'non non biyori', 'kuroneko', 'yuru yuri', 'flip flappers', 'gabriel dropout', 'kiniro mosaic', 'seitokai no ichizon', 'relife', 'new game']

urls = []
for s in searches:
    urls.extend(scrapeShow(searchShow(s)))

urls = list(set(urls))

# I recommend downloading the images with aria2. From the commandline,
# $ mkdir images
# $ aria2c -x16 -i links.txt -d images/
f = open('links.txt','w')
for u in urls:
    f.write(u+'\n')
f.close()
	from pprint import pprint
	import requests
	from bs4 import BeautifulSoup

	def searchShow(showName):
	"""
	When you use their search bar, it sends your query to this webhook to list
	some possible choices. The first one is usually the one we want, so we'll
	return that.

	It will look something like this:
	Megumin\|Character\|Kono Subarashii Sekai ni Shukufuku wo!
	"""
	url = 'https://www.zerochan.net/suggest?q='
	url += showName.strip().lower().replace(' ','+')
	url += '&limit=10'

	r = requests.get(url)
	results = r.text.strip().split('\n')
	if len(results) == 0:
	return []
	else:
	return results[0]

	def btw(inputText,leftText,rightText):
	"""utility function to get the text between two other pieces of text.
	"""
	if leftText == None:
	return inputText.split(rightText,1)[0]
	elif rightText == None:
	return inputText.split(leftText,1)[1]
	else:
	return inputText.split(leftText,1)[1].split(rightText,1)[0]

	def scrapeShow(showEntry):
	"""Now that we know we have our search properly formatted, go ahead and go
	to the page for that search, and return a list of its image urls. We'll only
	worry about the first 10 pages, or less if it doesn't have that many.

	This number can be changed in this line:
	maxPage = min([10, maxPage])
	"""
	url = 'https://www.zerochan.net/'
	url += showEntry.split('\|',1)[0].replace(' ','+')
	url += '?p='

	urls = []

	i = 1
	maxPage = 10
	print(showEntry)
	print('\tpage 1/x')
	while i <= maxPage:
	r = requests.get(url + str(i))
	soup = BeautifulSoup(r.content, "html.parser")
	images = soup.select("li a img")

	if i == 1:
	try:
	maxPage = int(btw(soup.select('p.pagination')[0].text, 'of ', '\t'))
	maxPage = min([10, maxPage]) # change if you want more photos
	except:
	print('Possible error. Maybe there is only one page.')
	maxPage = 1
	pass
	else:
	print('\tpage %d/%d' % (i,maxPage))

	if len(images) == 0:
	break # we've reached the start of the members-only pages.
	for item in images:
	imgurl = item.get('src')
	# delete the second replace function if you want the thumbnails.
	imgurl = imgurl.replace("s3","static").replace(".240.",".full.")
	urls.append(imgurl)

	print('\t\t%d' % len(urls))
	i+=1

	return urls

	searches = ['megumin', 'non non biyori', 'kuroneko', 'yuru yuri', 'flip flappers', 'gabriel dropout', 'kiniro mosaic', 'seitokai no ichizon', 'relife', 'new game']

	urls = []
	for s in searches:
	urls.extend(scrapeShow(searchShow(s)))

	urls = list(set(urls))

	# I recommend downloading the images with aria2. From the commandline,
	# $ mkdir images
	# $ aria2c -x16 -i links.txt -d images/
	f = open('links.txt','w')
	for u in urls:
	f.write(u+'\n')
	f.close()