Skip to content

Instantly share code, notes, and snippets.

@chocolatkey
Last active February 12, 2017 17:20
Show Gist options
  • Save chocolatkey/5656df41f75143773f2c5c51c176d5c6 to your computer and use it in GitHub Desktop.
Save chocolatkey/5656df41f75143773f2c5c51c176d5c6 to your computer and use it in GitHub Desktop.
Python's slowly growing on me...
# Mangatown Scraper 1.2
# Henry (chocolatkey) 2017
# coding=utf-8
import urllib.request
from bs4 import BeautifulSoup
import re
import threading
import time
import logging
import os
import sys
import argparse
from fake_useragent import UserAgent
ua = UserAgent()
import traceback
logging.basicConfig(filename='scraper.log', filemode='w', level=logging.DEBUG) # Change to debug if needed. Be careful: current mode overwrites previous log
kmp = "http://www.mangatown.com/" # Muh web URL base
nmangas = 0 # Amount of Mangas/huas/hwas
nchaps = 0 # Amount of chapters
nlchaps = 0 # Amount of licensed chapters
npics = 0 # Amount of pictures (pages)
err1 = 0 # Lvl1 Errors: Probably timed out connection to server, not usually related to scraper itself
err2 = 0 # Lvl2 Errors: Shouldn't be happening, something went wrong in the chapter worker thread
err3 = 0 # Lvl3 Errors: Occurs when scraper can't make sense of the chapter webpage (no <div class="mangaread-main">. Actually occurs, will look into why soon.
def stats(nmangas, nchaps, nlchaps, npics, err1, err2, err3): # yeah I know it's kind of silly passing the vars, do what u want
return "--------------------\nTotal stats: \nMangas: " + str(nmangas) + \
"\nChapters: " + str(nchaps) + \
"\nLicensed Chapters: " + str(nlchaps) + "(Add ~" + str(nchaps*16) + " pics)" + \
"\nPictures: " + str(npics) + \
"\nLvl1 Errors: " + str(err1) + \
"\nLvl2 Errors: " + str(err2) + \
"\nLvl3 Errors: " + str(err3)
def urrq(url):
urlrq = urllib.request.Request(
url,
data=None,
headers={
'User-Agent': ua.google
}
)
return BeautifulSoup(urllib.request.urlopen(urlrq).read().decode('utf-8'), 'html.parser')
def pagedownloader(link,lb,number):
global err3
global npics
try:
soupinst = urrq(link + str(number) + ".html")
mmain = soupinst.find("img", {"id": "image"})
if mmain is not None:
npics += 1
url = soupinst.find("img", {"id": "image"}).get("src")
filename = str(url.split('/')[-1]).split('?')[0]
fileloc = lb + os.sep + filename
if os.path.isfile(fileloc):
logging.debug("Pic exists: " + filename)
else:
logging.debug("Retreiving " + filename)
try:
urllib.request.urlretrieve(url, fileloc) # save img
except Exception as e:
logging.error(str(e))
time.sleep(0.25)
else:
logging.error("No image @" + link + str(number) + ".html")
err3 += 1
except Exception as err: # Means we probably sent too many requests
print(link + str(number) + ".html" + " cooldown" + "\n" + str(err))
time.sleep(1) # Cooldown
pagedownloader(link, lb,number) # Try again
#os._exit(1)
def chapworker(link, lb): # Individual chapter thread worker
global npics
global nlchaps
global err2
global err3
try:
soup = urrq(link)
pagecount = soup.find("div", {"class": "page_select"}).find("select", {"onchange": "javascript:location.href=this.value;"}).find_all("option") #pages
lastpage = int(pagecount[-1].text) # Get last page
for number in range(1, lastpage):
time.sleep(0.05)
pagedownloader(link,lb,number)
except Exception as e:
logging.error(str(e))
err2 += 1
time.sleep(1)
chapworker(link,lb)
def main():
global kmp
global nmangas
global nchaps
global nlchaps
global npics
global err1
global err2
global err3
DESCRIPTION = '''Scrape Mangatown for comics, chapters, pages and info, then download, check or add info to the collection'''
MODE_HELP = '''Running mode. Possible options you can combine: s (scrape), c (check), i (get info), o (overwrite).
Example: mangatownscraper.py --mode sio'''
BEGIN_HELP = '''Directory page to start at'''
firstnum = 1
parser = argparse.ArgumentParser(description = DESCRIPTION)
#parser.add_argument('files', nargs = '+', help = FILES_HELP)
#parser.add_argument('-m','--mode', dest = 'mode', help = MODE_HELP, required=True) #TODO: work with mode
parser.add_argument('-b','--begin', dest = 'begin', help = BEGIN_HELP)
options = parser.parse_args()
if options.begin is not None:
firstnum = int(options.begin)
print("Starting from page " + str(firstnum))
soup = urrq(kmp + "directory/")
pages = soup.find("div", {"class": "next-page"}).find_all(href=re.compile("\/directory\/[A-Za-z0-9-_\/\.]+"), class_=False, id=False)
lastnum = int(re.match(r".+\/(\d+)\.htm$", pages[-1].get('href')).group(1)) # Get last directory page
print(str(lastnum) + " pages to go through")
time.sleep(1)
for number in range(firstnum, lastnum): # directory pages
logging.info("=======================\nPage: " + str(number))
soup = urrq(kmp + "directory/" + str(number) + ".htm")
for mng in soup.find("ul", {"class": "manga_pic_list"}).find_all("li"): #mangas
link = mng.find("a", {"class": "manga_cover"})
mtitle = str(link.get('title')).strip().encode('utf-8')
mstub = str(re.match(".+\/manga\/([A-Za-z0-9-_]+)\/$", link.get('href')).group(1))
logging.info(mtitle + b" (" + mstub.encode('utf-8') + b")")
if not os.path.exists(mstub):
os.makedirs(mstub)
print(b"=====" + mtitle + b"=====")
nmangas += 1
try:
soup = urrq(link.get('href'))
threads = []
chaptertable = soup.find("ul", {"class": "chapter_list"}).find_all("a")
for link in chaptertable: #chapterss
chaptername = " ".join(str(link.find(text=True)).replace("&nbsp;", "").split())
chapternum = re.findall(re.compile(r'.+\ (\d+\.*\d*)$'), chaptername)[0]
logging.debug(b"\nChapter: " + chaptername.encode('utf-8') + b" (" + chapternum.encode('utf-8') + b")\n")
chapterpath = mstub + os.sep + chapternum
if not os.path.exists(chapterpath):
os.makedirs(chapterpath)
nchaps += 1
#chapworker(link.get('href'),chapterpath)
t = threading.Thread(target=chapworker, args=(link.get('href'),chapterpath,))
threads.append(t)
t.start()
time.sleep(0.1) # beeee careful!
else:
logging.info("Chapter already exists!")
print("Chapter already exists!")
for x in threads:
x.join()
except Exception as e:
logging.error(str(e))
print(traceback.format_exc())
err1 += 1
time.sleep(1)
pass
print("\n" + stats(nmangas, nchaps, nlchaps, npics, err1, err2, err3)) # Print stats after scraping each manga for verbose console output
time.sleep(1) # Give it a rest
logging.info(stats(nmangas, nchaps, nlchaps, npics, err1, err2, err3)) # Log final stats
if __name__ == '__main__':
sys.exit(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment