TestItOnlyOnce/wuxscrap.py

## wuxscrap.py
import os, subprocess, sys, requests, re
from time import sleep
from collections import OrderedDict
from os.path import expanduser
from bs4 import BeautifulSoup as bs
from natsort import humansorted, natsorted

BASE_URL = "https://www.wuxiaworld.com"
HEADERS = {'User-Agent': 'Mozilla/5.0 (rv:66.0) Gecko/20100101 Firefox/66.0'}

abbreviation = {
    "pw": "perfect-world",
    "awe": "a-will-eternal",
    "atg": "against-the-gods",
    "sotr": "sovereign-of-the-three-realms",
    "te": "talisman-emperor",
    "mw": "martial-world",
    "hjc": "heavenly-jewel-change",
    "usaw": "upgrade-specialist-in-another-world",
    "ige": "imperial-god-emperor"
}

def get_chapters(long_name: str) -> [(str, str)]:
    dom = bs(download_html("/novel/{}".format(long_name)), "html5lib")
    links = []
    for chi in dom.findAll("li", class_="chapter-item"):
        a = chi.findNext("a")
        links.append((a["href"] , a.get_text()))
    return links

def download_html(url: str) -> str:
    try:
        response = requests.get(BASE_URL + url, headers=HEADERS)
        return response.content.decode("utf-8")
    except:
        raise

def slprint(text: str):
    sys.stdout.write("\r" + text[:77].ljust(77,' '))
    sys.stdout.flush()

def download_chapter(url: str) -> (str, str, str):
    slprint("url: " + (('..' + url[7:70]) if len(url) > 63 else url))

    dom = bs(download_html(url), "html5lib")
    dom.encode(formatter="html5")
    content = dom.select_one(".p-15 .fr-view")
    if not content:
        return None, None, None

    next_ch = dom.select_one("li.next a")
    if next_ch is not None:
        next_ch = next_ch["href"]
    else:
        return None, None, None # don't return preview chapters

    for s in content.findAll("script"):
        if s.parent is not None:
            #slprint("removing script", s.encode_contents())
            s.decompose()

    ch_name = dom.select('h4')[1].get_text()
    # no fucking spoilers anymore
    ch_name = re.sub("(?:Chapter)?\s*([\d\.]+)[\s:]-?\s.+", '\\1', ch_name)

    #strip unaccounted chapter names
    p0 = content.select('p')[0]
    if re.search(r"(?:Chapter)?\s*[\d\.]+", p0.get_text()) is not None:
        #print("removing title p0: ", p0.encode_contents())
        p0.decompose()
    p1 = content.select('p')[1]
    if re.search(r"(?:Chapter)?\s*[\d\.]+", p1.get_text()) is not None:
        #print("removing title p1: ", p1.encode_contents())
        p1.decompose()

    ch = str(content.encode_contents(), 'utf-8')
    ch = re.sub("</?(a|hr|div|span)[^>]*>", '', ch) #strip unnecessary tags
    ch = re.sub("\\s*(style|dir|id)=\"[^\"]*\"", '', ch) #strip attributes
    ch = re.sub("\\s*Previous Chapter\\s*", '', ch)
    ch = re.sub("\\s*Next Chapter\\s*", '', ch)
    ch = re.sub("<p>\\s*(<br\\s?/?>)?\\s*</p>", '', ch) #strip empty paragraphs
    #replace unnecessary special characters
    ch = re.sub("[”“]", '"', ch)
    ch = re.sub("[‘’]", "'", ch)
    ch = re.sub("\xa0", ' ', ch)
    ch = re.sub("—", '-', ch)
    #ch = re.sub("…", "...", ch)

    return ch, ch_name, next_ch

def download(short_name: str, long_name: str, first_chapter: int, chapter_count: int = 0, enable_header: bool = True, enable_footer: bool = False):
    chapters = OrderedDict()
    slprint("getting chapter list...")
    for _ in range(0, 5):
        chs = get_chapters(long_name)
        for ch in chs:
            m = re.search(r"chapter-([0-9]+)-?([0-9]*)", ch[0])
            try:
                ch_num = m.group(1)
                if m.group(2):
                    ch_num += "." + str(int(m.group(2)))
                chapters[float(ch_num)] = ch
            except:
                pass

    if chapter_count == 0:
        chapter_count = len(chapters) - first_chapter

    chapters = OrderedDict(natsorted(chapters.items(),key=lambda ch: ch[1][0]))
    #chapters = list(chapters.items())[first_chapter:first_chapter+chapter_count]
    chslen = len(chapters)

    sys.stdout.write("\r")
    sys.stdout.flush()
    print("novel:", long_name, "(" + short_name + ")", "chapters:", str(first_chapter) +
           ".." + str(first_chapter + chapter_count), "total:", chapter_count)

    i = chapter_count
    j = first_chapter
    nextch = chapters[first_chapter][0]

    header = ""
    footer = "\n\n<br/><mbp:pagebreak>\n\n" if enable_footer else "\n"

    out = ""
    while nextch is not None and i > 0 and j < chslen:
        if chapter_count > 0:
            i -= 1
        j += 1
        text, ch_name, nextch = download_chapter(nextch)
        if text is not None:
            if enable_header:
                header = "\n<h3>" + ch_name + "</h3>"
            out += (header + text + footer)

    slprint("completed...")

    out = "<!DOCTYPE html>\n<head>\n\t<meta charset=\"UTF-8\">\n\t<style>body {font: normal normal 400 15.5px/22.16px \"Open Sans\"; background:#222; color:#aaa; padding:0 20px;} h3 {color:#222;}</style>\n</head>\n\n<body>\n\n" + out + "\n\n</body>\n\n</html>"
    #print(out)
    path = long_name + "-" + str(first_chapter) + "-" + str(first_chapter + chapter_count - 1) + ".html"
    file = open(path, "wb")
    file.write(out.encode("ascii", "xmlcharrefreplace")) # out.encode("utf-8"))
    file.flush()
    #subprocess.call([expanduser("~") + "/kindlegen", path, "-c1"])
    #os.unlink(path)

if __name__ == "__main__":
    arg_len = len(sys.argv)

    if arg_len < 2:
        print("Usage: ")
        print("    python wuxscrap.py <NOVEL_ABBR> [FIRST_CHAPTER] [LAST_CHAPTER]")
        sys.exit(1)

    short_name = sys.argv[1]
    long_name = abbreviation[short_name]
    first_chapter = 0 if arg_len < 3 else int(sys.argv[2])
    ch_count = 0 if arg_len < 4 else (int(sys.argv[3]) - first_chapter)

    download(short_name, long_name, first_chapter, ch_count)
	import os, subprocess, sys, requests, re
	from time import sleep
	from collections import OrderedDict
	from os.path import expanduser
	from bs4 import BeautifulSoup as bs
	from natsort import humansorted, natsorted

	BASE_URL = "https://www.wuxiaworld.com"
	HEADERS = {'User-Agent': 'Mozilla/5.0 (rv:66.0) Gecko/20100101 Firefox/66.0'}

	abbreviation = {
	"pw": "perfect-world",
	"awe": "a-will-eternal",
	"atg": "against-the-gods",
	"sotr": "sovereign-of-the-three-realms",
	"te": "talisman-emperor",
	"mw": "martial-world",
	"hjc": "heavenly-jewel-change",
	"usaw": "upgrade-specialist-in-another-world",
	"ige": "imperial-god-emperor"
	}

	def get_chapters(long_name: str) -> [(str, str)]:
	dom = bs(download_html("/novel/{}".format(long_name)), "html5lib")
	links = []
	for chi in dom.findAll("li", class_="chapter-item"):
	a = chi.findNext("a")
	links.append((a["href"] , a.get_text()))
	return links

	def download_html(url: str) -> str:
	try:
	response = requests.get(BASE_URL + url, headers=HEADERS)
	return response.content.decode("utf-8")
	except:
	raise

	def slprint(text: str):
	sys.stdout.write("\r" + text[:77].ljust(77,' '))
	sys.stdout.flush()

	def download_chapter(url: str) -> (str, str, str):
	slprint("url: " + (('..' + url[7:70]) if len(url) > 63 else url))

	dom = bs(download_html(url), "html5lib")
	dom.encode(formatter="html5")
	content = dom.select_one(".p-15 .fr-view")
	if not content:
	return None, None, None

	next_ch = dom.select_one("li.next a")
	if next_ch is not None:
	next_ch = next_ch["href"]
	else:
	return None, None, None # don't return preview chapters

	for s in content.findAll("script"):
	if s.parent is not None:
	#slprint("removing script", s.encode_contents())
	s.decompose()

	ch_name = dom.select('h4')[1].get_text()
	# no fucking spoilers anymore
	ch_name = re.sub("(?:Chapter)?\s*([\d\.]+)[\s:]-?\s.+", '\\1', ch_name)

	#strip unaccounted chapter names
	p0 = content.select('p')[0]
	if re.search(r"(?:Chapter)?\s*[\d\.]+", p0.get_text()) is not None:
	#print("removing title p0: ", p0.encode_contents())
	p0.decompose()
	p1 = content.select('p')[1]
	if re.search(r"(?:Chapter)?\s*[\d\.]+", p1.get_text()) is not None:
	#print("removing title p1: ", p1.encode_contents())
	p1.decompose()

	ch = str(content.encode_contents(), 'utf-8')
	ch = re.sub("</?(a\|hr\|div\|span)[^>]*>", '', ch) #strip unnecessary tags
	ch = re.sub("\\s(style\|dir\|id)=\"[^\"]\"", '', ch) #strip attributes
	ch = re.sub("\\sPrevious Chapter\\s", '', ch)
	ch = re.sub("\\sNext Chapter\\s", '', ch)
	ch = re.sub("<p>\\s(<br\\s?/?>)?\\s</p>", '', ch) #strip empty paragraphs
	#replace unnecessary special characters
	ch = re.sub("[”“]", '"', ch)
	ch = re.sub("[‘’]", "'", ch)
	ch = re.sub("\xa0", ' ', ch)
	ch = re.sub("—", '-', ch)
	#ch = re.sub("…", "...", ch)

	return ch, ch_name, next_ch

	def download(short_name: str, long_name: str, first_chapter: int, chapter_count: int = 0, enable_header: bool = True, enable_footer: bool = False):
	chapters = OrderedDict()
	slprint("getting chapter list...")
	for _ in range(0, 5):
	chs = get_chapters(long_name)
	for ch in chs:
	m = re.search(r"chapter-([0-9]+)-?([0-9]*)", ch[0])
	try:
	ch_num = m.group(1)
	if m.group(2):
	ch_num += "." + str(int(m.group(2)))
	chapters[float(ch_num)] = ch
	except:
	pass

	if chapter_count == 0:
	chapter_count = len(chapters) - first_chapter

	chapters = OrderedDict(natsorted(chapters.items(),key=lambda ch: ch[1][0]))
	#chapters = list(chapters.items())[first_chapter:first_chapter+chapter_count]
	chslen = len(chapters)

	sys.stdout.write("\r")
	sys.stdout.flush()
	print("novel:", long_name, "(" + short_name + ")", "chapters:", str(first_chapter) +
	".." + str(first_chapter + chapter_count), "total:", chapter_count)

	i = chapter_count
	j = first_chapter
	nextch = chapters[first_chapter][0]

	header = ""
	footer = "\n\n<br/><mbp:pagebreak>\n\n" if enable_footer else "\n"

	out = ""
	while nextch is not None and i > 0 and j < chslen:
	if chapter_count > 0:
	i -= 1
	j += 1
	text, ch_name, nextch = download_chapter(nextch)
	if text is not None:
	if enable_header:
	header = "\n<h3>" + ch_name + "</h3>"
	out += (header + text + footer)

	slprint("completed...")

	out = "<!DOCTYPE html>\n<head>\n\t<meta charset=\"UTF-8\">\n\t<style>body {font: normal normal 400 15.5px/22.16px \"Open Sans\"; background:#222; color:#aaa; padding:0 20px;} h3 {color:#222;}</style>\n</head>\n\n<body>\n\n" + out + "\n\n</body>\n\n</html>"
	#print(out)
	path = long_name + "-" + str(first_chapter) + "-" + str(first_chapter + chapter_count - 1) + ".html"
	file = open(path, "wb")
	file.write(out.encode("ascii", "xmlcharrefreplace")) # out.encode("utf-8"))
	file.flush()
	#subprocess.call([expanduser("~") + "/kindlegen", path, "-c1"])
	#os.unlink(path)

	if __name__ == "__main__":
	arg_len = len(sys.argv)

	if arg_len < 2:
	print("Usage: ")
	print(" python wuxscrap.py <NOVEL_ABBR> [FIRST_CHAPTER] [LAST_CHAPTER]")
	sys.exit(1)

	short_name = sys.argv[1]
	long_name = abbreviation[short_name]
	first_chapter = 0 if arg_len < 3 else int(sys.argv[2])
	ch_count = 0 if arg_len < 4 else (int(sys.argv[3]) - first_chapter)

	download(short_name, long_name, first_chapter, ch_count)