Skip to content

Instantly share code, notes, and snippets.

@TestItOnlyOnce
Last active March 1, 2019 18:02
Show Gist options
  • Save TestItOnlyOnce/12109586620e650f1d70ec201feab19f to your computer and use it in GitHub Desktop.
Save TestItOnlyOnce/12109586620e650f1d70ec201feab19f to your computer and use it in GitHub Desktop.
HTML scrapper for wuxiaworld novels (Python)
import os, subprocess, sys, requests, re
from time import sleep
from collections import OrderedDict
from os.path import expanduser
from bs4 import BeautifulSoup as bs
from natsort import humansorted, natsorted
BASE_URL = "https://www.wuxiaworld.com"
HEADERS = {'User-Agent': 'Mozilla/5.0 (rv:66.0) Gecko/20100101 Firefox/66.0'}
abbreviation = {
"pw": "perfect-world",
"awe": "a-will-eternal",
"atg": "against-the-gods",
"sotr": "sovereign-of-the-three-realms",
"te": "talisman-emperor",
"mw": "martial-world",
"hjc": "heavenly-jewel-change",
"usaw": "upgrade-specialist-in-another-world",
"ige": "imperial-god-emperor"
}
def get_chapters(long_name: str) -> [(str, str)]:
dom = bs(download_html("/novel/{}".format(long_name)), "html5lib")
links = []
for chi in dom.findAll("li", class_="chapter-item"):
a = chi.findNext("a")
links.append((a["href"] , a.get_text()))
return links
def download_html(url: str) -> str:
try:
response = requests.get(BASE_URL + url, headers=HEADERS)
return response.content.decode("utf-8")
except:
raise
def slprint(text: str):
sys.stdout.write("\r" + text[:77].ljust(77,' '))
sys.stdout.flush()
def download_chapter(url: str) -> (str, str, str):
slprint("url: " + (('..' + url[7:70]) if len(url) > 63 else url))
dom = bs(download_html(url), "html5lib")
dom.encode(formatter="html5")
content = dom.select_one(".p-15 .fr-view")
if not content:
return None, None, None
next_ch = dom.select_one("li.next a")
if next_ch is not None:
next_ch = next_ch["href"]
else:
return None, None, None # don't return preview chapters
for s in content.findAll("script"):
if s.parent is not None:
#slprint("removing script", s.encode_contents())
s.decompose()
ch_name = dom.select('h4')[1].get_text()
# no fucking spoilers anymore
ch_name = re.sub("(?:Chapter)?\s*([\d\.]+)[\s:]-?\s.+", '\\1', ch_name)
#strip unaccounted chapter names
p0 = content.select('p')[0]
if re.search(r"(?:Chapter)?\s*[\d\.]+", p0.get_text()) is not None:
#print("removing title p0: ", p0.encode_contents())
p0.decompose()
p1 = content.select('p')[1]
if re.search(r"(?:Chapter)?\s*[\d\.]+", p1.get_text()) is not None:
#print("removing title p1: ", p1.encode_contents())
p1.decompose()
ch = str(content.encode_contents(), 'utf-8')
ch = re.sub("</?(a|hr|div|span)[^>]*>", '', ch) #strip unnecessary tags
ch = re.sub("\\s*(style|dir|id)=\"[^\"]*\"", '', ch) #strip attributes
ch = re.sub("\\s*Previous Chapter\\s*", '', ch)
ch = re.sub("\\s*Next Chapter\\s*", '', ch)
ch = re.sub("<p>\\s*(<br\\s?/?>)?\\s*</p>", '', ch) #strip empty paragraphs
#replace unnecessary special characters
ch = re.sub("[”“]", '"', ch)
ch = re.sub("[‘’]", "'", ch)
ch = re.sub("\xa0", ' ', ch)
ch = re.sub("—", '-', ch)
#ch = re.sub("…", "...", ch)
return ch, ch_name, next_ch
def download(short_name: str, long_name: str, first_chapter: int, chapter_count: int = 0, enable_header: bool = True, enable_footer: bool = False):
chapters = OrderedDict()
slprint("getting chapter list...")
for _ in range(0, 5):
chs = get_chapters(long_name)
for ch in chs:
m = re.search(r"chapter-([0-9]+)-?([0-9]*)", ch[0])
try:
ch_num = m.group(1)
if m.group(2):
ch_num += "." + str(int(m.group(2)))
chapters[float(ch_num)] = ch
except:
pass
if chapter_count == 0:
chapter_count = len(chapters) - first_chapter
chapters = OrderedDict(natsorted(chapters.items(),key=lambda ch: ch[1][0]))
#chapters = list(chapters.items())[first_chapter:first_chapter+chapter_count]
chslen = len(chapters)
sys.stdout.write("\r")
sys.stdout.flush()
print("novel:", long_name, "(" + short_name + ")", "chapters:", str(first_chapter) +
".." + str(first_chapter + chapter_count), "total:", chapter_count)
i = chapter_count
j = first_chapter
nextch = chapters[first_chapter][0]
header = ""
footer = "\n\n<br/><mbp:pagebreak>\n\n" if enable_footer else "\n"
out = ""
while nextch is not None and i > 0 and j < chslen:
if chapter_count > 0:
i -= 1
j += 1
text, ch_name, nextch = download_chapter(nextch)
if text is not None:
if enable_header:
header = "\n<h3>" + ch_name + "</h3>"
out += (header + text + footer)
slprint("completed...")
out = "<!DOCTYPE html>\n<head>\n\t<meta charset=\"UTF-8\">\n\t<style>body {font: normal normal 400 15.5px/22.16px \"Open Sans\"; background:#222; color:#aaa; padding:0 20px;} h3 {color:#222;}</style>\n</head>\n\n<body>\n\n" + out + "\n\n</body>\n\n</html>"
#print(out)
path = long_name + "-" + str(first_chapter) + "-" + str(first_chapter + chapter_count - 1) + ".html"
file = open(path, "wb")
file.write(out.encode("ascii", "xmlcharrefreplace")) # out.encode("utf-8"))
file.flush()
#subprocess.call([expanduser("~") + "/kindlegen", path, "-c1"])
#os.unlink(path)
if __name__ == "__main__":
arg_len = len(sys.argv)
if arg_len < 2:
print("Usage: ")
print(" python wuxscrap.py <NOVEL_ABBR> [FIRST_CHAPTER] [LAST_CHAPTER]")
sys.exit(1)
short_name = sys.argv[1]
long_name = abbreviation[short_name]
first_chapter = 0 if arg_len < 3 else int(sys.argv[2])
ch_count = 0 if arg_len < 4 else (int(sys.argv[3]) - first_chapter)
download(short_name, long_name, first_chapter, ch_count)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment