Created
November 23, 2018 04:41
-
-
Save eternal-flame-AD/b106ee648fde2954511747ad206fe0d4 to your computer and use it in GitHub Desktop.
Format bilibili novel into HTML format
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import os | |
from xml.sax.saxutils import escape | |
import bs4 | |
import requests | |
class BiliArticle(): | |
def __init__(self, title: str, author: str, main_markup: str = ""): | |
self.title, self.author, self.main_markup = title, author, main_markup | |
@staticmethod | |
def from_bili_cv(id: str): | |
url = f"https://www.bilibili.com/read/cv{id}" | |
soup = bs4.BeautifulSoup(requests.get(url).text, "html5lib") | |
author = soup.find("a", class_="author-name").get_text(strip=True) | |
title = soup.find("h1", class_="title").get_text(strip=True) | |
main_markup = soup.find("div", class_="article-holder").prettify() | |
return BiliArticle(title, author, main_markup) | |
@staticmethod | |
def merge(new_title: str, new_author: str, alist): | |
return BiliArticle(new_title, new_author, | |
"".join([article.main_markup for article in alist])) | |
def format_markup(self): | |
author = escape( | |
self.author, entities={ | |
"\"": """, | |
"'": "'", | |
}) | |
soup = bs4.BeautifulSoup( | |
f'<html><head><title>{escape(self.title)}</title><meta name="author" content="{author}"></meta></head><body>{self.main_markup}</body></html>', | |
"html5lib") | |
return soup.prettify() | |
if __name__ == "__main__": | |
if sys.argv[1] == "cv": | |
cv_id = sys.argv[2] | |
cv = BiliArticle.from_bili_cv(cv_id) | |
markup = cv.format_markup() | |
with open(f"result/{cv.title}.html", "w") as f: | |
f.write(markup) | |
elif sys.argv[1] == "series": | |
series_id = sys.argv[2] | |
url = f"https://api.bilibili.com/x/article/list/articles?id={series_id}&jsonp=jsonp" | |
res = requests.get(url).json() | |
list_name = res["data"]["list"]["name"] | |
list_author = res["data"]["author"]["name"] | |
os.makedirs(f"result/{list_name}/", exist_ok=True) | |
cvs = [] | |
for article in res["data"]["articles"]: | |
cv_id = article["id"] | |
cv_title = article["title"] | |
print(cv_id, cv_title) | |
cv = BiliArticle.from_bili_cv(cv_id) | |
cvs.append(cv) | |
with open(f"result/{list_name}/{cv_title}.html", "w") as f: | |
f.write(cv.format_markup()) | |
with open(f"result/{list_name}/merge.html", "w") as f: | |
f.write( | |
BiliArticle.merge(list_name, list_author, cvs).format_markup()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment