Last active
December 26, 2019 08:25
-
-
Save karupoimou/ad1ca7ba0b94637f4b4c5d3afbd9cb5d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from tqdm import tqdm | |
import pandas as pd | |
import requests | |
import time | |
import gzip | |
import json | |
from bs4 import BeautifulSoup | |
headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0"} | |
cookie = {'over18': 'yes'} | |
# 取得間隔(秒) | |
interval=5 | |
# 誤字チェック用に使うリストを読み込む | |
df = pd.read_excel("誤字チェックリスト.xlsx") | |
word_list=df["word"].values.tolist() | |
writer_id = input("なろうIDまたはxidを入力してください") | |
novel_date=[] | |
url_list=[] | |
# なろうAPIから作品データを取得する | |
def get_ncode_list(): | |
if "x" not in writer_id: | |
payload = {'out': 'json','gzip':5,'lim':500,'userid':writer_id} | |
res = requests.get('https://api.syosetu.com/novelapi/api/', params=payload).content | |
page_url="https://ncode.syosetu.com/" | |
else: | |
payload = {'out': 'json','gzip':5,'lim':500,'xid':writer_id} | |
res = requests.get('https://api.syosetu.com/novel18api/api/', params=payload).content | |
page_url="https://novel18.syosetu.com/" | |
r = gzip.decompress(res).decode("utf-8") | |
# 作者の作品データを取得 | |
for data in json.loads(r): | |
try: | |
temp=[] | |
temp.append(data['ncode']) | |
temp.append(data['general_all_no']) | |
temp.append(data['novel_type']) | |
novel_date.append(temp) | |
except KeyError: | |
pass | |
# URLをリスト化する | |
for i in range(len(novel_date)): | |
if novel_date[i][2]==2: | |
url = page_url+"%s/"%novel_date[i][0] | |
url_list.append(url) | |
else: | |
for cnt in range(novel_date[i][1]): | |
cnt=cnt+1 | |
url = page_url+"%s/%s/"%(novel_date[i][0],cnt) | |
url_list.append(url) | |
# 作品ページをスクレイピングで取得する | |
def get_text_all(): | |
global html_text | |
html_text = "" | |
for url in tqdm(url_list): | |
try: | |
response = requests.get(url=url, headers=headers, cookies=cookie) | |
except: | |
time.sleep(60) | |
response = requests.get(url=url, headers=headers, cookies=cookie) | |
html = response.content | |
soup = BeautifulSoup(html, "lxml") | |
novel_view = soup.find_all("div",id="novel_honbun")[0].text | |
html_text = html_text+novel_view+"<a href='%s' target='_blank'>%s</a>"%(url,url) | |
time.sleep(interval) | |
# htmlを作成する | |
def gen_html(): | |
# htmlの作成 | |
all_text = "<html>" + html_text | |
# 内容を入れつつ、単語リスト内の単語を強調する | |
for w in word_list: | |
all_text=all_text.replace("%s"%w, "<font size='5' color='#ff0000'><b>%s</b></font>"%w) | |
# htmlを閉じる | |
all_text=all_text+"</html>" | |
# htmlを出力する | |
with open('%s.html'%writer_id, mode='w', encoding='UTF-8', errors='ignore') as f: | |
f.write(all_text) | |
get_ncode_list() | |
get_text_all() | |
gen_html() | |
print("完了しました") | |
input("何かキーを押すと終了します") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment