Skip to content

Instantly share code, notes, and snippets.

@karupoimou
Last active December 26, 2019 08:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save karupoimou/ad1ca7ba0b94637f4b4c5d3afbd9cb5d to your computer and use it in GitHub Desktop.
Save karupoimou/ad1ca7ba0b94637f4b4c5d3afbd9cb5d to your computer and use it in GitHub Desktop.
from tqdm import tqdm
import pandas as pd
import requests
import time
import gzip
import json
from bs4 import BeautifulSoup
headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0"}
cookie = {'over18': 'yes'}
# 取得間隔(秒)
interval=5
# 誤字チェック用に使うリストを読み込む
df = pd.read_excel("誤字チェックリスト.xlsx")
word_list=df["word"].values.tolist()
writer_id = input("なろうIDまたはxidを入力してください")
novel_date=[]
url_list=[]
# なろうAPIから作品データを取得する
def get_ncode_list():
if "x" not in writer_id:
payload = {'out': 'json','gzip':5,'lim':500,'userid':writer_id}
res = requests.get('https://api.syosetu.com/novelapi/api/', params=payload).content
page_url="https://ncode.syosetu.com/"
else:
payload = {'out': 'json','gzip':5,'lim':500,'xid':writer_id}
res = requests.get('https://api.syosetu.com/novel18api/api/', params=payload).content
page_url="https://novel18.syosetu.com/"
r = gzip.decompress(res).decode("utf-8")
# 作者の作品データを取得
for data in json.loads(r):
try:
temp=[]
temp.append(data['ncode'])
temp.append(data['general_all_no'])
temp.append(data['novel_type'])
novel_date.append(temp)
except KeyError:
pass
# URLをリスト化する
for i in range(len(novel_date)):
if novel_date[i][2]==2:
url = page_url+"%s/"%novel_date[i][0]
url_list.append(url)
else:
for cnt in range(novel_date[i][1]):
cnt=cnt+1
url = page_url+"%s/%s/"%(novel_date[i][0],cnt)
url_list.append(url)
# 作品ページをスクレイピングで取得する
def get_text_all():
global html_text
html_text = ""
for url in tqdm(url_list):
try:
response = requests.get(url=url, headers=headers, cookies=cookie)
except:
time.sleep(60)
response = requests.get(url=url, headers=headers, cookies=cookie)
html = response.content
soup = BeautifulSoup(html, "lxml")
novel_view = soup.find_all("div",id="novel_honbun")[0].text
html_text = html_text+novel_view+"<a href='%s' target='_blank'>%s</a>"%(url,url)
time.sleep(interval)
# htmlを作成する
def gen_html():
# htmlの作成
all_text = "<html>" + html_text
# 内容を入れつつ、単語リスト内の単語を強調する
for w in word_list:
all_text=all_text.replace("%s"%w, "<font size='5' color='#ff0000'><b>%s</b></font>"%w)
# htmlを閉じる
all_text=all_text+"</html>"
# htmlを出力する
with open('%s.html'%writer_id, mode='w', encoding='UTF-8', errors='ignore') as f:
f.write(all_text)
get_ncode_list()
get_text_all()
gen_html()
print("完了しました")
input("何かキーを押すと終了します")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment