Last active
July 13, 2021 10:02
-
-
Save t-okkn/6156cc81eeaa4bffc12a31df846ee838 to your computer and use it in GitHub Desktop.
「小説を読もう」のある小説全編を一つのファイルにしてダウンロードします(R18対応)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
import requests | |
from requests.adapters import HTTPAdapter | |
from requests.cookies import create_cookie | |
from bs4 import BeautifulSoup | |
import sys, os, argparse | |
HOST = 'https://ncode.syosetu.com' | |
class Scraper(object): | |
def __init__(self, ncode, savepath=''): | |
self._ncode = ncode | |
self._user_agent = get_user_agent() | |
self._session = requests.Session() | |
adapter = HTTPAdapter(pool_connections=1, pool_maxsize=1) | |
self._session.mount('https://', adapter) | |
cookie_obj = create_cookie( | |
domain='.syosetu.com', name='over18', value='yes', path='/') | |
self._session.cookies.set_cookie(cookie_obj) | |
self._session.headers.update({'User-Agent': self._user_agent}) | |
if savepath and os.path.isdir(savepath): | |
self._savepath = savepath | |
else: | |
self._savepath = os.path.dirname(os.path.abspath(__file__)) | |
def __exit__(self): | |
self._session.close() | |
@property | |
def session(self): | |
if not self._session is None: | |
return self._session | |
else: | |
return None | |
def get_text(self): | |
main_url = '{}/{}/'.format(HOST, self._ncode) | |
main_soup = self.__get_parser(main_url) | |
if main_soup is None: | |
print('【ERROR】指定したNコード({})の小説は存在しません'.format(self._ncode)) | |
self.__close() | |
return | |
title = self.__get_title(main_soup) | |
path = os.path.join(self._savepath, '[{}]_{}.txt'.format(self._ncode, title)) | |
if os.path.isfile(path): | |
os.remove(path) | |
count = 1 | |
p2 = '\n' * 2 | |
p10 = '\n' * 10 | |
print('start : [{}] {}'.format(self._ncode, title)) | |
while True: | |
url = '{}/{}/{}'.format(HOST, self._ncode, count) | |
soup = self.__get_parser(url) | |
if not soup is None: | |
subtitle = soup.find('p', {'class': 'novel_subtitle'}).get_text() | |
honbun = soup.find('div', {'class': 'novel_view', 'id': 'novel_honbun'}).get_text() | |
with open(path, mode='a', encoding='utf-8') as f: | |
f.write('-----{}----- {}{}{}{}'.format(count, subtitle, p2, honbun, p10)) | |
print(' |- {}話完了'.format(count)) | |
count += 1 | |
else: | |
print('done : [{}] {}'.format(self._ncode, title)) | |
self.__close() | |
break | |
def __close(self): | |
self._session.close() | |
def __get_parser(self, url): | |
try: | |
r = self._session.get(url) | |
status = r.status_code | |
soup = None | |
if status == 200: | |
soup = BeautifulSoup(r.text, "html.parser") | |
r.close() | |
return soup | |
except: | |
return None | |
def __get_title(self, soup): | |
t = soup.find('title').get_text() | |
return t.strip() | |
def get_user_agent(): | |
url = 'https://omahaproxy.appspot.com/all.json' | |
base = 'Mozilla/5.0 ' | |
pf = '(Windows NT 10.0; Win64; x64) ' | |
chrome_ver = '0.0.0.0' | |
webkit_ver = '537.36' # not change? | |
try: | |
r = requests.get(url) | |
if r.status_code == 200: | |
json_data = r.json() | |
g = (inner_dic for dic in json_data for inner_dic in dic['versions']) | |
stable = [(d['os'], d['current_version']) for d in g if d['channel'] == 'stable'] | |
os = [i[0] for i in stable] | |
if 'win64' in os: | |
chrome_ver = [i[1] for i in stable if i[0] == 'win64'][0] | |
elif 'linux' in os: | |
chrome_ver = [i[1] for i in stable if i[0] == 'linux'][0] | |
pf = '(X11; Linux x86_64) ' | |
r.close() | |
except: | |
pass | |
re = 'AppleWebKit/{} (KHTML, like Gecko) '.format(webkit_ver) | |
chrome = 'Chrome/{} Safari/{}'.format(chrome_ver, webkit_ver) | |
return base + pf + re + chrome | |
def main(): | |
d = '「小説家になろう」から小説をダウンロードします' | |
parser = argparse.ArgumentParser(description=d) | |
file_path = os.path.dirname(os.path.abspath(__file__)) | |
parser.add_argument('ncode', metavar='NCODE', nargs='+', | |
help='小説のNコードを指定してください') | |
parser.add_argument('-p', '--path', metavar='Path', default=file_path, | |
help='小説の保存先を指定します(default: %(default)s)') | |
args = parser.parse_args() | |
if args.path and (not os.path.isdir(args.path)): | |
print('【ERROR】存在しないディレクトリか、Pathではありません\n') | |
sys.exit(1) | |
else: | |
for ncode in args.ncode: | |
if not ncode: | |
print('【ERROR】Nコードが空文字です\n') | |
continue | |
elif ncode[0] != 'n': | |
print('【ERROR】Nコードの値が不正です\n') | |
continue | |
s = Scraper(ncode, args.path) | |
s.get_text() | |
if __name__ == '__main__': | |
main() | |
sys.exit(0) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment