Skip to content

Instantly share code, notes, and snippets.

@t-okkn
Last active July 13, 2021 10:02
Show Gist options
  • Save t-okkn/6156cc81eeaa4bffc12a31df846ee838 to your computer and use it in GitHub Desktop.
Save t-okkn/6156cc81eeaa4bffc12a31df846ee838 to your computer and use it in GitHub Desktop.
「小説を読もう」のある小説全編を一つのファイルにしてダウンロードします(R18対応)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import requests
from requests.adapters import HTTPAdapter
from requests.cookies import create_cookie
from bs4 import BeautifulSoup
import sys, os, argparse
HOST = 'https://ncode.syosetu.com'
class Scraper(object):
def __init__(self, ncode, savepath=''):
self._ncode = ncode
self._user_agent = get_user_agent()
self._session = requests.Session()
adapter = HTTPAdapter(pool_connections=1, pool_maxsize=1)
self._session.mount('https://', adapter)
cookie_obj = create_cookie(
domain='.syosetu.com', name='over18', value='yes', path='/')
self._session.cookies.set_cookie(cookie_obj)
self._session.headers.update({'User-Agent': self._user_agent})
if savepath and os.path.isdir(savepath):
self._savepath = savepath
else:
self._savepath = os.path.dirname(os.path.abspath(__file__))
def __exit__(self):
self._session.close()
@property
def session(self):
if not self._session is None:
return self._session
else:
return None
def get_text(self):
main_url = '{}/{}/'.format(HOST, self._ncode)
main_soup = self.__get_parser(main_url)
if main_soup is None:
print('【ERROR】指定したNコード({})の小説は存在しません'.format(self._ncode))
self.__close()
return
title = self.__get_title(main_soup)
path = os.path.join(self._savepath, '[{}]_{}.txt'.format(self._ncode, title))
if os.path.isfile(path):
os.remove(path)
count = 1
p2 = '\n' * 2
p10 = '\n' * 10
print('start : [{}] {}'.format(self._ncode, title))
while True:
url = '{}/{}/{}'.format(HOST, self._ncode, count)
soup = self.__get_parser(url)
if not soup is None:
subtitle = soup.find('p', {'class': 'novel_subtitle'}).get_text()
honbun = soup.find('div', {'class': 'novel_view', 'id': 'novel_honbun'}).get_text()
with open(path, mode='a', encoding='utf-8') as f:
f.write('-----{}----- {}{}{}{}'.format(count, subtitle, p2, honbun, p10))
print(' |- {}話完了'.format(count))
count += 1
else:
print('done : [{}] {}'.format(self._ncode, title))
self.__close()
break
def __close(self):
self._session.close()
def __get_parser(self, url):
try:
r = self._session.get(url)
status = r.status_code
soup = None
if status == 200:
soup = BeautifulSoup(r.text, "html.parser")
r.close()
return soup
except:
return None
def __get_title(self, soup):
t = soup.find('title').get_text()
return t.strip()
def get_user_agent():
url = 'https://omahaproxy.appspot.com/all.json'
base = 'Mozilla/5.0 '
pf = '(Windows NT 10.0; Win64; x64) '
chrome_ver = '0.0.0.0'
webkit_ver = '537.36' # not change?
try:
r = requests.get(url)
if r.status_code == 200:
json_data = r.json()
g = (inner_dic for dic in json_data for inner_dic in dic['versions'])
stable = [(d['os'], d['current_version']) for d in g if d['channel'] == 'stable']
os = [i[0] for i in stable]
if 'win64' in os:
chrome_ver = [i[1] for i in stable if i[0] == 'win64'][0]
elif 'linux' in os:
chrome_ver = [i[1] for i in stable if i[0] == 'linux'][0]
pf = '(X11; Linux x86_64) '
r.close()
except:
pass
re = 'AppleWebKit/{} (KHTML, like Gecko) '.format(webkit_ver)
chrome = 'Chrome/{} Safari/{}'.format(chrome_ver, webkit_ver)
return base + pf + re + chrome
def main():
d = '「小説家になろう」から小説をダウンロードします'
parser = argparse.ArgumentParser(description=d)
file_path = os.path.dirname(os.path.abspath(__file__))
parser.add_argument('ncode', metavar='NCODE', nargs='+',
help='小説のNコードを指定してください')
parser.add_argument('-p', '--path', metavar='Path', default=file_path,
help='小説の保存先を指定します(default: %(default)s)')
args = parser.parse_args()
if args.path and (not os.path.isdir(args.path)):
print('【ERROR】存在しないディレクトリか、Pathではありません\n')
sys.exit(1)
else:
for ncode in args.ncode:
if not ncode:
print('【ERROR】Nコードが空文字です\n')
continue
elif ncode[0] != 'n':
print('【ERROR】Nコードの値が不正です\n')
continue
s = Scraper(ncode, args.path)
s.get_text()
if __name__ == '__main__':
main()
sys.exit(0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment