t-okkn/syosetu-downloader.py

## syosetu-downloader.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import requests
from requests.adapters import HTTPAdapter
from requests.cookies import create_cookie
from bs4 import BeautifulSoup
import sys, os, argparse

HOST = 'https://ncode.syosetu.com'

class Scraper(object):
    def __init__(self, ncode, savepath=''):
        self._ncode = ncode
        self._user_agent = get_user_agent()

        self._session = requests.Session()
        adapter = HTTPAdapter(pool_connections=1, pool_maxsize=1)
        self._session.mount('https://', adapter)

        cookie_obj = create_cookie(
                domain='.syosetu.com', name='over18', value='yes', path='/')
        self._session.cookies.set_cookie(cookie_obj)
        self._session.headers.update({'User-Agent': self._user_agent})

        if savepath and os.path.isdir(savepath):
            self._savepath = savepath
        else:
            self._savepath = os.path.dirname(os.path.abspath(__file__))

    def __exit__(self):
        self._session.close()

    @property
    def session(self):
        if not self._session is None:
            return self._session
        else:
            return None

    def get_text(self):
        main_url = '{}/{}/'.format(HOST, self._ncode)
        main_soup = self.__get_parser(main_url)

        if main_soup is None:
            print('【ERROR】指定したNコード（{}）の小説は存在しません'.format(self._ncode))
            self.__close()
            return

        title = self.__get_title(main_soup)
        path = os.path.join(self._savepath, '[{}]_{}.txt'.format(self._ncode, title))

        if os.path.isfile(path):
            os.remove(path)

        count = 1
        p2 = '\n' * 2
        p10 = '\n' * 10

        print('start : [{}] {}'.format(self._ncode, title))

        while True:
            url = '{}/{}/{}'.format(HOST, self._ncode, count)
            soup = self.__get_parser(url)

            if not soup is None:
                subtitle = soup.find('p', {'class': 'novel_subtitle'}).get_text()
                honbun = soup.find('div', {'class': 'novel_view', 'id': 'novel_honbun'}).get_text()

                with open(path, mode='a', encoding='utf-8') as f:
                    f.write('-----{}----- {}{}{}{}'.format(count, subtitle, p2, honbun, p10))

                print(' |- {}話完了'.format(count))
                count += 1

            else:
                print('done : [{}] {}'.format(self._ncode, title))
                self.__close()
                break

    def __close(self):
        self._session.close()

    def __get_parser(self, url):
        try:
            r = self._session.get(url)
            status = r.status_code
            soup = None

            if status == 200:
                soup = BeautifulSoup(r.text, "html.parser")

            r.close()
            return soup

        except:
            return None

    def __get_title(self, soup):
        t = soup.find('title').get_text()
        return t.strip()


def get_user_agent():
    url = 'https://omahaproxy.appspot.com/all.json'
    base = 'Mozilla/5.0 '
    pf = '(Windows NT 10.0; Win64; x64) '

    chrome_ver = '0.0.0.0'
    webkit_ver = '537.36'  # not change?

    try:
        r = requests.get(url)

        if r.status_code == 200:
            json_data = r.json()

            g = (inner_dic for dic in json_data for inner_dic in dic['versions'])
            stable = [(d['os'], d['current_version']) for d in g if d['channel'] == 'stable']
            os = [i[0] for i in stable]

            if 'win64' in os:
                chrome_ver = [i[1] for i in stable if i[0] == 'win64'][0]

            elif 'linux' in os:
                chrome_ver = [i[1] for i in stable if i[0] == 'linux'][0]
                pf = '(X11; Linux x86_64) '

        r.close()

    except:
        pass

    re = 'AppleWebKit/{} (KHTML, like Gecko) '.format(webkit_ver)
    chrome = 'Chrome/{} Safari/{}'.format(chrome_ver, webkit_ver)

    return base + pf + re + chrome


def main():
    d = '「小説家になろう」から小説をダウンロードします'
    parser = argparse.ArgumentParser(description=d)

    file_path = os.path.dirname(os.path.abspath(__file__))
    parser.add_argument('ncode', metavar='NCODE', nargs='+',
                        help='小説のNコードを指定してください')
    parser.add_argument('-p', '--path', metavar='Path', default=file_path,
                        help='小説の保存先を指定します（default: %(default)s）')

    args = parser.parse_args()

    if args.path and (not os.path.isdir(args.path)):
        print('【ERROR】存在しないディレクトリか、Pathではありません\n')
        sys.exit(1)

    else:
        for ncode in args.ncode:
            if not ncode:
                print('【ERROR】Nコードが空文字です\n')
                continue

            elif ncode[0] != 'n':
                print('【ERROR】Nコードの値が不正です\n')
                continue

            s = Scraper(ncode, args.path)
            s.get_text()


if __name__ == '__main__':
    main()
    sys.exit(0)
	#!/usr/bin/env python3
	# -- coding: utf-8 --

	import requests
	from requests.adapters import HTTPAdapter
	from requests.cookies import create_cookie
	from bs4 import BeautifulSoup
	import sys, os, argparse

	HOST = 'https://ncode.syosetu.com'

	class Scraper(object):
	def __init__(self, ncode, savepath=''):
	self._ncode = ncode
	self._user_agent = get_user_agent()

	self._session = requests.Session()
	adapter = HTTPAdapter(pool_connections=1, pool_maxsize=1)
	self._session.mount('https://', adapter)

	cookie_obj = create_cookie(
	domain='.syosetu.com', name='over18', value='yes', path='/')
	self._session.cookies.set_cookie(cookie_obj)
	self._session.headers.update({'User-Agent': self._user_agent})

	if savepath and os.path.isdir(savepath):
	self._savepath = savepath
	else:
	self._savepath = os.path.dirname(os.path.abspath(__file__))

	def __exit__(self):
	self._session.close()

	@property
	def session(self):
	if not self._session is None:
	return self._session
	else:
	return None

	def get_text(self):
	main_url = '{}/{}/'.format(HOST, self._ncode)
	main_soup = self.__get_parser(main_url)

	if main_soup is None:
	print('【ERROR】指定したNコード（{}）の小説は存在しません'.format(self._ncode))
	self.__close()
	return

	title = self.__get_title(main_soup)
	path = os.path.join(self._savepath, '[{}]_{}.txt'.format(self._ncode, title))

	if os.path.isfile(path):
	os.remove(path)

	count = 1
	p2 = '\n' * 2
	p10 = '\n' * 10

	print('start : [{}] {}'.format(self._ncode, title))

	while True:
	url = '{}/{}/{}'.format(HOST, self._ncode, count)
	soup = self.__get_parser(url)

	if not soup is None:
	subtitle = soup.find('p', {'class': 'novel_subtitle'}).get_text()
	honbun = soup.find('div', {'class': 'novel_view', 'id': 'novel_honbun'}).get_text()

	with open(path, mode='a', encoding='utf-8') as f:
	f.write('-----{}----- {}{}{}{}'.format(count, subtitle, p2, honbun, p10))

	print(' \|- {}話完了'.format(count))
	count += 1

	else:
	print('done : [{}] {}'.format(self._ncode, title))
	self.__close()
	break

	def __close(self):
	self._session.close()

	def __get_parser(self, url):
	try:
	r = self._session.get(url)
	status = r.status_code
	soup = None

	if status == 200:
	soup = BeautifulSoup(r.text, "html.parser")

	r.close()
	return soup

	except:
	return None

	def __get_title(self, soup):
	t = soup.find('title').get_text()
	return t.strip()


	def get_user_agent():
	url = 'https://omahaproxy.appspot.com/all.json'
	base = 'Mozilla/5.0 '
	pf = '(Windows NT 10.0; Win64; x64) '

	chrome_ver = '0.0.0.0'
	webkit_ver = '537.36' # not change?

	try:
	r = requests.get(url)

	if r.status_code == 200:
	json_data = r.json()

	g = (inner_dic for dic in json_data for inner_dic in dic['versions'])
	stable = [(d['os'], d['current_version']) for d in g if d['channel'] == 'stable']
	os = [i[0] for i in stable]

	if 'win64' in os:
	chrome_ver = [i[1] for i in stable if i[0] == 'win64'][0]

	elif 'linux' in os:
	chrome_ver = [i[1] for i in stable if i[0] == 'linux'][0]
	pf = '(X11; Linux x86_64) '

	r.close()

	except:
	pass

	re = 'AppleWebKit/{} (KHTML, like Gecko) '.format(webkit_ver)
	chrome = 'Chrome/{} Safari/{}'.format(chrome_ver, webkit_ver)

	return base + pf + re + chrome


	def main():
	d = '「小説家になろう」から小説をダウンロードします'
	parser = argparse.ArgumentParser(description=d)

	file_path = os.path.dirname(os.path.abspath(__file__))
	parser.add_argument('ncode', metavar='NCODE', nargs='+',
	help='小説のNコードを指定してください')
	parser.add_argument('-p', '--path', metavar='Path', default=file_path,
	help='小説の保存先を指定します（default: %(default)s）')

	args = parser.parse_args()

	if args.path and (not os.path.isdir(args.path)):
	print('【ERROR】存在しないディレクトリか、Pathではありません\n')
	sys.exit(1)

	else:
	for ncode in args.ncode:
	if not ncode:
	print('【ERROR】Nコードが空文字です\n')
	continue

	elif ncode[0] != 'n':
	print('【ERROR】Nコードの値が不正です\n')
	continue

	s = Scraper(ncode, args.path)
	s.get_text()


	if __name__ == '__main__':
	main()
	sys.exit(0)