karupoimou/X_Narou_publish_date_all_get.py

## X_Narou_publish_date_all_get.py
# R18サイト出版作品紹介情報取得
import re
import requests
import time
from bs4 import BeautifulSoup
import pandas as pd

#データ取得間隔
interval=1

#出力ファイル名の指定
file_name="R-18書籍化情報.xlsx"

#ページ数の指定(調べて打ち込む、大きく取っておいてあとで重複分を消した方が楽かも)
all_page_num=143

#Pandas用
all_list=[]
columns_name=["書籍名","出版形態","著者名","発売日","出版社名","レーベル名","ISBN","価格","購入方法","イラストレータ名","ユーザー名","XID","ノクタ掲載","ムーン掲載","ミッド掲載","電子書籍の有無","コミックの記載","記事のurl","作者マイページ"]

#requestsの設定ユーザーエージェントの設定（設定必須）
headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0"}
cookie = {'over18': 'yes'}  # Xサイト用のクッキー

def get_data():
    for page in range(all_page_num):
        page=page+1
        url="https://blog.nightlantern.net/index.php?catid=2&page=%s"%page
        print(url)

        response = requests.get(url=url, headers=headers, cookies=cookie)
        html = response.content
        soup = BeautifulSoup(html, "lxml")

        sp2=soup.find_all("div",class_="contentbody")

        for num in range(len(sp2)):

            #記事のURL
            get_url=soup.find_all("h2")[num].find("a").get("href")
            url="https://blog.nightlantern.net/%s"%get_url

            #リストに分割
            contentbody=sp2[num].text.split("\n")

            book_title=""
            book_type=""
            author_name=""
            publish_date=""
            publisher=""
            label=""
            ISBN=""
            price=""
            sales_channel=""
            illustrator=""
            user_name=""
            xid=""
            is_noc=0
            is_moon=0
            is_mid=0
            is_densi=0
            is_comic=0

            for text in contentbody:

                if "・書籍名" in text:
                    book_title=text[5:-1]
                if "・作品名" in text:
                    book_title=text[5:-1]

                if "の紹介です。" in text:
                    if "電子" in text:
                        book_type="電子書籍"
                    else:
                        book_type="書籍"

                if "・作者名" in text:
                    author_name=text[5:-1]

                if "・発売日" in text:
                    publish_date=text[5:-1]

                if "・出版社" in text:
                    publisher=text[5:-1]

                if "・レーベル" in text:
                    label=text[6:-1]

                if "・ISBN" in text:
                    ISBN_pre=text[6:-1]
                    ISBN=int(re.sub("\\D", "",ISBN_pre))

                if "・販売価格" in text:
                    price=text[6:-1]

                if "・購入方法" in text:
                    sales_channel=text[6:-1]

                if "・イラストレータ名" in text:
                    illustrator=text[10:-1]

                if "ノクタ" in text:
                    is_noc=1
                if "ムーン" in text:
                    is_moon=1
                if "ミッド" in text:
                    is_mid=1

                if "電子版" in text:
                    is_densi=1
                if "電子書籍" in text:
                    is_densi=1

                if "コミック" in text:
                    is_comic=1
                if "コミカラ" in text:
                    is_comic=1

            #ユーザー名取得
            user_name=sp2[num].find("a").text

            #XID取得
            xid_url = sp2[num].find("a").get("href")
            index1=xid_url.find("x",20,-1)
            xid=xid_url[index1:-1]

            #記事のURL
            mypage_url="https://xmypage.syosetu.com/%s/"%xid

            #一時リストに書き込み
            temp_list=[]
            temp_list.append(book_title)
            temp_list.append(book_type)
            temp_list.append(author_name)
            temp_list.append(publish_date)
            temp_list.append(publisher)
            temp_list.append(label)
            temp_list.append(ISBN)
            temp_list.append(price)
            temp_list.append(sales_channel)
            temp_list.append(illustrator)
            temp_list.append(user_name)
            temp_list.append(xid)
            temp_list.append(is_noc)
            temp_list.append(is_moon)
            temp_list.append(is_mid)
            temp_list.append(is_densi)
            temp_list.append(is_comic)
            temp_list.append(url)
            temp_list.append(mypage_url)

            all_list.append(temp_list)

        time.sleep(interval)

def save_date():
    df = pd.DataFrame(all_list,columns=columns_name)#pandasのデータフレームに収納
    df.to_excel(file_name, sheet_name="Sheet1")

#タスクの実行
t1=time.time()

get_data()
save_date()

print("処理時間%s"%str(time.time()-t1))
	# R18サイト出版作品紹介情報取得
	import re
	import requests
	import time
	from bs4 import BeautifulSoup
	import pandas as pd

	#データ取得間隔
	interval=1

	#出力ファイル名の指定
	file_name="R-18書籍化情報.xlsx"

	#ページ数の指定(調べて打ち込む、大きく取っておいてあとで重複分を消した方が楽かも)
	all_page_num=143

	#Pandas用
	all_list=[]
	columns_name=["書籍名","出版形態","著者名","発売日","出版社名","レーベル名","ISBN","価格","購入方法","イラストレータ名","ユーザー名","XID","ノクタ掲載","ムーン掲載","ミッド掲載","電子書籍の有無","コミックの記載","記事のurl","作者マイページ"]

	#requestsの設定ユーザーエージェントの設定（設定必須）
	headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0"}
	cookie = {'over18': 'yes'} # Xサイト用のクッキー

	def get_data():
	for page in range(all_page_num):
	page=page+1
	url="https://blog.nightlantern.net/index.php?catid=2&page=%s"%page
	print(url)

	response = requests.get(url=url, headers=headers, cookies=cookie)
	html = response.content
	soup = BeautifulSoup(html, "lxml")

	sp2=soup.find_all("div",class_="contentbody")

	for num in range(len(sp2)):

	#記事のURL
	get_url=soup.find_all("h2")[num].find("a").get("href")
	url="https://blog.nightlantern.net/%s"%get_url

	#リストに分割
	contentbody=sp2[num].text.split("\n")

	book_title=""
	book_type=""
	author_name=""
	publish_date=""
	publisher=""
	label=""
	ISBN=""
	price=""
	sales_channel=""
	illustrator=""
	user_name=""
	xid=""
	is_noc=0
	is_moon=0
	is_mid=0
	is_densi=0
	is_comic=0

	for text in contentbody:

	if "・書籍名" in text:
	book_title=text[5:-1]
	if "・作品名" in text:
	book_title=text[5:-1]

	if "の紹介です。" in text:
	if "電子" in text:
	book_type="電子書籍"
	else:
	book_type="書籍"

	if "・作者名" in text:
	author_name=text[5:-1]

	if "・発売日" in text:
	publish_date=text[5:-1]

	if "・出版社" in text:
	publisher=text[5:-1]

	if "・レーベル" in text:
	label=text[6:-1]

	if "・ISBN" in text:
	ISBN_pre=text[6:-1]
	ISBN=int(re.sub("\\D", "",ISBN_pre))

	if "・販売価格" in text:
	price=text[6:-1]

	if "・購入方法" in text:
	sales_channel=text[6:-1]

	if "・イラストレータ名" in text:
	illustrator=text[10:-1]

	if "ノクタ" in text:
	is_noc=1
	if "ムーン" in text:
	is_moon=1
	if "ミッド" in text:
	is_mid=1

	if "電子版" in text:
	is_densi=1
	if "電子書籍" in text:
	is_densi=1

	if "コミック" in text:
	is_comic=1
	if "コミカラ" in text:
	is_comic=1

	#ユーザー名取得
	user_name=sp2[num].find("a").text

	#XID取得
	xid_url = sp2[num].find("a").get("href")
	index1=xid_url.find("x",20,-1)
	xid=xid_url[index1:-1]

	#記事のURL
	mypage_url="https://xmypage.syosetu.com/%s/"%xid

	#一時リストに書き込み
	temp_list=[]
	temp_list.append(book_title)
	temp_list.append(book_type)
	temp_list.append(author_name)
	temp_list.append(publish_date)
	temp_list.append(publisher)
	temp_list.append(label)
	temp_list.append(ISBN)
	temp_list.append(price)
	temp_list.append(sales_channel)
	temp_list.append(illustrator)
	temp_list.append(user_name)
	temp_list.append(xid)
	temp_list.append(is_noc)
	temp_list.append(is_moon)
	temp_list.append(is_mid)
	temp_list.append(is_densi)
	temp_list.append(is_comic)
	temp_list.append(url)
	temp_list.append(mypage_url)

	all_list.append(temp_list)

	time.sleep(interval)

	def save_date():
	df = pd.DataFrame(all_list,columns=columns_name)#pandasのデータフレームに収納
	df.to_excel(file_name, sheet_name="Sheet1")

	#タスクの実行
	t1=time.time()

	get_data()
	save_date()

	print("処理時間%s"%str(time.time()-t1))