Last active
July 16, 2019 10:46
-
-
Save karupoimou/bcf5a0a1b51c3072f131cfc03e5492a3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# R18サイト出版作品紹介情報取得 | |
import re | |
import requests | |
import time | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
#データ取得間隔 | |
interval=1 | |
#出力ファイル名の指定 | |
file_name="R-18書籍化情報.xlsx" | |
#ページ数の指定(調べて打ち込む、大きく取っておいてあとで重複分を消した方が楽かも) | |
all_page_num=143 | |
#Pandas用 | |
all_list=[] | |
columns_name=["書籍名","出版形態","著者名","発売日","出版社名","レーベル名","ISBN","価格","購入方法","イラストレータ名","ユーザー名","XID","ノクタ掲載","ムーン掲載","ミッド掲載","電子書籍の有無","コミックの記載","記事のurl","作者マイページ"] | |
#requestsの設定ユーザーエージェントの設定(設定必須) | |
headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0"} | |
cookie = {'over18': 'yes'} # Xサイト用のクッキー | |
def get_data(): | |
for page in range(all_page_num): | |
page=page+1 | |
url="https://blog.nightlantern.net/index.php?catid=2&page=%s"%page | |
print(url) | |
response = requests.get(url=url, headers=headers, cookies=cookie) | |
html = response.content | |
soup = BeautifulSoup(html, "lxml") | |
sp2=soup.find_all("div",class_="contentbody") | |
for num in range(len(sp2)): | |
#記事のURL | |
get_url=soup.find_all("h2")[num].find("a").get("href") | |
url="https://blog.nightlantern.net/%s"%get_url | |
#リストに分割 | |
contentbody=sp2[num].text.split("\n") | |
book_title="" | |
book_type="" | |
author_name="" | |
publish_date="" | |
publisher="" | |
label="" | |
ISBN="" | |
price="" | |
sales_channel="" | |
illustrator="" | |
user_name="" | |
xid="" | |
is_noc=0 | |
is_moon=0 | |
is_mid=0 | |
is_densi=0 | |
is_comic=0 | |
for text in contentbody: | |
if "・書籍名" in text: | |
book_title=text[5:-1] | |
if "・作品名" in text: | |
book_title=text[5:-1] | |
if "の紹介です。" in text: | |
if "電子" in text: | |
book_type="電子書籍" | |
else: | |
book_type="書籍" | |
if "・作者名" in text: | |
author_name=text[5:-1] | |
if "・発売日" in text: | |
publish_date=text[5:-1] | |
if "・出版社" in text: | |
publisher=text[5:-1] | |
if "・レーベル" in text: | |
label=text[6:-1] | |
if "・ISBN" in text: | |
ISBN_pre=text[6:-1] | |
ISBN=int(re.sub("\\D", "",ISBN_pre)) | |
if "・販売価格" in text: | |
price=text[6:-1] | |
if "・購入方法" in text: | |
sales_channel=text[6:-1] | |
if "・イラストレータ名" in text: | |
illustrator=text[10:-1] | |
if "ノクタ" in text: | |
is_noc=1 | |
if "ムーン" in text: | |
is_moon=1 | |
if "ミッド" in text: | |
is_mid=1 | |
if "電子版" in text: | |
is_densi=1 | |
if "電子書籍" in text: | |
is_densi=1 | |
if "コミック" in text: | |
is_comic=1 | |
if "コミカラ" in text: | |
is_comic=1 | |
#ユーザー名取得 | |
user_name=sp2[num].find("a").text | |
#XID取得 | |
xid_url = sp2[num].find("a").get("href") | |
index1=xid_url.find("x",20,-1) | |
xid=xid_url[index1:-1] | |
#記事のURL | |
mypage_url="https://xmypage.syosetu.com/%s/"%xid | |
#一時リストに書き込み | |
temp_list=[] | |
temp_list.append(book_title) | |
temp_list.append(book_type) | |
temp_list.append(author_name) | |
temp_list.append(publish_date) | |
temp_list.append(publisher) | |
temp_list.append(label) | |
temp_list.append(ISBN) | |
temp_list.append(price) | |
temp_list.append(sales_channel) | |
temp_list.append(illustrator) | |
temp_list.append(user_name) | |
temp_list.append(xid) | |
temp_list.append(is_noc) | |
temp_list.append(is_moon) | |
temp_list.append(is_mid) | |
temp_list.append(is_densi) | |
temp_list.append(is_comic) | |
temp_list.append(url) | |
temp_list.append(mypage_url) | |
all_list.append(temp_list) | |
time.sleep(interval) | |
def save_date(): | |
df = pd.DataFrame(all_list,columns=columns_name)#pandasのデータフレームに収納 | |
df.to_excel(file_name, sheet_name="Sheet1") | |
#タスクの実行 | |
t1=time.time() | |
get_data() | |
save_date() | |
print("処理時間%s"%str(time.time()-t1)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment