Last active
October 2, 2019 12:46
-
-
Save karupoimou/59ace5718f25e8948b2964acab4d42fe to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#なろう書籍化情報取得 2019-10-02更新 | |
import re | |
import requests | |
import time | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
from tqdm import tqdm | |
#取得間隔 | |
interval=1 | |
#出力ファイル名の指定 | |
file_name="なろう書籍化情報2018_10_02.xlsx" | |
#ページ数の指定(調べて打ち込む、大きく取っておいてあとで重複分を消した方が楽かも) | |
all_page_num=183 | |
#requestsの設定ユーザーエージェントの設定(設定必須) | |
headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0"} | |
cookie = {'over18': 'yes'} # Xサイト用のクッキー | |
#書き出し用 | |
all_list=[] | |
columns_name=["書籍名","著者","イラストレーター","出版社","レーベル","出版形態", | |
"発売日","発売年","ISBN","「電子」の記載","「コミック」の記載","ユーザID","ユーザ名","作者マイページ", | |
"書報ページURL","アマゾンURL"] | |
#スクレイピング | |
def get_data(): | |
for page in tqdm(range(all_page_num)): | |
page=page+1 | |
url="https://syosetu.com/syuppan/list/?p=%s"%page | |
print(url) | |
response = requests.get(url=url, headers=headers, cookies=cookie) | |
html = response.content | |
soup = BeautifulSoup(html, "lxml") | |
title_list=[] | |
title_url_list=[] | |
sp1=soup.find_all("a",class_="p-syuppan-list__title") | |
for i in range(len(sp1)): | |
title_list.append(sp1[i].text)#タイトル | |
title_url_list.append("https://syosetu.com"+sp1[i].get("href"))#個別ページURL | |
#個別ページの取得 | |
for i in range(len(title_url_list)): | |
response = requests.get(url=title_url_list[i], headers=headers, cookies=cookie) | |
html = response.content | |
soup = BeautifulSoup(html, "lxml") | |
temp_list=[] | |
book_author="" | |
illustration="" | |
publisher="" | |
label="" | |
pub_type="" | |
publish_date="" | |
publish_year="" | |
isbn="" | |
userid="" | |
user_name="" | |
user_mypage="" | |
amazon_url="" | |
is_densi=0 | |
is_comincs=0 | |
#著者名 | |
try: | |
sp = soup.find("div",class_="p-syuppan-detail__info-author").text | |
if "(著)"in sp: | |
index1=sp.find("(著)") | |
book_author=sp[1:index1] | |
else: | |
book_author=sp | |
#イラストレータ名 | |
if "(イラスト"in sp: | |
index2=sp.find("(イラスト") | |
index3=sp[0:index2].rfind(",") | |
illustration=sp[index3+2:index2] | |
except: | |
pass | |
#出版社名など | |
table_text=soup.find("table",class_="c-table").text | |
if "出版社" in table_text: | |
publisher=soup.find("th",string="出版社").find_next("td").text[1:-1] | |
if "レーベル" in table_text: | |
label=soup.find("th",string="レーベル").find_next("td").text | |
#出版形態 | |
type_exist=len(soup.find_all("span",class_="p-syuppan-detail__info-binding")) | |
if 0 != type_exist: | |
pub_type=soup.find_all("span",class_="p-syuppan-detail__info-binding")[0].text | |
if "発売日" in table_text: | |
publish_date=soup.find("th",string="発売日").find_next("td").text | |
publish_year=int(publish_date[0:4]) | |
if "ISBN" in table_text: | |
isbn=soup.find("th",string="ISBN").find_next("td").text | |
isbn=int(isbn) | |
#ユーザ情報 | |
user=soup.find_all("div",class_="c-panel__body-headline") | |
if user[-1].text in "小説家になろう登録情報": | |
userid=soup.find("th",string="ユーザID").find_next("td").text | |
userid=int(userid) | |
user_name=soup.find("th",string="ユーザ名").find_next("td").text | |
user_mypage="https://mypage.syosetu.com/%s/"%userid | |
#アマゾンURL | |
ama_exist=len(soup.find_all("div",class_="p-syuppan-detail__purchase")) | |
if 0 != ama_exist: | |
try: | |
amazon_url=soup.find("a",class_="c-button c-button--half c-button--lg c-button--primary").get("href") | |
except: | |
pass | |
#電子の記載 | |
if "電子" in soup.find("div",class_="c-panel").text: | |
is_densi=1 | |
#コミックの記載 | |
if "コミック" in soup.find("div",class_="c-panel").text: | |
is_comincs=1 | |
#書き出し | |
temp_list.append(title_list[i]) | |
temp_list.append(book_author) | |
temp_list.append(illustration) | |
temp_list.append(publisher) | |
temp_list.append(label) | |
temp_list.append(pub_type) | |
temp_list.append(publish_date) | |
temp_list.append(publish_year) | |
temp_list.append(isbn) | |
temp_list.append(is_densi) | |
temp_list.append(is_comincs) | |
temp_list.append(userid) | |
temp_list.append(user_name) | |
temp_list.append(user_mypage) | |
temp_list.append(title_url_list[i]) | |
temp_list.append(amazon_url) | |
all_list.append(temp_list) | |
time.sleep(interval) | |
def export_data(): | |
df = pd.DataFrame(all_list,columns=columns_name)#pandasのデータフレームに収納 | |
writer = pd.ExcelWriter(file_name,options={'strings_to_urls': False}) | |
df.to_excel(writer, sheet_name="Sheet1")#Writerを通して書き込み | |
writer.close() | |
#タスクの実行 | |
get_data() | |
export_data() | |
print("end") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment