Skip to content

Instantly share code, notes, and snippets.

@karupoimou
Last active July 16, 2019 10:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save karupoimou/bcf5a0a1b51c3072f131cfc03e5492a3 to your computer and use it in GitHub Desktop.
Save karupoimou/bcf5a0a1b51c3072f131cfc03e5492a3 to your computer and use it in GitHub Desktop.
# R18サイト出版作品紹介情報取得
import re
import requests
import time
from bs4 import BeautifulSoup
import pandas as pd
#データ取得間隔
interval=1
#出力ファイル名の指定
file_name="R-18書籍化情報.xlsx"
#ページ数の指定(調べて打ち込む、大きく取っておいてあとで重複分を消した方が楽かも)
all_page_num=143
#Pandas用
all_list=[]
columns_name=["書籍名","出版形態","著者名","発売日","出版社名","レーベル名","ISBN","価格","購入方法","イラストレータ名","ユーザー名","XID","ノクタ掲載","ムーン掲載","ミッド掲載","電子書籍の有無","コミックの記載","記事のurl","作者マイページ"]
#requestsの設定ユーザーエージェントの設定(設定必須)
headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0"}
cookie = {'over18': 'yes'} # Xサイト用のクッキー
def get_data():
for page in range(all_page_num):
page=page+1
url="https://blog.nightlantern.net/index.php?catid=2&page=%s"%page
print(url)
response = requests.get(url=url, headers=headers, cookies=cookie)
html = response.content
soup = BeautifulSoup(html, "lxml")
sp2=soup.find_all("div",class_="contentbody")
for num in range(len(sp2)):
#記事のURL
get_url=soup.find_all("h2")[num].find("a").get("href")
url="https://blog.nightlantern.net/%s"%get_url
#リストに分割
contentbody=sp2[num].text.split("\n")
book_title=""
book_type=""
author_name=""
publish_date=""
publisher=""
label=""
ISBN=""
price=""
sales_channel=""
illustrator=""
user_name=""
xid=""
is_noc=0
is_moon=0
is_mid=0
is_densi=0
is_comic=0
for text in contentbody:
if "・書籍名" in text:
book_title=text[5:-1]
if "・作品名" in text:
book_title=text[5:-1]
if "の紹介です。" in text:
if "電子" in text:
book_type="電子書籍"
else:
book_type="書籍"
if "・作者名" in text:
author_name=text[5:-1]
if "・発売日" in text:
publish_date=text[5:-1]
if "・出版社" in text:
publisher=text[5:-1]
if "・レーベル" in text:
label=text[6:-1]
if "・ISBN" in text:
ISBN_pre=text[6:-1]
ISBN=int(re.sub("\\D", "",ISBN_pre))
if "・販売価格" in text:
price=text[6:-1]
if "・購入方法" in text:
sales_channel=text[6:-1]
if "・イラストレータ名" in text:
illustrator=text[10:-1]
if "ノクタ" in text:
is_noc=1
if "ムーン" in text:
is_moon=1
if "ミッド" in text:
is_mid=1
if "電子版" in text:
is_densi=1
if "電子書籍" in text:
is_densi=1
if "コミック" in text:
is_comic=1
if "コミカラ" in text:
is_comic=1
#ユーザー名取得
user_name=sp2[num].find("a").text
#XID取得
xid_url = sp2[num].find("a").get("href")
index1=xid_url.find("x",20,-1)
xid=xid_url[index1:-1]
#記事のURL
mypage_url="https://xmypage.syosetu.com/%s/"%xid
#一時リストに書き込み
temp_list=[]
temp_list.append(book_title)
temp_list.append(book_type)
temp_list.append(author_name)
temp_list.append(publish_date)
temp_list.append(publisher)
temp_list.append(label)
temp_list.append(ISBN)
temp_list.append(price)
temp_list.append(sales_channel)
temp_list.append(illustrator)
temp_list.append(user_name)
temp_list.append(xid)
temp_list.append(is_noc)
temp_list.append(is_moon)
temp_list.append(is_mid)
temp_list.append(is_densi)
temp_list.append(is_comic)
temp_list.append(url)
temp_list.append(mypage_url)
all_list.append(temp_list)
time.sleep(interval)
def save_date():
df = pd.DataFrame(all_list,columns=columns_name)#pandasのデータフレームに収納
df.to_excel(file_name, sheet_name="Sheet1")
#タスクの実行
t1=time.time()
get_data()
save_date()
print("処理時間%s"%str(time.time()-t1))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment