Created
January 27, 2018 10:25
-
-
Save yosuke1985/740d92937b3d02571a8c52a4b4e13399 to your computer and use it in GitHub Desktop.
Blogosメルマガが終了するので、Pythonでスクレイピングして過去記事を保存してみた
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#https://qiita.com/YOSUKE8080/items/a59e5bcbf5ded6c8d5f9 | |
import requests | |
from bs4 import BeautifulSoup | |
from urllib.parse import urljoin | |
# メールアドレスとパスワードの指定 | |
USER = "" | |
PASS = "" | |
# セッションを開始 | |
session = requests.session() | |
# ログイン | |
login_info = { | |
"livedoor_id":USER, | |
"password":PASS | |
} | |
url_login = "https://member.livedoor.com/login/?.next=http%3A%2F%2Fmagazine.livedoor.com%2Fmanage%2F&.sv=magazine" | |
res = session.post(url_login, data=login_info) | |
res.raise_for_status() # エラーならここで例外を発生させる | |
#続き | |
# マイページのURLをピックアップする | |
soup = BeautifulSoup(res.text,"html.parser") | |
a = soup.find_all("a") | |
a = a[6] | |
if a is None: | |
print("マイページが取得できませんでした") | |
quit() | |
# 相対URLを絶対URLに変換 | |
url_mypage = urljoin(url_login, a.attrs["href"]) | |
#続き | |
#ページ数 | |
pages = 8 | |
for i in range(pages): | |
res = session.get(f"{url_mypage}?p={i+1}") | |
res.raise_for_status() | |
soup2 = BeautifulSoup(res.text, "html.parser") | |
d = soup2.find_all(class_ = "title") | |
for i in range(len(d)-1): | |
url = d[i+1].find("a").attrs["href"] | |
res = session.get(url) | |
res.raise_for_status() | |
x = url.split('/') | |
name = x[-1].split(".")[0] | |
with open( f'{name}.html','w') as file: | |
file.write(res.text) | |
print("done!") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment