Skip to content

Instantly share code, notes, and snippets.

@yosuke1985
Created January 27, 2018 10:25
Show Gist options
  • Save yosuke1985/740d92937b3d02571a8c52a4b4e13399 to your computer and use it in GitHub Desktop.
Save yosuke1985/740d92937b3d02571a8c52a4b4e13399 to your computer and use it in GitHub Desktop.
Blogosメルマガが終了するので、Pythonでスクレイピングして過去記事を保存してみた
#https://qiita.com/YOSUKE8080/items/a59e5bcbf5ded6c8d5f9
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
# メールアドレスとパスワードの指定
USER = ""
PASS = ""
# セッションを開始
session = requests.session()
# ログイン
login_info = {
"livedoor_id":USER,
"password":PASS
}
url_login = "https://member.livedoor.com/login/?.next=http%3A%2F%2Fmagazine.livedoor.com%2Fmanage%2F&.sv=magazine"
res = session.post(url_login, data=login_info)
res.raise_for_status() # エラーならここで例外を発生させる
#続き
# マイページのURLをピックアップする
soup = BeautifulSoup(res.text,"html.parser")
a = soup.find_all("a")
a = a[6]
if a is None:
print("マイページが取得できませんでした")
quit()
# 相対URLを絶対URLに変換
url_mypage = urljoin(url_login, a.attrs["href"])
#続き
#ページ数
pages = 8
for i in range(pages):
res = session.get(f"{url_mypage}?p={i+1}")
res.raise_for_status()
soup2 = BeautifulSoup(res.text, "html.parser")
d = soup2.find_all(class_ = "title")
for i in range(len(d)-1):
url = d[i+1].find("a").attrs["href"]
res = session.get(url)
res.raise_for_status()
x = url.split('/')
name = x[-1].split(".")[0]
with open( f'{name}.html','w') as file:
file.write(res.text)
print("done!")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment