Created
January 26, 2021 14:04
-
-
Save kotoripiyopiyo/a303512a7dadd48644dc6beed5689806 to your computer and use it in GitHub Desktop.
URLを指定するとリンクされてるページをDLする
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# linkcheck.py URLを指定すると、リンクを抽出して、その先が生きてるか確認し、生きていたらDL、死んでたら通知 | |
import sys, requests, bs4, os | |
os.makedirs('webpages', exist_ok=True) # 保存先ディレクトリを作成 | |
# todo 元ページを取得 | |
if len(sys.argv) < 2: | |
print('使い方 linkcheck.py URL') | |
sys.exit() | |
URL = sys.argv[1] | |
res = requests.get(URL) | |
res.raise_for_status() | |
# todo リンクを抽出 | |
soup = bs4.BeautifulSoup(res.text, 'html.parser') | |
link_elements = soup.select('a[href]') | |
for i in link_elements: | |
dl_url = i.attrs['href'] | |
try: | |
res_linked = requests.get(dl_url) | |
res_linked.raise_for_status() | |
except Exception as err: | |
print(f'エラーです:{err}') | |
# todo リンク先をダウンロード。なければ通知 | |
# この書き方は動く html_download = open(f'./pages/{os.path.basename(dl_url)}', 'wb') | |
try: # こうした理由:最後がディレクトリで終わるURLの場合、os.path.basenameでエラーになるから | |
html_download = open(os.path.join('webpages', os.path.basename(dl_url)), 'wb') | |
for chunk in res_linked.iter_content(100000): | |
html_download.write(chunk) | |
print(f'ダウンロード完了;{html_download}') | |
html_download.close() | |
except Exception as err: | |
print(f'たぶんファイル名で終わってない:{err}') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment