Created
September 22, 2021 06:39
-
-
Save raimusyndrome/6665016e2eaa0d407d269c0085c83bb4 to your computer and use it in GitHub Desktop.
Webページをファイルに保存する
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding:utf-8 -*- | |
import argparse | |
import requests | |
from bs4 import BeautifulSoup | |
if __name__ == '__main__': | |
# 引数解析 | |
parser = argparse.ArgumentParser() | |
# 読み込みパス | |
parser.add_argument('url') | |
# ローカルファイル指定 | |
parser.add_argument('--file', '-f', action='store_true', default=False) | |
# pretty出力 | |
parser.add_argument('--pretty', '-p', action='store_true', default=False) | |
# 出力ファイル名 | |
parser.add_argument('--output_file', '-o', default='output.html') | |
args = parser.parse_args() | |
if not args.file: | |
# Web経由で取得 | |
header = {} | |
page = requests.get(args.url, headers=header) | |
if page.status_code == 200: | |
bs = BeautifulSoup(page.content, 'lxml') | |
# ファイル保存 | |
page.encoding = bs.original_encoding | |
if not args.pretty: | |
with open(args.output_file, 'wb') as fp: | |
fp.write(page.text.encode('utf-8')) | |
else: | |
bs = BeautifulSoup(page.content, 'lxml') | |
with open(args.output_file, 'w') as fp: | |
fp.write(bs.prettify()) | |
else: | |
# ページの取得失敗 | |
print(page.status_code) | |
exit(1) | |
else: | |
# ローカルファイルを読み込み | |
with open(args.url, 'r') as fp: | |
page = fp.read() | |
bs = BeautifulSoup(page, 'lxml') | |
for link in bs.find('a'): | |
print(link) | |
exit(0) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment