Skip to content

Instantly share code, notes, and snippets.

@raimusyndrome
Created September 22, 2021 06:39
Show Gist options
  • Save raimusyndrome/6665016e2eaa0d407d269c0085c83bb4 to your computer and use it in GitHub Desktop.
Save raimusyndrome/6665016e2eaa0d407d269c0085c83bb4 to your computer and use it in GitHub Desktop.
Webページをファイルに保存する
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import argparse
import requests
from bs4 import BeautifulSoup
if __name__ == '__main__':
# 引数解析
parser = argparse.ArgumentParser()
# 読み込みパス
parser.add_argument('url')
# ローカルファイル指定
parser.add_argument('--file', '-f', action='store_true', default=False)
# pretty出力
parser.add_argument('--pretty', '-p', action='store_true', default=False)
# 出力ファイル名
parser.add_argument('--output_file', '-o', default='output.html')
args = parser.parse_args()
if not args.file:
# Web経由で取得
header = {}
page = requests.get(args.url, headers=header)
if page.status_code == 200:
bs = BeautifulSoup(page.content, 'lxml')
# ファイル保存
page.encoding = bs.original_encoding
if not args.pretty:
with open(args.output_file, 'wb') as fp:
fp.write(page.text.encode('utf-8'))
else:
bs = BeautifulSoup(page.content, 'lxml')
with open(args.output_file, 'w') as fp:
fp.write(bs.prettify())
else:
# ページの取得失敗
print(page.status_code)
exit(1)
else:
# ローカルファイルを読み込み
with open(args.url, 'r') as fp:
page = fp.read()
bs = BeautifulSoup(page, 'lxml')
for link in bs.find('a'):
print(link)
exit(0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment