Last active
August 29, 2015 14:20
-
-
Save hasen/157c2bac219933343d55 to your computer and use it in GitHub Desktop.
Get 'last-modified' from target page's header by Python.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
改行区切りのURLリストにそれぞれアクセスし, | |
各ページのheaderにあるlast-modifiedの値を取得する. | |
""" | |
import httplib2 | |
import time | |
# 対象URLのリスト | |
INPUT_FILE_NAME = "INPUT_FILE_NAME" | |
# 出力先のファイル | |
OUTPUT_FILE_NAME = "OUTPUT_FILE_NAME" | |
# 実行間隔 | |
SLEEP_TIME = 0 | |
# 対象のheader-key | |
TARGET_KEY = "last-modified" | |
def main(): | |
h = httplib2.Http() | |
# 対象URLのリストがあるファイルを開く | |
with open(INPUT_FILE_NAME, mode="r", encoding="utf-8") as input_file: | |
# 検索結果を出力するファイルを開く | |
with open(OUTPUT_FILE_NAME, mode="a", encoding="utf-8") as output_file: | |
count = 0 | |
for url in input_file: | |
time.sleep(SLEEP_TIME) | |
count += 1 | |
url = url.rstrip() | |
# 進行状況確認のため出力 | |
print(count, url) | |
try: | |
# キャッシュを無視 | |
response, content = h.request(url, "GET", headers={"cache-control": "no-cache"}) | |
# 取得結果を書き込み | |
if TARGET_KEY in response: | |
output_file.write(url + " => " + response[TARGET_KEY] + ",\n") | |
else: | |
output_file.write(url + " => no response[" + TARGET_KEY + "]\n") | |
except httplib2.ServerNotFoundError: | |
output_file.write(url + " => ServerNotFoundError\n") | |
continue | |
except httplib2.ssl.CertificateError: | |
output_file.write(url + " => httplib2.ssl.CertificateError\n") | |
continue | |
except OSError: | |
output_file.write(url + " OSError\n") | |
continue | |
continue | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment