Skip to content

Instantly share code, notes, and snippets.

@hasen
Last active August 29, 2015 14:20
Show Gist options
  • Save hasen/157c2bac219933343d55 to your computer and use it in GitHub Desktop.
Save hasen/157c2bac219933343d55 to your computer and use it in GitHub Desktop.
Get 'last-modified' from target page's header by Python.
# -*- coding: utf-8 -*-
"""
改行区切りのURLリストにそれぞれアクセスし,
各ページのheaderにあるlast-modifiedの値を取得する.
"""
import httplib2
import time
# 対象URLのリスト
INPUT_FILE_NAME = "INPUT_FILE_NAME"
# 出力先のファイル
OUTPUT_FILE_NAME = "OUTPUT_FILE_NAME"
# 実行間隔
SLEEP_TIME = 0
# 対象のheader-key
TARGET_KEY = "last-modified"
def main():
h = httplib2.Http()
# 対象URLのリストがあるファイルを開く
with open(INPUT_FILE_NAME, mode="r", encoding="utf-8") as input_file:
# 検索結果を出力するファイルを開く
with open(OUTPUT_FILE_NAME, mode="a", encoding="utf-8") as output_file:
count = 0
for url in input_file:
time.sleep(SLEEP_TIME)
count += 1
url = url.rstrip()
# 進行状況確認のため出力
print(count, url)
try:
# キャッシュを無視
response, content = h.request(url, "GET", headers={"cache-control": "no-cache"})
# 取得結果を書き込み
if TARGET_KEY in response:
output_file.write(url + " => " + response[TARGET_KEY] + ",\n")
else:
output_file.write(url + " => no response[" + TARGET_KEY + "]\n")
except httplib2.ServerNotFoundError:
output_file.write(url + " => ServerNotFoundError\n")
continue
except httplib2.ssl.CertificateError:
output_file.write(url + " => httplib2.ssl.CertificateError\n")
continue
except OSError:
output_file.write(url + " OSError\n")
continue
continue
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment