Skip to content

Instantly share code, notes, and snippets.

@inchoate
Created June 27, 2017 23:01
Show Gist options
  • Save inchoate/2f8649c517be41117a98cb8141f8b003 to your computer and use it in GitHub Desktop.
Save inchoate/2f8649c517be41117a98cb8141f8b003 to your computer and use it in GitHub Desktop.
Checks remote resources
"""Checks to see if media URIs listed in the given file are valid.
Warning this is untested, shit code.
"Valid" means:
- the file is under 32 megs
- the file exists at the URI
Usage:
# check all URLs in the `your-uri-file.txt` and output failures only.
python check_media.py your-uri-file.txt
# check all URLs in the `your-uri-file.txt` and output all results.
python check_media.py your-uri-file.txt --all
"""
import argparse
import logging
import requests
import sys
import time
def check_remote_resource(uri, max_bytes=32000000):
"""Validates the existence of a remote resource and limits file size."""
response = None
backoff = 1
while response is None and backoff <= 8:
try:
response = requests.head(uri)
except requests.ConnectionError as ce:
logging.warn(
"[WARN] Exception fetching '{}'. Backing off and retrying".\
format(uri))
time.sleep(backoff)
backoff = backoff * 2
continue
if backoff > 8 and response is None:
return False, "[FAIL] Cannot check file: {}".format(uri)
msg, status = [], True
if not (200 <= response.status_code <= 299):
msg.append("Cannot fetch file")
msg.append("Status code = {}".format(response.status_code))
msg.append("Headers = {}".format(response.headers))
status = False
if "Content-Length" in response.headers:
if int(response.headers["Content-Length"]) > max_bytes:
msg.append("File too large")
status = False
else:
msg.append("No Content Length. Verify by hand.")
status = False
return status, msg or ["OK"]
def main():
parser = argparse.ArgumentParser()
parser.add_argument("urls", help="File with list of URIs to check.")
parser.add_argument("--all", help="Prints all, not just failures.", default=False)
args = parser.parse_args()
infile = args.urls
urls = open(infile, 'rb').readlines()
for url in urls:
url = url.strip()
status, msg = check_remote_resource(url)
if (not status) or args.all:
print("[{}]: {} => {}".format(
"SUCCESS" if status else "FAILURE",
url,
msg))
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment