Skip to content

Instantly share code, notes, and snippets.

@Mr0grog
Last active July 22, 2024 13:18
Show Gist options
  • Save Mr0grog/d217a3dc4039b6b4d033c4e62c850905 to your computer and use it in GitHub Desktop.
Save Mr0grog/d217a3dc4039b6b4d033c4e62c850905 to your computer and use it in GitHub Desktop.
Check the HTTP status codes for a list of URLs in a file. Undoubtedly there are fancier ways to do this with wget or some other common Linux utilities, but I’m not good enough at bash scripting to find a way to do that with minimal memory usage (if the list of URLs is millions long) and with nice formatting, so I just wrote a script.
www.energy.gov
gobbledegook
apple.com
www.gpo.gov
httpstat.us/404
import aiohttp
import argparse
import asyncio
import csv
import re
import sys
import time
from urllib.parse import urlparse
# Edit this to adjust how many HTTP requests you can make in parallel
CONCURRENT_REQUESTS = 20
PROTOCOL_PATTERN = re.compile(r'^\w+://')
DOMAIN_PATTERN = re.compile(r'^[^:]+://([^/:]+)')
async def get_status(session, url):
'''Return the status code for a URL'''
if PROTOCOL_PATTERN.match(url) is None:
url = 'http://' + url
domain = DOMAIN_PATTERN.match(url).group(1)
rate_lock = next((x for x in options.rate if x.matches(domain)), None)
if rate_lock:
await rate_lock.wait()
async with session.head(url, allow_redirects=True) as response:
return response.status
async def worker(file, output, options):
'''
Reads URLs as lines from a file, requests their status, and outputs the
results in CSV format. Spawn multiple of these to work on more than one
URL in parallel.
'''
start_time = time.time()
writer = csv.writer(output)
ssl_mode = False if options.ignore_ssl_errors else None
connector = aiohttp.TCPConnector(ssl=ssl_mode)
async with aiohttp.ClientSession(connector=connector) as session:
while True:
line = file.readline()
if line == '':
break
url = line.strip()
try:
status = await get_status(session, url)
writer.writerow([url, status])
except Exception as error:
writer.writerow([url, f'ERROR: {error}'])
if time.time() - start_time > 2:
output.flush()
class RateLimit:
'A lock that can only be acquired at a certain rate.'
def __init__(self, raw):
domain = None
rate_text = raw
if ':' in raw:
domain, rate_text = raw.split(':', 1)
try:
rate = float(rate_text)
except ValueError:
raise ValueError(f'Invalid rate format: "{raw}"')
self.domain = domain
self.rate = rate
self.interval = 1 / rate if rate > 0 else 0
self.last_use = 0
def matches(self, domain):
'Determine if this rate limit should be used for the given domain.'
if not self.domain:
return True
return domain == self.domain or domain.endswith(f'.{self.domain}')
async def wait(self):
'Wait for the next available usage according to the rate limit.'
remaining = 1
while remaining > 0:
remaining = self.last_use + self.interval - time.time()
if remaining > 0:
await asyncio.sleep(remaining)
self.last_use = time.time()
parser = argparse.ArgumentParser(description='Check the statuses of a list of URLs.')
parser.add_argument('path', help='path to a file that is a newline-delimited list of URLs')
parser.add_argument('--ignore-ssl-errors', action='store_true', help='ignore errors in SSL handshakes')
parser.add_argument('--rate', action='append', type=RateLimit, default=[], help='Maximum number of requests per second to make. Repeat with `--rate "example.com:2"` to set specific rate limits per domain.')
options = parser.parse_args()
# Sort rate limits by longest (i.e. most specific) domain first
options.rate.sort(key=lambda x: x.domain or '', reverse=True)
# Start event loop, open the URL list, and spawn workers to read from it.
loop = asyncio.get_event_loop()
with open(options.path) as urls_file:
workers = [worker(urls_file, sys.stdout, options)
for i in range(CONCURRENT_REQUESTS)]
loop.run_until_complete(asyncio.gather(*workers))
python get_urls.py example_urls.txt > statuses.csv
# Result in statuses.csv is like:
# gobbledegook,"ERROR: Cannot connect to host gobbledegook:80 ssl:None [nodename nor servname provided, or not known]"
# httpstat.us/404,404
# apple.com,200
# www.energy.gov,200
# www.gpo.gov,200
# Or use --ignore-ssl-errors to retrieve status for URLs with SSL handshake
# problems, e.g. invalid or expired SSL certificates.
python get_urls.py example_urls.txt --ignore-ssl-errors > statuses.csv
# To apply a rate limit of 2 requests/second:
python get_urls.py example_urls.txt --rate 2
# To apply a rate limit of 1 request/second for energy.gov, 2 requests/second
# for epa.gov, and 10/second for everything else:
python get_urls.py example_urls.txt --rate 10 --rate 'energy.gov:1' --rate 'epa.gov:2'
@Mr0grog
Copy link
Author

Mr0grog commented Sep 19, 2018

Unsurprisingly, this could use some rate limiting and backoff semantics if you want to run it over many thousands of URLs that cover only a few hosts.

@Mr0grog
Copy link
Author

Mr0grog commented Sep 19, 2018

Input file sometimes has some obvious formatting problems we can fix, e.g…

Doubled up protocols:

http://http://www.ncd.gov

Quotes:

"http://library.usip.org/search?/tpakistani+public/tpakistani+public/1,1,1,B/l962&FF=tpakistani+public&1,1,,0,0/indexsort=-startreferer//search/tpakistani+public/tpakistani+public/1,1,1,B/frameset&FF=tpakistani+public&1,1,/endreferer/"

@Mr0grog
Copy link
Author

Mr0grog commented Nov 22, 2018

Added rate limiting with --rate 5 for 5 requests/second. You can also repeat to apply different rate limits for different domains:

# Limit to 10 requests/s, but only 1 request/s for energy.gov and 2 requests/s for epa.gov
--rate 10 --rate 'energy.gov:1' --rate 'epa.gov:2'

Note that domains include subdomains, so in the above example, requests to arpa-e.energy.gov would be rate-limited together with requests to energy.gov.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment