Skip to content

Instantly share code, notes, and snippets.

@clopez
Last active October 11, 2017 14:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save clopez/dcae6ba605e7fa521b5e989ca323f522 to your computer and use it in GitHub Desktop.
Save clopez/dcae6ba605e7fa521b5e989ca323f522 to your computer and use it in GitHub Desktop.
Check Alexa top N sites to see how many support a specific Content-Encoding
#!/usr/bin/env python3
# Author: Carlos Alberto Lopez Perez <clopez@igalia.com>
# License: MIT
#
# Check Alexa top N sites to see how many support a specific Content-Encoding
#
# Examples of use:
# * Check top 100 alexa for encoding gzip:
# $ ./alexa_check_content_encoding.py gzip 100
# * Check top 500 alexa for encoding brotli:
# $ ./alexa_check_content_encoding.py br 500
# * Check top 1000 alexa for encoding Zstandard:
# $ ./alexa_check_content_encoding.py zstd 1000
#
import argparse
import io
import zipfile
import urllib.request, urllib.parse, urllib.error
def alexa_top_list(num=1000):
f = urllib.request.urlopen('http://s3.amazonaws.com/alexa-static/top-1m.csv.zip')
buf = io.BytesIO(f.read())
zfile = zipfile.ZipFile(buf)
buf = io.StringIO(zfile.read('top-1m.csv').decode('utf-8'))
alexa_list = []
counter = 0
for line in buf:
counter = counter + 1
rank, domain = line.split(',')
if (int(rank.strip())) != counter:
raise ValueError("Something is wrong with the alexa data file. Maybe not sorted?")
alexa_list.append(domain.strip())
if counter == num:
break
return alexa_list
def server_supports_encoding(encoding, url):
req_headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36',
'Accept-Encoding': encoding, }
request = urllib.request.Request(url, headers=req_headers)
response = urllib.request.urlopen(request, timeout=5)
if 'Content-Encoding' in response.headers:
return encoding in response.headers['Content-Encoding']
return False
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("encoding", type=str, help="Name of the codename of the encoding to test")
parser.add_argument("alexanumber", type=int, help="Number of alexa top n sites to check")
args = parser.parse_args()
encoding = args.encoding.strip()
alexa_number_domains = int(args.alexanumber)
iana_encoding_list = ["aes128gcm", "br", "compress", "deflate", "exi", "gzip", "identity", "pack200-gzip", "x-compress", "x-gzip"]
if encoding not in iana_encoding_list:
print ("\033[0;33mWARNING:\033[0m encoding %s is not on the IANA encoding list" %encoding)
print ("Check: https://www.iana.org/assignments/http-parameters/http-parameters.xhtml#content-coding")
print ("And check you spelled it correcly (example: is \"br\" instead of \"brotli\")\n")
print("Querying alexa top %d domains for encoding %s ..." %(alexa_number_domains,encoding))
domains_toquery = alexa_top_list(alexa_number_domains)
domains_support_encoding = 0
domains_notsupport_encoding = 0
domains_error = 0
for domain in domains_toquery:
try:
# Check both http and https
if server_supports_encoding(encoding, "http://" + domain + "/") or server_supports_encoding(encoding, "https://" + domain + "/"):
print("\033[0;32mDomain %s supports encoding %s\033[0m" %(domain,encoding))
domains_support_encoding += 1
else:
print("\033[0;31mDomain %s NOT supports encoding %s\033[0m" %(domain,encoding))
domains_notsupport_encoding += 1
except KeyboardInterrupt:
raise
except:
print("Domain %s gave an error, likely timeout" %domain)
domains_error += 1
pass
print("---- SUMMARY ----")
if domains_error > 0:
print("%d servers queried sucesfully (%d caused error)" %(len(domains_toquery) - domains_error, domains_error))
else:
print("%d servers queried sucesfully" %len(domains_toquery))
print("%d servers support encoding %s" %(domains_support_encoding, encoding))
print("%d servers NOT support encoding %s" %(domains_notsupport_encoding, encoding))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment