Skip to content

Instantly share code, notes, and snippets.

@rviscomi
Last active November 1, 2020 11:15
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save rviscomi/1d121e7e67eefc4770739e735d681e29 to your computer and use it in GitHub Desktop.
Exploring compression stats in HTTP Archive
# https://discuss.httparchive.org/t/how-many-text-files-are-not-served-with-gzip/1092
#
# https://bigquery.cloud.google.com/table/httparchive:runs.2017_10_15_requests
#
# Browse encoding and MIME type for 10 random requests.
#standardSQL
SELECT
resp_content_encoding,
mimeType,
url
FROM
httparchive.runs.2017_10_15_requests
LIMIT
10
# Top 50 encoding values
#standardSQL
SELECT
APPROX_TOP_COUNT(resp_content_encoding, 50)
FROM
httparchive.runs.2017_10_15_requests
# Most popular encoding values: '', 'br', 'gzip', 'deflate'
# Example MIME types: 'application/javascript', 'text/css', 'text/html', 'images/svg+xml'
#
# How many compressed requests are there?
#standardSQL
SELECT
COUNT(0)
FROM
httparchive.runs.2017_10_15_requests
WHERE
resp_content_encoding IN ('br', 'gzip', 'deflate')
# 16,361,764 requests
# Remove WHERE clause to see total number of requetsts: 51,504,225
# 31.77% of all requests have brotli or gzip compression.
#
# All in one query
#standardSQL
SELECT
SUM(IF(resp_content_encoding IN ('br', 'gzip', 'deflate'), 1, 0)) / COUNT(0)
FROM
httparchive.runs.2017_10_15_requests
# More verbose version, but supports more complicated IF conditions:
#standardSQL
SELECT
COUNT(0) / (
SELECT
COUNT(0)
FROM
httparchive.runs.2017_10_15_requests)
FROM
httparchive.runs.2017_10_15_requests
WHERE
resp_content_encoding IN ('br', 'gzip', 'deflate')
# How many of the remaining requests *could* have been compressed?
#standardSQL
SELECT
COUNT(0) / (
SELECT
COUNT(0)
FROM
httparchive.runs.2017_10_15_requests)
FROM
httparchive.runs.2017_10_15_requests
WHERE
resp_content_encoding NOT IN ('br', 'gzip', 'deflate')
AND mimeType IN (
'text/html',
'text/css',
'application/javascript',
'images/svg+xml')
# 8.42% of all requests
# Our MIME type whitelist is not comprehensive. Let's widen the net.
#
# https://raw.githubusercontent.com/jshttp/mime-db/master/db.json
# 148 MIME types marked as compressable
# JSON.stringify(Object.entries(types).filter(([mimeType, info]) => info.compressible).map(([mimeType, info]) => mimeType))
#standardSQL
SELECT
COUNT(0) / (
SELECT
COUNT(0)
FROM
httparchive.runs.2017_10_15_requests)
FROM
httparchive.runs.2017_10_15_requests
WHERE
resp_content_encoding NOT IN ('br', 'gzip', 'deflate') AND
mimeType IN ("application/alto-costmap+json","application/alto-costmapfilter+json","application/alto-directory+json","application/alto-endpointcost+json","application/alto-endpointcostparams+json","application/alto-endpointprop+json","application/alto-endpointpropparams+json","application/alto-error+json","application/alto-networkmap+json","application/alto-networkmapfilter+json","application/atom+xml","application/calendar+json","application/coap-group+json","application/csvm+json","application/dart","application/dicom+json","application/ecmascript","application/fido.trusted-apps+json","application/geo+json","application/javascript","application/jf2feed+json","application/jose+json","application/jrd+json","application/json","application/json-patch+json","application/jsonml+json","application/jwk+json","application/jwk-set+json","application/ld+json","application/manifest+json","application/merge-patch+json","application/mud+json","application/postscript","application/ppsp-tracker+json","application/problem+json","application/raml+yaml","application/rdap+json","application/rdf+xml","application/reputon+json","application/rss+xml","application/rtf","application/scim+json","application/soap+xml","application/tar","application/vcard+json","application/vnd.api+json","application/vnd.apothekende.reservation+json","application/vnd.avalon+json","application/vnd.bekitzur-stech+json","application/vnd.capasystems-pg+json","application/vnd.collection+json","application/vnd.collection.doc+json","application/vnd.collection.next+json","application/vnd.coreos.ignition+json","application/vnd.dart","application/vnd.datapackage+json","application/vnd.dataresource+json","application/vnd.document+json","application/vnd.drive+json","application/vnd.geo+json","application/vnd.google-earth.kml+xml","application/vnd.hal+json","application/vnd.hc+json","application/vnd.heroku+json","application/vnd.hyper-item+json","application/vnd.hyperdrive+json","application/vnd.ims.lis.v2.result+json","application/vnd.ims.lti.v2.toolconsumerprofile+json","application/vnd.ims.lti.v2.toolproxy+json","application/vnd.ims.lti.v2.toolproxy.id+json","application/vnd.ims.lti.v2.toolsettings+json","application/vnd.ims.lti.v2.toolsettings.simple+json","application/vnd.las.las+json","application/vnd.mason+json","application/vnd.micro+json","application/vnd.miele+json","application/vnd.mozilla.xul+xml","application/vnd.ms-fontobject","application/vnd.ms-opentype","application/vnd.nearst.inv+json","application/vnd.oftn.l10n+json","application/vnd.oma.lwm2m+json","application/vnd.oracle.resource+json","application/vnd.pagerduty+json","application/vnd.siren+json","application/vnd.sun.wadl+xml","application/vnd.tableschema+json","application/vnd.vel+json","application/vnd.xacml+json","application/wasm","application/webpush-options+json","application/x-httpd-php","application/x-javascript","application/x-ns-proxy-autoconfig","application/x-sh","application/x-tar","application/x-virtualbox-hdd","application/x-virtualbox-ova","application/x-virtualbox-ovf","application/x-virtualbox-vbox","application/x-virtualbox-vdi","application/x-virtualbox-vhd","application/x-virtualbox-vmdk","application/x-web-app-manifest+json","application/x-www-form-urlencoded","application/xhtml+xml","application/xml","application/xml-dtd","application/xop+xml","application/yang-data+json","application/yang-patch+json","font/otf","image/bmp","image/svg+xml","image/vnd.adobe.photoshop","image/x-icon","image/x-ms-bmp","message/imdn+xml","message/rfc822","model/gltf+json","model/gltf-binary","model/x3d+xml","text/cache-manifest","text/calender","text/cmd","text/css","text/csv","text/html","text/javascript","text/jsx","text/markdown","text/n3","text/plain","text/richtext","text/rtf","text/tab-separated-values","text/uri-list","text/vcard","text/vtt","text/x-gwt-rpc","text/x-jquery-tmpl","text/x-markdown","text/x-org","text/x-processing","text/x-suse-ymp","text/xml","x-shader/x-fragment","x-shader/x-vertex")
# 12.19% of all requests
#
# Less precise but more straightforward version
# Just look for:
# - text/*
# - */*+json
# - */*+text
# - */*+xml
#standardSQL
SELECT
COUNT(0) / (
SELECT
COUNT(0)
FROM
httparchive.runs.2017_10_15_requests)
FROM
httparchive.runs.2017_10_15_requests
WHERE
resp_content_encoding NOT IN ('br', 'gzip', 'deflate') AND
REGEXP_CONTAINS(mimeType, r'(text/.*|.*/.*\+json|.*/*\+text|.*/.*\+xml)')
# Back down to 8.74% :(
#
# Doesn't cover application/javascript though?
# What happens if we include all MIME types containing `javascript` or `json`?
# Will probably have some false positives.
#standardSQL
SELECT
COUNT(0) / (
SELECT
COUNT(0)
FROM
httparchive.runs.2017_10_15_requests)
FROM
httparchive.runs.2017_10_15_requests
WHERE
resp_content_encoding NOT IN ('br', 'gzip', 'deflate') AND
REGEXP_CONTAINS(mimeType, r'(text/.*|.*/.*\+json|.*/*\+text|.*/.*\+xml)') OR
REGEXP_CONTAINS(mimeType, r'(javascript|json)')
# 29.74%! WOW
#
# HA also has a _gzip_save field in the requests table: KB that could be saved with gzip
#standardSQL
SELECT
SUM(IF(_gzip_save > 0, 1, 0)) / COUNT(0)
FROM
httparchive.runs.2017_10_15_requests
# Even lower at 3.22% ;(
#
# Key point: The results are *highly* dependent on the constraints you set.
# You need to think deeply about and be confident in your filtering criteria.
#
# Doug Sillars: "Since files under 1KB fit into a single packet - you can generally skip the compression step."
#standardSQL
SELECT
COUNT(0) / (
SELECT
COUNT(0)
FROM
httparchive.runs.2017_10_15_requests)
FROM
httparchive.runs.2017_10_15_requests
WHERE
respSize > 1000 AND
resp_content_encoding NOT IN ('br', 'gzip', 'deflate') AND
REGEXP_CONTAINS(mimeType, r'(text/.*|.*/.*\+json|.*/*\+text|.*/.*\+xml)') OR
REGEXP_CONTAINS(mimeType, r'(javascript|json)')
# 23.70%
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment