Last active
August 2, 2024 22:38
-
-
Save rviscomi/1d121e7e67eefc4770739e735d681e29 to your computer and use it in GitHub Desktop.
Exploring compression stats in HTTP Archive
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# https://discuss.httparchive.org/t/how-many-text-files-are-not-served-with-gzip/1092 | |
# | |
# https://bigquery.cloud.google.com/table/httparchive:runs.2017_10_15_requests | |
# | |
# Browse encoding and MIME type for 10 random requests. | |
#standardSQL | |
SELECT | |
resp_content_encoding, | |
mimeType, | |
url | |
FROM | |
httparchive.runs.2017_10_15_requests | |
LIMIT | |
10 | |
# Top 50 encoding values | |
#standardSQL | |
SELECT | |
APPROX_TOP_COUNT(resp_content_encoding, 50) | |
FROM | |
httparchive.runs.2017_10_15_requests | |
# Most popular encoding values: '', 'br', 'gzip', 'deflate' | |
# Example MIME types: 'application/javascript', 'text/css', 'text/html', 'images/svg+xml' | |
# | |
# How many compressed requests are there? | |
#standardSQL | |
SELECT | |
COUNT(0) | |
FROM | |
httparchive.runs.2017_10_15_requests | |
WHERE | |
resp_content_encoding IN ('br', 'gzip', 'deflate') | |
# 16,361,764 requests | |
# Remove WHERE clause to see total number of requetsts: 51,504,225 | |
# 31.77% of all requests have brotli or gzip compression. | |
# | |
# All in one query | |
#standardSQL | |
SELECT | |
SUM(IF(resp_content_encoding IN ('br', 'gzip', 'deflate'), 1, 0)) / COUNT(0) | |
FROM | |
httparchive.runs.2017_10_15_requests | |
# More verbose version, but supports more complicated IF conditions: | |
#standardSQL | |
SELECT | |
COUNT(0) / ( | |
SELECT | |
COUNT(0) | |
FROM | |
httparchive.runs.2017_10_15_requests) | |
FROM | |
httparchive.runs.2017_10_15_requests | |
WHERE | |
resp_content_encoding IN ('br', 'gzip', 'deflate') | |
# How many of the remaining requests *could* have been compressed? | |
#standardSQL | |
SELECT | |
COUNT(0) / ( | |
SELECT | |
COUNT(0) | |
FROM | |
httparchive.runs.2017_10_15_requests) | |
FROM | |
httparchive.runs.2017_10_15_requests | |
WHERE | |
resp_content_encoding NOT IN ('br', 'gzip', 'deflate') | |
AND mimeType IN ( | |
'text/html', | |
'text/css', | |
'application/javascript', | |
'images/svg+xml') | |
# 8.42% of all requests | |
# Our MIME type whitelist is not comprehensive. Let's widen the net. | |
# | |
# https://raw.githubusercontent.com/jshttp/mime-db/master/db.json | |
# 148 MIME types marked as compressable | |
# JSON.stringify(Object.entries(types).filter(([mimeType, info]) => info.compressible).map(([mimeType, info]) => mimeType)) | |
#standardSQL | |
SELECT | |
COUNT(0) / ( | |
SELECT | |
COUNT(0) | |
FROM | |
httparchive.runs.2017_10_15_requests) | |
FROM | |
httparchive.runs.2017_10_15_requests | |
WHERE | |
resp_content_encoding NOT IN ('br', 'gzip', 'deflate') AND | |
mimeType IN ("application/alto-costmap+json","application/alto-costmapfilter+json","application/alto-directory+json","application/alto-endpointcost+json","application/alto-endpointcostparams+json","application/alto-endpointprop+json","application/alto-endpointpropparams+json","application/alto-error+json","application/alto-networkmap+json","application/alto-networkmapfilter+json","application/atom+xml","application/calendar+json","application/coap-group+json","application/csvm+json","application/dart","application/dicom+json","application/ecmascript","application/fido.trusted-apps+json","application/geo+json","application/javascript","application/jf2feed+json","application/jose+json","application/jrd+json","application/json","application/json-patch+json","application/jsonml+json","application/jwk+json","application/jwk-set+json","application/ld+json","application/manifest+json","application/merge-patch+json","application/mud+json","application/postscript","application/ppsp-tracker+json","application/problem+json","application/raml+yaml","application/rdap+json","application/rdf+xml","application/reputon+json","application/rss+xml","application/rtf","application/scim+json","application/soap+xml","application/tar","application/vcard+json","application/vnd.api+json","application/vnd.apothekende.reservation+json","application/vnd.avalon+json","application/vnd.bekitzur-stech+json","application/vnd.capasystems-pg+json","application/vnd.collection+json","application/vnd.collection.doc+json","application/vnd.collection.next+json","application/vnd.coreos.ignition+json","application/vnd.dart","application/vnd.datapackage+json","application/vnd.dataresource+json","application/vnd.document+json","application/vnd.drive+json","application/vnd.geo+json","application/vnd.google-earth.kml+xml","application/vnd.hal+json","application/vnd.hc+json","application/vnd.heroku+json","application/vnd.hyper-item+json","application/vnd.hyperdrive+json","application/vnd.ims.lis.v2.result+json","application/vnd.ims.lti.v2.toolconsumerprofile+json","application/vnd.ims.lti.v2.toolproxy+json","application/vnd.ims.lti.v2.toolproxy.id+json","application/vnd.ims.lti.v2.toolsettings+json","application/vnd.ims.lti.v2.toolsettings.simple+json","application/vnd.las.las+json","application/vnd.mason+json","application/vnd.micro+json","application/vnd.miele+json","application/vnd.mozilla.xul+xml","application/vnd.ms-fontobject","application/vnd.ms-opentype","application/vnd.nearst.inv+json","application/vnd.oftn.l10n+json","application/vnd.oma.lwm2m+json","application/vnd.oracle.resource+json","application/vnd.pagerduty+json","application/vnd.siren+json","application/vnd.sun.wadl+xml","application/vnd.tableschema+json","application/vnd.vel+json","application/vnd.xacml+json","application/wasm","application/webpush-options+json","application/x-httpd-php","application/x-javascript","application/x-ns-proxy-autoconfig","application/x-sh","application/x-tar","application/x-virtualbox-hdd","application/x-virtualbox-ova","application/x-virtualbox-ovf","application/x-virtualbox-vbox","application/x-virtualbox-vdi","application/x-virtualbox-vhd","application/x-virtualbox-vmdk","application/x-web-app-manifest+json","application/x-www-form-urlencoded","application/xhtml+xml","application/xml","application/xml-dtd","application/xop+xml","application/yang-data+json","application/yang-patch+json","font/otf","image/bmp","image/svg+xml","image/vnd.adobe.photoshop","image/x-icon","image/x-ms-bmp","message/imdn+xml","message/rfc822","model/gltf+json","model/gltf-binary","model/x3d+xml","text/cache-manifest","text/calender","text/cmd","text/css","text/csv","text/html","text/javascript","text/jsx","text/markdown","text/n3","text/plain","text/richtext","text/rtf","text/tab-separated-values","text/uri-list","text/vcard","text/vtt","text/x-gwt-rpc","text/x-jquery-tmpl","text/x-markdown","text/x-org","text/x-processing","text/x-suse-ymp","text/xml","x-shader/x-fragment","x-shader/x-vertex") | |
# 12.19% of all requests | |
# | |
# Less precise but more straightforward version | |
# Just look for: | |
# - text/* | |
# - */*+json | |
# - */*+text | |
# - */*+xml | |
#standardSQL | |
SELECT | |
COUNT(0) / ( | |
SELECT | |
COUNT(0) | |
FROM | |
httparchive.runs.2017_10_15_requests) | |
FROM | |
httparchive.runs.2017_10_15_requests | |
WHERE | |
resp_content_encoding NOT IN ('br', 'gzip', 'deflate') AND | |
REGEXP_CONTAINS(mimeType, r'(text/.*|.*/.*\+json|.*/*\+text|.*/.*\+xml)') | |
# Back down to 8.74% :( | |
# | |
# Doesn't cover application/javascript though? | |
# What happens if we include all MIME types containing `javascript` or `json`? | |
# Will probably have some false positives. | |
#standardSQL | |
SELECT | |
COUNT(0) / ( | |
SELECT | |
COUNT(0) | |
FROM | |
httparchive.runs.2017_10_15_requests) | |
FROM | |
httparchive.runs.2017_10_15_requests | |
WHERE | |
resp_content_encoding NOT IN ('br', 'gzip', 'deflate') AND | |
REGEXP_CONTAINS(mimeType, r'(text/.*|.*/.*\+json|.*/*\+text|.*/.*\+xml)') OR | |
REGEXP_CONTAINS(mimeType, r'(javascript|json)') | |
# 29.74%! WOW | |
# | |
# HA also has a _gzip_save field in the requests table: KB that could be saved with gzip | |
#standardSQL | |
SELECT | |
SUM(IF(_gzip_save > 0, 1, 0)) / COUNT(0) | |
FROM | |
httparchive.runs.2017_10_15_requests | |
# Even lower at 3.22% ;( | |
# | |
# Key point: The results are *highly* dependent on the constraints you set. | |
# You need to think deeply about and be confident in your filtering criteria. | |
# | |
# Doug Sillars: "Since files under 1KB fit into a single packet - you can generally skip the compression step." | |
#standardSQL | |
SELECT | |
COUNT(0) / ( | |
SELECT | |
COUNT(0) | |
FROM | |
httparchive.runs.2017_10_15_requests) | |
FROM | |
httparchive.runs.2017_10_15_requests | |
WHERE | |
respSize > 1000 AND | |
resp_content_encoding NOT IN ('br', 'gzip', 'deflate') AND | |
REGEXP_CONTAINS(mimeType, r'(text/.*|.*/.*\+json|.*/*\+text|.*/.*\+xml)') OR | |
REGEXP_CONTAINS(mimeType, r'(javascript|json)') | |
# 23.70% |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment