Skip to content

Instantly share code, notes, and snippets.

@dstufft
Created June 10, 2022 16:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save dstufft/fd21658d68044e6c6fc0788f9f158497 to your computer and use it in GitHub Desktop.
Save dstufft/fd21658d68044e6c6fc0788f9f158497 to your computer and use it in GitHub Desktop.
import requests
import zlib
import struct
import json
import html5lib
import re
import pprint
_gzip_header = b"\x1f\x8b\x08\x00\x00\x00\x00\x00\x02\xff"
def gzip_app_iter(app_iter): # Taken from WebOb, which Warehouse uses
size = 0
crc = zlib.crc32(b"") & 0xFFFFFFFF
compress = zlib.compressobj(
9, zlib.DEFLATED, -zlib.MAX_WBITS, zlib.DEF_MEM_LEVEL, 0
)
yield _gzip_header
for item in app_iter:
size += len(item)
crc = zlib.crc32(item, crc) & 0xFFFFFFFF
# The compress function may return zero length bytes if the input is
# small enough; it buffers the input for the next iteration or for a
# flush.
result = compress.compress(item)
if result:
yield result
# Similarly, flush may also not yield a value.
result = compress.flush()
if result:
yield result
yield struct.pack("<2L", crc, size & 0xFFFFFFFF)
def normalize(name): # Taken from PEP 503
return re.sub(r"[-_.]+", "-", name).lower()
data = {}
resp = requests.get("https://pypi.org/simple", stream=True)
resp.raise_for_status()
data["current.compressed"] = 0
for chunk in resp.raw.stream(1024, decode_content=False):
data["current.compressed"] += len(chunk)
resp = requests.get("https://pypi.org/simple")
resp.raise_for_status()
data["current.uncompressed"] = len(resp.content)
jdata = {"meta": {"api-version": "1.0"}, "projects": {}}
jdata2 = {"meta": {"api-version": "1.0"}, "projects": {}}
html = html5lib.parse(resp.content, namespaceHTMLElements=False)
for link in html.findall(".//a"):
jdata["projects"][normalize(link.text)] = {"url": link.attrib["href"]}
jdata2["projects"][normalize(link.text)] = {
"name": link.text,
"url": link.attrib["href"],
}
jcontent = json.dumps(jdata, sort_keys=True, separators=(",", ":")).encode("utf8")
jcontent2 = json.dumps(jdata2, sort_keys=True, separators=(",", ":")).encode("utf8")
data["pep691.uncompressed"] = len(jcontent)
data["pep691.name.uncompressed"] = len(jcontent2)
data["pep691.compressed"] = sum(map(len, list(gzip_app_iter([jcontent]))))
data["pep691.name.compressed"] = sum(map(len, list(gzip_app_iter([jcontent2]))))
pprint.pprint(data)
@pelson
Copy link

pelson commented Jun 13, 2022

Thanks for sharing your code.

I had a look at representing project list as:

{
  "meta": {"api-version": "1.0", "simple_url": "https://pypi.org/simple"},
  "projects": [
    {"name": project_name},
  ]
}

The conclusion being: This would represent a 40% reduction in compressed size. Re-introducing the normalized name into this project dictionary makes it a ~1.4% increase in compressed size compared to the existing HTML response.

With the following raw results:

{'current.compressed': 3175998,
 'current.uncompressed': 20931765,
 'pep691.as_list_no_url.compressed': 1841468,
 'pep691.as_list_no_url.uncompressed': 9512691,
 'pep691.as_list_no_url_unnormalized.compressed': 3220197,
 'pep691.as_list_no_url_unnormalized.uncompressed': 23219279,
 'pep691.compressed': 2917633,
 'pep691.name.compressed': 4454882,
 'pep691.name.uncompressed': 27394139,
 'pep691.no_url.compressed': 1791066,
 'pep691.no_url.uncompressed': 7225101,
 'pep691.uncompressed': 18643996}

Using this code:

# Requires: html5lib, requests

import requests
import zlib
import struct
import json
import html5lib
import re
import pprint

_gzip_header = b"\x1f\x8b\x08\x00\x00\x00\x00\x00\x02\xff"


def gzip_app_iter(app_iter):  # Taken from WebOb, which Warehouse uses
    size = 0
    crc = zlib.crc32(b"") & 0xFFFFFFFF
    compress = zlib.compressobj(
        9, zlib.DEFLATED, -zlib.MAX_WBITS, zlib.DEF_MEM_LEVEL, 0
    )

    yield _gzip_header

    for item in app_iter:
        size += len(item)
        crc = zlib.crc32(item, crc) & 0xFFFFFFFF

        # The compress function may return zero length bytes if the input is
        # small enough; it buffers the input for the next iteration or for a
        # flush.
        result = compress.compress(item)

        if result:
            yield result

    # Similarly, flush may also not yield a value.
    result = compress.flush()

    if result:
        yield result
    yield struct.pack("<2L", crc, size & 0xFFFFFFFF)


def normalize(name):  # Taken from PEP 503
    return re.sub(r"[-_.]+", "-", name).lower()


data = {}

resp = requests.get("https://pypi.org/simple", stream=True)
resp.raise_for_status()

data["current.compressed"] = 0
for chunk in resp.raw.stream(1024, decode_content=False):
    data["current.compressed"] += len(chunk)


resp = requests.get("https://pypi.org/simple")
resp.raise_for_status()


data["current.uncompressed"] = len(resp.content)


jdata = {"meta": {"api-version": "1.0"}, "projects": {}}
jdata2 = {"meta": {"api-version": "1.0"}, "projects": {}}
jdata3 = {"meta": {"api-version": "1.0", "simple_url": "https://pypi.org/simple"}, "projects": {}}
jdata4 = {"meta": {"api-version": "1.0", "simple_url": "https://pypi.org/simple"}, "projects": []}
jdata5 = {"meta": {"api-version": "1.0", "simple_url": "https://pypi.org/simple"}, "projects": []}

html = html5lib.parse(resp.content, namespaceHTMLElements=False)
for link in html.findall(".//a"):
    jdata["projects"][normalize(link.text)] = {"url": link.attrib["href"]}
    jdata2["projects"][normalize(link.text)] = {
        "name": link.text,
        "url": link.attrib["href"],
    }
    jdata3["projects"][normalize(link.text)] = {}
    jdata4["projects"].append({'name': normalize(link.text)})
    jdata5["projects"].append({'name': normalize(link.text), 'unnormalized-name': link.text})


jcontent = json.dumps(jdata, sort_keys=True, separators=(",", ":")).encode("utf8")
jcontent2 = json.dumps(jdata2, sort_keys=True, separators=(",", ":")).encode("utf8")
jcontent3 = json.dumps(jdata3, sort_keys=True, separators=(",", ":")).encode("utf8")
jcontent4 = json.dumps(jdata4, sort_keys=True, separators=(",", ":")).encode("utf8")
jcontent5 = json.dumps(jdata5, sort_keys=True, separators=(",", ":")).encode("utf8")

data["pep691.uncompressed"] = len(jcontent)
data["pep691.name.uncompressed"] = len(jcontent2)
data["pep691.no_url.uncompressed"] = len(jcontent3)
data["pep691.as_list_no_url.uncompressed"] = len(jcontent4)
data["pep691.as_list_no_url_unnormalized.uncompressed"] = len(jcontent5)


data["pep691.compressed"] = sum(map(len, list(gzip_app_iter([jcontent]))))
data["pep691.name.compressed"] = sum(map(len, list(gzip_app_iter([jcontent2]))))
data["pep691.no_url.compressed"] = sum(map(len, list(gzip_app_iter([jcontent3]))))
data["pep691.as_list_no_url.compressed"] = sum(map(len, list(gzip_app_iter([jcontent4]))))
data["pep691.as_list_no_url_unnormalized.compressed"] = sum(map(len, list(gzip_app_iter([jcontent5]))))

pprint.pprint(data)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment