Skip to content

Instantly share code, notes, and snippets.

@pfmoore
Created July 5, 2023 11:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pfmoore/3b7847c4ea35930f789918db158cc5e0 to your computer and use it in GitHub Desktop.
Save pfmoore/3b7847c4ea35930f789918db158cc5e0 to your computer and use it in GitHub Desktop.
Get the PyPI simple index in JSON format
from datetime import datetime
import email.message
import re
from multiprocessing.dummy import Pool
import json
import traceback
import urllib3
def parse_content_type(header: str) -> str:
m = email.message.Message()
m["content-type"] = header
return m.get_content_type()
def normalize(name):
return re.sub(r"[-_.]+", "-", name).lower()
class PyPI:
CONTENT_TYPES = [
"application/vnd.pypi.simple.v1+json",
# "application/vnd.pypi.simple.v1+html;q=0.2",
# "text/html;q=0.01", # For legacy compatibility
]
ACCEPT = ", ".join(CONTENT_TYPES)
def __init__(self):
self.pool_mgr = urllib3.PoolManager(maxsize=10)
def get_root_data(self):
try:
url = "https://pypi.org/simple/"
resp = self.pool_mgr.request("GET", url, headers={"Accept": self.ACCEPT})
content_type = parse_content_type(resp.headers.get("content-type", ""))
if content_type != "application/vnd.pypi.simple.v1+json":
print(f"Unsupported content type for {url}: {content_type} ({resp.status})")
return
except Exception as exc:
traceback.print_exception(exc)
return dict(error="".join(traceback.format_exception(exc)))
return resp.json()
def get_project_data(self, project):
try:
project = normalize(project)
url = f"https://pypi.org/simple/{project}/"
resp = self.pool_mgr.request("GET", url, headers={"Accept": self.ACCEPT})
content_type = parse_content_type(resp.headers.get("content-type", ""))
if content_type != "application/vnd.pypi.simple.v1+json":
err = f"Unsupported content type for {url}: {content_type} ({resp.status})"
print(err)
return dict(
name=project, error=err, data=repr(resp.data), status=resp.status
)
except Exception as exc:
traceback.print_exception(exc)
return dict(name=project, error="".join(traceback.format_exception(exc)))
return resp.json()
if __name__ == "__main__":
pypi = PyPI()
root_data = pypi.get_root_data()
print(len(root_data["projects"]))
i = 0
with Pool() as p:
project_names = (normalize(p["name"]) for p in root_data["projects"])
results = p.map(pypi.get_project_data, project_names)
filename = f"PyPI_simple.{datetime.now():%Y-%m-%d-%H-%M}.json"
with open(filename, "w", encoding="utf-8") as f:
json.dump({"root": root_data, "projects": results}, f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment