Skip to content

Instantly share code, notes, and snippets.

@pfmoore
Created September 2, 2023 16:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pfmoore/fdcd1f6c68a1567b0a7430beb74938b7 to your computer and use it in GitHub Desktop.
Save pfmoore/fdcd1f6c68a1567b0a7430beb74938b7 to your computer and use it in GitHub Desktop.
PyPI downloader for py-code.org
import json
import subprocess
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
from pathlib import Path
from urllib.request import urlopen
REPOSITORIES = "https://github.com/pypi-data/data/raw/main/stats/repositories.json"
with urlopen(REPOSITORIES) as f:
repo_data = json.load(f)
def clone_repo(repo, repo_dir):
index = repo["index"]
name = repo["name"]
loc = str(repo_dir / name)
url = repo["url"]
start = datetime.now()
print(f"{name}: Started {start}")
proc = subprocess.run(
["git", "clone", url, loc],
capture_output=True,
text=True,
)
end = datetime.now()
duration = datetime.now() - start
print(f"{name}: Ended {end} ({duration})")
with open("output/" + name + ".output.txt", "w", encoding="utf-8") as f:
print(f"{index}. {name}: {url}", file=f)
print(f"Duration: {duration}", file=f)
print(f"Return code: {proc.returncode}", file=f)
print("\nOutput:\n" + proc.stdout, file=f)
print("\nErrors:\n" + proc.stderr, file=f)
repo_dir = Path("repos")
repo_dir.mkdir(exist_ok=True, parents=True)
with ThreadPoolExecutor() as executor:
results = executor.map(
lambda r: clone_repo(r, repo_dir),
sorted(repo_data, key=lambda r: r["index"])[200:]
)
# Then, do
# dir .\repos\pypi-mirro* | % { git -C "$_" config --local core.longpaths true }
# To fetch new data:
# dir .\repos\pypi-mirro* | Foreach-Object -Parallel { git -C "$_" fetch }
# Takes about 30s (2 min without -Parallel)
# To make object lists:
# dir .\repos\pypi-mirro* | Foreach-Object -Parallel { git -C "$_" rev-list --objects --all | Out-File -Encoding UTF8 (Join-Path objects $_.name)}
# (takes about 50 minutes)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment