Skip to content

Instantly share code, notes, and snippets.

@bennuttall
Last active October 30, 2022 13:43
Show Gist options
  • Save bennuttall/e5b15e4d0cffcacdd0a2b710e5ada572 to your computer and use it in GitHub Desktop.
Save bennuttall/e5b15e4d0cffcacdd0a2b710e5ada572 to your computer and use it in GitHub Desktop.
from pathlib import Path
from collections import defaultdict
missing_wheels = defaultdict(set)
extra_wheels = defaultdict(set)
rewrites = set()
total_missing = 0
total_extra = 0
with open('missing.txt') as f:
for whl in f:
whl = whl.strip()
pkg = whl.split('/')[-2]
filename = whl.split('/')[-1]
path = Path(whl)
rewrites.add(pkg)
if path.is_file():
missing_wheels[pkg].add(filename)
total_missing += 1
with open('extra.txt') as f:
for whl in f:
whl = whl.strip()
pkg = whl.split('/')[-2]
filename = whl.split('/')[-1]
rewrites.add(pkg)
extra_wheels[pkg].add(filename)
total_extra += 1
with open('rewrites.sh', 'w') as f:
f.write('\n'.join([f'piw-rebuild index {pkg}' for pkg in rewrites]) + '\n')
print(total_missing, "missing wheels")
print(total_extra, "extra wheels")
print(len(rewrites), "package indexes need rewriting")
  1. Compose a list of build ids which exist in the database:

    $ psql piwheels -c "select build_id from builds" > build_ids.txt
  2. Cut the header and footer lines from the output

  3. Copy the file over to the piwheels master

  4. On piwheels master, iterate the logs directory. If you come across a log file with a build id which doesn't exist in the database, delete the file:

    from pathlib import Path
    from datetime import datetime
    
    logs_dir = Path('www/logs')
    db_file = Path('build_ids_db.txt')
    db_build_ids = {int(line) for line in db_file.read_text().split()}
    
    start = datetime.now()
    for lvl_1 in logs_dir.iterdir():
        for lvl_2 in lvl_1.iterdir():
            for log_file in lvl_2.iterdir():
                build_id = int(f"{lvl_1.name}{lvl_2.name}{log_file.stem.split('.')[0]}")
                if build_id not in db_build_ids:
                    log_file.unlink()
    end = datetime.now()
    print(end - start)  # see how long it took
import xmlrpc.client
client = xmlrpc.client.ServerProxy('https://pypi.org/pypi')
now = 1597017600
timestamp = 0
serial = 0
packages_created = {}
packages_removed = {}
versions_created = {}
versions_removed = {}
versions_yanked = {}
versions_unyanked = {}
while timestamp < now:
for package, version, timestamp, action, serial in client.changelog_since_serial(serial):
if action == 'create':
last_timestamp = packages_created.get(package, 0)
if timestamp > last_timestamp:
packages_created[package] = timestamp
elif action == 'new release':
last_timestamp = versions_created.get(package, 0)
if timestamp > last_timestamp:
versions_created[package] = timestamp
elif action == 'remove':
if version is None:
last_timestamp = packages_removed.get(package, 0)
if timestamp > last_timestamp:
packages_removed[package] = timestamp
else:
last_timestamp = versions_removed.get((package, version), 0)
if timestamp > last_timestamp:
versions_removed[(package, version)] = timestamp
elif action == 'yank release':
last_timestamp = versions_yanked.get((package, version), 0)
if timestamp > last_timestamp:
versions_yanked[(package, version)] = timestamp
elif action == 'unyank release':
last_timestamp = versions_unyanked.get((package, version), 0)
if timestamp > last_timestamp:
versions_unyanked[(package, version)] = timestamp
print(
serial,
len(packages_created),
len(packages_removed),
len(versions_created),
len(versions_removed),
len(versions_yanked),
len(versions_unyanked),
)
# make a set of package versions which were yanked after being unyanked (or
# were never unyanked)
yanked = set()
for (p, v), yanked_ts in versions_yanked.items():
unyanked_ts = versions_unyanked.get((p, v), 0)
if yanked_ts > unyanked_ts:
yanked.add((p, v))
print(f'{len(yanked):,} versions to yank')
# write out an sql file to update the versions table setting those versions as
# yanked
with open('yanked.sql', 'w') as f:
for p, v in yanked:
f.write(f"UPDATE versions SET yanked = true WHERE package = '{p}' AND version = '{v}';\n")
# create a set of packages which have had versions yanked
yanked_packages = {p for p, v in yanked}
# write out a bash script to rewrite the indexes and project pages for all
# packages which have had versions yanked
with open('yanked.sh', 'w') as f:
for p, v in yanked:
f.write(f"piw-rebuild index {p}\n")
# create a set of all packages which have been deleted since their last creation
# date
p_removed = set()
for p, removed_ts in packages_removed.items():
created_ts = packages_created.get(p, 0)
if removed_ts > created_ts:
p_removed.add(p)
print(f'{len(p_removed):,} packages to delete')
# write out a plain text file of all packages marked for deletion
with open('deleted_packages.txt', 'w') as f:
for p in p_removed:
f.write(f"{p}\n")
# write out a bash script with rm's for deleted packages (simple and project)
# this may be a bad idea
with open('deleted_packages.sh', 'w') as f:
for p in p_removed:
f.write(f"rm -r /home/piwheels/www/simple/{p}/ /home/piwheels/www/project/{p}/\n")
# write out an sql script to delete versions which have been removed
with open('deleted_packages.sql', 'w') as f:
for p in p_removed:
f.write(f"DELETE FROM versions WHERE package = '{p}';\n")
# create a set of all versions which have been deleted since their last creation
v_removed = set()
for (p, v), removed_ts in versions_removed.items():
created_ts = versions_created.get((p, v), 0)
if p not in p_removed and removed_ts > created_ts:
v_removed.add((p, v))
print(f'{len(v_removed):,} versions to delete')
# write out a text file of all deleted versions
with open('deleted_versions.txt', 'w') as f:
for p, v in v_removed:
f.write(f"{p} {v}\n")
# write out a bash script of piw-remove commands for all deleted versions
with open('deleted_versions.sh', 'w') as f:
for p, v in v_removed:
f.write(f"echo 'removing {p} {v}'\n")
f.write(f"piw-remove -v -y '{p}' '{v}'\n")
import xmlrpc.client
import requests
from piwheels.format import canonicalize_name
client = xmlrpc.client.ServerProxy('https://pypi.org/pypi')
package_data = {}
def get_pypi_packages():
return {canonicalize_name(p) for p in client.list_packages()}
def get_piwheels_packages():
url = "https://www.piwheels.org/packages.json"
packages = requests.get(url).json()
return {canonicalize_name(p[0]) for p in packages}
def get_package_data(pkg):
url = f"https://pypi.org/pypi/{pkg}/json"
package_data[pkg] = requests.get(url).json()
def get_package_versions(pkg):
versions = package_data[pkg]['releases']
return {(v, d[0]['upload_time']) for v, d in versions.items() if d}
pypi_packages = get_pypi_packages()
piwheels_packages = get_piwheels_packages()
missing_packages = pypi_packages - piwheels_packages
extra_packages = piwheels_packages - pypi_packages
# create a dict of missing packages with their list of versions from pypi
missing_package_versions = {}
for pkg in missing_packages:
try:
get_package_data(pkg)
versions = get_package_versions(pkg)
if versions:
missing_package_versions[pkg] = versions
except Exception as e:
print(repr(e))
# write out an sql file to add the missing packages and versions
# write out a bash file to rebuild the project pages
with open('missing_packages.sh') as shf, open('missing_packages.sql') as sqlf:
for pkg, vers in missing_package_versions.items():
description = package_data[pkg]['info']['summary'].replace("'", "''")
shf.write(f"piw-rebuild project {pkg}\n")
sqlf.write(f"select add_package_name('{pkg}', '{pkg}', '1970-01-01 00:00:00');\n")
sqlf.write(f"select add_new_package('{pkg}', '', '{description}');\n")
for ver, rel in vers:
rel = rel.replace('T', ' ')
sqlf.write(f"select add_new_package_version('{pkg}', '{ver}', '{rel}', '');\n")
# write out a bash file to remove the extra packages
with open('extra_packages.sh', 'w') as f:
for pkg in extra_packages:
f.write(f"piw-remove {pkg} -y\n")
import csv
from time import sleep
from piwheels.master.pypi import PyPIEvents
pypi = PyPIEvents()
latest_serial = pypi._buffer._client.changelog_last_serial()
with open('pypi.csv', 'w') as f:
writer = csv.writer(f)
while pypi.serial < latest_serial:
sleep(1)
writer.writerows(list(pypi))
import xmlrpc.client
from piwheels.format import canonicalize_name
from time import sleep
import csv
client = xmlrpc.client.ServerProxy('https://pypi.org/pypi')
serial = 0
latest_serial = client.changelog_last_serial()
sleep(1)
with open('pypi_log_full.csv', 'w') as f:
w = csv.writer(f)
while serial < latest_serial:
sleep(1)
for package_alias, version, timestamp, action, serial in client.changelog_since_serial(serial):
package = canonicalize_name(package_alias)
w.writerow((package, version, timestamp, action, serial))
print(100 * serial / latest_serial)
import requests
def get_piwheels_packages():
"Return a set of all packages in piwheels"
url = "https://www.piwheels.org/packages.json"
data = requests.get(url).json()
return {d[0] for d in data}
def get_piwheels_versions(pkg):
"""
Return a dict of versions of pkg in piwheels with a bool representing
whether a version is currently skipped (True) or unskipped (False) in
piwheels
"""
url = f"https://www.piwheels.org/project/{pkg}/json"
r = requests.get(url)
r.raise_for_status()
return {
v: info['skip_reason'] == 'binary only'
for v, info in r.json()['releases'].items()
}
def get_pypi_versions(pkg):
"""
Return a dict of versions of pkg on pypi with a bool representing whether
a version should be skipped (True) or unskipped (False) in piwheels
according to whether or each version includes an sdist
"""
url = f"https://pypi.org/pypi/{pkg}/json"
r = requests.get(url)
r.raise_for_status()
versions = r.json()['releases']
return {
v: {
'released': get_release_date(files),
'skipped': not version_has_sdist(files),
}
for v, files in versions.items()
if files
}
def version_has_sdist(files):
"Return True if files contains an sdist"
return 'sdist' in {f['packagetype'] for f in files}
def get_release_date(files):
"Look up the release date for a file and return a datetime string"
if files:
return files[0]['upload_time'].replace('T', ' ')
versions_to_add = {}
versions_to_add_skipped = {}
versions_to_remove = {}
versions_to_skip = {}
versions_to_unskip = {}
packages_to_remove = set()
piwheels_errors = {}
pypi_errors = {}
packages = get_piwheels_packages()
for i, pkg in enumerate(packages):
print(f"Checking {pkg}... {100*(i+1) / len(packages):.2f}% [{len(versions_to_add)} {len(versions_to_add_skipped)} {len(versions_to_remove)} {len(versions_to_skip)} {len(versions_to_unskip)}]")
# get dicts of versions in piwheels and pypi
try:
pypi_versions_dict = get_pypi_versions(pkg)
except requests.exceptions.HTTPError as exc:
if exc.response.status_code == 404:
packages_to_remove.add(pkg)
else:
pypi_errors[pkg] = exc.response.status_code
continue
except Exception as exc:
pypi_errors[pkg] = str(exc)
continue
try:
piwheels_versions_dict = get_piwheels_versions(pkg)
except requests.exceptions.HTTPError as exc:
piwheels_errors[pkg] = exc.response.status_code
continue
except Exception as exc:
piwheels_errors[pkg] = repr(exc)
continue
# get sets of versions only to determine which are missing/extra
piwheels_versions = set(piwheels_versions_dict)
pypi_versions = set(pypi_versions_dict)
# deal with versions missing from piwheels - either add skipped or unskipped
missing_versions = pypi_versions - piwheels_versions
_to_add = {
(v, pypi_versions_dict[v]['released'])
for v in missing_versions
if not pypi_versions_dict[v]['skipped']
}
if _to_add:
versions_to_add[pkg] = _to_add
_to_add_skipped = {
(v, pypi_versions_dict[v]['released'])
for v in missing_versions
if pypi_versions_dict[v]['skipped']
}
if _to_add_skipped:
versions_to_add_skipped[pkg] = _to_add_skipped
# deal with versions that should be removed from piwheels
_to_remove = piwheels_versions - pypi_versions
if _to_remove:
versions_to_remove[pkg] = _to_remove
versions_in_both = piwheels_versions & pypi_versions
# deal with versions that should be skipped
_to_skip = {
v
for v in versions_in_both
if pypi_versions_dict[v]['skipped']
and not piwheels_versions_dict[v]
}
if _to_skip:
versions_to_skip[pkg] = _to_skip
# deal with versions that should be unskipped
_to_unskip = {
v
for v in versions_in_both
if piwheels_versions_dict[v]
and not pypi_versions_dict[v]['skipped']
}
if _to_unskip:
versions_to_unskip[pkg] = _to_unskip
# write out bash scripts and sql scripts to make the neccesary changes
with open('versions_to_add.sql', 'w') as f:
for pkg, versions in versions_to_add.items():
for v, released in versions:
f.write(f"select add_new_package_version('{pkg}', '{v}', '{released}', '');\n")
with open('versions_to_add_skipped.sql', 'w') as f:
for pkg, versions in versions_to_add_skipped.items():
for v, released in versions:
f.write(f"select add_new_package_version('{pkg}', '{v}', '{released}', 'binary only');\n")
with open('versions_to_skip.sh', 'w') as f:
for pkg, versions in versions_to_skip.items():
for v in versions:
f.write(f"piw-remove {pkg} '{v}' --skip 'binary only' -y\n")
with open('versions_to_unskip.sql', 'w') as f:
for pkg, versions in versions_to_unskip.items():
for v in versions:
f.write(f"update versions set skip = '' where package = '{pkg}' and version = '{v}';\n")
with open('versions_to_remove.sh', 'w') as f:
for pkg, versions in versions_to_remove.items():
for v in versions:
f.write(f"piw-remove {pkg} '{v}' -y\n")
with open('packages_to_remove.sh', 'w') as f:
for pkg in packages_to_remove:
f.write(f"piw-remove {pkg} -y\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment