Skip to content

Instantly share code, notes, and snippets.

@jonashaag
Created September 10, 2023 20:21
Show Gist options
  • Save jonashaag/575e89f6e9493297e350902121518ff0 to your computer and use it in GitHub Desktop.
Save jonashaag/575e89f6e9493297e350902121518ff0 to your computer and use it in GitHub Desktop.
import json
import sqlite3
repodata = json.load(open("497deca9.json"))
COLS = 'filename, build, build_number, depends, license, license_family, md5, name, sha256, size, subdir, timestamp, version'.split(', ')
db = sqlite3.connect("497deca9.sqlite")
db.execute("create table repodata ({}, primary key (filename))".format(','.join(COLS)))
rows = [
{**{c: None for c in COLS}, **pkg, "filename": key, "depends": json.dumps(pkg["depends"])} for key, pkg in repodata["packages"].items()
]
# all_deps = {}
# for row in rows:
# all_deps.setdefault(row["name"], set()).update(row["depends"])
# for row in rows:
# if row["name"] in all_deps:
# db.executemany("insert into repodata ({}) values ({})".format(','.join(COLS), ','.join(':'+c for c in COLS)), [{
# **row, "depends": json.dumps(list(all_deps[row["name"]]))}])
# del all_deps[row["name"]]
db.executemany("insert into repodata ({}) values ({})".format(','.join(COLS), ','.join(':'+c for c in COLS)), rows)
db.execute("create index idx_name on repodata (name)")
db.commit()
# import sqlite_zstd
# db.enable_load_extension(True)
# sqlite_zstd.load(db)
# for col in 'build,depends,license,license_family,md5,sha256,subdir,version'.split(','):
# print('compressing',col)
# db.execute(f"""SELECT zstd_enable_transparent('{{"table": "repodata", "column": "{col}", "compression_level": 19, "dict_chooser": "''a''"}}')""")
# db.execute("select zstd_incremental_maintenance(null, 1)")
# db.execute("vacuum")
# db.commit()
import time
import json
import sqlite3
db = sqlite3.connect("497deca9.sqlite")
# db.enable_load_extension(True)
# import sqlite_zstd
# sqlite_zstd.load(db)
#want = {'pandas'}
want = set("_libgcc_mutex _openmp_mutex alabaster annotated-types anyio argon2-cffi argon2-cffi-bindings arrow asciitree asttokens async-lru attrs aws-c-auth aws-c-cal aws-c-common aws-c-compression aws-c-event-stream aws-c-http aws-c-io aws-c-mqtt aws-c-s3 aws-c-sdkutils aws-checksums aws-crt-cpp aws-sdk-cpp babel backcall backports backports.functools_lru_cache barmer_utils beautifulsoup4 blas bleach brotli brotli-bin brotli-python bzip2 c-ares ca-certificates cached-property cached_property certifi cffi cfgv charset-normalizer click colorama comm conda-env-lock contourpy coverage cycler debugpy decorator defusedxml dil_health_drg_fake_data distlib docutils drg-plausibility drg-simulation drg_external_data_1010 entrypoints exceptiongroup executing fasteners filelock fonttools fqdn freetype gflags glog greenlet identify idna imagesize importlib-metadata importlib_metadata importlib_resources iniconfig ipykernel ipython isoduration jedi jinja2 joblib json5 jsonpointer jsonschema jsonschema-specifications jsonschema-with-format-nongpl jupyter-lsp jupyter_client jupyter_core jupyter_events jupyter_server jupyter_server_terminals jupyterlab jupyterlab_pygments jupyterlab_server keyutils kiwisolver krb5 lcms2 ld_impl_linux-64 lerc libabseil libarrow libblas libbrotlicommon libbrotlidec libbrotlienc libcblas libcrc32c libcurl libdeflate libedit libev libevent libexpat libffi libgcc-ng libgfortran-ng libgfortran5 libgomp libgoogle-cloud libgrpc libiconv libjpeg-turbo liblapack libnghttp2 libnsl libnuma libopenblas libpng libprotobuf libsodium libsqlite libssh2 libstdcxx-ng libthrift libtiff libutf8proc libuuid libwebp-base libxcb libzlib lightgbm lz4-c make markupsafe matplotlib-base matplotlib-inline matplotlib-venn mistune msgpack-python multimethod munkres mypy_extensions nbclient nbconvert-core nbformat ncurses nest-asyncio nodeenv notebook notebook-shim numcodecs numpy numpydoc openblas openjpeg openssl orc overrides packaging pandas pandera-base pandocfilters parso patsy pbr pexpect pickleshare pillow pip pkgutil-resolve-name platformdirs pluggy polars pooch pre-commit prometheus_client prompt-toolkit prompt_toolkit psutil pthread-stubs ptyprocess pure_eval pyarrow pycparser pydantic pydantic-core pygments pyodbc pyparsing pysocks pytest pytest-cov python python-dateutil python-fastjsonschema python-json-logger python-tzdata python_abi pytz pyyaml pyzmq qc_drg_grouper_geos_binaries quantcore.ducttape re2 readline referencing requests rfc3339-validator rfc3986-validator rheia rpds-py ruamel.yaml ruamel.yaml.clib ruamel.yaml.jinja2 s2n scikit-learn scipy seaborn seaborn-base send2trash setuptools setuptools-scm six snappy sniffio snowballstemmer soupsieve sphinx sphinx_rtd_theme sphinxcontrib-apidoc sphinxcontrib-applehelp sphinxcontrib-devhelp sphinxcontrib-htmlhelp sphinxcontrib-jquery sphinxcontrib-jsmath sphinxcontrib-qthelp sphinxcontrib-serializinghtml sqlalchemy stack_data statsmodels terminado themis-assets themis-assets-axa themis-assets-external-data themis-assets-meta themis-assets-model themis-assets-plausibility themis-assets-simulation threadpoolctl tinycss2 tk toml tomli tornado tqdm traitlets turbodbc typeguard typing-extensions typing_extensions typing_inspect typing_utils tzdata ukkonen unixodbc uri-template urllib3 virtualenv wcwidth webcolors webencodings websocket-client wheel wrapt xorg-libxau xorg-libxdmcp xz yaml zarr zeromq zipp zstandard zstd".split())
versions = {}
start = time.perf_counter()
while want:
pkg = want.pop()
versions[pkg] = []
for version, build, depends in db.execute("select version, build, depends from repodata where name = ?", [pkg]).fetchall():
versions[pkg].append((version, build))
for d in (d.split()[0] for d in json.loads(depends)):
if d not in versions:
want.add(d)
duration = time.perf_counter() - start
print("Took", duration, "s to reduce number of builds from", db.execute("select count(*) from repodata").fetchone()[0], "to", sum(map(len, versions.values())))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment