henryiii/LICENSE

## compute-backends.py
import contextlib
import sqlite3
import tomllib
from collections import Counter


def main():
    counter = Counter()
    with contextlib.closing(sqlite3.connect("pyproject_contents.db")) as con:
        cursor = con.cursor()
        for row in cursor.execute("SELECT contents FROM pyproject"):
            contents, = row
            try:
                toml = tomllib.loads(contents)
                backend = toml.get("build-system", {}).get("build-backend", "unknown")
                if isinstance(backend, str):
                    counter[backend] += 1
                else:
                    counter["busted"] += 1
            except tomllib.TOMLDecodeError:
                counter["broken"] += 1

    for i, (k, v) in enumerate(counter.most_common()):
        print(f"{i:3} {k}: {v}")

if __name__ == "__main__":
    main()

## compute-tool.py
#!/usr/bin/env python3

import argparse
import pickle
from collections import Counter
from collections.abc import Generator
from pathlib import Path
from typing import Any


def dig(value: Any, key: str, *keys: str) -> Any:
    res = value.get(key, {})
    return dig(res, *keys) if keys else res


def all_keys(
    d: dict[str, Any], level: int, *prefixes: str
) -> Generator[str, None, None]:
    for key, value in d.items():
        if isinstance(value, dict) and level > 0:
            yield from all_keys(value, level - 1, *prefixes, key)
        else:
            yield ".".join([*prefixes, key])


def get_tomls_cached(db: str) -> Generator[tuple[str, str, dict[str, Any]], None, None]:
    pkl = Path(f"{db}.pkl")
    with pkl.open("rb") as f:
        yield from pickle.load(f)


def main(tool: str, get_contents: bool, level: int = 0) -> None:
    if tool:
        if get_contents:
            print(f"{tool} contents:")
        else:
            print(tool + ".*" * (level + 1) + ":")
    else:
        if get_contents:
            raise AssertionError("Can't get contents with no section")

        print("*:")

    if get_contents and level > 0:
        raise AssertionError("Can't use level with contents")

    counter = Counter()
    for _, _, toml in get_tomls_cached("pyproject_contents.db"):
        item = dig(toml, *tool.split(".")) if tool else toml
        if item:
            if get_contents:
                counter[repr(item)] += 1
            else:
                counter += Counter(all_keys(item, level=level))

    for k, v in counter.most_common():
        print(f"{k}: {v}")


def blame(tool: str, string: str) -> None:
    if string:
        print(tool, "=", string)
    else:
        print(tool, "= ...")
    for name, version, toml in get_tomls_cached("pyproject_contents.db"):
        item = dig(toml, *tool.split(".")) if tool else toml
        if not string and item:
            print(name, version, "=", repr(item))
        elif repr(item) == string:
            print(name, version)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("tool", help="Tool to processes")
    parser.add_argument("-c", "--contents", action="store_true")
    parser.add_argument(
        "-l", "--level", type=int, default=0, help="Unpack nested levels"
    )
    parser.add_argument(
        "-b",
        "--blame",
        help="print matching project names, empty string to print any value (careful)",
    )
    args = parser.parse_args()
    if args.blame is not None:
        assert args.level == 0
        assert not args.contents
        blame(args.tool, args.blame)
    else:
        main(args.tool, args.contents, args.level)

## compute-tools.py
#!/usr/bin/env python

import contextlib
import sqlite3
import tomllib
from collections import Counter


def main():
    counter = Counter()
    with contextlib.closing(sqlite3.connect("pyproject_contents.db")) as con:
        cursor = con.cursor()
        for row in cursor.execute("SELECT contents FROM pyproject"):
            contents, = row
            with contextlib.suppress(tomllib.TOMLDecodeError):
                toml = tomllib.loads(contents)
                tools = toml.get("tool", {}).keys()
                counter += Counter(f"tool.{k}" for k in tools)

    for i, (k, v) in enumerate(counter.most_common()):
        print(f"{i:3} {k}: {v}")

if __name__ == "__main__":
    main()

## db_to_pickle.py
#!/usr/bin/env python3

import contextlib
import pickle
import sqlite3
from collections.abc import Generator
from pathlib import Path
from typing import Any

import tomllib


def get_tomls(db: str) -> Generator[tuple[str, str, dict[str, Any]], None, None]:
    with contextlib.closing(sqlite3.connect(db)) as con:
        cursor = con.cursor()
        for row in cursor.execute(
            "SELECT project_name, project_version, contents FROM pyproject"
        ):
            project_name, project_version, contents = row
            with contextlib.suppress(tomllib.TOMLDecodeError):
                yield project_name, project_version, tomllib.loads(contents)


def make_cache(db: str) -> None:
    pkl = Path(f"{db}.pkl")
    with pkl.open("wb", pickle.HIGHEST_PROTOCOL) as f:
        pickle.dump(list(get_tomls(db)), f)


if __name__ == "__main__":
    make_cache("pyproject_contents.db")

## LICENSE
MIT License

Copyright (c) 2023 Françoise CONIL

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


## pyproject-download.py
# /// script
# dependencies = ["aiohttp", "packaging"]
# requires-python = ">=3.11"
# ///

"""
Downloads all pyproject.toml files and puts them in a database. Doesn't talk to
GitHub if the package name + version already is in the database.

You need an input CSV to work on. To prepare one, you can use
pyproject-latest-to-csv.py.
"""

import asyncio
import contextlib
import csv
import itertools
import logging
import sqlite3
import sys
import time
from collections.abc import Iterator

import aiohttp
from packaging.version import Version

LOG = logging.getLogger(__name__)

PYPROJECT_CREATE = """CREATE TABLE IF NOT EXISTS
pyproject(project_name TEXT PRIMARY KEY, project_version TEXT, contents TEXT)
"""

INSERT_CONTENTS = """INSERT INTO pyproject
VALUES (:project_name, :project_version, :contents)
"""

GET_CONTENTS = """SELECT project_version FROM pyproject WHERE project_name=?
"""
DELETE_CONTENTS = """DELETE FROM pyproject WHERE project_name=?
"""

csv.field_size_limit(sys.maxsize)


async def get_data(
    session: aiohttp.ClientSession, path: str, repo: int, name: str
) -> str | None:
    if path.count("/") != 4 or not path.endswith("/pyproject.toml"):
        LOG.warning("Project %s has non-top-level path %s", name, path)
        return None

    url = f"https://raw.githubusercontent.com/pypi-data/pypi-mirror-{repo}/code/{path}"
    try:
        async with session.get(url) as response:
            if response.status == 429:
                LOG.error("Rate limited when accessing %s", name)
                raise RuntimeError("Rate limited")
            if response.status != 200:
                LOG.error("pycodeorg.get_data failed to retrieve %s", name)
                return None
            try:
                data = await response.text()
            except UnicodeDecodeError:
                LOG.exception("Unicode decode error on %s", name)
                return None
            return data
    except (
        aiohttp.http_exceptions.BadHttpMessage,
        aiohttp.client_exceptions.ClientResponseError,
    ):
        LOG.exception("Failed reading %s", name)
        return None


async def worker(
    iterator: Iterator[str], session: aiohttp.ClientSession, cursor: sqlite3.Cursor, thread: int
) -> None:
    with contextlib.suppress(StopIteration):
        for i in itertools.count(0):
            if i and i % 200 == 0:
                LOG.info("PROGRESS %d: %d", thread, i)
            line = next(iterator)
            with cursor.connection:
                result = cursor.execute(GET_CONTENTS, (line["project_name"],))
                value = result.fetchone()
                if value and Version(line["project_version"]) <= Version(value[0]):
                    continue

            data = await get_data(
                session, line["path"], line["repository"], line["project_name"]
            )
            if not data:
                continue

            with cursor.connection:
                cursor.execute(DELETE_CONTENTS, (line["project_name"],))
                cursor.execute(
                    INSERT_CONTENTS,
                    {
                        "project_name": line["project_name"],
                        "project_version": line["project_version"],
                        "contents": data,
                    },
                )


async def main() -> None:
    with contextlib.closing(sqlite3.connect("pyproject_contents.db")) as cnx_backend:
        cur_backend = cnx_backend.cursor()
        cur_backend.execute(PYPROJECT_CREATE)

        with open("extract-pyproject-all-versions.csv", newline="") as f:
            reader = csv.DictReader(f)
            total = len(list(reader))

        print(f"Processing {total} projects")

        with open("extract-pyproject-latest.csv", newline="") as f:
            reader = csv.DictReader(f)
            iterator = iter(reversed(list(reader)))
            async with aiohttp.ClientSession() as session, asyncio.TaskGroup() as tg:
                for i in range(8):
                    tg.create_task(worker(iterator, session, cur_backend, i))


if __name__ == "__main__":
    start_time = time.time()
    logging.basicConfig(filename="pyproject_contents.log", level=logging.INFO)

    asyncio.run(main())

    end_time = time.time()
    duration_msg = f"Getting files took : {end_time - start_time:0.3} seconds."

    LOG.info(duration_msg)
    print(duration_msg)

## pyproject-latest-to-csv.py
# /// script
# dependencies = ["duckdb"]
# ///

"""

Originally from https://framapiaf.org/@fcodvpt/111540079686191842
https://gitlab.liris.cnrs.fr/fconil-small-programs/packaging/get-pypi-packages-backends
https://sethmlarson.dev/security-developer-in-residence-weekly-report-18
https://gist.github.com/sethmlarson/852341a9b7899eda7d22d8c362c0a095

    curl -L --remote-name-all $(curl -L "https://github.com/pypi-data/data/raw/main/links/dataset.txt")

MIT licensed.
"""

import duckdb

ALL_VERSIONS_QUERY = """SELECT project_name, COUNT(project_name) AS nb_uploads,
  MAX(project_version) AS max_version,
  LIST(DISTINCT project_version) AS all_versions,
  MAX(uploaded_on) AS max_uploaded_on,
  LIST(DISTINCT uploaded_on) AS all_uploaded_on,
  LIST(DISTINCT repository) AS all_repository,
  LIST(DISTINCT path) AS all_path
  FROM '*.parquet'
  WHERE (date_part('year', uploaded_on) >= '2018') AND regexp_matches(path, 'pyproject.toml$') AND skip_reason == ''
  GROUP BY project_name;
"""

res = duckdb.sql(ALL_VERSIONS_QUERY)
res.to_csv("extract-pyproject-all-versions.csv", header=True)

LATEST_QUERY = """WITH lpv AS (SELECT project_name, COUNT(project_name) AS nb_uploads,
  MAX(uploaded_on) AS max_uploaded_on,
  LIST(DISTINCT uploaded_on) AS all_uploaded_on
  FROM '*.parquet'
  WHERE (date_part('year', uploaded_on) >= '2018') AND regexp_matches(path, 'pyproject.toml$') AND skip_reason == ''
  GROUP BY project_name)
SELECT ip.repository, ip.project_name, ip.project_version, lpv.nb_uploads,
  ip.uploaded_on, date_part('year', ip.uploaded_on) AS year, ip.path
  FROM '*.parquet' as ip
    JOIN lpv ON ip.project_name=lpv.project_name AND ip.uploaded_on=lpv.max_uploaded_on
  WHERE regexp_matches(path, 'pyproject.toml$') AND skip_reason == '';
"""

# res = duckdb.sql(LATEST_QUERY).show()

res = duckdb.sql(LATEST_QUERY)
res.to_csv("extract-pyproject-latest.csv", header=True)
	import contextlib
	import sqlite3
	import tomllib
	from collections import Counter


	def main():
	counter = Counter()
	with contextlib.closing(sqlite3.connect("pyproject_contents.db")) as con:
	cursor = con.cursor()
	for row in cursor.execute("SELECT contents FROM pyproject"):
	contents, = row
	try:
	toml = tomllib.loads(contents)
	backend = toml.get("build-system", {}).get("build-backend", "unknown")
	if isinstance(backend, str):
	counter[backend] += 1
	else:
	counter["busted"] += 1
	except tomllib.TOMLDecodeError:
	counter["broken"] += 1

	for i, (k, v) in enumerate(counter.most_common()):
	print(f"{i:3} {k}: {v}")

	if __name__ == "__main__":
	main()
	#!/usr/bin/env python3

	import argparse
	import pickle
	from collections import Counter
	from collections.abc import Generator
	from pathlib import Path
	from typing import Any


	def dig(value: Any, key: str, *keys: str) -> Any:
	res = value.get(key, {})
	return dig(res, *keys) if keys else res


	def all_keys(
	d: dict[str, Any], level: int, *prefixes: str
	) -> Generator[str, None, None]:
	for key, value in d.items():
	if isinstance(value, dict) and level > 0:
	yield from all_keys(value, level - 1, *prefixes, key)
	else:
	yield ".".join([*prefixes, key])


	def get_tomls_cached(db: str) -> Generator[tuple[str, str, dict[str, Any]], None, None]:
	pkl = Path(f"{db}.pkl")
	with pkl.open("rb") as f:
	yield from pickle.load(f)


	def main(tool: str, get_contents: bool, level: int = 0) -> None:
	if tool:
	if get_contents:
	print(f"{tool} contents:")
	else:
	print(tool + "." (level + 1) + ":")
	else:
	if get_contents:
	raise AssertionError("Can't get contents with no section")

	print("*:")

	if get_contents and level > 0:
	raise AssertionError("Can't use level with contents")

	counter = Counter()
	for _, _, toml in get_tomls_cached("pyproject_contents.db"):
	item = dig(toml, *tool.split(".")) if tool else toml
	if item:
	if get_contents:
	counter[repr(item)] += 1
	else:
	counter += Counter(all_keys(item, level=level))

	for k, v in counter.most_common():
	print(f"{k}: {v}")


	def blame(tool: str, string: str) -> None:
	if string:
	print(tool, "=", string)
	else:
	print(tool, "= ...")
	for name, version, toml in get_tomls_cached("pyproject_contents.db"):
	item = dig(toml, *tool.split(".")) if tool else toml
	if not string and item:
	print(name, version, "=", repr(item))
	elif repr(item) == string:
	print(name, version)


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("tool", help="Tool to processes")
	parser.add_argument("-c", "--contents", action="store_true")
	parser.add_argument(
	"-l", "--level", type=int, default=0, help="Unpack nested levels"
	)
	parser.add_argument(
	"-b",
	"--blame",
	help="print matching project names, empty string to print any value (careful)",
	)
	args = parser.parse_args()
	if args.blame is not None:
	assert args.level == 0
	assert not args.contents
	blame(args.tool, args.blame)
	else:
	main(args.tool, args.contents, args.level)
	#!/usr/bin/env python

	import contextlib
	import sqlite3
	import tomllib
	from collections import Counter


	def main():
	counter = Counter()
	with contextlib.closing(sqlite3.connect("pyproject_contents.db")) as con:
	cursor = con.cursor()
	for row in cursor.execute("SELECT contents FROM pyproject"):
	contents, = row
	with contextlib.suppress(tomllib.TOMLDecodeError):
	toml = tomllib.loads(contents)
	tools = toml.get("tool", {}).keys()
	counter += Counter(f"tool.{k}" for k in tools)

	for i, (k, v) in enumerate(counter.most_common()):
	print(f"{i:3} {k}: {v}")

	if __name__ == "__main__":
	main()
	MIT License

	Copyright (c) 2023 Françoise CONIL

	Permission is hereby granted, free of charge, to any person obtaining a copy
	of this software and associated documentation files (the "Software"), to deal
	in the Software without restriction, including without limitation the rights
	to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	copies of the Software, and to permit persons to whom the Software is
	furnished to do so, subject to the following conditions:

	The above copyright notice and this permission notice shall be included in all
	copies or substantial portions of the Software.

	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	SOFTWARE.
	# /// script
	# dependencies = ["aiohttp", "packaging"]
	# requires-python = ">=3.11"
	# ///

	"""
	Downloads all pyproject.toml files and puts them in a database. Doesn't talk to
	GitHub if the package name + version already is in the database.

	You need an input CSV to work on. To prepare one, you can use
	pyproject-latest-to-csv.py.
	"""

	import asyncio
	import contextlib
	import csv
	import itertools
	import logging
	import sqlite3
	import sys
	import time
	from collections.abc import Iterator

	import aiohttp
	from packaging.version import Version

	LOG = logging.getLogger(__name__)

	PYPROJECT_CREATE = """CREATE TABLE IF NOT EXISTS
	pyproject(project_name TEXT PRIMARY KEY, project_version TEXT, contents TEXT)
	"""

	INSERT_CONTENTS = """INSERT INTO pyproject
	VALUES (:project_name, :project_version, :contents)
	"""

	GET_CONTENTS = """SELECT project_version FROM pyproject WHERE project_name=?
	"""
	DELETE_CONTENTS = """DELETE FROM pyproject WHERE project_name=?
	"""

	csv.field_size_limit(sys.maxsize)


	async def get_data(
	session: aiohttp.ClientSession, path: str, repo: int, name: str
	) -> str \| None:
	if path.count("/") != 4 or not path.endswith("/pyproject.toml"):
	LOG.warning("Project %s has non-top-level path %s", name, path)
	return None

	url = f"https://raw.githubusercontent.com/pypi-data/pypi-mirror-{repo}/code/{path}"
	try:
	async with session.get(url) as response:
	if response.status == 429:
	LOG.error("Rate limited when accessing %s", name)
	raise RuntimeError("Rate limited")
	if response.status != 200:
	LOG.error("pycodeorg.get_data failed to retrieve %s", name)
	return None
	try:
	data = await response.text()
	except UnicodeDecodeError:
	LOG.exception("Unicode decode error on %s", name)
	return None
	return data
	except (
	aiohttp.http_exceptions.BadHttpMessage,
	aiohttp.client_exceptions.ClientResponseError,
	):
	LOG.exception("Failed reading %s", name)
	return None


	async def worker(
	iterator: Iterator[str], session: aiohttp.ClientSession, cursor: sqlite3.Cursor, thread: int
	) -> None:
	with contextlib.suppress(StopIteration):
	for i in itertools.count(0):
	if i and i % 200 == 0:
	LOG.info("PROGRESS %d: %d", thread, i)
	line = next(iterator)
	with cursor.connection:
	result = cursor.execute(GET_CONTENTS, (line["project_name"],))
	value = result.fetchone()
	if value and Version(line["project_version"]) <= Version(value[0]):
	continue

	data = await get_data(
	session, line["path"], line["repository"], line["project_name"]
	)
	if not data:
	continue

	with cursor.connection:
	cursor.execute(DELETE_CONTENTS, (line["project_name"],))
	cursor.execute(
	INSERT_CONTENTS,
	{
	"project_name": line["project_name"],
	"project_version": line["project_version"],
	"contents": data,
	},
	)


	async def main() -> None:
	with contextlib.closing(sqlite3.connect("pyproject_contents.db")) as cnx_backend:
	cur_backend = cnx_backend.cursor()
	cur_backend.execute(PYPROJECT_CREATE)

	with open("extract-pyproject-all-versions.csv", newline="") as f:
	reader = csv.DictReader(f)
	total = len(list(reader))

	print(f"Processing {total} projects")

	with open("extract-pyproject-latest.csv", newline="") as f:
	reader = csv.DictReader(f)
	iterator = iter(reversed(list(reader)))
	async with aiohttp.ClientSession() as session, asyncio.TaskGroup() as tg:
	for i in range(8):
	tg.create_task(worker(iterator, session, cur_backend, i))


	if __name__ == "__main__":
	start_time = time.time()
	logging.basicConfig(filename="pyproject_contents.log", level=logging.INFO)

	asyncio.run(main())

	end_time = time.time()
	duration_msg = f"Getting files took : {end_time - start_time:0.3} seconds."

	LOG.info(duration_msg)
	print(duration_msg)
	# /// script
	# dependencies = ["duckdb"]
	# ///

	"""

	Originally from https://framapiaf.org/@fcodvpt/111540079686191842
	https://gitlab.liris.cnrs.fr/fconil-small-programs/packaging/get-pypi-packages-backends
	https://sethmlarson.dev/security-developer-in-residence-weekly-report-18
	https://gist.github.com/sethmlarson/852341a9b7899eda7d22d8c362c0a095

	curl -L --remote-name-all $(curl -L "https://github.com/pypi-data/data/raw/main/links/dataset.txt")

	MIT licensed.
	"""

	import duckdb

	ALL_VERSIONS_QUERY = """SELECT project_name, COUNT(project_name) AS nb_uploads,
	MAX(project_version) AS max_version,
	LIST(DISTINCT project_version) AS all_versions,
	MAX(uploaded_on) AS max_uploaded_on,
	LIST(DISTINCT uploaded_on) AS all_uploaded_on,
	LIST(DISTINCT repository) AS all_repository,
	LIST(DISTINCT path) AS all_path
	FROM '*.parquet'
	WHERE (date_part('year', uploaded_on) >= '2018') AND regexp_matches(path, 'pyproject.toml$') AND skip_reason == ''
	GROUP BY project_name;
	"""

	res = duckdb.sql(ALL_VERSIONS_QUERY)
	res.to_csv("extract-pyproject-all-versions.csv", header=True)

	LATEST_QUERY = """WITH lpv AS (SELECT project_name, COUNT(project_name) AS nb_uploads,
	MAX(uploaded_on) AS max_uploaded_on,
	LIST(DISTINCT uploaded_on) AS all_uploaded_on
	FROM '*.parquet'
	WHERE (date_part('year', uploaded_on) >= '2018') AND regexp_matches(path, 'pyproject.toml$') AND skip_reason == ''
	GROUP BY project_name)
	SELECT ip.repository, ip.project_name, ip.project_version, lpv.nb_uploads,
	ip.uploaded_on, date_part('year', ip.uploaded_on) AS year, ip.path
	FROM '*.parquet' as ip
	JOIN lpv ON ip.project_name=lpv.project_name AND ip.uploaded_on=lpv.max_uploaded_on
	WHERE regexp_matches(path, 'pyproject.toml$') AND skip_reason == '';
	"""

	# res = duckdb.sql(LATEST_QUERY).show()

	res = duckdb.sql(LATEST_QUERY)
	res.to_csv("extract-pyproject-latest.csv", header=True)