Last active
February 2, 2023 07:21
-
-
Save jvolkman/05658a8d6f67342ab49c55f707899038 to your computer and use it in GitHub Desktop.
how many pypi packages use different sets of requirements per wheel?
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#standardSQL | |
-- https://stackoverflow.com/a/63364851 | |
-- max version number in the dataset is 217 chars (someone dumped many digits of pi into the version string >:( ) | |
CREATE TEMP FUNCTION normalizedSemanticVersion(semanticVersion STRING) | |
AS (( | |
SELECT STRING_AGG( | |
IF(isDigit, REPEAT('0', 220 - LENGTH(chars)) || chars, chars), '' ORDER BY grp | |
) || '..zzzzzzzzzzzzzz' | |
FROM ( | |
SELECT grp, isDigit, STRING_AGG(char, '' ORDER BY OFFSET) chars, | |
FROM ( | |
SELECT OFFSET, char, isDigit, | |
COUNTIF(NOT isDigit) OVER(ORDER BY OFFSET) AS grp | |
FROM UNNEST(SPLIT(semanticVersion, '')) AS char WITH OFFSET, | |
UNNEST([char IN ('1','2','3','4','5','6','7','8','9','0')]) isDigit | |
) | |
GROUP BY grp, isDigit | |
))); | |
-- get the latest version for each distribution that ships wheels | |
with latest_versions as ( | |
SELECT name, version | |
FROM `bigquery-public-data.pypi.distribution_metadata` | |
where packagetype = 'bdist_wheel' | |
QUALIFY ROW_NUMBER() OVER (PARTITION BY name ORDER BY normalizedSemanticVersion(version) DESC) = 1 | |
), | |
wheels as ( | |
select name, version, requires_dist | |
FROM `bigquery-public-data.pypi.distribution_metadata` join latest_versions using (name, version) | |
where packagetype = 'bdist_wheel' | |
), | |
-- order and dedupe the lines in requires_dist, placing them into a single string | |
deduped_requires as ( | |
select name, version, array_to_string(array(SELECT distinct elem FROM UNNEST(split(array_to_string(requires_dist, '\n'), '\n')) AS elem ORDER BY elem), '\n') requires_dist | |
from wheels | |
), | |
-- select distinct name, version, requires_dist rows | |
distinct_requires as ( | |
select name, version, requires_dist | |
from deduped_requires | |
group by name, version, requires_dist | |
) | |
-- select all name, version rows that have > 1 requires_dist set | |
select name, version, count(*) from distinct_requires group by name, version having count(*) > 1 order by name |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Results on 2023/02/01