Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jspeed-meyers/d309dc84c561845f8e3d77a7fbdf8de3 to your computer and use it in GitHub Desktop.
Save jspeed-meyers/d309dc84c561845f8e3d77a7fbdf8de3 to your computer and use it in GitHub Desktop.
Measure number of dependencies for each version of most depended upon packages using deps.dev data - SQL Query
DECLARE LatestSnapshot TIMESTAMP;
SET LatestSnapshot = (SELECT MAX(Time) FROM `bigquery-public-data.deps_dev_v1.Snapshots`);
WITH
-- Releases includes every release of every package.
Releases AS (
SELECT
System,
Name,
Version,
VersionInfo
FROM
`bigquery-public-data.deps_dev_v1.PackageVersions`
WHERE
SnapshotAt = LatestSnapshot
AND VersionInfo.IsRelease
AND System IS NOT NULL ),
-- OrderedReleases has every version of every package, sorted by the version (descending).
OrderedReleases AS (
SELECT
System,
Name,
Version,
VersionInfo.Ordinal AS VersionOrdinal,
ROW_NUMBER() OVER (PARTITION BY Name ORDER BY VersionInfo.Ordinal DESC) AS RowNumber
FROM
Releases ),
HighestReleases AS (
SELECT
System,
Name,
Version
FROM
OrderedReleases
WHERE
RowNumber = 1 ),
-- PackagesByPopularity has every package along with its popularity rank *within* each system, calculated as:
-- How many packages depend on this one, directly or indirectly, in their latest version?
PackagesByPopularity AS (
SELECT
D.System,
D.Dependency.Name AS Name,
COUNT(*) AS NDependents,
ROW_NUMBER() OVER (PARTITION BY D.System ORDER BY COUNT(*) DESC) AS Rank
FROM
`bigquery-public-data.deps_dev_v1.Dependencies` AS D
JOIN
HighestReleases H
ON
D.System = H.System
AND D.Version = H.Version
AND D.Name = H.Name
WHERE
D.SnapshotAt = LatestSnapshot
GROUP BY
D.System,
D.Dependency.Name ),
-- TopPopularPackages contains the top 100 packages for each system.
TopPopularPackages AS (
SELECT
P.System,
P.Name,
P.NDependents,
P.Rank
FROM
PackagesByPopularity P
WHERE
P.Rank <= 100
AND P.System IS NOT NULL)
SELECT
O.System,
O.Name,
T.Rank,
O.VersionOrdinal,
O.Version,
COUNT(*) AS NTotalDependencies,
COUNT(CASE WHEN E.System IS NOT NULL THEN 1 END) AS NDirectDependencies
FROM
TopPopularPackages T
JOIN
OrderedReleases O
ON
O.System = T.System
AND O.Name = T.Name
LEFT JOIN
`bigquery-public-data.deps_dev_v1.Dependencies` D
ON
D.System = O.System
AND D.Name = O.Name
AND D.Version = O.Version
-- all conditions need to be in ON for LEFT JOIN to include packages with no dependencies
AND D.SnapshotAt = LatestSnapshot
LEFT JOIN
`bigquery-public-data.deps_dev_v1.DependencyGraphEdges` E
ON
-- Matching this packages
E.System = O.System
AND E.Name = O.Name
AND E.Version = O.Version
-- Matching this dependencny
AND E.To.Name = D.Dependency.Name
AND E.To.Version = D.Dependency.Version
-- Only direct dependencies
AND E.From.Name = O.Name
-- all conditions need to be in ON for LEFT JOIN to include packages with no dependencies
AND E.SnapshotAt = LatestSnapshot
GROUP BY
O.System,
O.Name,
O.Version,
O.VersionOrdinal,
T.Rank
ORDER BY
O.System ASC,
T.Rank ASC,
O.VersionOrdinal ASC;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment