Instantly share code, notes, and snippets.

@alex /pypi.md
Last active Sep 8, 2018

Embed
What would you like to do?

Most downloaded projects

SELECT
  file.project,
  COUNT(*) as total_downloads,
FROM
  TABLE_DATE_RANGE(
    [the-psf:pypi.downloads],
    TIMESTAMP("20160114"),
    CURRENT_TIMESTAMP()
  )
GROUP BY
  file.project
ORDER BY
  total_downloads DESC
LIMIT 100

OpenSSL versions

SELECT
  details.system.name,
  REGEXP_EXTRACT(details.openssl_version, r"^OpenSSL ([^ ]+) ") as openssl_version,
  COUNT(*) as download_count,
FROM
  TABLE_DATE_RANGE(
    [the-psf:pypi.downloads],
    TIMESTAMP("20160114"),
    CURRENT_TIMESTAMP()
  )
WHERE
  details.openssl_version IS NOT NULL
GROUP BY
  details.system.name,
  openssl_version,
HAVING
  download_count >= 100
ORDER BY
  download_count DESC
LIMIT 100

Which python versions install cryptography

SELECT
  REGEXP_EXTRACT(details.python, r"^([^\.]+\.[^\.]+)") as python_version,
  COUNT(*) as download_count,
FROM
  TABLE_DATE_RANGE(
    [the-psf:pypi.downloads],
    TIMESTAMP("20160114"),
    CURRENT_TIMESTAMP()
  )
WHERE
  file.project = 'cryptography'
GROUP BY
  python_version,
ORDER BY
  download_count DESC
LIMIT 100

Which platforms are generating 2.6 downloads

SELECT
  details.distro.name,
  details.distro.version,
  details.system.release,
  COUNT(*) as download_count,
FROM
  TABLE_DATE_RANGE(
    [the-psf:pypi.downloads],
    TIMESTAMP("20160114"),
    CURRENT_TIMESTAMP()
  )
WHERE
  REGEXP_EXTRACT(details.python, r"^([^\.]+\.[^\.]+)") = "2.6"
GROUP BY
  details.distro.name,
  details.distro.version,
  details.system.release
ORDER BY
  download_count DESC
LIMIT 100

Which packages are downloaded by Python 2.6 most often (percent)

SELECT
  file.project,
  ROUND(100 * SUM(CASE WHEN REGEXP_EXTRACT(details.python, r"^([^\.]+\.[^\.]+)") = "2.6" THEN 1 ELSE 0 END) / COUNT(*), 1) AS percent_26,
  SUM(CASE WHEN REGEXP_EXTRACT(details.python, r"^([^\.]+\.[^\.]+)") = "2.6" THEN 1 ELSE 0 END) as total_26_downloads,
  COUNT(*) as total_downloads,
FROM
  TABLE_DATE_RANGE(
    [the-psf:pypi.downloads],
    TIMESTAMP("20160114"),
    CURRENT_TIMESTAMP()
  )
WHERE
  details.python IS NOT NULL AND
  # Exclude things which are stdlib backports *for* Python 2.6
  file.project NOT IN ("argparse", "ordereddict")
GROUP BY
  file.project,
HAVING
  total_downloads > 5000
ORDER BY
  percent_26 DESC
LIMIT 250

Which packages are downloaded by Python 2.6 most often (absolute)

SELECT
  file.project,
  COUNT(*) as total_downloads,
FROM
  TABLE_DATE_RANGE(
    [the-psf:pypi.downloads],
    TIMESTAMP("20160114"),
    CURRENT_TIMESTAMP()
  )
WHERE
  REGEXP_EXTRACT(details.python, r"^([^\.]+\.[^\.]+)") == "2.6" AND
  # Exclude things which are stdlib backports *for* Python 2.6
  file.project NOT IN ("argparse", "ordereddict")
GROUP BY
  file.project,
ORDER BY
  total_downloads DESC
LIMIT 100

Most used installers

SELECT
  details.installer.name,
  details.installer.version,
  COUNT(*) as total_downloads
FROM
  TABLE_DATE_RANGE(
    [the-psf:pypi.downloads],
    TIMESTAMP("20160114"),
    CURRENT_TIMESTAMP()
  )
GROUP BY
  details.installer.name,
  details.installer.version
ORDER BY
  total_downloads DESC
LIMIT 100

Packages installed with OpenSSL 0.9.8 (excluding Windows and OS X)

SELECT
  file.project,
  ROUND(100 * SUM(CASE WHEN SUBSTR(REGEXP_EXTRACT(details.openssl_version, r"^OpenSSL ([^ ]+) "), 0, 5) == "0.9.8" THEN 1 ELSE 0 END) / COUNT(*), 1) as percent_098,
  ROUND(100 * SUM(CASE WHEN SUBSTR(REGEXP_EXTRACT(details.openssl_version, r"^OpenSSL ([^ ]+) "), 0, 5) == "1.0.0" THEN 1 ELSE 0 END) / COUNT(*), 1) as percent_100,
  ROUND(100 * SUM(CASE WHEN SUBSTR(REGEXP_EXTRACT(details.openssl_version, r"^OpenSSL ([^ ]+) "), 0, 5) == "1.0.1" THEN 1 ELSE 0 END) / COUNT(*), 1) as percent_101,
  ROUND(100 * SUM(CASE WHEN SUBSTR(REGEXP_EXTRACT(details.openssl_version, r"^OpenSSL ([^ ]+) "), 0, 5) == "1.0.2" THEN 1 ELSE 0 END) / COUNT(*), 1) as percent_102,
  COUNT(*) as total_downloads,
FROM
  TABLE_DATE_RANGE(
    [the-psf:pypi.downloads],
    TIMESTAMP("20160114"),
    CURRENT_TIMESTAMP()
  )
WHERE
  details.openssl_version IS NOT NULL AND
  # Exclude Windows and OS X where Cryptography ships a static wheel
  details.system.name NOT IN ("Windows", "Darwin")
GROUP BY
  file.project,
HAVING
  total_downloads > 5000
ORDER BY
  percent_098 DESC
LIMIT 250
@kennethreitz

This comment has been minimized.

kennethreitz commented Jun 11, 2016

an example for 2 vs 3 usage (month-over-month, perhaps) for a given library would be awesome.

@diimdeep

This comment has been minimized.

diimdeep commented Aug 6, 2016

this is awesome !

@linwoodc3

This comment has been minimized.

linwoodc3 commented Oct 30, 2016

Is there a way to distinguish mirror bots vs true user downloads? Just asking but I agree with poster above, THIS IS AWESOME!!

@browniebroke

This comment has been minimized.

browniebroke commented Jun 12, 2018

Total downloads by main Python version over time:

SELECT
  STRFTIME_UTC_USEC(timestamp, "%Y-%m") AS month,
  SUM(CASE WHEN REGEXP_EXTRACT(details.python, r"^([^\.]+\.[^\.]+)") = '2.7' THEN 1 ELSE 0 END) AS py2.7,
  SUM(CASE WHEN REGEXP_EXTRACT(details.python, r"^([^\.]+\.[^\.]+)") = '3.2' THEN 1 ELSE 0 END) AS py3.2,
  SUM(CASE WHEN REGEXP_EXTRACT(details.python, r"^([^\.]+\.[^\.]+)") = '3.3' THEN 1 ELSE 0 END) AS py3.3,
  SUM(CASE WHEN REGEXP_EXTRACT(details.python, r"^([^\.]+\.[^\.]+)") = '3.4' THEN 1 ELSE 0 END) AS py3.4,
  SUM(CASE WHEN REGEXP_EXTRACT(details.python, r"^([^\.]+\.[^\.]+)") = '3.5' THEN 1 ELSE 0 END) AS py3.5,
  SUM(CASE WHEN REGEXP_EXTRACT(details.python, r"^([^\.]+\.[^\.]+)") = '3.6' THEN 1 ELSE 0 END) AS py3.6,
  SUM(CASE WHEN REGEXP_EXTRACT(details.python, r"^([^\.]+\.[^\.]+)") = '3.7' THEN 1 ELSE 0 END) AS py3.7
FROM
  TABLE_DATE_RANGE(
    [the-psf:pypi.downloads],
    TIMESTAMP("20160101"),
    CURRENT_TIMESTAMP()
  )
GROUP BY
  month
ORDER BY
  month
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment