Total of pip packages downloaded separated by Python versions
From June 26, 2016 (python 3.5.2 release) to Aug. 31, 2016.
Python versions from 2.6 to 3.5
Without 2.7
From June 26, 2016 (python 3.5.2 release) to Aug. 31, 2016.
Python versions from 2.6 to 3.5
Without 2.7
-- https://bigquery.cloud.google.com/dataset/the-psf:pypi | |
SELECT concat( | |
date(timestamp), '_', REGEXP_EXTRACT(details.python, r'^([2-3]).[0-9].') | |
) as date_python, count(details.python) as downloads | |
FROM (TABLE_DATE_RANGE([the-psf:pypi.downloads], | |
TIMESTAMP('2016-06-26'), | |
TIMESTAMP('2016-08-31'))) | |
group by date_python |
-- https://bigquery.cloud.google.com/dataset/the-psf:pypi | |
-- https://bigquery.cloud.google.com/table/the-psf:pypi.downloads20160903 | |
SELECT concat( | |
date(timestamp), '_', REGEXP_EXTRACT(details.python, r'^([2-3].[0-9]).') | |
) as date_python, count(details.python) as downloads | |
FROM (TABLE_DATE_RANGE([the-psf:pypi.downloads], | |
TIMESTAMP('2016-06-26'), | |
TIMESTAMP('2016-08-31'))) | |
group by date_python |
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
# To plot chart from csv generated by bigquery | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
plt.figure() | |
ts = pd.read_csv('download_python_version_by_day.csv') | |
ts['date'] = pd.to_datetime(ts['date']) | |
df = ts.pivot(index='date', columns='python', values='downloads') | |
#df.plot() | |
#df[[2.6, 2.7, 3.1, 3.2, 3.3, 3.4, 3.5]].plot() | |
df[[2.6, 3.1, 3.2, 3.3, 3.4, 3.5]].plot() | |
plt.show() |
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
ts = pd.read_csv( | |
'download_python_major_version_by_day.csv', parse_dates=True, | |
) | |
ts['date'] = pd.to_datetime(ts['date']) | |
df = ts.pivot(index='date', columns='python', values='downloads') | |
ax = df[[2, 3]].plot(logy=True, figsize=(12, 9)) | |
ax.set_ylabel('log(downloads)') | |
ax.set_title('Python packages downloads') | |
plt.show() |
I also think it's important for this analysis to filter down to SELECT
CONCAT( DATE(timestamp), '_', REGEXP_EXTRACT(details.python, r'^([2-3].[0-9]).') ) AS date_python,
COUNT(details.python) AS downloads
FROM (TABLE_DATE_RANGE([the-psf:pypi.downloads], TIMESTAMP('2016-06-26'), TIMESTAMP('2016-08-31')))
WHERE
details.installer.name LIKE 'pip'
GROUP BY
date_python although it doesn't have a huge impact on the results. |
Also, if you just want a relative comparison (rather than absolute), it might also be better to add |
This comment has been minimized.
PeridexisErrant commentedSep 3, 2016
Try a log scale on the y-axis, it's as simple as
df.plot(logy=True)
(docs)