Instantly share code, notes, and snippets.

@brettcannon /pypi_stats.py Secret
Last active Aug 29, 2015

Embed
What would you like to do?
Script to calculate Python 3 porting progress based on various criteria of what qualifies a project to be counted.
import collections
from concurrent import futures
import datetime
import enum
import json
import operator
import os
import pathlib
import pickle
from urllib import request
from urllib.error import URLError
from xml import sax
from xml.sax import handler
# Doesn't subclass HTTPError as it requires a bunch of arguments I don't want
# to have to deal with.
# TODO: deal with the arguments.
class PyPIFetchError(Exception):
"""Exception representing when PyPI triggers an HTTPError."""
def __init__(self, url, message):
super().__init__('{} ({})'.format(message, url))
class PySupport(enum.Enum):
"""Enum representing major version support of Python."""
unknown = 1
py2 = 2
py3 = 3
@classmethod
def classify(cls, project):
classifiers = project['info']['classifiers']
supports = cls.unknown
classifier_prefix = 'Programming Language :: Python :: '
for classifier in classifiers:
if not classifier.startswith(classifier_prefix):
continue
version = classifier[len(classifier_prefix):].strip()
if version == '3 :: Only' or version == '3' or version.startswith('3.'):
return cls.py3
elif version == '2 :: Only':
return cls.py2
elif version == '2' or version.startswith('2.'):
supports = cls.py2
return supports
def calc_release_dates(project):
release_dates = []
for upload_info in project['releases'].values():
newest_upload_time = None
for upload in upload_info:
naive_upload_time = datetime.datetime.strptime(upload['upload_time'],
'%Y-%m-%dT%H:%M:%S')
upload_time = naive_upload_time.replace(tzinfo=datetime.timezone.utc)
if newest_upload_time is None or upload_time > newest_upload_time:
newest_upload_time = upload_time
if newest_upload_time is not None:
release_dates.append(newest_upload_time)
release_dates.sort() # Oldest to newest.
return release_dates
def calc_downloads(project):
return project['info']['downloads']['last_month']
def count_projects(projects):
return collections.Counter(map(operator.itemgetter('Python support'), projects))
def is_project_active(project):
"""Check if a project is considered active.
To be considered active, a project needs to have made two releases within
the past year where they were at least 3 months apart.
"""
year_ago = datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(365)
latest_release = project['Release dates'][-1]
if latest_release < year_ago:
return False
for release in reversed(project['Release dates'][:-1]):
if release < year_ago:
return False
if (latest_release - release) >= datetime.timedelta(90):
return True
return False
def is_project_new(freshness):
"""Determine if a project is new based on its oldest release."""
def filter_func(project):
if project['Release dates'][0] >= freshness:
return True
return False
return filter_func
def has_released_since(cutoff):
"""Make sure a project has released by a cutoff date."""
def filter_func(project):
if project['Release dates'][-1] >= cutoff:
return True
return False
return filter_func
def release_often_enough(count, cutoff):
"""See if a project has made `count` releases since the cutoff."""
def filter_func(project):
if len(project['Release dates']) < count:
return False
return all(date >= cutoff for date in project['Release dates'][-count:])
return filter_func
def is_project_popular(cutoff):
"""Check if the project meets the cutoff for monthly downloads."""
def filter_func(project):
if project['Monthly downloads'] >= cutoff:
return True
return False
return filter_func
class PyPIIndexHandler(handler.ContentHandler):
"""Parse PyPI's simple index page."""
def __init__(self):
super().__init__()
self.projects = set()
def startElement(self, name, attrs):
# TODO: Check for <meta name="api-version" value="2" /> .
if name != 'a':
return
project_name = attrs.get('href', None)
if project_name is not None:
self.projects.add(project_name)
def fetch_index():
"""Return an iterable of every project name on PyPI."""
with request.urlopen('https://pypi.python.org/simple/') as response:
html_index = response.read()
sax_handler = PyPIIndexHandler()
sax.parseString(html_index, sax_handler)
return sax_handler.projects
def fetch_project(name):
"""Return the loaded JSON data from PyPI for a project."""
url = 'https://pypi.python.org/pypi/{}/json'.format(name)
try:
with request.urlopen(url) as response:
return json.loads(response.read().decode('utf-8'))
except URLError as exc:
return PyPIFetchError(url, exc)
def fetch_main(pickle_path):
print('Fetching index ...')
project_set = sorted(fetch_index())
project_data = list()
project_issues = list()
print('Fetching projects ...')
with futures.ThreadPoolExecutor(os.cpu_count()) as executor:
for data in executor.map(fetch_project, project_set):
if isinstance(data, PyPIFetchError):
issue = str(data)
project_issues.append(issue)
print(' ' + issue)
continue
project_data.append(data)
print(' ' + data['info']['name'])
# TODO: store fetch datetime with data for consistent timespan calculations.
print('Writing pickle file ...')
with open(str(pickle_path), 'wb') as file:
pickle.dump(project_data, file, pickle.HIGHEST_PROTOCOL)
print('Problems:')
for issue in project_issues:
print(' ' + issue)
def print_counts(counts):
count_format = ' * {:<9} {:>6,}'
print(count_format.format('Unknown:', counts[PySupport.unknown]))
print(count_format.format('Python 2:', counts[PySupport.py2]))
print(count_format.format('Python 3:', counts[PySupport.py3]))
def analyze_main(pickle_path):
print('Loading pickle file ...')
with open(str(pickle_path), 'rb') as file:
all_projects = pickle.load(file)
print('{:,} project loaded'.format(len(all_projects)))
print('Calculating stuff ...')
projects = []
for project in all_projects:
project['Python support'] = PySupport.classify(project)
project['Monthly downloads'] = calc_downloads(project)
project['Release dates'] = calc_release_dates(project)
if not len(project['Release dates']):
continue
projects.append(project)
# TODO: get freshness from data.
data_freshness = datetime.datetime.now(datetime.timezone.utc)
two_years_ago = data_freshness - datetime.timedelta(365 * 2)
last_year = data_freshness - datetime.timedelta(365)
six_months_ago = data_freshness - datetime.timedelta(365 // 2)
last_month = data_freshness - datetime.timedelta(30)
print('Projects that have ever had a release:')
print_counts(count_projects(projects))
print('Projects with a release within the last 2 years:')
print_counts(count_projects(
filter(has_released_since(two_years_ago), projects)))
print('Projects with a release within the last year:')
print_counts(count_projects(
filter(has_released_since(last_year), projects)))
print('Projects with a release within the last 6 months:')
print_counts(count_projects(
filter(has_released_since(six_months_ago), projects)))
print('Projects with 2 releases in the past year:')
print_counts(count_projects(
filter(release_often_enough(2, last_year), projects)))
print('Projects with two releases in the last six months:')
print_counts(count_projects(
filter(release_often_enough(2, six_months_ago), projects)))
print('Projects that were downloaded at least 1,440 times last month (i.e., twice an hour):')
print_counts(count_projects(
filter(is_project_popular(24 * 30 * 2), projects)))
print('Projects that were downloaded at least 1,440 times last month with 2 releases in the past year:')
print_counts(count_projects(
filter(is_project_popular(24 * 30 * 2),
filter(release_often_enough(2, last_year), projects))))
print('Projects that were downloaded at least 300 times last month with 2 releases in the past year:')
print_counts(count_projects(
filter(is_project_popular(300),
filter(release_often_enough(2, last_year), projects))))
print('Projects that made two releases in the last year at least 90 days apart:')
print_counts(count_projects(
filter(is_project_active, projects)))
print('Projects created during the past year:')
print_counts(count_projects(
filter(is_project_new(last_year), projects)))
print('Projects created during the past six months:')
print_counts(count_projects(
filter(is_project_new(six_months_ago), projects)))
print('Projects created during the past 30 days:')
print_counts(count_projects(
filter(is_project_new(last_month), projects)))
if __name__ == '__main__':
pickle_path = pathlib.Path('pypi_data.pickle')
if not pickle_path.exists():
fetch_main(pickle_path)
analyze_main(pickle_path)
Loading pickle file ...
60,543 project loaded
Calculating stuff ...
Projects that have ever had a release:
* Unknown: 34,447
* Python 2: 8,064
* Python 3: 11,377
Projects with a release within the last 2 years:
* Unknown: 19,760
* Python 2: 5,898
* Python 3: 10,295
Projects with a release within the last year:
* Unknown: 12,864
* Python 2: 4,091
* Python 3: 8,329
Projects with a release within the last 6 months:
* Unknown: 8,183
* Python 2: 2,809
* Python 3: 6,134
Projects with 2 releases in the past year:
* Unknown: 8,033
* Python 2: 2,779
* Python 3: 5,889
Projects with two releases in the last six months:
* Unknown: 4,747
* Python 2: 1,732
* Python 3: 3,920
Projects that were downloaded at least 1,440 times last month (i.e., twice an hour):
* Unknown: 4,115
* Python 2: 1,215
* Python 3: 3,332
Projects that were downloaded at least 1,440 times last month with 2 releases in the past year:
* Unknown: 2,241
* Python 2: 868
* Python 3: 2,465
Projects that were downloaded at least 300 times last month with 2 releases in the past year:
* Unknown: 6,412
* Python 2: 2,326
* Python 3: 5,202
Projects that made two releases in the last year at least 90 days apart:
* Unknown: 3,158
* Python 2: 1,188
* Python 3: 2,786
Projects created during the past year:
* Unknown: 8,242
* Python 2: 2,589
* Python 3: 5,050
Projects created during the past six months:
* Unknown: 4,260
* Python 2: 1,401
* Python 3: 2,952
Projects created during the past 30 days:
* Unknown: 761
* Python 2: 247
* Python 3: 534
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment