-
-
Save brettcannon/d03fbcf365a9c76d4aaa to your computer and use it in GitHub Desktop.
Script to calculate Python 3 porting progress based on various criteria of what qualifies a project to be counted.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import collections | |
from concurrent import futures | |
import datetime | |
import enum | |
import json | |
import operator | |
import os | |
import pathlib | |
import pickle | |
from urllib import request | |
from urllib.error import URLError | |
from xml import sax | |
from xml.sax import handler | |
# Doesn't subclass HTTPError as it requires a bunch of arguments I don't want | |
# to have to deal with. | |
# TODO: deal with the arguments. | |
class PyPIFetchError(Exception): | |
"""Exception representing when PyPI triggers an HTTPError.""" | |
def __init__(self, url, message): | |
super().__init__('{} ({})'.format(message, url)) | |
class PySupport(enum.Enum): | |
"""Enum representing major version support of Python.""" | |
unknown = 1 | |
py2 = 2 | |
py3 = 3 | |
@classmethod | |
def classify(cls, project): | |
classifiers = project['info']['classifiers'] | |
supports = cls.unknown | |
classifier_prefix = 'Programming Language :: Python :: ' | |
for classifier in classifiers: | |
if not classifier.startswith(classifier_prefix): | |
continue | |
version = classifier[len(classifier_prefix):].strip() | |
if version == '3 :: Only' or version == '3' or version.startswith('3.'): | |
return cls.py3 | |
elif version == '2 :: Only': | |
return cls.py2 | |
elif version == '2' or version.startswith('2.'): | |
supports = cls.py2 | |
return supports | |
def calc_release_dates(project): | |
release_dates = [] | |
for upload_info in project['releases'].values(): | |
newest_upload_time = None | |
for upload in upload_info: | |
naive_upload_time = datetime.datetime.strptime(upload['upload_time'], | |
'%Y-%m-%dT%H:%M:%S') | |
upload_time = naive_upload_time.replace(tzinfo=datetime.timezone.utc) | |
if newest_upload_time is None or upload_time > newest_upload_time: | |
newest_upload_time = upload_time | |
if newest_upload_time is not None: | |
release_dates.append(newest_upload_time) | |
release_dates.sort() # Oldest to newest. | |
return release_dates | |
def calc_downloads(project): | |
return project['info']['downloads']['last_month'] | |
def count_projects(projects): | |
return collections.Counter(map(operator.itemgetter('Python support'), projects)) | |
def is_project_active(project): | |
"""Check if a project is considered active. | |
To be considered active, a project needs to have made two releases within | |
the past year where they were at least 3 months apart. | |
""" | |
year_ago = datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(365) | |
latest_release = project['Release dates'][-1] | |
if latest_release < year_ago: | |
return False | |
for release in reversed(project['Release dates'][:-1]): | |
if release < year_ago: | |
return False | |
if (latest_release - release) >= datetime.timedelta(90): | |
return True | |
return False | |
def is_project_new(freshness): | |
"""Determine if a project is new based on its oldest release.""" | |
def filter_func(project): | |
if project['Release dates'][0] >= freshness: | |
return True | |
return False | |
return filter_func | |
def has_released_since(cutoff): | |
"""Make sure a project has released by a cutoff date.""" | |
def filter_func(project): | |
if project['Release dates'][-1] >= cutoff: | |
return True | |
return False | |
return filter_func | |
def release_often_enough(count, cutoff): | |
"""See if a project has made `count` releases since the cutoff.""" | |
def filter_func(project): | |
if len(project['Release dates']) < count: | |
return False | |
return all(date >= cutoff for date in project['Release dates'][-count:]) | |
return filter_func | |
def is_project_popular(cutoff): | |
"""Check if the project meets the cutoff for monthly downloads.""" | |
def filter_func(project): | |
if project['Monthly downloads'] >= cutoff: | |
return True | |
return False | |
return filter_func | |
class PyPIIndexHandler(handler.ContentHandler): | |
"""Parse PyPI's simple index page.""" | |
def __init__(self): | |
super().__init__() | |
self.projects = set() | |
def startElement(self, name, attrs): | |
# TODO: Check for <meta name="api-version" value="2" /> . | |
if name != 'a': | |
return | |
project_name = attrs.get('href', None) | |
if project_name is not None: | |
self.projects.add(project_name) | |
def fetch_index(): | |
"""Return an iterable of every project name on PyPI.""" | |
with request.urlopen('https://pypi.python.org/simple/') as response: | |
html_index = response.read() | |
sax_handler = PyPIIndexHandler() | |
sax.parseString(html_index, sax_handler) | |
return sax_handler.projects | |
def fetch_project(name): | |
"""Return the loaded JSON data from PyPI for a project.""" | |
url = 'https://pypi.python.org/pypi/{}/json'.format(name) | |
try: | |
with request.urlopen(url) as response: | |
return json.loads(response.read().decode('utf-8')) | |
except URLError as exc: | |
return PyPIFetchError(url, exc) | |
def fetch_main(pickle_path): | |
print('Fetching index ...') | |
project_set = sorted(fetch_index()) | |
project_data = list() | |
project_issues = list() | |
print('Fetching projects ...') | |
with futures.ThreadPoolExecutor(os.cpu_count()) as executor: | |
for data in executor.map(fetch_project, project_set): | |
if isinstance(data, PyPIFetchError): | |
issue = str(data) | |
project_issues.append(issue) | |
print(' ' + issue) | |
continue | |
project_data.append(data) | |
print(' ' + data['info']['name']) | |
# TODO: store fetch datetime with data for consistent timespan calculations. | |
print('Writing pickle file ...') | |
with open(str(pickle_path), 'wb') as file: | |
pickle.dump(project_data, file, pickle.HIGHEST_PROTOCOL) | |
print('Problems:') | |
for issue in project_issues: | |
print(' ' + issue) | |
def print_counts(counts): | |
count_format = ' * {:<9} {:>6,}' | |
print(count_format.format('Unknown:', counts[PySupport.unknown])) | |
print(count_format.format('Python 2:', counts[PySupport.py2])) | |
print(count_format.format('Python 3:', counts[PySupport.py3])) | |
def analyze_main(pickle_path): | |
print('Loading pickle file ...') | |
with open(str(pickle_path), 'rb') as file: | |
all_projects = pickle.load(file) | |
print('{:,} project loaded'.format(len(all_projects))) | |
print('Calculating stuff ...') | |
projects = [] | |
for project in all_projects: | |
project['Python support'] = PySupport.classify(project) | |
project['Monthly downloads'] = calc_downloads(project) | |
project['Release dates'] = calc_release_dates(project) | |
if not len(project['Release dates']): | |
continue | |
projects.append(project) | |
# TODO: get freshness from data. | |
data_freshness = datetime.datetime.now(datetime.timezone.utc) | |
two_years_ago = data_freshness - datetime.timedelta(365 * 2) | |
last_year = data_freshness - datetime.timedelta(365) | |
six_months_ago = data_freshness - datetime.timedelta(365 // 2) | |
last_month = data_freshness - datetime.timedelta(30) | |
print('Projects that have ever had a release:') | |
print_counts(count_projects(projects)) | |
print('Projects with a release within the last 2 years:') | |
print_counts(count_projects( | |
filter(has_released_since(two_years_ago), projects))) | |
print('Projects with a release within the last year:') | |
print_counts(count_projects( | |
filter(has_released_since(last_year), projects))) | |
print('Projects with a release within the last 6 months:') | |
print_counts(count_projects( | |
filter(has_released_since(six_months_ago), projects))) | |
print('Projects with 2 releases in the past year:') | |
print_counts(count_projects( | |
filter(release_often_enough(2, last_year), projects))) | |
print('Projects with two releases in the last six months:') | |
print_counts(count_projects( | |
filter(release_often_enough(2, six_months_ago), projects))) | |
print('Projects that were downloaded at least 1,440 times last month (i.e., twice an hour):') | |
print_counts(count_projects( | |
filter(is_project_popular(24 * 30 * 2), projects))) | |
print('Projects that were downloaded at least 1,440 times last month with 2 releases in the past year:') | |
print_counts(count_projects( | |
filter(is_project_popular(24 * 30 * 2), | |
filter(release_often_enough(2, last_year), projects)))) | |
print('Projects that were downloaded at least 300 times last month with 2 releases in the past year:') | |
print_counts(count_projects( | |
filter(is_project_popular(300), | |
filter(release_often_enough(2, last_year), projects)))) | |
print('Projects that made two releases in the last year at least 90 days apart:') | |
print_counts(count_projects( | |
filter(is_project_active, projects))) | |
print('Projects created during the past year:') | |
print_counts(count_projects( | |
filter(is_project_new(last_year), projects))) | |
print('Projects created during the past six months:') | |
print_counts(count_projects( | |
filter(is_project_new(six_months_ago), projects))) | |
print('Projects created during the past 30 days:') | |
print_counts(count_projects( | |
filter(is_project_new(last_month), projects))) | |
if __name__ == '__main__': | |
pickle_path = pathlib.Path('pypi_data.pickle') | |
if not pickle_path.exists(): | |
fetch_main(pickle_path) | |
analyze_main(pickle_path) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Loading pickle file ... | |
60,543 project loaded | |
Calculating stuff ... | |
Projects that have ever had a release: | |
* Unknown: 34,447 | |
* Python 2: 8,064 | |
* Python 3: 11,377 | |
Projects with a release within the last 2 years: | |
* Unknown: 19,760 | |
* Python 2: 5,898 | |
* Python 3: 10,295 | |
Projects with a release within the last year: | |
* Unknown: 12,864 | |
* Python 2: 4,091 | |
* Python 3: 8,329 | |
Projects with a release within the last 6 months: | |
* Unknown: 8,183 | |
* Python 2: 2,809 | |
* Python 3: 6,134 | |
Projects with 2 releases in the past year: | |
* Unknown: 8,033 | |
* Python 2: 2,779 | |
* Python 3: 5,889 | |
Projects with two releases in the last six months: | |
* Unknown: 4,747 | |
* Python 2: 1,732 | |
* Python 3: 3,920 | |
Projects that were downloaded at least 1,440 times last month (i.e., twice an hour): | |
* Unknown: 4,115 | |
* Python 2: 1,215 | |
* Python 3: 3,332 | |
Projects that were downloaded at least 1,440 times last month with 2 releases in the past year: | |
* Unknown: 2,241 | |
* Python 2: 868 | |
* Python 3: 2,465 | |
Projects that were downloaded at least 300 times last month with 2 releases in the past year: | |
* Unknown: 6,412 | |
* Python 2: 2,326 | |
* Python 3: 5,202 | |
Projects that made two releases in the last year at least 90 days apart: | |
* Unknown: 3,158 | |
* Python 2: 1,188 | |
* Python 3: 2,786 | |
Projects created during the past year: | |
* Unknown: 8,242 | |
* Python 2: 2,589 | |
* Python 3: 5,050 | |
Projects created during the past six months: | |
* Unknown: 4,260 | |
* Python 2: 1,401 | |
* Python 3: 2,952 | |
Projects created during the past 30 days: | |
* Unknown: 761 | |
* Python 2: 247 | |
* Python 3: 534 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment