brettcannon/pypi_stats.py Secret

## pypi_stats.py
import collections
from concurrent import futures
import datetime
import enum
import json
import operator
import os
import pathlib
import pickle
from urllib import request
from urllib.error import URLError
from xml import sax
from xml.sax import handler


# Doesn't subclass HTTPError as it requires a bunch of arguments I don't want
# to have to deal with.
# TODO: deal with the arguments.
class PyPIFetchError(Exception):

    """Exception representing when PyPI triggers an HTTPError."""

    def __init__(self, url, message):
        super().__init__('{} ({})'.format(message, url))


class PySupport(enum.Enum):

    """Enum representing major version support of Python."""

    unknown = 1
    py2 = 2
    py3 = 3

    @classmethod
    def classify(cls, project):
        classifiers = project['info']['classifiers']
        supports = cls.unknown
        classifier_prefix = 'Programming Language :: Python :: '
        for classifier in classifiers:
            if not classifier.startswith(classifier_prefix):
                continue
            version = classifier[len(classifier_prefix):].strip()
            if version == '3 :: Only' or version == '3' or version.startswith('3.'):
                return cls.py3
            elif version == '2 :: Only':
                return cls.py2
            elif version == '2' or version.startswith('2.'):
                supports = cls.py2
        return supports


def calc_release_dates(project):
    release_dates = []
    for upload_info in project['releases'].values():
        newest_upload_time = None
        for upload in upload_info:
            naive_upload_time = datetime.datetime.strptime(upload['upload_time'],
                                                           '%Y-%m-%dT%H:%M:%S')
            upload_time = naive_upload_time.replace(tzinfo=datetime.timezone.utc)
            if newest_upload_time is None or upload_time > newest_upload_time:
                newest_upload_time = upload_time
        if newest_upload_time is not None:
            release_dates.append(newest_upload_time)
    release_dates.sort()  # Oldest to newest.
    return release_dates


def calc_downloads(project):
    return project['info']['downloads']['last_month']


def count_projects(projects):
    return collections.Counter(map(operator.itemgetter('Python support'), projects))


def is_project_active(project):
    """Check if a project is considered active.

    To be considered active, a project needs to have made two releases within
    the past year where they were at least 3 months apart.
    """
    year_ago = datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(365)
    latest_release = project['Release dates'][-1]
    if latest_release < year_ago:
        return False
    for release in reversed(project['Release dates'][:-1]):
        if release < year_ago:
            return False
        if (latest_release - release) >= datetime.timedelta(90):
            return True
    return False


def is_project_new(freshness):
    """Determine if a project is new based on its oldest release."""
    def filter_func(project):
        if project['Release dates'][0] >= freshness:
            return True
        return False
    return filter_func


def has_released_since(cutoff):
    """Make sure a project has released by a cutoff date."""
    def filter_func(project):
        if project['Release dates'][-1] >= cutoff:
            return True
        return False
    return filter_func


def release_often_enough(count, cutoff):
    """See if a project has made `count` releases since the cutoff."""
    def filter_func(project):
        if len(project['Release dates']) < count:
            return False
        return all(date >= cutoff for date in project['Release dates'][-count:])
    return filter_func


def is_project_popular(cutoff):
    """Check if the project meets the cutoff for monthly downloads."""
    def filter_func(project):
        if project['Monthly downloads'] >= cutoff:
            return True
        return False
    return filter_func


class PyPIIndexHandler(handler.ContentHandler):

    """Parse PyPI's simple index page."""

    def __init__(self):
        super().__init__()
        self.projects = set()

    def startElement(self, name, attrs):
        # TODO: Check for <meta name="api-version" value="2" /> .
        if name != 'a':
            return

        project_name = attrs.get('href', None)
        if project_name is not None:
            self.projects.add(project_name)


def fetch_index():
    """Return an iterable of every project name on PyPI."""
    with request.urlopen('https://pypi.python.org/simple/') as response:
        html_index = response.read()
    sax_handler = PyPIIndexHandler()
    sax.parseString(html_index, sax_handler)
    return sax_handler.projects


def fetch_project(name):
    """Return the loaded JSON data from PyPI for a project."""
    url = 'https://pypi.python.org/pypi/{}/json'.format(name)
    try:
        with request.urlopen(url) as response:
            return json.loads(response.read().decode('utf-8'))
    except URLError as exc:
        return PyPIFetchError(url, exc)


def fetch_main(pickle_path):
    print('Fetching index ...')
    project_set = sorted(fetch_index())
    project_data = list()
    project_issues = list()
    print('Fetching projects ...')
    with futures.ThreadPoolExecutor(os.cpu_count()) as executor:
        for data in executor.map(fetch_project, project_set):
            if isinstance(data, PyPIFetchError):
                issue = str(data)
                project_issues.append(issue)
                print('    ' + issue)
                continue
            project_data.append(data)
            print('    ' + data['info']['name'])

    # TODO: store fetch datetime with data for consistent timespan calculations.
    print('Writing pickle file ...')
    with open(str(pickle_path), 'wb') as file:
        pickle.dump(project_data, file, pickle.HIGHEST_PROTOCOL)
    print('Problems:')
    for issue in project_issues:
        print('    ' + issue)


def print_counts(counts):
    count_format = '  * {:<9} {:>6,}'
    print(count_format.format('Unknown:', counts[PySupport.unknown]))
    print(count_format.format('Python 2:', counts[PySupport.py2]))
    print(count_format.format('Python 3:', counts[PySupport.py3]))


def analyze_main(pickle_path):
    print('Loading pickle file ...')
    with open(str(pickle_path), 'rb') as file:
        all_projects = pickle.load(file)
    print('{:,} project loaded'.format(len(all_projects)))
    print('Calculating stuff ...')
    projects = []
    for project in all_projects:
        project['Python support'] = PySupport.classify(project)
        project['Monthly downloads'] = calc_downloads(project)
        project['Release dates'] = calc_release_dates(project)
        if not len(project['Release dates']):
            continue
        projects.append(project)


    # TODO: get freshness from data.
    data_freshness = datetime.datetime.now(datetime.timezone.utc)
    two_years_ago = data_freshness - datetime.timedelta(365 * 2)
    last_year = data_freshness - datetime.timedelta(365)
    six_months_ago = data_freshness - datetime.timedelta(365 // 2)
    last_month = data_freshness - datetime.timedelta(30)

    print('Projects that have ever had a release:')
    print_counts(count_projects(projects))
    print('Projects with a release within the last 2 years:')
    print_counts(count_projects(
            filter(has_released_since(two_years_ago), projects)))
    print('Projects with a release within the last year:')
    print_counts(count_projects(
            filter(has_released_since(last_year), projects)))
    print('Projects with a release within the last 6 months:')
    print_counts(count_projects(
            filter(has_released_since(six_months_ago), projects)))
    print('Projects with 2 releases in the past year:')
    print_counts(count_projects(
            filter(release_often_enough(2, last_year), projects)))
    print('Projects with two releases in the last six months:')
    print_counts(count_projects(
            filter(release_often_enough(2, six_months_ago), projects)))
    print('Projects that were downloaded at least 1,440 times last month (i.e., twice an hour):')
    print_counts(count_projects(
            filter(is_project_popular(24 * 30 * 2), projects)))
    print('Projects that were downloaded at least 1,440 times last month with 2 releases in the past year:')
    print_counts(count_projects(
            filter(is_project_popular(24 * 30 * 2),
                    filter(release_often_enough(2, last_year), projects))))
    print('Projects that were downloaded at least 300 times last month with 2 releases in the past year:')
    print_counts(count_projects(
            filter(is_project_popular(300),
                    filter(release_often_enough(2, last_year), projects))))
    print('Projects that made two releases in the last year at least 90 days apart:')
    print_counts(count_projects(
            filter(is_project_active, projects)))
    print('Projects created during the past year:')
    print_counts(count_projects(
            filter(is_project_new(last_year), projects)))
    print('Projects created during the past six months:')
    print_counts(count_projects(
            filter(is_project_new(six_months_ago), projects)))
    print('Projects created during the past 30 days:')
    print_counts(count_projects(
            filter(is_project_new(last_month), projects)))


if __name__ == '__main__':
    pickle_path = pathlib.Path('pypi_data.pickle')
    if not pickle_path.exists():
        fetch_main(pickle_path)
    analyze_main(pickle_path)

## results.txt
Loading pickle file ...
60,543 project loaded
Calculating stuff ...
Projects that have ever had a release:
  * Unknown:  34,447
  * Python 2:  8,064
  * Python 3: 11,377
Projects with a release within the last 2 years:
  * Unknown:  19,760
  * Python 2:  5,898
  * Python 3: 10,295
Projects with a release within the last year:
  * Unknown:  12,864
  * Python 2:  4,091
  * Python 3:  8,329
Projects with a release within the last 6 months:
  * Unknown:   8,183
  * Python 2:  2,809
  * Python 3:  6,134
Projects with 2 releases in the past year:
  * Unknown:   8,033
  * Python 2:  2,779
  * Python 3:  5,889
Projects with two releases in the last six months:
  * Unknown:   4,747
  * Python 2:  1,732
  * Python 3:  3,920
Projects that were downloaded at least 1,440 times last month (i.e., twice an hour):
  * Unknown:   4,115
  * Python 2:  1,215
  * Python 3:  3,332
Projects that were downloaded at least 1,440 times last month with 2 releases in the past year:
  * Unknown:   2,241
  * Python 2:    868
  * Python 3:  2,465
Projects that were downloaded at least 300 times last month with 2 releases in the past year:
  * Unknown:   6,412
  * Python 2:  2,326
  * Python 3:  5,202
Projects that made two releases in the last year at least 90 days apart:
  * Unknown:   3,158
  * Python 2:  1,188
  * Python 3:  2,786
Projects created during the past year:
  * Unknown:   8,242
  * Python 2:  2,589
  * Python 3:  5,050
Projects created during the past six months:
  * Unknown:   4,260
  * Python 2:  1,401
  * Python 3:  2,952
Projects created during the past 30 days:
  * Unknown:     761
  * Python 2:    247
  * Python 3:    534
	import collections
	from concurrent import futures
	import datetime
	import enum
	import json
	import operator
	import os
	import pathlib
	import pickle
	from urllib import request
	from urllib.error import URLError
	from xml import sax
	from xml.sax import handler


	# Doesn't subclass HTTPError as it requires a bunch of arguments I don't want
	# to have to deal with.
	# TODO: deal with the arguments.
	class PyPIFetchError(Exception):

	"""Exception representing when PyPI triggers an HTTPError."""

	def __init__(self, url, message):
	super().__init__('{} ({})'.format(message, url))


	class PySupport(enum.Enum):

	"""Enum representing major version support of Python."""

	unknown = 1
	py2 = 2
	py3 = 3

	@classmethod
	def classify(cls, project):
	classifiers = project['info']['classifiers']
	supports = cls.unknown
	classifier_prefix = 'Programming Language :: Python :: '
	for classifier in classifiers:
	if not classifier.startswith(classifier_prefix):
	continue
	version = classifier[len(classifier_prefix):].strip()
	if version == '3 :: Only' or version == '3' or version.startswith('3.'):
	return cls.py3
	elif version == '2 :: Only':
	return cls.py2
	elif version == '2' or version.startswith('2.'):
	supports = cls.py2
	return supports


	def calc_release_dates(project):
	release_dates = []
	for upload_info in project['releases'].values():
	newest_upload_time = None
	for upload in upload_info:
	naive_upload_time = datetime.datetime.strptime(upload['upload_time'],
	'%Y-%m-%dT%H:%M:%S')
	upload_time = naive_upload_time.replace(tzinfo=datetime.timezone.utc)
	if newest_upload_time is None or upload_time > newest_upload_time:
	newest_upload_time = upload_time
	if newest_upload_time is not None:
	release_dates.append(newest_upload_time)
	release_dates.sort() # Oldest to newest.
	return release_dates


	def calc_downloads(project):
	return project['info']['downloads']['last_month']


	def count_projects(projects):
	return collections.Counter(map(operator.itemgetter('Python support'), projects))


	def is_project_active(project):
	"""Check if a project is considered active.

	To be considered active, a project needs to have made two releases within
	the past year where they were at least 3 months apart.
	"""
	year_ago = datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(365)
	latest_release = project['Release dates'][-1]
	if latest_release < year_ago:
	return False
	for release in reversed(project['Release dates'][:-1]):
	if release < year_ago:
	return False
	if (latest_release - release) >= datetime.timedelta(90):
	return True
	return False


	def is_project_new(freshness):
	"""Determine if a project is new based on its oldest release."""
	def filter_func(project):
	if project['Release dates'][0] >= freshness:
	return True
	return False
	return filter_func


	def has_released_since(cutoff):
	"""Make sure a project has released by a cutoff date."""
	def filter_func(project):
	if project['Release dates'][-1] >= cutoff:
	return True
	return False
	return filter_func


	def release_often_enough(count, cutoff):
	"""See if a project has made `count` releases since the cutoff."""
	def filter_func(project):
	if len(project['Release dates']) < count:
	return False
	return all(date >= cutoff for date in project['Release dates'][-count:])
	return filter_func


	def is_project_popular(cutoff):
	"""Check if the project meets the cutoff for monthly downloads."""
	def filter_func(project):
	if project['Monthly downloads'] >= cutoff:
	return True
	return False
	return filter_func


	class PyPIIndexHandler(handler.ContentHandler):

	"""Parse PyPI's simple index page."""

	def __init__(self):
	super().__init__()
	self.projects = set()

	def startElement(self, name, attrs):
	# TODO: Check for <meta name="api-version" value="2" /> .
	if name != 'a':
	return

	project_name = attrs.get('href', None)
	if project_name is not None:
	self.projects.add(project_name)


	def fetch_index():
	"""Return an iterable of every project name on PyPI."""
	with request.urlopen('https://pypi.python.org/simple/') as response:
	html_index = response.read()
	sax_handler = PyPIIndexHandler()
	sax.parseString(html_index, sax_handler)
	return sax_handler.projects


	def fetch_project(name):
	"""Return the loaded JSON data from PyPI for a project."""
	url = 'https://pypi.python.org/pypi/{}/json'.format(name)
	try:
	with request.urlopen(url) as response:
	return json.loads(response.read().decode('utf-8'))
	except URLError as exc:
	return PyPIFetchError(url, exc)


	def fetch_main(pickle_path):
	print('Fetching index ...')
	project_set = sorted(fetch_index())
	project_data = list()
	project_issues = list()
	print('Fetching projects ...')
	with futures.ThreadPoolExecutor(os.cpu_count()) as executor:
	for data in executor.map(fetch_project, project_set):
	if isinstance(data, PyPIFetchError):
	issue = str(data)
	project_issues.append(issue)
	print(' ' + issue)
	continue
	project_data.append(data)
	print(' ' + data['info']['name'])

	# TODO: store fetch datetime with data for consistent timespan calculations.
	print('Writing pickle file ...')
	with open(str(pickle_path), 'wb') as file:
	pickle.dump(project_data, file, pickle.HIGHEST_PROTOCOL)
	print('Problems:')
	for issue in project_issues:
	print(' ' + issue)


	def print_counts(counts):
	count_format = ' * {:<9} {:>6,}'
	print(count_format.format('Unknown:', counts[PySupport.unknown]))
	print(count_format.format('Python 2:', counts[PySupport.py2]))
	print(count_format.format('Python 3:', counts[PySupport.py3]))


	def analyze_main(pickle_path):
	print('Loading pickle file ...')
	with open(str(pickle_path), 'rb') as file:
	all_projects = pickle.load(file)
	print('{:,} project loaded'.format(len(all_projects)))
	print('Calculating stuff ...')
	projects = []
	for project in all_projects:
	project['Python support'] = PySupport.classify(project)
	project['Monthly downloads'] = calc_downloads(project)
	project['Release dates'] = calc_release_dates(project)
	if not len(project['Release dates']):
	continue
	projects.append(project)


	# TODO: get freshness from data.
	data_freshness = datetime.datetime.now(datetime.timezone.utc)
	two_years_ago = data_freshness - datetime.timedelta(365 * 2)
	last_year = data_freshness - datetime.timedelta(365)
	six_months_ago = data_freshness - datetime.timedelta(365 // 2)
	last_month = data_freshness - datetime.timedelta(30)

	print('Projects that have ever had a release:')
	print_counts(count_projects(projects))
	print('Projects with a release within the last 2 years:')
	print_counts(count_projects(
	filter(has_released_since(two_years_ago), projects)))
	print('Projects with a release within the last year:')
	print_counts(count_projects(
	filter(has_released_since(last_year), projects)))
	print('Projects with a release within the last 6 months:')
	print_counts(count_projects(
	filter(has_released_since(six_months_ago), projects)))
	print('Projects with 2 releases in the past year:')
	print_counts(count_projects(
	filter(release_often_enough(2, last_year), projects)))
	print('Projects with two releases in the last six months:')
	print_counts(count_projects(
	filter(release_often_enough(2, six_months_ago), projects)))
	print('Projects that were downloaded at least 1,440 times last month (i.e., twice an hour):')
	print_counts(count_projects(
	filter(is_project_popular(24 * 30 * 2), projects)))
	print('Projects that were downloaded at least 1,440 times last month with 2 releases in the past year:')
	print_counts(count_projects(
	filter(is_project_popular(24 * 30 * 2),
	filter(release_often_enough(2, last_year), projects))))
	print('Projects that were downloaded at least 300 times last month with 2 releases in the past year:')
	print_counts(count_projects(
	filter(is_project_popular(300),
	filter(release_often_enough(2, last_year), projects))))
	print('Projects that made two releases in the last year at least 90 days apart:')
	print_counts(count_projects(
	filter(is_project_active, projects)))
	print('Projects created during the past year:')
	print_counts(count_projects(
	filter(is_project_new(last_year), projects)))
	print('Projects created during the past six months:')
	print_counts(count_projects(
	filter(is_project_new(six_months_ago), projects)))
	print('Projects created during the past 30 days:')
	print_counts(count_projects(
	filter(is_project_new(last_month), projects)))


	if __name__ == '__main__':
	pickle_path = pathlib.Path('pypi_data.pickle')
	if not pickle_path.exists():
	fetch_main(pickle_path)
	analyze_main(pickle_path)
	Loading pickle file ...
	60,543 project loaded
	Calculating stuff ...
	Projects that have ever had a release:
	* Unknown: 34,447
	* Python 2: 8,064
	* Python 3: 11,377
	Projects with a release within the last 2 years:
	* Unknown: 19,760
	* Python 2: 5,898
	* Python 3: 10,295
	Projects with a release within the last year:
	* Unknown: 12,864
	* Python 2: 4,091
	* Python 3: 8,329
	Projects with a release within the last 6 months:
	* Unknown: 8,183
	* Python 2: 2,809
	* Python 3: 6,134
	Projects with 2 releases in the past year:
	* Unknown: 8,033
	* Python 2: 2,779
	* Python 3: 5,889
	Projects with two releases in the last six months:
	* Unknown: 4,747
	* Python 2: 1,732
	* Python 3: 3,920
	Projects that were downloaded at least 1,440 times last month (i.e., twice an hour):
	* Unknown: 4,115
	* Python 2: 1,215
	* Python 3: 3,332
	Projects that were downloaded at least 1,440 times last month with 2 releases in the past year:
	* Unknown: 2,241
	* Python 2: 868
	* Python 3: 2,465
	Projects that were downloaded at least 300 times last month with 2 releases in the past year:
	* Unknown: 6,412
	* Python 2: 2,326
	* Python 3: 5,202
	Projects that made two releases in the last year at least 90 days apart:
	* Unknown: 3,158
	* Python 2: 1,188
	* Python 3: 2,786
	Projects created during the past year:
	* Unknown: 8,242
	* Python 2: 2,589
	* Python 3: 5,050
	Projects created during the past six months:
	* Unknown: 4,260
	* Python 2: 1,401
	* Python 3: 2,952
	Projects created during the past 30 days:
	* Unknown: 761
	* Python 2: 247
	* Python 3: 534