leventov/analyze.py

## analyze.py
import json

MIN_STARS = 700

with open('{}.json'.format(MIN_STARS)) as json_file:
    repos = json.load(json_file)

repos_by_language = {}
by_top = {}
for repo in repos:
    by_top[repo['top']] = by_top.get(repo['top'], 0) + 1
    repos_by_language.setdefault(repo['language'], []).append(repo)

print('{} repos'.format(len(repos)))

for top, count in by_top.items():
    print('<tr><td>{}</td><td>{} - {:.1f}%</td></tr>'.format(top, count, count / len(repos) * 100).replace('.', ','))

russian = {
    'top': 'авторов',
    'total_commits': 'коммитов',
    'total_additions': 'добавлений',
    'total_changes': 'изменений',
    'stars': 'звездочек',
}

def ending(n):
    if 10 < n < 20:
        return 'ов'
    if n % 10 == 1:
        return ''
    if 2 <= n % 10 <= 4:
        return 'а'
    return 'ов'

def top_and_print(repos, keys, where=None, top=5, reverse=False):
    sort_key = lambda r: tuple(map(lambda k: r[k], keys))
    if where:
        repos = filter(lambda r: all(map(lambda kv: r[kv[0]] == kv[1], where.items())), repos)
    sorted_repos = sorted(repos, key=sort_key, reverse=not reverse)
    top_repos = sorted_repos[:top]
    if top_repos:
        print('<h4>Топ {} проект{} по количеству {}{}</h4><table>'.format(top, ending(top), russian[keys[0]],
                                                                          ' с одним автором' if where else ''))
        print('<tr><th>Проект</th><th>Язык</th><th>Звездочки</th><th>Коммиты</th>'
              '<th>Добавления</th><th>Изменения</th><th>Авторы</th></tr>')
        for r in top_repos:
            print(('<tr>'
                  '<td><a href="https://github.com/{}">{}</a></td>'
                  '<td>{}</td>'
                  '<td>{}</td>'
                  '<td>{}</td>'
                  '<td>{}</td>'
                  '<td>{}</td>'
                  '<td>{}</td>'
                  '</tr>').format(r['name'], r['name'], r['language'], r['stars'], r['total_commits'],
                                  r['total_additions'], r['total_changes'], r['top']))
        print('</table>')
        print()
    else:
        print('No repos with {}'.format(where))

top_and_print(repos, ['top'], top=20)
top_and_print(repos, ['total_commits'], top=20)
top_and_print(repos, ['total_additions'], top=20)
top_and_print(repos, ['total_changes'], top=20)
top_and_print(repos, ['stars'], where={'top': 1}, top=7)
top_and_print(repos, ['total_commits'], where={'top': 1}, top=7)
total_repos = len(repos)

lang_stats = {}

for lang, repos in sorted(repos_by_language.items(), key=lambda e: len(e[1]), reverse=True):
    print('<spoiler title="{}">'.format(lang))
    print()
    n = len(repos)
    print('{} проект{} - {:.1f}%'.format(n, ending(n), n / total_repos * 100).replace('.', ','))
    print()
    average_top = sum(map(lambda r: r['top'], repos)) / n
    print('Среднее количество авторов: {:.1f}.'.format(average_top).replace('.', ',', 1))
    lang_stats[lang] = (n, average_top)
    print()
    top_and_print(repos, ['top'])
    top_and_print(repos, ['total_commits'])
    top_and_print(repos, ['total_additions'])
    top_and_print(repos, ['total_changes'])
    top_and_print(repos, ['stars'], where={'top': 1}, top=3)
    top_and_print(repos, ['total_commits'], where={'top': 1}, top=3)
    print('</spoiler>')

for lang, stats in sorted(lang_stats.items(), key=lambda e: e[1][0], reverse=True):
    print('<tr><td>{}</td><td>{} - {:.1f}%</td><td>{:.1f}</td></tr>'.format(lang, stats[0],
                                                                            stats[0] / total_repos * 100, stats[1]
    ).replace('.', ','))

## contrib.py
import json

MIN_STARS = 700
THRESHOLD = 0.9

with open('{}.json'.format(MIN_STARS)) as json_file:
    repos = json.load(json_file)

keys = ['commits', 'additions', 'changes']


def sum_contributions(contributors):
    return {k: sum(map(lambda c: c[1][k], contributors)) for k in keys}

for repo in repos:
    cs = repo['contributors'].items()
    total_sums = sum_contributions(cs)
    cs_by = {k: sorted(cs, key=lambda c: c[1][k], reverse=True) for k in keys}
    result_top = 0
    result_passing_criteria = []
    for top in range(len(cs)):
        for k in keys:
            top_by = cs_by[k][:top + 1]
            top_nicknames = map(lambda e: e[0], top_by)
            top_sums = sum_contributions(top_by)
            if total_sums[k] > 0 and top_sums[k] / total_sums[k] >= THRESHOLD:
                passing_criteria = [k]
                rest_keys = list(keys)
                rest_keys.remove(k)
                for rest_key in rest_keys:
                    if total_sums[rest_key] > 0 and top_sums[rest_key] / total_sums[rest_key] >= THRESHOLD:
                        passing_criteria.append(rest_key)
                if len(passing_criteria) >= 2 and result_top == 0:
                    result_top = top + 1
                    result_passing_criteria = passing_criteria
        if result_top != 0:
            break
    repo['top'] = result_top
    repo['criteria'] = sorted(result_passing_criteria)

with open('{}.json'.format(MIN_STARS), 'w') as json_file:
    json.dump(repos, json_file, sort_keys=True, indent=4, separators=(',', ': '))

## empty.py
import json

MIN_STARS = 700

repos = json.load(open('{}.json'.format(MIN_STARS)))

empty = []
for repo in repos:
    if not repo['contributors']:
        empty.append(repo['name'])

with open('{}-empty.json'.format(MIN_STARS), 'w') as json_file:
    json.dump(empty, json_file, sort_keys=True, indent=4, separators=(',', ': '))

## github.py
from github3 import login
import json
from time import sleep

gh = login('login', 'pass')


def all_repos_by_stars(min_stars):
    repos = {}
    def add_repos(date_range):
        if gh.rate_limit()['resources']['search']['remaining'] < 10:
            print('Throttling requests, sleeping for 60 seconds')
            sleep(60)
        for res in gh.search_repositories('created:{} stars:>{}'.format(date_range, min_stars)):
            repo = res.repository
            repos[repo.full_name] = repo

    add_repos('<2008-01-02')
    add_repos('2008-01-01..2009-01-02')
    add_repos('2009-01-01..2010-01-02')
    add_repos('2010-01-01..2011-01-02')
    add_repos('2011-01-01..2012-01-02')
    add_repos('2012-01-01..2013-01-02')
    add_repos('2013-01-01..2014-01-02')
    add_repos('>2014-01-01')

    print('Found {} repos'.format(len(repos)))
    return repos


def parse_repos(repos):
    result = []
    for i, repo in enumerate(repos.values(), 1):
        print('{}% {}'.format(round(i / len(repos) * 100), repo.full_name))
        repo_result = {
            'name': repo.full_name,
            'language': repo.language,
            'stars': repo.stargazers,
            'total_commits': 0,
            'total_additions': 0,
            'total_changes': 0,
            'contributors': {}
        }
        iter_contrib_stats = iter(repo.iter_contributor_statistics())
        while True:
            # Sometimes strange ex. 'NoneType' object has no attribute 'get' is thrown
            # github3.py bug or connection problems?
            try:
                contrib_stats = iter_contrib_stats.__next__()
                contrib_additions = 0
                contrib_changes = 0
                for week in contrib_stats.weeks:
                    additions = int(week['a'])
                    deletions = int(week['d'])
                    contrib_additions += additions - deletions
                    contrib_changes += additions + deletions
                repo_result['total_commits'] += contrib_stats.total
                repo_result['total_additions'] += contrib_additions
                repo_result['total_changes'] += contrib_changes
                repo_result['contributors'][contrib_stats.author.login] = {
                    'commits': contrib_stats.total,
                    'additions': contrib_additions,
                    'changes': contrib_changes
                }
            except StopIteration:
                break
            except Exception as e:
                print(e)
        result.append(repo_result)
    return result

if __name__ == '__main__':
    min_stars = 700
    parse_empty = True
    if parse_empty:
        with open('{}-empty.json'.format(min_stars)) as json_file:
            all_empty = json.load(json_file)
        repos_able_to_process = (gh.rate_limit()['rate']['remaining'] // 2) - 1
        print('Rate limits allow to process {} repos'.format(repos_able_to_process))
        empty_to_process = list(all_empty[:repos_able_to_process])
        repos = {}
        for i, full_name in enumerate(empty_to_process):
            print('{}% {}'.format(round(i / len(empty_to_process) * 100), full_name))
            owner, repo = full_name.split('/')
            repos[full_name] = gh.repository(owner, repo)
        result = parse_repos(repos)
        with open('{}.json'.format(min_stars)) as json_file:
            old_result = json.load(json_file)
        new_repos = {repo['name']: repo for repo in old_result}
        for repo_result in result:
            name = repo_result['name']
            if repo_result['contributors']:
                new_repos[name] = repo_result
            else:
                empty_to_process.remove(name)
        with open('{}.json'.format(min_stars), 'w') as json_file:
             json.dump(list(new_repos.values()), json_file, sort_keys=True, indent=4, separators=(',', ': '))

        for processed in empty_to_process:
            all_empty.remove(processed)
        with open('{}-empty.json'.format(min_stars), 'w') as json_file:
            json.dump(all_empty, json_file, sort_keys=True, indent=4, separators=(',', ': '))
    else:
        result = parse_repos(all_repos_by_stars(min_stars))
        with open('{}.json'.format(min_stars), 'w') as json_file:
            json.dump(result, json_file, sort_keys=True, indent=4, separators=(',', ': '))

## rate_limits.py
from github3 import login

gh = login('login', 'pass')

print(gh.rate_limit())

## to_csv.py
import json
import csv
MIN_STARS = 700
repos = json.load(open('{}.json'.format(MIN_STARS)))
with open('{}.csv'.format(MIN_STARS), 'w') as csv_file:
    field_names = ['name', 'language', 'stars', 'total_commits', 'total_additions', 'total_changes', 'top', 'criteria']
    writer = csv.DictWriter(csv_file, field_names, extrasaction='ignore')
    writer.writeheader()
    for repo in repos:
        writer.writerow(repo)
	import json

	MIN_STARS = 700

	with open('{}.json'.format(MIN_STARS)) as json_file:
	repos = json.load(json_file)

	repos_by_language = {}
	by_top = {}
	for repo in repos:
	by_top[repo['top']] = by_top.get(repo['top'], 0) + 1
	repos_by_language.setdefault(repo['language'], []).append(repo)

	print('{} repos'.format(len(repos)))

	for top, count in by_top.items():
	print('<tr><td>{}</td><td>{} - {:.1f}%</td></tr>'.format(top, count, count / len(repos) * 100).replace('.', ','))

	russian = {
	'top': 'авторов',
	'total_commits': 'коммитов',
	'total_additions': 'добавлений',
	'total_changes': 'изменений',
	'stars': 'звездочек',
	}

	def ending(n):
	if 10 < n < 20:
	return 'ов'
	if n % 10 == 1:
	return ''
	if 2 <= n % 10 <= 4:
	return 'а'
	return 'ов'

	def top_and_print(repos, keys, where=None, top=5, reverse=False):
	sort_key = lambda r: tuple(map(lambda k: r[k], keys))
	if where:
	repos = filter(lambda r: all(map(lambda kv: r[kv[0]] == kv[1], where.items())), repos)
	sorted_repos = sorted(repos, key=sort_key, reverse=not reverse)
	top_repos = sorted_repos[:top]
	if top_repos:
	print('<h4>Топ {} проект{} по количеству {}{}</h4><table>'.format(top, ending(top), russian[keys[0]],
	' с одним автором' if where else ''))
	print('<tr><th>Проект</th><th>Язык</th><th>Звездочки</th><th>Коммиты</th>'
	'<th>Добавления</th><th>Изменения</th><th>Авторы</th></tr>')
	for r in top_repos:
	print(('<tr>'
	'<td><a href="https://github.com/{}">{}</a></td>'
	'<td>{}</td>'
	'<td>{}</td>'
	'<td>{}</td>'
	'<td>{}</td>'
	'<td>{}</td>'
	'<td>{}</td>'
	'</tr>').format(r['name'], r['name'], r['language'], r['stars'], r['total_commits'],
	r['total_additions'], r['total_changes'], r['top']))
	print('</table>')
	print()
	else:
	print('No repos with {}'.format(where))

	top_and_print(repos, ['top'], top=20)
	top_and_print(repos, ['total_commits'], top=20)
	top_and_print(repos, ['total_additions'], top=20)
	top_and_print(repos, ['total_changes'], top=20)
	top_and_print(repos, ['stars'], where={'top': 1}, top=7)
	top_and_print(repos, ['total_commits'], where={'top': 1}, top=7)
	total_repos = len(repos)

	lang_stats = {}

	for lang, repos in sorted(repos_by_language.items(), key=lambda e: len(e[1]), reverse=True):
	print('<spoiler title="{}">'.format(lang))
	print()
	n = len(repos)
	print('{} проект{} - {:.1f}%'.format(n, ending(n), n / total_repos * 100).replace('.', ','))
	print()
	average_top = sum(map(lambda r: r['top'], repos)) / n
	print('Среднее количество авторов: {:.1f}.'.format(average_top).replace('.', ',', 1))
	lang_stats[lang] = (n, average_top)
	print()
	top_and_print(repos, ['top'])
	top_and_print(repos, ['total_commits'])
	top_and_print(repos, ['total_additions'])
	top_and_print(repos, ['total_changes'])
	top_and_print(repos, ['stars'], where={'top': 1}, top=3)
	top_and_print(repos, ['total_commits'], where={'top': 1}, top=3)
	print('</spoiler>')

	for lang, stats in sorted(lang_stats.items(), key=lambda e: e[1][0], reverse=True):
	print('<tr><td>{}</td><td>{} - {:.1f}%</td><td>{:.1f}</td></tr>'.format(lang, stats[0],
	stats[0] / total_repos * 100, stats[1]
	).replace('.', ','))
	import json

	MIN_STARS = 700
	THRESHOLD = 0.9

	with open('{}.json'.format(MIN_STARS)) as json_file:
	repos = json.load(json_file)

	keys = ['commits', 'additions', 'changes']


	def sum_contributions(contributors):
	return {k: sum(map(lambda c: c[1][k], contributors)) for k in keys}

	for repo in repos:
	cs = repo['contributors'].items()
	total_sums = sum_contributions(cs)
	cs_by = {k: sorted(cs, key=lambda c: c[1][k], reverse=True) for k in keys}
	result_top = 0
	result_passing_criteria = []
	for top in range(len(cs)):
	for k in keys:
	top_by = cs_by[k][:top + 1]
	top_nicknames = map(lambda e: e[0], top_by)
	top_sums = sum_contributions(top_by)
	if total_sums[k] > 0 and top_sums[k] / total_sums[k] >= THRESHOLD:
	passing_criteria = [k]
	rest_keys = list(keys)
	rest_keys.remove(k)
	for rest_key in rest_keys:
	if total_sums[rest_key] > 0 and top_sums[rest_key] / total_sums[rest_key] >= THRESHOLD:
	passing_criteria.append(rest_key)
	if len(passing_criteria) >= 2 and result_top == 0:
	result_top = top + 1
	result_passing_criteria = passing_criteria
	if result_top != 0:
	break
	repo['top'] = result_top
	repo['criteria'] = sorted(result_passing_criteria)

	with open('{}.json'.format(MIN_STARS), 'w') as json_file:
	json.dump(repos, json_file, sort_keys=True, indent=4, separators=(',', ': '))
	import json

	MIN_STARS = 700

	repos = json.load(open('{}.json'.format(MIN_STARS)))

	empty = []
	for repo in repos:
	if not repo['contributors']:
	empty.append(repo['name'])

	with open('{}-empty.json'.format(MIN_STARS), 'w') as json_file:
	json.dump(empty, json_file, sort_keys=True, indent=4, separators=(',', ': '))
	from github3 import login
	import json
	from time import sleep

	gh = login('login', 'pass')


	def all_repos_by_stars(min_stars):
	repos = {}
	def add_repos(date_range):
	if gh.rate_limit()['resources']['search']['remaining'] < 10:
	print('Throttling requests, sleeping for 60 seconds')
	sleep(60)
	for res in gh.search_repositories('created:{} stars:>{}'.format(date_range, min_stars)):
	repo = res.repository
	repos[repo.full_name] = repo

	add_repos('<2008-01-02')
	add_repos('2008-01-01..2009-01-02')
	add_repos('2009-01-01..2010-01-02')
	add_repos('2010-01-01..2011-01-02')
	add_repos('2011-01-01..2012-01-02')
	add_repos('2012-01-01..2013-01-02')
	add_repos('2013-01-01..2014-01-02')
	add_repos('>2014-01-01')

	print('Found {} repos'.format(len(repos)))
	return repos


	def parse_repos(repos):
	result = []
	for i, repo in enumerate(repos.values(), 1):
	print('{}% {}'.format(round(i / len(repos) * 100), repo.full_name))
	repo_result = {
	'name': repo.full_name,
	'language': repo.language,
	'stars': repo.stargazers,
	'total_commits': 0,
	'total_additions': 0,
	'total_changes': 0,
	'contributors': {}
	}
	iter_contrib_stats = iter(repo.iter_contributor_statistics())
	while True:
	# Sometimes strange ex. 'NoneType' object has no attribute 'get' is thrown
	# github3.py bug or connection problems?
	try:
	contrib_stats = iter_contrib_stats.__next__()
	contrib_additions = 0
	contrib_changes = 0
	for week in contrib_stats.weeks:
	additions = int(week['a'])
	deletions = int(week['d'])
	contrib_additions += additions - deletions
	contrib_changes += additions + deletions
	repo_result['total_commits'] += contrib_stats.total
	repo_result['total_additions'] += contrib_additions
	repo_result['total_changes'] += contrib_changes
	repo_result['contributors'][contrib_stats.author.login] = {
	'commits': contrib_stats.total,
	'additions': contrib_additions,
	'changes': contrib_changes
	}
	except StopIteration:
	break
	except Exception as e:
	print(e)
	result.append(repo_result)
	return result

	if __name__ == '__main__':
	min_stars = 700
	parse_empty = True
	if parse_empty:
	with open('{}-empty.json'.format(min_stars)) as json_file:
	all_empty = json.load(json_file)
	repos_able_to_process = (gh.rate_limit()['rate']['remaining'] // 2) - 1
	print('Rate limits allow to process {} repos'.format(repos_able_to_process))
	empty_to_process = list(all_empty[:repos_able_to_process])
	repos = {}
	for i, full_name in enumerate(empty_to_process):
	print('{}% {}'.format(round(i / len(empty_to_process) * 100), full_name))
	owner, repo = full_name.split('/')
	repos[full_name] = gh.repository(owner, repo)
	result = parse_repos(repos)
	with open('{}.json'.format(min_stars)) as json_file:
	old_result = json.load(json_file)
	new_repos = {repo['name']: repo for repo in old_result}
	for repo_result in result:
	name = repo_result['name']
	if repo_result['contributors']:
	new_repos[name] = repo_result
	else:
	empty_to_process.remove(name)
	with open('{}.json'.format(min_stars), 'w') as json_file:
	json.dump(list(new_repos.values()), json_file, sort_keys=True, indent=4, separators=(',', ': '))

	for processed in empty_to_process:
	all_empty.remove(processed)
	with open('{}-empty.json'.format(min_stars), 'w') as json_file:
	json.dump(all_empty, json_file, sort_keys=True, indent=4, separators=(',', ': '))
	else:
	result = parse_repos(all_repos_by_stars(min_stars))
	with open('{}.json'.format(min_stars), 'w') as json_file:
	json.dump(result, json_file, sort_keys=True, indent=4, separators=(',', ': '))
	from github3 import login

	gh = login('login', 'pass')

	print(gh.rate_limit())
	import json
	import csv
	MIN_STARS = 700
	repos = json.load(open('{}.json'.format(MIN_STARS)))
	with open('{}.csv'.format(MIN_STARS), 'w') as csv_file:
	field_names = ['name', 'language', 'stars', 'total_commits', 'total_additions', 'total_changes', 'top', 'criteria']
	writer = csv.DictWriter(csv_file, field_names, extrasaction='ignore')
	writer.writeheader()
	for repo in repos:
	writer.writerow(repo)