Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
См. http://habrahabr.ru/post/235689/. Инструкция по применению в первом комментарии
import json
MIN_STARS = 700
with open('{}.json'.format(MIN_STARS)) as json_file:
repos = json.load(json_file)
repos_by_language = {}
by_top = {}
for repo in repos:
by_top[repo['top']] = by_top.get(repo['top'], 0) + 1
repos_by_language.setdefault(repo['language'], []).append(repo)
print('{} repos'.format(len(repos)))
for top, count in by_top.items():
print('<tr><td>{}</td><td>{} - {:.1f}%</td></tr>'.format(top, count, count / len(repos) * 100).replace('.', ','))
russian = {
'top': 'авторов',
'total_commits': 'коммитов',
'total_additions': 'добавлений',
'total_changes': 'изменений',
'stars': 'звездочек',
}
def ending(n):
if 10 < n < 20:
return 'ов'
if n % 10 == 1:
return ''
if 2 <= n % 10 <= 4:
return 'а'
return 'ов'
def top_and_print(repos, keys, where=None, top=5, reverse=False):
sort_key = lambda r: tuple(map(lambda k: r[k], keys))
if where:
repos = filter(lambda r: all(map(lambda kv: r[kv[0]] == kv[1], where.items())), repos)
sorted_repos = sorted(repos, key=sort_key, reverse=not reverse)
top_repos = sorted_repos[:top]
if top_repos:
print('<h4>Топ {} проект{} по количеству {}{}</h4><table>'.format(top, ending(top), russian[keys[0]],
' с одним автором' if where else ''))
print('<tr><th>Проект</th><th>Язык</th><th>Звездочки</th><th>Коммиты</th>'
'<th>Добавления</th><th>Изменения</th><th>Авторы</th></tr>')
for r in top_repos:
print(('<tr>'
'<td><a href="https://github.com/{}">{}</a></td>'
'<td>{}</td>'
'<td>{}</td>'
'<td>{}</td>'
'<td>{}</td>'
'<td>{}</td>'
'<td>{}</td>'
'</tr>').format(r['name'], r['name'], r['language'], r['stars'], r['total_commits'],
r['total_additions'], r['total_changes'], r['top']))
print('</table>')
print()
else:
print('No repos with {}'.format(where))
top_and_print(repos, ['top'], top=20)
top_and_print(repos, ['total_commits'], top=20)
top_and_print(repos, ['total_additions'], top=20)
top_and_print(repos, ['total_changes'], top=20)
top_and_print(repos, ['stars'], where={'top': 1}, top=7)
top_and_print(repos, ['total_commits'], where={'top': 1}, top=7)
total_repos = len(repos)
lang_stats = {}
for lang, repos in sorted(repos_by_language.items(), key=lambda e: len(e[1]), reverse=True):
print('<spoiler title="{}">'.format(lang))
print()
n = len(repos)
print('{} проект{} - {:.1f}%'.format(n, ending(n), n / total_repos * 100).replace('.', ','))
print()
average_top = sum(map(lambda r: r['top'], repos)) / n
print('Среднее количество авторов: {:.1f}.'.format(average_top).replace('.', ',', 1))
lang_stats[lang] = (n, average_top)
print()
top_and_print(repos, ['top'])
top_and_print(repos, ['total_commits'])
top_and_print(repos, ['total_additions'])
top_and_print(repos, ['total_changes'])
top_and_print(repos, ['stars'], where={'top': 1}, top=3)
top_and_print(repos, ['total_commits'], where={'top': 1}, top=3)
print('</spoiler>')
for lang, stats in sorted(lang_stats.items(), key=lambda e: e[1][0], reverse=True):
print('<tr><td>{}</td><td>{} - {:.1f}%</td><td>{:.1f}</td></tr>'.format(lang, stats[0],
stats[0] / total_repos * 100, stats[1]
).replace('.', ','))
import json
MIN_STARS = 700
THRESHOLD = 0.9
with open('{}.json'.format(MIN_STARS)) as json_file:
repos = json.load(json_file)
keys = ['commits', 'additions', 'changes']
def sum_contributions(contributors):
return {k: sum(map(lambda c: c[1][k], contributors)) for k in keys}
for repo in repos:
cs = repo['contributors'].items()
total_sums = sum_contributions(cs)
cs_by = {k: sorted(cs, key=lambda c: c[1][k], reverse=True) for k in keys}
result_top = 0
result_passing_criteria = []
for top in range(len(cs)):
for k in keys:
top_by = cs_by[k][:top + 1]
top_nicknames = map(lambda e: e[0], top_by)
top_sums = sum_contributions(top_by)
if total_sums[k] > 0 and top_sums[k] / total_sums[k] >= THRESHOLD:
passing_criteria = [k]
rest_keys = list(keys)
rest_keys.remove(k)
for rest_key in rest_keys:
if total_sums[rest_key] > 0 and top_sums[rest_key] / total_sums[rest_key] >= THRESHOLD:
passing_criteria.append(rest_key)
if len(passing_criteria) >= 2 and result_top == 0:
result_top = top + 1
result_passing_criteria = passing_criteria
if result_top != 0:
break
repo['top'] = result_top
repo['criteria'] = sorted(result_passing_criteria)
with open('{}.json'.format(MIN_STARS), 'w') as json_file:
json.dump(repos, json_file, sort_keys=True, indent=4, separators=(',', ': '))
import json
MIN_STARS = 700
repos = json.load(open('{}.json'.format(MIN_STARS)))
empty = []
for repo in repos:
if not repo['contributors']:
empty.append(repo['name'])
with open('{}-empty.json'.format(MIN_STARS), 'w') as json_file:
json.dump(empty, json_file, sort_keys=True, indent=4, separators=(',', ': '))
from github3 import login
import json
from time import sleep
gh = login('login', 'pass')
def all_repos_by_stars(min_stars):
repos = {}
def add_repos(date_range):
if gh.rate_limit()['resources']['search']['remaining'] < 10:
print('Throttling requests, sleeping for 60 seconds')
sleep(60)
for res in gh.search_repositories('created:{} stars:>{}'.format(date_range, min_stars)):
repo = res.repository
repos[repo.full_name] = repo
add_repos('<2008-01-02')
add_repos('2008-01-01..2009-01-02')
add_repos('2009-01-01..2010-01-02')
add_repos('2010-01-01..2011-01-02')
add_repos('2011-01-01..2012-01-02')
add_repos('2012-01-01..2013-01-02')
add_repos('2013-01-01..2014-01-02')
add_repos('>2014-01-01')
print('Found {} repos'.format(len(repos)))
return repos
def parse_repos(repos):
result = []
for i, repo in enumerate(repos.values(), 1):
print('{}% {}'.format(round(i / len(repos) * 100), repo.full_name))
repo_result = {
'name': repo.full_name,
'language': repo.language,
'stars': repo.stargazers,
'total_commits': 0,
'total_additions': 0,
'total_changes': 0,
'contributors': {}
}
iter_contrib_stats = iter(repo.iter_contributor_statistics())
while True:
# Sometimes strange ex. 'NoneType' object has no attribute 'get' is thrown
# github3.py bug or connection problems?
try:
contrib_stats = iter_contrib_stats.__next__()
contrib_additions = 0
contrib_changes = 0
for week in contrib_stats.weeks:
additions = int(week['a'])
deletions = int(week['d'])
contrib_additions += additions - deletions
contrib_changes += additions + deletions
repo_result['total_commits'] += contrib_stats.total
repo_result['total_additions'] += contrib_additions
repo_result['total_changes'] += contrib_changes
repo_result['contributors'][contrib_stats.author.login] = {
'commits': contrib_stats.total,
'additions': contrib_additions,
'changes': contrib_changes
}
except StopIteration:
break
except Exception as e:
print(e)
result.append(repo_result)
return result
if __name__ == '__main__':
min_stars = 700
parse_empty = True
if parse_empty:
with open('{}-empty.json'.format(min_stars)) as json_file:
all_empty = json.load(json_file)
repos_able_to_process = (gh.rate_limit()['rate']['remaining'] // 2) - 1
print('Rate limits allow to process {} repos'.format(repos_able_to_process))
empty_to_process = list(all_empty[:repos_able_to_process])
repos = {}
for i, full_name in enumerate(empty_to_process):
print('{}% {}'.format(round(i / len(empty_to_process) * 100), full_name))
owner, repo = full_name.split('/')
repos[full_name] = gh.repository(owner, repo)
result = parse_repos(repos)
with open('{}.json'.format(min_stars)) as json_file:
old_result = json.load(json_file)
new_repos = {repo['name']: repo for repo in old_result}
for repo_result in result:
name = repo_result['name']
if repo_result['contributors']:
new_repos[name] = repo_result
else:
empty_to_process.remove(name)
with open('{}.json'.format(min_stars), 'w') as json_file:
json.dump(list(new_repos.values()), json_file, sort_keys=True, indent=4, separators=(',', ': '))
for processed in empty_to_process:
all_empty.remove(processed)
with open('{}-empty.json'.format(min_stars), 'w') as json_file:
json.dump(all_empty, json_file, sort_keys=True, indent=4, separators=(',', ': '))
else:
result = parse_repos(all_repos_by_stars(min_stars))
with open('{}.json'.format(min_stars), 'w') as json_file:
json.dump(result, json_file, sort_keys=True, indent=4, separators=(',', ': '))
from github3 import login
gh = login('login', 'pass')
print(gh.rate_limit())
import json
import csv
MIN_STARS = 700
repos = json.load(open('{}.json'.format(MIN_STARS)))
with open('{}.csv'.format(MIN_STARS), 'w') as csv_file:
field_names = ['name', 'language', 'stars', 'total_commits', 'total_additions', 'total_changes', 'top', 'criteria']
writer = csv.DictWriter(csv_file, field_names, extrasaction='ignore')
writer.writeheader()
for repo in repos:
writer.writerow(repo)
@leventov

This comment has been minimized.

Copy link
Owner Author

leventov commented Sep 4, 2014

Зависимость - github3.py, среда Python 3

  1. Вписать свой логин и пароль в вызовах login('login', 'pass')
  2. Установить желаемое минимальное число звездочек во всех файлах, но так чтобы общее кол-во репов не превышало 5000, потому что это лимит Гитхаба на запросы в час
  3. Поставить parse_empty = False в github.py
  4. Запустить github.py
  5. Запустить empty.py
  6. Поставить parse_empty = True и запустить github.py 2-3 раза подряд
  7. Запустить contrib.py
  8. Запустить to_csv.py
  9. Запустить analyze.py
    10) Скопипастить прямо в статью на Хабре

Знаю что все очень криво, но лениво вылизывать.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.