Skip to content

Instantly share code, notes, and snippets.

@leventov
Created September 4, 2014 23:53
Show Gist options
  • Star 5 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save leventov/9e00036ec2f3abbf4a55 to your computer and use it in GitHub Desktop.
Save leventov/9e00036ec2f3abbf4a55 to your computer and use it in GitHub Desktop.
См. http://habrahabr.ru/post/235689/. Инструкция по применению в первом комментарии
import json
MIN_STARS = 700
with open('{}.json'.format(MIN_STARS)) as json_file:
repos = json.load(json_file)
repos_by_language = {}
by_top = {}
for repo in repos:
by_top[repo['top']] = by_top.get(repo['top'], 0) + 1
repos_by_language.setdefault(repo['language'], []).append(repo)
print('{} repos'.format(len(repos)))
for top, count in by_top.items():
print('<tr><td>{}</td><td>{} - {:.1f}%</td></tr>'.format(top, count, count / len(repos) * 100).replace('.', ','))
russian = {
'top': 'авторов',
'total_commits': 'коммитов',
'total_additions': 'добавлений',
'total_changes': 'изменений',
'stars': 'звездочек',
}
def ending(n):
if 10 < n < 20:
return 'ов'
if n % 10 == 1:
return ''
if 2 <= n % 10 <= 4:
return 'а'
return 'ов'
def top_and_print(repos, keys, where=None, top=5, reverse=False):
sort_key = lambda r: tuple(map(lambda k: r[k], keys))
if where:
repos = filter(lambda r: all(map(lambda kv: r[kv[0]] == kv[1], where.items())), repos)
sorted_repos = sorted(repos, key=sort_key, reverse=not reverse)
top_repos = sorted_repos[:top]
if top_repos:
print('<h4>Топ {} проект{} по количеству {}{}</h4><table>'.format(top, ending(top), russian[keys[0]],
' с одним автором' if where else ''))
print('<tr><th>Проект</th><th>Язык</th><th>Звездочки</th><th>Коммиты</th>'
'<th>Добавления</th><th>Изменения</th><th>Авторы</th></tr>')
for r in top_repos:
print(('<tr>'
'<td><a href="https://github.com/{}">{}</a></td>'
'<td>{}</td>'
'<td>{}</td>'
'<td>{}</td>'
'<td>{}</td>'
'<td>{}</td>'
'<td>{}</td>'
'</tr>').format(r['name'], r['name'], r['language'], r['stars'], r['total_commits'],
r['total_additions'], r['total_changes'], r['top']))
print('</table>')
print()
else:
print('No repos with {}'.format(where))
top_and_print(repos, ['top'], top=20)
top_and_print(repos, ['total_commits'], top=20)
top_and_print(repos, ['total_additions'], top=20)
top_and_print(repos, ['total_changes'], top=20)
top_and_print(repos, ['stars'], where={'top': 1}, top=7)
top_and_print(repos, ['total_commits'], where={'top': 1}, top=7)
total_repos = len(repos)
lang_stats = {}
for lang, repos in sorted(repos_by_language.items(), key=lambda e: len(e[1]), reverse=True):
print('<spoiler title="{}">'.format(lang))
print()
n = len(repos)
print('{} проект{} - {:.1f}%'.format(n, ending(n), n / total_repos * 100).replace('.', ','))
print()
average_top = sum(map(lambda r: r['top'], repos)) / n
print('Среднее количество авторов: {:.1f}.'.format(average_top).replace('.', ',', 1))
lang_stats[lang] = (n, average_top)
print()
top_and_print(repos, ['top'])
top_and_print(repos, ['total_commits'])
top_and_print(repos, ['total_additions'])
top_and_print(repos, ['total_changes'])
top_and_print(repos, ['stars'], where={'top': 1}, top=3)
top_and_print(repos, ['total_commits'], where={'top': 1}, top=3)
print('</spoiler>')
for lang, stats in sorted(lang_stats.items(), key=lambda e: e[1][0], reverse=True):
print('<tr><td>{}</td><td>{} - {:.1f}%</td><td>{:.1f}</td></tr>'.format(lang, stats[0],
stats[0] / total_repos * 100, stats[1]
).replace('.', ','))
import json
MIN_STARS = 700
THRESHOLD = 0.9
with open('{}.json'.format(MIN_STARS)) as json_file:
repos = json.load(json_file)
keys = ['commits', 'additions', 'changes']
def sum_contributions(contributors):
return {k: sum(map(lambda c: c[1][k], contributors)) for k in keys}
for repo in repos:
cs = repo['contributors'].items()
total_sums = sum_contributions(cs)
cs_by = {k: sorted(cs, key=lambda c: c[1][k], reverse=True) for k in keys}
result_top = 0
result_passing_criteria = []
for top in range(len(cs)):
for k in keys:
top_by = cs_by[k][:top + 1]
top_nicknames = map(lambda e: e[0], top_by)
top_sums = sum_contributions(top_by)
if total_sums[k] > 0 and top_sums[k] / total_sums[k] >= THRESHOLD:
passing_criteria = [k]
rest_keys = list(keys)
rest_keys.remove(k)
for rest_key in rest_keys:
if total_sums[rest_key] > 0 and top_sums[rest_key] / total_sums[rest_key] >= THRESHOLD:
passing_criteria.append(rest_key)
if len(passing_criteria) >= 2 and result_top == 0:
result_top = top + 1
result_passing_criteria = passing_criteria
if result_top != 0:
break
repo['top'] = result_top
repo['criteria'] = sorted(result_passing_criteria)
with open('{}.json'.format(MIN_STARS), 'w') as json_file:
json.dump(repos, json_file, sort_keys=True, indent=4, separators=(',', ': '))
import json
MIN_STARS = 700
repos = json.load(open('{}.json'.format(MIN_STARS)))
empty = []
for repo in repos:
if not repo['contributors']:
empty.append(repo['name'])
with open('{}-empty.json'.format(MIN_STARS), 'w') as json_file:
json.dump(empty, json_file, sort_keys=True, indent=4, separators=(',', ': '))
from github3 import login
import json
from time import sleep
gh = login('login', 'pass')
def all_repos_by_stars(min_stars):
repos = {}
def add_repos(date_range):
if gh.rate_limit()['resources']['search']['remaining'] < 10:
print('Throttling requests, sleeping for 60 seconds')
sleep(60)
for res in gh.search_repositories('created:{} stars:>{}'.format(date_range, min_stars)):
repo = res.repository
repos[repo.full_name] = repo
add_repos('<2008-01-02')
add_repos('2008-01-01..2009-01-02')
add_repos('2009-01-01..2010-01-02')
add_repos('2010-01-01..2011-01-02')
add_repos('2011-01-01..2012-01-02')
add_repos('2012-01-01..2013-01-02')
add_repos('2013-01-01..2014-01-02')
add_repos('>2014-01-01')
print('Found {} repos'.format(len(repos)))
return repos
def parse_repos(repos):
result = []
for i, repo in enumerate(repos.values(), 1):
print('{}% {}'.format(round(i / len(repos) * 100), repo.full_name))
repo_result = {
'name': repo.full_name,
'language': repo.language,
'stars': repo.stargazers,
'total_commits': 0,
'total_additions': 0,
'total_changes': 0,
'contributors': {}
}
iter_contrib_stats = iter(repo.iter_contributor_statistics())
while True:
# Sometimes strange ex. 'NoneType' object has no attribute 'get' is thrown
# github3.py bug or connection problems?
try:
contrib_stats = iter_contrib_stats.__next__()
contrib_additions = 0
contrib_changes = 0
for week in contrib_stats.weeks:
additions = int(week['a'])
deletions = int(week['d'])
contrib_additions += additions - deletions
contrib_changes += additions + deletions
repo_result['total_commits'] += contrib_stats.total
repo_result['total_additions'] += contrib_additions
repo_result['total_changes'] += contrib_changes
repo_result['contributors'][contrib_stats.author.login] = {
'commits': contrib_stats.total,
'additions': contrib_additions,
'changes': contrib_changes
}
except StopIteration:
break
except Exception as e:
print(e)
result.append(repo_result)
return result
if __name__ == '__main__':
min_stars = 700
parse_empty = True
if parse_empty:
with open('{}-empty.json'.format(min_stars)) as json_file:
all_empty = json.load(json_file)
repos_able_to_process = (gh.rate_limit()['rate']['remaining'] // 2) - 1
print('Rate limits allow to process {} repos'.format(repos_able_to_process))
empty_to_process = list(all_empty[:repos_able_to_process])
repos = {}
for i, full_name in enumerate(empty_to_process):
print('{}% {}'.format(round(i / len(empty_to_process) * 100), full_name))
owner, repo = full_name.split('/')
repos[full_name] = gh.repository(owner, repo)
result = parse_repos(repos)
with open('{}.json'.format(min_stars)) as json_file:
old_result = json.load(json_file)
new_repos = {repo['name']: repo for repo in old_result}
for repo_result in result:
name = repo_result['name']
if repo_result['contributors']:
new_repos[name] = repo_result
else:
empty_to_process.remove(name)
with open('{}.json'.format(min_stars), 'w') as json_file:
json.dump(list(new_repos.values()), json_file, sort_keys=True, indent=4, separators=(',', ': '))
for processed in empty_to_process:
all_empty.remove(processed)
with open('{}-empty.json'.format(min_stars), 'w') as json_file:
json.dump(all_empty, json_file, sort_keys=True, indent=4, separators=(',', ': '))
else:
result = parse_repos(all_repos_by_stars(min_stars))
with open('{}.json'.format(min_stars), 'w') as json_file:
json.dump(result, json_file, sort_keys=True, indent=4, separators=(',', ': '))
from github3 import login
gh = login('login', 'pass')
print(gh.rate_limit())
import json
import csv
MIN_STARS = 700
repos = json.load(open('{}.json'.format(MIN_STARS)))
with open('{}.csv'.format(MIN_STARS), 'w') as csv_file:
field_names = ['name', 'language', 'stars', 'total_commits', 'total_additions', 'total_changes', 'top', 'criteria']
writer = csv.DictWriter(csv_file, field_names, extrasaction='ignore')
writer.writeheader()
for repo in repos:
writer.writerow(repo)
@leventov
Copy link
Author

leventov commented Sep 4, 2014

Зависимость - github3.py, среда Python 3

  1. Вписать свой логин и пароль в вызовах login('login', 'pass')
  2. Установить желаемое минимальное число звездочек во всех файлах, но так чтобы общее кол-во репов не превышало 5000, потому что это лимит Гитхаба на запросы в час
  3. Поставить parse_empty = False в github.py
  4. Запустить github.py
  5. Запустить empty.py
  6. Поставить parse_empty = True и запустить github.py 2-3 раза подряд
  7. Запустить contrib.py
  8. Запустить to_csv.py
  9. Запустить analyze.py
    10) Скопипастить прямо в статью на Хабре

Знаю что все очень криво, но лениво вылизывать.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment