Skip to content

Instantly share code, notes, and snippets.

@brendansudol
Created March 31, 2017 18:05
Show Gist options
  • Save brendansudol/7eadcd92df0431eb893ff0c78046b579 to your computer and use it in GitHub Desktop.
Save brendansudol/7eadcd92df0431eb893ff0c78046b579 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import glob\n",
"import json\n",
"import os\n",
"from datetime import datetime\n",
"\n",
"from github import Github, RateLimitExceededException\n",
"\n",
"GH_KEYS = [os.environ['GH_KEY{}'.format(i)] for i in range(1, 10)]"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"839\n",
"['14c-prototype', '18f-bot', '18f-cli', '18f-education-discovery', '18f-reveal.js-theme']\n"
]
}
],
"source": [
"# get 18f repo names (& sort them)\n",
"\n",
"gh = Github(GH_KEYS[0])\n",
"org = gh.get_organization('18F')\n",
"repo_names = sorted([r.name for r in org.get_repos()])\n",
"\n",
"print(len(repo_names))\n",
"print(repo_names[:5])"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false,
"scrolled": false
},
"outputs": [],
"source": [
"# get and save all commits (since 2016) per repo\n",
"\n",
"SINCE = datetime(2016, 1, 1)\n",
"IDX_START = 0\n",
"IDX_END = 100\n",
"\n",
"\n",
"class Extract:\n",
" def __init__(self):\n",
" self.keys = GH_KEYS[:]\n",
" self.gh = Github(self.keys[0])\n",
" \n",
" def fetch(self, name):\n",
" try:\n",
" repo = self.gh.get_organization('18F').get_repo(name)\n",
" commits = repo.get_commits(since=SINCE)\n",
" return { 'repo': name, 'commits': [c.raw_data for c in commits] }\n",
" except RateLimitExceededException:\n",
" self.update_key()\n",
" return self.fetch(name) \n",
"\n",
" def save(self, data, name):\n",
" with open('github-data/repos/{}.json'.format(name), 'w') as f:\n",
" json.dump(data, f)\n",
"\n",
" def update_key(self):\n",
" if not len(self.keys):\n",
" raise Exception('no more API keys to use :(')\n",
" key = self.keys.pop(0)\n",
" print('using new key \"{}\"...'.format(key))\n",
" self.gh = Github(key)\n",
"\n",
" \n",
"e = Extract() \n",
"for i, repo in enumerate(repo_names[IDX_START:IDX_END]):\n",
" print('{}. fetching commits for {}...'.format(i + IDX_START, repo))\n",
" e.save(e.fetch(repo), repo)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# combine & save into one big dataset\n",
"\n",
"data_files = glob.glob('github-data/repos/*')\n",
"results = {}\n",
"\n",
"for fname in data_files:\n",
" with open(fname) as f:\n",
" d = json.load(f)\n",
" results[d['repo']] = d['commits']\n",
"\n",
"with open('github-data/all-commits.json', 'w') as f:\n",
" json.dump(results, f, ensure_ascii=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
@thecapacity
Copy link

I had to look up the behavior of self.keys = GH_KEYS[:] ?

I was pretty sure it was to make a copy but had never seen the "slice as copy" before, apparently I'm not alone; http://stackoverflow.com/questions/2612802/how-to-clone-or-copy-a-list

it is a weird syntax and it does not make sense to use it ever 😂

Love seeing your code, thanks for teaching me!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment