Skip to content

Instantly share code, notes, and snippets.

@seocam
Last active May 8, 2017 14:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save seocam/7224518c280a8a584df726fc03f123f4 to your computer and use it in GitHub Desktop.
Save seocam/7224518c280a8a584df726fc03f123f4 to your computer and use it in GitHub Desktop.
Index Github data in Elasticsearch
#!/usr/bin/env python
import functools
import itertools
import requests
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
ELASTICSEARCH_IP = '<set>'
GITHUB_TOKEN = '<set>'
GITHUB_ORGANIZATION = '<set>'
GITHUB_PAGE_SIZE = 100
GITHUB_AUTH_HEADERS = {'Authorization': 'token ' + GITHUB_TOKEN}
def grouper(n, iterable):
"""Divide a list in smaller lists with size n
Grouper pattern is documented in itertools docs
"""
args = [iter(iterable)] * n
return itertools.zip_longest(*args)
def get_doc_field(doc, field):
field_parts = field.split('.')
subdoc = doc
for part in field_parts:
if not subdoc:
return None
subdoc = subdoc.get(part)
return subdoc
def doc2es_action(doc, index_name, extra=None):
extra_data = {}
if extra:
for action_name, field_name in extra.items():
if field_name.startswith('_'):
extra_data[action_name] = field_name.strip('_')
else:
extra_data[action_name] = get_doc_field(doc, field_name)
action = {
'_type': 'document',
'_id': doc.get('id', doc.get('sha')),
'_index': index_name,
}
action.update(doc)
action.update(extra_data)
return action
def gh_request_next_page(url):
print('.', end='', flush=True)
page = requests.get(url, headers=GITHUB_AUTH_HEADERS)
page_num = 1
while len(page.json()):
yield page.json()
page_num += 1
print('.', end='', flush=True)
page = requests.get(url + '&page=' + str(page_num),
headers=GITHUB_AUTH_HEADERS)
def gh_issues_to_actions():
print('Getting Issues from {} organization'.format(GITHUB_ORGANIZATION))
url = 'https://api.github.com/orgs/{}/issues'.format(
GITHUB_ORGANIZATION,
)
url += '?filter=all&state=all&per_page=100'
actions = []
extra = {
'@timestamp': 'updated_at',
'@user': 'assignee.login',
'@type': '__issue__',
}
for page in gh_request_next_page(url):
for issue in page:
if issue.get('pull_request'):
continue
actions.append(doc2es_action(issue, 'gh-issue', extra))
print('\n{} Issues recovered'.format(len(actions)))
return actions
@functools.lru_cache()
def gh_get_repos():
url = 'https://api.github.com/orgs/{}/repos'.format(
GITHUB_ORGANIZATION,
)
repos = requests.get(url, headers=GITHUB_AUTH_HEADERS)
return repos.json()
def gh_commits_to_actions():
print('Getting Commits from {} organization'.format(
GITHUB_ORGANIZATION,
))
all_actions = []
extra = {
'@timestamp': 'commit.author.date',
'@user': 'author.login',
'@type': '__commit__',
}
for repo in gh_get_repos():
actions = []
url = 'https://api.github.com/repos/{}/{}/commits'.format(
GITHUB_ORGANIZATION,
repo['name'],
)
url += '?per_page=100'
for page in gh_request_next_page(url):
for commit in page:
actions.append(doc2es_action(commit, 'gh-commit', extra))
all_actions += actions
print('\n{} Commits recovered'.format(len(all_actions)))
return all_actions
def gh_pulls_to_actions():
print('Getting Pull Requests from {} organization'.format(
GITHUB_ORGANIZATION,
))
all_actions = []
extra = {
'@timestamp': 'created_at',
'@user': 'user.login',
'@type': '__pull-request__',
}
for repo in gh_get_repos():
actions = []
# print('Getting PRs from repo', repo['name'])
url = 'https://api.github.com/repos/{}/{}/pulls'.format(
GITHUB_ORGANIZATION,
repo['name'],
)
url += '?state=all&per_page=100'
for page in gh_request_next_page(url):
for pull in page:
actions.append(doc2es_action(pull, 'gh-pull', extra))
all_actions += actions
print('\n{} Pull Requests recovered'.format(len(all_actions)))
return all_actions
def gh_releases_to_actions():
print('Getting Releases from {} organization'.format(
GITHUB_ORGANIZATION,
))
all_actions = []
extra = {
'@timestamp': 'published_at',
'@user': 'author.login',
'@type': '__release__',
}
for repo in gh_get_repos():
actions = []
url = 'https://api.github.com/repos/{}/{}/releases'.format(
GITHUB_ORGANIZATION,
repo['name'],
)
url += '?state=all&per_page=100'
for page in gh_request_next_page(url):
for release in page:
actions.append(doc2es_action(release, 'gh-release', extra))
all_actions += actions
print('\n{} Releases recovered'.format(len(all_actions)))
return all_actions
def batch_index(actions, batch_size=50):
print('Indexing documents')
es = Elasticsearch([ELASTICSEARCH_IP], timeout=180)
grouped_actions = grouper(batch_size, actions)
for action_group in grouped_actions:
print('.', end='', flush=True)
action_group = list(filter(None, action_group))
bulk(es, action_group)
print()
def index_gh():
issues_actions = gh_issues_to_actions()
batch_index(issues_actions)
pulls_actions = gh_pulls_to_actions()
batch_index(pulls_actions)
release_actions = gh_releases_to_actions()
batch_index(release_actions)
commits_actions = gh_commits_to_actions()
batch_index(commits_actions)
if __name__ == '__main__':
index_gh()
@seocam
Copy link
Author

seocam commented Apr 19, 2017

This version indexes Releases, Issues and Pull Requests.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment