seocam/gh2elasticsearch.py

## gh2elasticsearch.py
#!/usr/bin/env python

import functools
import itertools

import requests

from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

ELASTICSEARCH_IP = '<set>'
GITHUB_TOKEN = '<set>'
GITHUB_ORGANIZATION = '<set>'

GITHUB_PAGE_SIZE = 100
GITHUB_AUTH_HEADERS = {'Authorization': 'token ' + GITHUB_TOKEN}


def grouper(n, iterable):
    """Divide a list in smaller lists with size n

    Grouper pattern is documented in itertools docs
    """

    args = [iter(iterable)] * n
    return itertools.zip_longest(*args)


def get_doc_field(doc, field):
    field_parts = field.split('.')
    subdoc = doc
    for part in field_parts:
        if not subdoc:
            return None
        subdoc = subdoc.get(part)

    return subdoc


def doc2es_action(doc, index_name, extra=None):
    extra_data = {}
    if extra:
        for action_name, field_name in extra.items():
            if field_name.startswith('_'):
                extra_data[action_name] = field_name.strip('_')
            else:
                extra_data[action_name] = get_doc_field(doc, field_name)

    action = {
        '_type': 'document',
        '_id': doc.get('id', doc.get('sha')),
        '_index': index_name,
    }
    action.update(doc)
    action.update(extra_data)
    return action


def gh_request_next_page(url):
    print('.', end='', flush=True)
    page = requests.get(url, headers=GITHUB_AUTH_HEADERS)
    page_num = 1

    while len(page.json()):
        yield page.json()
        page_num += 1
        print('.', end='', flush=True)
        page = requests.get(url + '&page=' + str(page_num),
                            headers=GITHUB_AUTH_HEADERS)


def gh_issues_to_actions():
    print('Getting Issues from {} organization'.format(GITHUB_ORGANIZATION))

    url = 'https://api.github.com/orgs/{}/issues'.format(
        GITHUB_ORGANIZATION,
    )
    url += '?filter=all&state=all&per_page=100'

    actions = []

    extra = {
        '@timestamp': 'updated_at',
        '@user': 'assignee.login',
        '@type': '__issue__',
    }

    for page in gh_request_next_page(url):
        for issue in page:
            if issue.get('pull_request'):
                continue
            actions.append(doc2es_action(issue, 'gh-issue', extra))

    print('\n{} Issues recovered'.format(len(actions)))
    return actions


@functools.lru_cache()
def gh_get_repos():
    url = 'https://api.github.com/orgs/{}/repos'.format(
        GITHUB_ORGANIZATION,
    )
    repos = requests.get(url, headers=GITHUB_AUTH_HEADERS)

    return repos.json()


def gh_commits_to_actions():
    print('Getting Commits from {} organization'.format(
        GITHUB_ORGANIZATION,
    ))

    all_actions = []
    extra = {
        '@timestamp': 'commit.author.date',
        '@user': 'author.login',
        '@type': '__commit__',
    }
    for repo in gh_get_repos():
        actions = []

        url = 'https://api.github.com/repos/{}/{}/commits'.format(
            GITHUB_ORGANIZATION,
            repo['name'],
        )
        url += '?per_page=100'

        for page in gh_request_next_page(url):
            for commit in page:
                actions.append(doc2es_action(commit, 'gh-commit', extra))

        all_actions += actions

    print('\n{} Commits recovered'.format(len(all_actions)))
    return all_actions


def gh_pulls_to_actions():
    print('Getting Pull Requests from {} organization'.format(
        GITHUB_ORGANIZATION,
    ))

    all_actions = []
    extra = {
        '@timestamp': 'created_at',
        '@user': 'user.login',
        '@type': '__pull-request__',
    }

    for repo in gh_get_repos():
        actions = []
        # print('Getting PRs from repo', repo['name'])
        url = 'https://api.github.com/repos/{}/{}/pulls'.format(
            GITHUB_ORGANIZATION,
            repo['name'],
        )
        url += '?state=all&per_page=100'

        for page in gh_request_next_page(url):
            for pull in page:
                actions.append(doc2es_action(pull, 'gh-pull', extra))

        all_actions += actions

    print('\n{} Pull Requests recovered'.format(len(all_actions)))
    return all_actions


def gh_releases_to_actions():
    print('Getting Releases from {} organization'.format(
        GITHUB_ORGANIZATION,
    ))

    all_actions = []
    extra = {
        '@timestamp': 'published_at',
        '@user': 'author.login',
        '@type': '__release__',
    }

    for repo in gh_get_repos():
        actions = []
        url = 'https://api.github.com/repos/{}/{}/releases'.format(
            GITHUB_ORGANIZATION,
            repo['name'],
        )
        url += '?state=all&per_page=100'

        for page in gh_request_next_page(url):
            for release in page:
                actions.append(doc2es_action(release, 'gh-release', extra))

        all_actions += actions

    print('\n{} Releases recovered'.format(len(all_actions)))
    return all_actions


def batch_index(actions, batch_size=50):
    print('Indexing documents')
    es = Elasticsearch([ELASTICSEARCH_IP], timeout=180)

    grouped_actions = grouper(batch_size, actions)

    for action_group in grouped_actions:
        print('.', end='', flush=True)
        action_group = list(filter(None, action_group))
        bulk(es, action_group)
    print()


def index_gh():
    issues_actions = gh_issues_to_actions()
    batch_index(issues_actions)

    pulls_actions = gh_pulls_to_actions()
    batch_index(pulls_actions)

    release_actions = gh_releases_to_actions()
    batch_index(release_actions)

    commits_actions = gh_commits_to_actions()
    batch_index(commits_actions)


if __name__ == '__main__':
    index_gh()
	#!/usr/bin/env python

	import functools
	import itertools

	import requests

	from elasticsearch import Elasticsearch
	from elasticsearch.helpers import bulk

	ELASTICSEARCH_IP = '<set>'
	GITHUB_TOKEN = '<set>'
	GITHUB_ORGANIZATION = '<set>'

	GITHUB_PAGE_SIZE = 100
	GITHUB_AUTH_HEADERS = {'Authorization': 'token ' + GITHUB_TOKEN}


	def grouper(n, iterable):
	"""Divide a list in smaller lists with size n

	Grouper pattern is documented in itertools docs
	"""

	args = [iter(iterable)] * n
	return itertools.zip_longest(*args)


	def get_doc_field(doc, field):
	field_parts = field.split('.')
	subdoc = doc
	for part in field_parts:
	if not subdoc:
	return None
	subdoc = subdoc.get(part)

	return subdoc


	def doc2es_action(doc, index_name, extra=None):
	extra_data = {}
	if extra:
	for action_name, field_name in extra.items():
	if field_name.startswith('_'):
	extra_data[action_name] = field_name.strip('_')
	else:
	extra_data[action_name] = get_doc_field(doc, field_name)

	action = {
	'_type': 'document',
	'_id': doc.get('id', doc.get('sha')),
	'_index': index_name,
	}
	action.update(doc)
	action.update(extra_data)
	return action


	def gh_request_next_page(url):
	print('.', end='', flush=True)
	page = requests.get(url, headers=GITHUB_AUTH_HEADERS)
	page_num = 1

	while len(page.json()):
	yield page.json()
	page_num += 1
	print('.', end='', flush=True)
	page = requests.get(url + '&page=' + str(page_num),
	headers=GITHUB_AUTH_HEADERS)


	def gh_issues_to_actions():
	print('Getting Issues from {} organization'.format(GITHUB_ORGANIZATION))

	url = 'https://api.github.com/orgs/{}/issues'.format(
	GITHUB_ORGANIZATION,
	)
	url += '?filter=all&state=all&per_page=100'

	actions = []

	extra = {
	'@timestamp': 'updated_at',
	'@user': 'assignee.login',
	'@type': '__issue__',
	}

	for page in gh_request_next_page(url):
	for issue in page:
	if issue.get('pull_request'):
	continue
	actions.append(doc2es_action(issue, 'gh-issue', extra))

	print('\n{} Issues recovered'.format(len(actions)))
	return actions


	@functools.lru_cache()
	def gh_get_repos():
	url = 'https://api.github.com/orgs/{}/repos'.format(
	GITHUB_ORGANIZATION,
	)
	repos = requests.get(url, headers=GITHUB_AUTH_HEADERS)

	return repos.json()


	def gh_commits_to_actions():
	print('Getting Commits from {} organization'.format(
	GITHUB_ORGANIZATION,
	))

	all_actions = []
	extra = {
	'@timestamp': 'commit.author.date',
	'@user': 'author.login',
	'@type': '__commit__',
	}
	for repo in gh_get_repos():
	actions = []

	url = 'https://api.github.com/repos/{}/{}/commits'.format(
	GITHUB_ORGANIZATION,
	repo['name'],
	)
	url += '?per_page=100'

	for page in gh_request_next_page(url):
	for commit in page:
	actions.append(doc2es_action(commit, 'gh-commit', extra))

	all_actions += actions

	print('\n{} Commits recovered'.format(len(all_actions)))
	return all_actions


	def gh_pulls_to_actions():
	print('Getting Pull Requests from {} organization'.format(
	GITHUB_ORGANIZATION,
	))

	all_actions = []
	extra = {
	'@timestamp': 'created_at',
	'@user': 'user.login',
	'@type': '__pull-request__',
	}

	for repo in gh_get_repos():
	actions = []
	# print('Getting PRs from repo', repo['name'])
	url = 'https://api.github.com/repos/{}/{}/pulls'.format(
	GITHUB_ORGANIZATION,
	repo['name'],
	)
	url += '?state=all&per_page=100'

	for page in gh_request_next_page(url):
	for pull in page:
	actions.append(doc2es_action(pull, 'gh-pull', extra))

	all_actions += actions

	print('\n{} Pull Requests recovered'.format(len(all_actions)))
	return all_actions


	def gh_releases_to_actions():
	print('Getting Releases from {} organization'.format(
	GITHUB_ORGANIZATION,
	))

	all_actions = []
	extra = {
	'@timestamp': 'published_at',
	'@user': 'author.login',
	'@type': '__release__',
	}

	for repo in gh_get_repos():
	actions = []
	url = 'https://api.github.com/repos/{}/{}/releases'.format(
	GITHUB_ORGANIZATION,
	repo['name'],
	)
	url += '?state=all&per_page=100'

	for page in gh_request_next_page(url):
	for release in page:
	actions.append(doc2es_action(release, 'gh-release', extra))

	all_actions += actions

	print('\n{} Releases recovered'.format(len(all_actions)))
	return all_actions


	def batch_index(actions, batch_size=50):
	print('Indexing documents')
	es = Elasticsearch([ELASTICSEARCH_IP], timeout=180)

	grouped_actions = grouper(batch_size, actions)

	for action_group in grouped_actions:
	print('.', end='', flush=True)
	action_group = list(filter(None, action_group))
	bulk(es, action_group)
	print()


	def index_gh():
	issues_actions = gh_issues_to_actions()
	batch_index(issues_actions)

	pulls_actions = gh_pulls_to_actions()
	batch_index(pulls_actions)

	release_actions = gh_releases_to_actions()
	batch_index(release_actions)

	commits_actions = gh_commits_to_actions()
	batch_index(commits_actions)


	if __name__ == '__main__':
	index_gh()