coyotebush/github_archive_query.sql

## github_archive_query.sql
/* Fetch data from GitHub Archive using Google's BigQuery */
select actor, repository_language, count(repository_language) as pushes
from [githubarchive:github.timeline]
where type='PushEvent'
    and repository_language != ''
    and PARSE_UTC_USEC(created_at) >= PARSE_UTC_USEC('2012-01-01 00:00:00')
    and PARSE_UTC_USEC(created_at) < PARSE_UTC_USEC('2013-01-01 00:00:00')
group by actor, repository_language;

## github_language_correlations.py
'''Calculate and plot the correlations between the most popular languages on GitHub.

Details on my blog: http://datahackermd.com/2013/language-use-on-github/

'''
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


def plot_correlation(dataframe, filename, title='', corr_type=''):
    lang_names = dataframe.columns.tolist()
    tick_indices = np.arange(0.5, len(lang_names) + 0.5)
    plt.figure()
    plt.pcolor(dataframe.values, cmap='RdBu', vmin=-.2, vmax=.2)
    colorbar = plt.colorbar()
    colorbar.set_label(corr_type)
    plt.title(title)
    plt.xticks(tick_indices, lang_names, rotation='vertical')
    plt.yticks(tick_indices, lang_names)
    plt.savefig(filename)


def main():
    pushes = pd.read_csv('stacked_language_by_user.csv').pivot(
        index='actor',
        columns='repository_language',
        values='pushes')

    popular = pushes.select(
			lambda x: np.sum(pushes[x]) > 50000, axis=1).fillna(0)

    pearson_corr = popular.corr()
    plot_correlation(
        pearson_corr,
        'pearson_language_correlation.svg',
        title='2012 GitHub Language Correlations',
        corr_type='Pearson\'s Correlation')

    spearman_corr = popular.corr(method='spearman')
    plot_correlation(
        spearman_corr,
        'spearman_language_correlation.svg',
        title='2012 GitHub Language Correlations',
        corr_type='Spearman\'s Rank Correlation')


if __name__ == '__main__':
    main()
	/* Fetch data from GitHub Archive using Google's BigQuery */
	select actor, repository_language, count(repository_language) as pushes
	from [githubarchive:github.timeline]
	where type='PushEvent'
	and repository_language != ''
	and PARSE_UTC_USEC(created_at) >= PARSE_UTC_USEC('2012-01-01 00:00:00')
	and PARSE_UTC_USEC(created_at) < PARSE_UTC_USEC('2013-01-01 00:00:00')
	group by actor, repository_language;
	'''Calculate and plot the correlations between the most popular languages on GitHub.

	Details on my blog: http://datahackermd.com/2013/language-use-on-github/

	'''
	import matplotlib.pyplot as plt
	import numpy as np
	import pandas as pd


	def plot_correlation(dataframe, filename, title='', corr_type=''):
	lang_names = dataframe.columns.tolist()
	tick_indices = np.arange(0.5, len(lang_names) + 0.5)
	plt.figure()
	plt.pcolor(dataframe.values, cmap='RdBu', vmin=-.2, vmax=.2)
	colorbar = plt.colorbar()
	colorbar.set_label(corr_type)
	plt.title(title)
	plt.xticks(tick_indices, lang_names, rotation='vertical')
	plt.yticks(tick_indices, lang_names)
	plt.savefig(filename)


	def main():
	pushes = pd.read_csv('stacked_language_by_user.csv').pivot(
	index='actor',
	columns='repository_language',
	values='pushes')

	popular = pushes.select(
	lambda x: np.sum(pushes[x]) > 50000, axis=1).fillna(0)

	pearson_corr = popular.corr()
	plot_correlation(
	pearson_corr,
	'pearson_language_correlation.svg',
	title='2012 GitHub Language Correlations',
	corr_type='Pearson\'s Correlation')

	spearman_corr = popular.corr(method='spearman')
	plot_correlation(
	spearman_corr,
	'spearman_language_correlation.svg',
	title='2012 GitHub Language Correlations',
	corr_type='Spearman\'s Rank Correlation')


	if __name__ == '__main__':
	main()