Instantly share code, notes, and snippets.

Embed
What would you like to do?
/* Fetch data from GitHub Archive using Google's BigQuery */
select actor, repository_language, count(repository_language) as pushes
from [githubarchive:github.timeline]
where type='PushEvent'
and repository_language != ''
and PARSE_UTC_USEC(created_at) >= PARSE_UTC_USEC('2012-01-01 00:00:00')
and PARSE_UTC_USEC(created_at) < PARSE_UTC_USEC('2013-01-01 00:00:00')
group by actor, repository_language;
'''Calculate and plot the correlations between the most popular languages on GitHub.
Details on my blog: http://datahackermd.com/2013/language-use-on-github/
'''
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
def plot_correlation(dataframe, filename, title='', corr_type=''):
lang_names = dataframe.columns.tolist()
tick_indices = np.arange(0.5, len(lang_names) + 0.5)
plt.figure()
plt.pcolor(dataframe.values, cmap='RdBu', vmin=-.2, vmax=.2)
colorbar = plt.colorbar()
colorbar.set_label(corr_type)
plt.title(title)
plt.xticks(tick_indices, lang_names, rotation='vertical')
plt.yticks(tick_indices, lang_names)
plt.savefig(filename)
def main():
pushes = pd.read_csv('stacked_language_by_user.csv').pivot(
index='actor',
columns='repository_language',
values='pushes')
popular = pushes.select(
lambda x: np.sum(pushes[x]) > 50000, axis=1).fillna(0)
pearson_corr = popular.corr()
plot_correlation(
pearson_corr,
'pearson_language_correlation.svg',
title='2012 GitHub Language Correlations',
corr_type='Pearson\'s Correlation')
spearman_corr = popular.corr(method='spearman')
plot_correlation(
spearman_corr,
'spearman_language_correlation.svg',
title='2012 GitHub Language Correlations',
corr_type='Spearman\'s Rank Correlation')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment