Skip to content

Instantly share code, notes, and snippets.

@akshayjshah
Last active February 23, 2021 09:30
  • Star 5 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save akshayjshah/4772174 to your computer and use it in GitHub Desktop.
Calculate and plot correlations between the most popular languages on GitHub.
/* Fetch data from GitHub Archive using Google's BigQuery */
select actor, repository_language, count(repository_language) as pushes
from [githubarchive:github.timeline]
where type='PushEvent'
and repository_language != ''
and PARSE_UTC_USEC(created_at) >= PARSE_UTC_USEC('2012-01-01 00:00:00')
and PARSE_UTC_USEC(created_at) < PARSE_UTC_USEC('2013-01-01 00:00:00')
group by actor, repository_language;
'''Calculate and plot the correlations between the most popular languages on GitHub.
Details on my blog: http://datahackermd.com/2013/language-use-on-github/
'''
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
def plot_correlation(dataframe, filename, title='', corr_type=''):
lang_names = dataframe.columns.tolist()
tick_indices = np.arange(0.5, len(lang_names) + 0.5)
plt.figure()
plt.pcolor(dataframe.values, cmap='RdBu', vmin=-1, vmax=1)
colorbar = plt.colorbar()
colorbar.set_label(corr_type)
plt.title(title)
plt.xticks(tick_indices, lang_names, rotation='vertical')
plt.yticks(tick_indices, lang_names)
plt.savefig(filename)
def main():
pushes = pd.read_csv('stacked_language_by_user.csv').pivot(
index='actor',
columns='repository_language',
values='pushes')
popular = pushes.select(lambda x: np.sum(pushes[x]) > 50000, axis=1)
pearson_corr = popular.corr()
plot_correlation(
pearson_corr,
'pearson_language_correlation.svg',
title='2012 GitHub Language Correlations',
corr_type='Pearson\'s Correlation')
spearman_corr = popular.corr(method='spearman')
plot_correlation(
spearman_corr,
'spearman_language_correlation.svg',
title='2012 GitHub Language Correlations',
corr_type='Spearman\'s Rank Correlation')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment