|
'''Calculate and plot the correlations between the most popular languages on GitHub. |
|
|
|
Details on my blog: http://datahackermd.com/2013/language-use-on-github/ |
|
|
|
''' |
|
import matplotlib.pyplot as plt |
|
import numpy as np |
|
import pandas as pd |
|
|
|
|
|
def plot_correlation(dataframe, filename, title='', corr_type=''): |
|
lang_names = dataframe.columns.tolist() |
|
tick_indices = np.arange(0.5, len(lang_names) + 0.5) |
|
plt.figure() |
|
plt.pcolor(dataframe.values, cmap='RdBu', vmin=-.2, vmax=.2) |
|
colorbar = plt.colorbar() |
|
colorbar.set_label(corr_type) |
|
plt.title(title) |
|
plt.xticks(tick_indices, lang_names, rotation='vertical') |
|
plt.yticks(tick_indices, lang_names) |
|
plt.savefig(filename) |
|
|
|
|
|
def main(): |
|
pushes = pd.read_csv('stacked_language_by_user.csv').pivot( |
|
index='actor', |
|
columns='repository_language', |
|
values='pushes') |
|
|
|
popular = pushes.select( |
|
lambda x: np.sum(pushes[x]) > 50000, axis=1).fillna(0) |
|
|
|
pearson_corr = popular.corr() |
|
plot_correlation( |
|
pearson_corr, |
|
'pearson_language_correlation.svg', |
|
title='2012 GitHub Language Correlations', |
|
corr_type='Pearson\'s Correlation') |
|
|
|
spearman_corr = popular.corr(method='spearman') |
|
plot_correlation( |
|
spearman_corr, |
|
'spearman_language_correlation.svg', |
|
title='2012 GitHub Language Correlations', |
|
corr_type='Spearman\'s Rank Correlation') |
|
|
|
|
|
if __name__ == '__main__': |
|
main() |