Use demographica + sklearn to discover patterns in names
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from demographica import TOTALS | |
TOTALS | |
males = TOTALS.ix[:,'M',:] | |
males = TOTALS.xs('M', level=1) | |
males | |
males = TOTALS.xs('M', level=1).reset_index() | |
males | |
males.pivot? | |
males.pivot(index='name', columns='age_bin', values='occurrences') | |
data = males.pivot(index='name', columns='age_bin', values='occurrences').fillna(0) | |
data | |
data.ix['Aaron'] | |
data = data[ data.columns - ['85 years and over'] ] | |
data | |
data.sum(0) | |
data.sum(1) | |
data/data.sum(1) | |
data.sum(1) | |
data | |
data/(data.sum(1).values) | |
data/data.sum(1) | |
data.div? | |
data.div? | |
data.div( data.sum(1), axis='index') | |
data.ix[Zygmunt | |
'] | |
data.ix['Zygmunt'] | |
normalized_data = data.div( data.sum(1), axis='index') | |
normalized_data = normalized_data.dropna() | |
normalized_data.shape | |
normalized_data = data.div( data.sum(1), axis='index') | |
median(data.sum(1)) | |
data.sum(1).median() | |
data.sum(1).percentile | |
np.percentile( data.sum(1), 25 ) | |
np.percentile( data.sum(1), 75 ) | |
np.percentile( data.sum(1), 90 ) | |
normalized_data.ix[ data.sum(1) > 3433.8000000000011 ] | |
normalized_data.ix[ data.sum(1) > 3433.8000000000011 ] | |
normalized_data | |
normalized_data.ix[ data.sum(1) > 3433.8000000000011 ] | |
np.percentile( data.sum(1), 90 ) | |
normalized_data.ix[ data.sum(1) > 3433.8000000000011 ] | |
normalized_data | |
normalized_data.ix[ data.sum(1) > 3433.8000000000011 ] | |
common_names = normalized_data.ix[ data.sum(1) > 3433.8000000000011 ] | |
from sklearn.cluster import k_means | |
k_means? | |
from sklearn.cluster import KMeans | |
km = KMeans? | |
km = KMeans(20) | |
km.fit(common_names.values) | |
km.labels_ | |
km.inertia_ | |
km.cluster_centers_ | |
km.cluster_centers_[0] | |
plot(km.cluster_centers_[0]) | |
%pylab | |
plot(km.cluster_centers_[0]) | |
plot(km.cluster_centers_[1]) | |
plot(km.cluster_centers_[2]) | |
plot(km.cluster_centers_[3]) | |
common_ | |
common_names | |
plot(km.cluster_centers_[2]) | |
km.labels_ | |
km.labels_ == 2 | |
common_names[km.labels_ == 2] | |
plot(km.cluster_centers_[1]) | |
plot(km.cluster_centers_[0]) | |
plot(km.cluster_centers_[4]) | |
plot(km.cluster_centers_[2]) | |
plot(km.cluster_centers_[3]) | |
common_names[km.labels_ == 3] | |
common_names[km.labels_ == 3].head(100) | |
common_names[km.labels_ == 3].index | |
common_names[km.labels_ == 3].value | |
common_names[km.labels_ == 3].values | |
common_names[km.labels_ == 3] | |
plot(km.cluster_centers_[2]) | |
plot(km.cluster_centers_[0]) | |
common_names[km.labels_ == 0] | |
plot(km.cluster_centers_[20]) | |
plot(km.cluster_centers_[19]) | |
for i in range(20): | |
plot( km.cluster_centers_[i] ) | |
for i in range(20): | |
plot( km.cluster_centers_[i], label=i ) | |
pl.legend() | |
plt.legend() | |
plot(km.cluster_centers_[4]) | |
common_names[km.labels_ == 4] | |
common_names.ix['Taylor'] | |
common_names.ix['Taylor'].plot() | |
km = KMeans(1) | |
km = KMeans(2) | |
km = KMeans(12) | |
km.fit(common_names.values) | |
for i in range(20): | |
plot( km.cluster_centers_[i], label=i ) | |
for i in range(12): | |
plot( km.cluster_centers_[i], label=i ) | |
plt.legend() | |
common_names[km.labels_ == 5] | |
common_names[km.labels_ == 0] | |
plot(km.cluster_centers_[0]) | |
for i in range(12): | |
plot( km.cluster_centers_[i], label=i ) | |
plt.legend() | |
plot(km.cluster_centers_[10]) | |
common_names[km.labels_ == 10] | |
common_names.ix['Declan'].plot() | |
TOTALS.ix['Channing', 'M', :] | |
TOTALS.ix['Channing', 'M', :].plot() | |
TOTALS.ix['Channing', 'M', :].plot(label='Channing')) | |
plt.title('Name distribution of Channing') | |
for i in range(12): | |
plot( km.cluster_centers_[i], label=i ) | |
common_names.columns | |
len(common_names.columns) | |
plt.xticks(range(17), common_names.columns)/ | |
plt.xticks(range(17), common_names.columns) | |
for i in range(12): | |
plot( km.cluster_centers_[i], label=i ) | |
plt.legend() | |
plt.xticks? | |
plt.xticks(range(17), common_names.columns,rotation=25) | |
range(17) | |
for i in range(12): | |
plot( km.cluster_centers_[i], label=i ) | |
plt.xticks(range(17), common_names.columns,rotation=25) | |
for i in range(12): | |
plot( km.cluster_centers_[i], label=i ) | |
plt.xticks( [-1] + range(16), common_names.columns,rotation=25) | |
for i in range(12): | |
plot( km.cluster_centers_[i], label=i ) | |
plt.xticks(range(17), common_names.columns,rotation=25) | |
plt.xticks(range(17), common_names.columns,rotation=60) | |
plt.xticks(range(17), common_names.columns,rotation=80) | |
for i in range(12): | |
plot( km.cluster_centers_[i], label=i ) | |
plt.xticks(range(17), common_names.columns,rotation=15) | |
for i in range(12): | |
plot( km.cluster_centers_[i], label=i ) | |
plt.xticks(range(17), common_names.columns,rotation=20) | |
for i in range(12): | |
plot( km.cluster_centers_[i], label=i ) | |
plt.xticks(range(17)[::2], common_names.columns[::2],rotation=0) | |
for i in range(12): | |
plot( km.cluster_centers_[i], label=i ) | |
plt.xticks(range(17)[::3], common_names.columns[::3],rotation=0) | |
plt.title('Common distributions of names\n (Centers of the clusters, k=12)') | |
plt.ylabel('Probability') | |
plt.legend() | |
common_names[km.labels_ == 0] | |
common_names[km.labels_ == 9] | |
common_names[km.labels_ == 9].index | |
list(common_names[km.labels_ == 9].index) | |
list(common_names[km.labels_ == 4].index) | |
list(common_names[km.labels_ == 3].index) | |
plot(km.cluster_centers_[10]) | |
list(common_names[km.labels_ == 10].index) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment