-
-
Save CamDavidsonPilon/71f189a92050832c0dc1 to your computer and use it in GitHub Desktop.
Use demographica + sklearn to discover patterns in names
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from demographica import TOTALS | |
TOTALS | |
males = TOTALS.ix[:,'M',:] | |
males = TOTALS.xs('M', level=1) | |
males | |
males = TOTALS.xs('M', level=1).reset_index() | |
males | |
males.pivot? | |
males.pivot(index='name', columns='age_bin', values='occurrences') | |
data = males.pivot(index='name', columns='age_bin', values='occurrences').fillna(0) | |
data | |
data.ix['Aaron'] | |
data = data[ data.columns - ['85 years and over'] ] | |
data | |
data.sum(0) | |
data.sum(1) | |
data/data.sum(1) | |
data.sum(1) | |
data | |
data/(data.sum(1).values) | |
data/data.sum(1) | |
data.div? | |
data.div? | |
data.div( data.sum(1), axis='index') | |
data.ix[Zygmunt | |
'] | |
data.ix['Zygmunt'] | |
normalized_data = data.div( data.sum(1), axis='index') | |
normalized_data = normalized_data.dropna() | |
normalized_data.shape | |
normalized_data = data.div( data.sum(1), axis='index') | |
median(data.sum(1)) | |
data.sum(1).median() | |
data.sum(1).percentile | |
np.percentile( data.sum(1), 25 ) | |
np.percentile( data.sum(1), 75 ) | |
np.percentile( data.sum(1), 90 ) | |
normalized_data.ix[ data.sum(1) > 3433.8000000000011 ] | |
normalized_data.ix[ data.sum(1) > 3433.8000000000011 ] | |
normalized_data | |
normalized_data.ix[ data.sum(1) > 3433.8000000000011 ] | |
np.percentile( data.sum(1), 90 ) | |
normalized_data.ix[ data.sum(1) > 3433.8000000000011 ] | |
normalized_data | |
normalized_data.ix[ data.sum(1) > 3433.8000000000011 ] | |
common_names = normalized_data.ix[ data.sum(1) > 3433.8000000000011 ] | |
from sklearn.cluster import k_means | |
k_means? | |
from sklearn.cluster import KMeans | |
km = KMeans? | |
km = KMeans(20) | |
km.fit(common_names.values) | |
km.labels_ | |
km.inertia_ | |
km.cluster_centers_ | |
km.cluster_centers_[0] | |
plot(km.cluster_centers_[0]) | |
%pylab | |
plot(km.cluster_centers_[0]) | |
plot(km.cluster_centers_[1]) | |
plot(km.cluster_centers_[2]) | |
plot(km.cluster_centers_[3]) | |
common_ | |
common_names | |
plot(km.cluster_centers_[2]) | |
km.labels_ | |
km.labels_ == 2 | |
common_names[km.labels_ == 2] | |
plot(km.cluster_centers_[1]) | |
plot(km.cluster_centers_[0]) | |
plot(km.cluster_centers_[4]) | |
plot(km.cluster_centers_[2]) | |
plot(km.cluster_centers_[3]) | |
common_names[km.labels_ == 3] | |
common_names[km.labels_ == 3].head(100) | |
common_names[km.labels_ == 3].index | |
common_names[km.labels_ == 3].value | |
common_names[km.labels_ == 3].values | |
common_names[km.labels_ == 3] | |
plot(km.cluster_centers_[2]) | |
plot(km.cluster_centers_[0]) | |
common_names[km.labels_ == 0] | |
plot(km.cluster_centers_[20]) | |
plot(km.cluster_centers_[19]) | |
for i in range(20): | |
plot( km.cluster_centers_[i] ) | |
for i in range(20): | |
plot( km.cluster_centers_[i], label=i ) | |
pl.legend() | |
plt.legend() | |
plot(km.cluster_centers_[4]) | |
common_names[km.labels_ == 4] | |
common_names.ix['Taylor'] | |
common_names.ix['Taylor'].plot() | |
km = KMeans(1) | |
km = KMeans(2) | |
km = KMeans(12) | |
km.fit(common_names.values) | |
for i in range(20): | |
plot( km.cluster_centers_[i], label=i ) | |
for i in range(12): | |
plot( km.cluster_centers_[i], label=i ) | |
plt.legend() | |
common_names[km.labels_ == 5] | |
common_names[km.labels_ == 0] | |
plot(km.cluster_centers_[0]) | |
for i in range(12): | |
plot( km.cluster_centers_[i], label=i ) | |
plt.legend() | |
plot(km.cluster_centers_[10]) | |
common_names[km.labels_ == 10] | |
common_names.ix['Declan'].plot() | |
TOTALS.ix['Channing', 'M', :] | |
TOTALS.ix['Channing', 'M', :].plot() | |
TOTALS.ix['Channing', 'M', :].plot(label='Channing')) | |
plt.title('Name distribution of Channing') | |
for i in range(12): | |
plot( km.cluster_centers_[i], label=i ) | |
common_names.columns | |
len(common_names.columns) | |
plt.xticks(range(17), common_names.columns)/ | |
plt.xticks(range(17), common_names.columns) | |
for i in range(12): | |
plot( km.cluster_centers_[i], label=i ) | |
plt.legend() | |
plt.xticks? | |
plt.xticks(range(17), common_names.columns,rotation=25) | |
range(17) | |
for i in range(12): | |
plot( km.cluster_centers_[i], label=i ) | |
plt.xticks(range(17), common_names.columns,rotation=25) | |
for i in range(12): | |
plot( km.cluster_centers_[i], label=i ) | |
plt.xticks( [-1] + range(16), common_names.columns,rotation=25) | |
for i in range(12): | |
plot( km.cluster_centers_[i], label=i ) | |
plt.xticks(range(17), common_names.columns,rotation=25) | |
plt.xticks(range(17), common_names.columns,rotation=60) | |
plt.xticks(range(17), common_names.columns,rotation=80) | |
for i in range(12): | |
plot( km.cluster_centers_[i], label=i ) | |
plt.xticks(range(17), common_names.columns,rotation=15) | |
for i in range(12): | |
plot( km.cluster_centers_[i], label=i ) | |
plt.xticks(range(17), common_names.columns,rotation=20) | |
for i in range(12): | |
plot( km.cluster_centers_[i], label=i ) | |
plt.xticks(range(17)[::2], common_names.columns[::2],rotation=0) | |
for i in range(12): | |
plot( km.cluster_centers_[i], label=i ) | |
plt.xticks(range(17)[::3], common_names.columns[::3],rotation=0) | |
plt.title('Common distributions of names\n (Centers of the clusters, k=12)') | |
plt.ylabel('Probability') | |
plt.legend() | |
common_names[km.labels_ == 0] | |
common_names[km.labels_ == 9] | |
common_names[km.labels_ == 9].index | |
list(common_names[km.labels_ == 9].index) | |
list(common_names[km.labels_ == 4].index) | |
list(common_names[km.labels_ == 3].index) | |
plot(km.cluster_centers_[10]) | |
list(common_names[km.labels_ == 10].index) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment