Skip to content

Instantly share code, notes, and snippets.

@CamDavidsonPilon
Created August 24, 2014 22:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save CamDavidsonPilon/71f189a92050832c0dc1 to your computer and use it in GitHub Desktop.
Save CamDavidsonPilon/71f189a92050832c0dc1 to your computer and use it in GitHub Desktop.
Use demographica + sklearn to discover patterns in names
from demographica import TOTALS
TOTALS
males = TOTALS.ix[:,'M',:]
males = TOTALS.xs('M', level=1)
males
males = TOTALS.xs('M', level=1).reset_index()
males
males.pivot?
males.pivot(index='name', columns='age_bin', values='occurrences')
data = males.pivot(index='name', columns='age_bin', values='occurrences').fillna(0)
data
data.ix['Aaron']
data = data[ data.columns - ['85 years and over'] ]
data
data.sum(0)
data.sum(1)
data/data.sum(1)
data.sum(1)
data
data/(data.sum(1).values)
data/data.sum(1)
data.div?
data.div?
data.div( data.sum(1), axis='index')
data.ix[Zygmunt
']
data.ix['Zygmunt']
normalized_data = data.div( data.sum(1), axis='index')
normalized_data = normalized_data.dropna()
normalized_data.shape
normalized_data = data.div( data.sum(1), axis='index')
median(data.sum(1))
data.sum(1).median()
data.sum(1).percentile
np.percentile( data.sum(1), 25 )
np.percentile( data.sum(1), 75 )
np.percentile( data.sum(1), 90 )
normalized_data.ix[ data.sum(1) > 3433.8000000000011 ]
normalized_data.ix[ data.sum(1) > 3433.8000000000011 ]
normalized_data
normalized_data.ix[ data.sum(1) > 3433.8000000000011 ]
np.percentile( data.sum(1), 90 )
normalized_data.ix[ data.sum(1) > 3433.8000000000011 ]
normalized_data
normalized_data.ix[ data.sum(1) > 3433.8000000000011 ]
common_names = normalized_data.ix[ data.sum(1) > 3433.8000000000011 ]
from sklearn.cluster import k_means
k_means?
from sklearn.cluster import KMeans
km = KMeans?
km = KMeans(20)
km.fit(common_names.values)
km.labels_
km.inertia_
km.cluster_centers_
km.cluster_centers_[0]
plot(km.cluster_centers_[0])
%pylab
plot(km.cluster_centers_[0])
plot(km.cluster_centers_[1])
plot(km.cluster_centers_[2])
plot(km.cluster_centers_[3])
common_
common_names
plot(km.cluster_centers_[2])
km.labels_
km.labels_ == 2
common_names[km.labels_ == 2]
plot(km.cluster_centers_[1])
plot(km.cluster_centers_[0])
plot(km.cluster_centers_[4])
plot(km.cluster_centers_[2])
plot(km.cluster_centers_[3])
common_names[km.labels_ == 3]
common_names[km.labels_ == 3].head(100)
common_names[km.labels_ == 3].index
common_names[km.labels_ == 3].value
common_names[km.labels_ == 3].values
common_names[km.labels_ == 3]
plot(km.cluster_centers_[2])
plot(km.cluster_centers_[0])
common_names[km.labels_ == 0]
plot(km.cluster_centers_[20])
plot(km.cluster_centers_[19])
for i in range(20):
plot( km.cluster_centers_[i] )
for i in range(20):
plot( km.cluster_centers_[i], label=i )
pl.legend()
plt.legend()
plot(km.cluster_centers_[4])
common_names[km.labels_ == 4]
common_names.ix['Taylor']
common_names.ix['Taylor'].plot()
km = KMeans(1)
km = KMeans(2)
km = KMeans(12)
km.fit(common_names.values)
for i in range(20):
plot( km.cluster_centers_[i], label=i )
for i in range(12):
plot( km.cluster_centers_[i], label=i )
plt.legend()
common_names[km.labels_ == 5]
common_names[km.labels_ == 0]
plot(km.cluster_centers_[0])
for i in range(12):
plot( km.cluster_centers_[i], label=i )
plt.legend()
plot(km.cluster_centers_[10])
common_names[km.labels_ == 10]
common_names.ix['Declan'].plot()
TOTALS.ix['Channing', 'M', :]
TOTALS.ix['Channing', 'M', :].plot()
TOTALS.ix['Channing', 'M', :].plot(label='Channing'))
plt.title('Name distribution of Channing')
for i in range(12):
plot( km.cluster_centers_[i], label=i )
common_names.columns
len(common_names.columns)
plt.xticks(range(17), common_names.columns)/
plt.xticks(range(17), common_names.columns)
for i in range(12):
plot( km.cluster_centers_[i], label=i )
plt.legend()
plt.xticks?
plt.xticks(range(17), common_names.columns,rotation=25)
range(17)
for i in range(12):
plot( km.cluster_centers_[i], label=i )
plt.xticks(range(17), common_names.columns,rotation=25)
for i in range(12):
plot( km.cluster_centers_[i], label=i )
plt.xticks( [-1] + range(16), common_names.columns,rotation=25)
for i in range(12):
plot( km.cluster_centers_[i], label=i )
plt.xticks(range(17), common_names.columns,rotation=25)
plt.xticks(range(17), common_names.columns,rotation=60)
plt.xticks(range(17), common_names.columns,rotation=80)
for i in range(12):
plot( km.cluster_centers_[i], label=i )
plt.xticks(range(17), common_names.columns,rotation=15)
for i in range(12):
plot( km.cluster_centers_[i], label=i )
plt.xticks(range(17), common_names.columns,rotation=20)
for i in range(12):
plot( km.cluster_centers_[i], label=i )
plt.xticks(range(17)[::2], common_names.columns[::2],rotation=0)
for i in range(12):
plot( km.cluster_centers_[i], label=i )
plt.xticks(range(17)[::3], common_names.columns[::3],rotation=0)
plt.title('Common distributions of names\n (Centers of the clusters, k=12)')
plt.ylabel('Probability')
plt.legend()
common_names[km.labels_ == 0]
common_names[km.labels_ == 9]
common_names[km.labels_ == 9].index
list(common_names[km.labels_ == 9].index)
list(common_names[km.labels_ == 4].index)
list(common_names[km.labels_ == 3].index)
plot(km.cluster_centers_[10])
list(common_names[km.labels_ == 10].index)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment