alinazhanguwo/advancedKMeans.py

## advancedKMeans.py
# set up the parameters
n_init = 12
max_iter = 225
tol = 0.0001
random_state = 42
n_jobs = -1

t0 = dt.now()
print("=========  Start training ... ")

inertia_df = pd.DataFrame(data=[], index=range(2, 21), columns=['inertia'])
silhouette_avg_df = pd.DataFrame(data=[], index=range(2, 21), columns=['silhouetteAvg'])
overallAccuracy_df = pd.DataFrame(data=[], index=range(2, 21), columns=['overallAccuracy'])

for n_clusters in range(2, 21):
    clusterer = KMeans(n_clusters=n_clusters, n_init=n_init, max_iter=max_iter, tol = tol, \
                    random_state=random_state, n_jobs=n_jobs)
    cluster_labels = clusterer.fit_predict(X_train)

    # inertia
    inertia_df.loc[n_clusters] = clusterer.inertia_

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg_df.loc[n_clusters] = silhouette_score(X_train, cluster_labels)
    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(X_train, cluster_labels)

    # self-defined accuracy function: overallAccuracy
    kmeansClustered = pd.DataFrame(data=cluster_labels, index=X_train.index, columns=['cluster'])
    countByCluster_kMeans, countByLabel_kMeans, \
    countMostFreq_kMeans, \
    accuracyDF_kMeans, overallAccuracy_kMeans, \
    accuracyByLabel_kMeans = overallAccuracy(kmeansClustered, pd.Series(y_Class, index=X_train.index))
    overallAccuracy_df.loc[n_clusters] = overallAccuracy_kMeans


    # Plot the silhouette scores for each 'n_clusters'
    # Create a subplot with 1 row and 2 columns
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 7)

    # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    ax1.set_xlim([-0.1, 1])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    ax1.set_ylim([0, len(X_train) + (n_clusters + 1) * 10])

    y_lower = 10
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = \
            sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n_clusters)
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

    # 2nd Plot showing the actual clusters formed
    colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
    ax2.scatter(X_train.iloc[:, 0], X_train.iloc[:, 1], marker='.', s=30, lw=0, alpha=0.7,
                c=colors, edgecolor='k')


    # Labeling the clusters
    centers = clusterer.cluster_centers_
    # Draw white circles at cluster centers
    ax2.scatter(centers[:, 0], centers[:, 1], marker='o',
                c="white", alpha=1, s=200, edgecolor='k')

    for i, c in enumerate(centers):
        ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1,
                    s=50, edgecolor='k')

    ax2.set_title("The visualization of the clustered data.")
    ax2.set_xlabel("Feature space for the 1st feature")
    ax2.set_ylabel("Feature space for the 2nd feature")

    plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
                  "with n_clusters = %d" % n_clusters),
                 fontsize=14, fontweight='bold')


t1 = dt.now()-t0
print("=========  Finished in ",t1)

plt.show()
	# set up the parameters
	n_init = 12
	max_iter = 225
	tol = 0.0001
	random_state = 42
	n_jobs = -1

	t0 = dt.now()
	print("========= Start training ... ")

	inertia_df = pd.DataFrame(data=[], index=range(2, 21), columns=['inertia'])
	silhouette_avg_df = pd.DataFrame(data=[], index=range(2, 21), columns=['silhouetteAvg'])
	overallAccuracy_df = pd.DataFrame(data=[], index=range(2, 21), columns=['overallAccuracy'])

	for n_clusters in range(2, 21):
	clusterer = KMeans(n_clusters=n_clusters, n_init=n_init, max_iter=max_iter, tol = tol, \
	random_state=random_state, n_jobs=n_jobs)
	cluster_labels = clusterer.fit_predict(X_train)

	# inertia
	inertia_df.loc[n_clusters] = clusterer.inertia_

	# The silhouette_score gives the average value for all the samples.
	# This gives a perspective into the density and separation of the formed
	# clusters
	silhouette_avg_df.loc[n_clusters] = silhouette_score(X_train, cluster_labels)
	# Compute the silhouette scores for each sample
	sample_silhouette_values = silhouette_samples(X_train, cluster_labels)

	# self-defined accuracy function: overallAccuracy
	kmeansClustered = pd.DataFrame(data=cluster_labels, index=X_train.index, columns=['cluster'])
	countByCluster_kMeans, countByLabel_kMeans, \
	countMostFreq_kMeans, \
	accuracyDF_kMeans, overallAccuracy_kMeans, \
	accuracyByLabel_kMeans = overallAccuracy(kmeansClustered, pd.Series(y_Class, index=X_train.index))
	overallAccuracy_df.loc[n_clusters] = overallAccuracy_kMeans



	# Plot the silhouette scores for each 'n_clusters'
	# Create a subplot with 1 row and 2 columns
	fig, (ax1, ax2) = plt.subplots(1, 2)
	fig.set_size_inches(18, 7)

	# The 1st subplot is the silhouette plot
	# The silhouette coefficient can range from -1, 1 but in this example all
	# lie within [-0.1, 1]
	ax1.set_xlim([-0.1, 1])
	# The (n_clusters+1)*10 is for inserting blank space between silhouette
	# plots of individual clusters, to demarcate them clearly.
	ax1.set_ylim([0, len(X_train) + (n_clusters + 1) * 10])

	y_lower = 10
	for i in range(n_clusters):
	# Aggregate the silhouette scores for samples belonging to
	# cluster i, and sort them
	ith_cluster_silhouette_values = \
	sample_silhouette_values[cluster_labels == i]

	ith_cluster_silhouette_values.sort()

	size_cluster_i = ith_cluster_silhouette_values.shape[0]
	y_upper = y_lower + size_cluster_i

	color = cm.nipy_spectral(float(i) / n_clusters)
	ax1.fill_betweenx(np.arange(y_lower, y_upper),
	0, ith_cluster_silhouette_values,
	facecolor=color, edgecolor=color, alpha=0.7)

	# Label the silhouette plots with their cluster numbers at the middle
	ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

	# Compute the new y_lower for next plot
	y_lower = y_upper + 10 # 10 for the 0 samples

	ax1.set_title("The silhouette plot for the various clusters.")
	ax1.set_xlabel("The silhouette coefficient values")
	ax1.set_ylabel("Cluster label")

	# The vertical line for average silhouette score of all the values
	ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

	ax1.set_yticks([]) # Clear the yaxis labels / ticks
	ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

	# 2nd Plot showing the actual clusters formed
	colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
	ax2.scatter(X_train.iloc[:, 0], X_train.iloc[:, 1], marker='.', s=30, lw=0, alpha=0.7,
	c=colors, edgecolor='k')


	# Labeling the clusters
	centers = clusterer.cluster_centers_
	# Draw white circles at cluster centers
	ax2.scatter(centers[:, 0], centers[:, 1], marker='o',
	c="white", alpha=1, s=200, edgecolor='k')

	for i, c in enumerate(centers):
	ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1,
	s=50, edgecolor='k')

	ax2.set_title("The visualization of the clustered data.")
	ax2.set_xlabel("Feature space for the 1st feature")
	ax2.set_ylabel("Feature space for the 2nd feature")

	plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
	"with n_clusters = %d" % n_clusters),
	fontsize=14, fontweight='bold')



	t1 = dt.now()-t0
	print("========= Finished in ",t1)

	plt.show()