kidpixo/sklearn_classes_sort.py

## sklearn_classes_sort.py
# this code rearrange numeric labels based on an input array.
# I normally use some feature of the input data.

# X is the data matrix as pandas.Dataframe
# X.shape : [ROWS, COLUMNS]

# get the k labels
n_clusters=k
k_means = cluster.KMeans(n_clusters=n_clusters, random_state=0).fit(scaler.transform(X))
labels = k_means.labels_

# label : int labels array, 0 to k classes
# label.shape = [ROWS]

#####
# colors: relabelling the classes using the first centroids values

# calculate all the class centers in data space == centroids
y = X.groupby(labels).mean().values
# indedx of the data feature used to sort labels, one of X COLUMNS
feature_index = 1
# here the sorting index
centroids_sorting_index = np.argsort(y[:, feature_index])
# here the sorting labels, not the index!!
centroids_sorted_labels = np.argsort(centroids_sorting_index)
# # use pd.Series.map(dict) directly change values in place
labels = pd.Series(labels).map(dict(zip(np.arange(n_clusters),centroids_sorted_labels))).values

# those are only debug prints
print('index for label sort :',feature_index)
print(' features y[:,index] :',y[:, feature_index])
print(centroids_sorting_index)
print(centroids_sorted_labels)
print(f'ind:y_feat  > new_index')
for i,yf,ni in zip(range(len(y[:, feature_index])),y[:, feature_index],centroids_sorted_labels):
    print(f'{i:3}:{yf:.5f} > {ni:>4}')
	# this code rearrange numeric labels based on an input array.
	# I normally use some feature of the input data.

	# X is the data matrix as pandas.Dataframe
	# X.shape : [ROWS, COLUMNS]

	# get the k labels
	n_clusters=k
	k_means = cluster.KMeans(n_clusters=n_clusters, random_state=0).fit(scaler.transform(X))
	labels = k_means.labels_

	# label : int labels array, 0 to k classes
	# label.shape = [ROWS]

	#####
	# colors: relabelling the classes using the first centroids values

	# calculate all the class centers in data space == centroids
	y = X.groupby(labels).mean().values
	# indedx of the data feature used to sort labels, one of X COLUMNS
	feature_index = 1
	# here the sorting index
	centroids_sorting_index = np.argsort(y[:, feature_index])
	# here the sorting labels, not the index!!
	centroids_sorted_labels = np.argsort(centroids_sorting_index)
	# # use pd.Series.map(dict) directly change values in place
	labels = pd.Series(labels).map(dict(zip(np.arange(n_clusters),centroids_sorted_labels))).values

	# those are only debug prints
	print('index for label sort :',feature_index)
	print(' features y[:,index] :',y[:, feature_index])
	print(centroids_sorting_index)
	print(centroids_sorted_labels)
	print(f'ind:y_feat > new_index')
	for i,yf,ni in zip(range(len(y[:, feature_index])),y[:, feature_index],centroids_sorted_labels):
	print(f'{i:3}:{yf:.5f} > {ni:>4}')