Skip to content

Instantly share code, notes, and snippets.

@thistleknot
Created November 21, 2021 18:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thistleknot/22858624e35a6115b8a7cebad66a9889 to your computer and use it in GitHub Desktop.
Save thistleknot/22858624e35a6115b8a7cebad66a9889 to your computer and use it in GitHub Desktop.
Cluster ANOVA
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter
from rpy2.robjects.packages import importr
import numpy as np
import os
import pandas as pd
import rpy2
import rpy2.robjects as ro
wd = os.getcwd()
if (os.defpath==".;C:\\bin"):
os.environ['R_HOME'] = 'C:/Users/User/Documents/R/R-4.1.2'
os.environ['R_LIBS'] = 'C:/Users/User/Documents/R/R-4.1.2/library'
else:
os.environ['R_HOME'] = '/mnt/distvol/R/4.0.5/lib64/R/'
pandas2ri.activate()
df = pd.read_csv("https://raw.githubusercontent.com/thistleknot/Python-Stock/master/data/raw/states.csv")
with localconverter(ro.default_converter + pandas2ri.converter):
r_from_pd_df = ro.conversion.py2rpy(df)
ro.r('''
f <- function(y) {
model <- kmeans(y[,2:ncol(y)], centers = 2)
return(model)
}
''')
r_f = ro.globalenv['f']
d=(r_f(r_from_pd_df))
print(d)
labels = d[0]
print(labels)
clusters = len(d[1])
print(clusters)
centers = d[1]
print(centers)
totss = d[2]
print('totss',totss)
withinss = d[3]
print(withinss)
tot_withinss = d[4]
print('wss',tot_withinss)
betweenss = d[5]
print('bss',betweenss)
#model = KMeans(n_clusters=k, random_state=0, n_init=100).fit(df.iloc[:,1:])
within_ss = []
for n in range(0,clusters):
#WSS means the sum of distances between the points and the corresponding centroids for each cluster
data = df[labels==(n+1)].iloc[:,1:]
within_ss.append(((data - centers[n])**2).sum(1).sum())
WSS = total_within_ss = np.sum(within_ss)
print('wss',total_within_ss)
#sum of ((deviation from variable means) squared)
tot_ss = np.sum(np.sum((df.iloc[:,1:].iloc[:,1:]-df.iloc[:,1:].iloc[:,1:].mean())**2))
print('tot_ss',tot_ss)
cluster_BSS = []
for n in range(0,clusters):
#sum((variable/column means cluster - variable/column means data)^2)*len(cluster members)
BSS = np.sum((df[labels==(n+1)].iloc[:,1:].mean()-np.array(np.mean(df.iloc[:,1:])))**2)*len(df[labels==(n+1)].iloc[:,1:])
#print(BSS)
cluster_BSS.append(BSS)
BSS = np.sum(cluster_BSS)
print('bss',BSS)
#print(BSS+total_within_ss)
print(tot_ss/totss)
print(WSS/tot_withinss)
#print(tot_ss-betweenss)
#print(betweenss+tot_withinss)
#print(totss-BSS)
print(BSS/betweenss)
@thistleknot
Copy link
Author

Cleaner (sourced from: https://stats.stackexchange.com/questions/81954/ssb-sum-of-squares-between-clusters)

import numpy as np
from scipy.cluster.vq import vq

X = np.array(df.iloc[:,1:])
codebook = np.array(centers)
partition, euc_distance_to_centroids = vq(X, codebook)
WSS = np.sum(euc_distance_to_centroids**2)

TSS = np.sum((X-X.mean(0))**2)

BSS = TSS - WSS

print(TSS, WSS, BSS)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment