Skip to content

Instantly share code, notes, and snippets.

@thistleknot
Created November 21, 2021 18:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thistleknot/22858624e35a6115b8a7cebad66a9889 to your computer and use it in GitHub Desktop.
Save thistleknot/22858624e35a6115b8a7cebad66a9889 to your computer and use it in GitHub Desktop.
Cluster ANOVA
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter
from rpy2.robjects.packages import importr
import numpy as np
import os
import pandas as pd
import rpy2
import rpy2.robjects as ro
wd = os.getcwd()
if (os.defpath==".;C:\\bin"):
os.environ['R_HOME'] = 'C:/Users/User/Documents/R/R-4.1.2'
os.environ['R_LIBS'] = 'C:/Users/User/Documents/R/R-4.1.2/library'
else:
os.environ['R_HOME'] = '/mnt/distvol/R/4.0.5/lib64/R/'
pandas2ri.activate()
df = pd.read_csv("https://raw.githubusercontent.com/thistleknot/Python-Stock/master/data/raw/states.csv")
with localconverter(ro.default_converter + pandas2ri.converter):
r_from_pd_df = ro.conversion.py2rpy(df)
ro.r('''
f <- function(y) {
model <- kmeans(y[,2:ncol(y)], centers = 2)
return(model)
}
''')
r_f = ro.globalenv['f']
d=(r_f(r_from_pd_df))
print(d)
labels = d[0]
print(labels)
clusters = len(d[1])
print(clusters)
centers = d[1]
print(centers)
totss = d[2]
print('totss',totss)
withinss = d[3]
print(withinss)
tot_withinss = d[4]
print('wss',tot_withinss)
betweenss = d[5]
print('bss',betweenss)
#model = KMeans(n_clusters=k, random_state=0, n_init=100).fit(df.iloc[:,1:])
within_ss = []
for n in range(0,clusters):
#WSS means the sum of distances between the points and the corresponding centroids for each cluster
data = df[labels==(n+1)].iloc[:,1:]
within_ss.append(((data - centers[n])**2).sum(1).sum())
WSS = total_within_ss = np.sum(within_ss)
print('wss',total_within_ss)
#sum of ((deviation from variable means) squared)
tot_ss = np.sum(np.sum((df.iloc[:,1:].iloc[:,1:]-df.iloc[:,1:].iloc[:,1:].mean())**2))
print('tot_ss',tot_ss)
cluster_BSS = []
for n in range(0,clusters):
#sum((variable/column means cluster - variable/column means data)^2)*len(cluster members)
BSS = np.sum((df[labels==(n+1)].iloc[:,1:].mean()-np.array(np.mean(df.iloc[:,1:])))**2)*len(df[labels==(n+1)].iloc[:,1:])
#print(BSS)
cluster_BSS.append(BSS)
BSS = np.sum(cluster_BSS)
print('bss',BSS)
#print(BSS+total_within_ss)
print(tot_ss/totss)
print(WSS/tot_withinss)
#print(tot_ss-betweenss)
#print(betweenss+tot_withinss)
#print(totss-BSS)
print(BSS/betweenss)
@thistleknot
Copy link
Author

K-means clustering with 2 clusters of sizes 46, 4

Cluster means:
   Poverty Infant Mort    White    Crime  Doctors Traf Deaths University
1 12.62391       6.900 82.30217 395.7174 258.4152    1.415217   26.83478
2 13.97500       6.025 78.05000 542.7500 281.6750    1.280000   28.15000
  Unemployed   Income Population
1   5.213043 51834.35    4415558
2   5.925000 53718.75   24423016

Clustering vector:
 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 
 1  1  1  1  2  1  1  1  2  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 
 1  1  1  1  1  2  1  1  1  1  1  1  1  1  1  1  2  1  1  1  1  1  1  1 

Within cluster sum of squares by cluster:
[1] 5.010768e+14 2.017156e+14
 (between_SS / total_SS =  67.7 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
[6] "betweenss"    "size"         "iter"         "ifault"      

[1 1 1 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1
 1 1 1 1 1 2 1 1 1 1 1 1 1]
2
[[1.26239130e+01 6.90000000e+00 8.23021739e+01 3.95717391e+02
  2.58415217e+02 1.41521739e+00 2.68347826e+01 5.21304348e+00
  5.18343478e+04 4.41555770e+06]
 [1.39750000e+01 6.02500000e+00 7.80500000e+01 5.42750000e+02
  2.81675000e+02 1.28000000e+00 2.81500000e+01 5.92500000e+00
  5.37187500e+04 2.44230155e+07]]
totss [2.17589047e+15]
[5.01076821e+14 2.01715643e+14]
wss [7.02792464e+14]
bss [1.47309801e+15]
wss 702792464051332.8
tot_ss 2175890470677469.0
bss 1473098006626559.2
[1.]
[1.]
[1.]

@thistleknot
Copy link
Author

Cleaner (sourced from: https://stats.stackexchange.com/questions/81954/ssb-sum-of-squares-between-clusters)

import numpy as np
from scipy.cluster.vq import vq

X = np.array(df.iloc[:,1:])
codebook = np.array(centers)
partition, euc_distance_to_centroids = vq(X, codebook)
WSS = np.sum(euc_distance_to_centroids**2)

TSS = np.sum((X-X.mean(0))**2)

BSS = TSS - WSS

print(TSS, WSS, BSS)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment