thistleknot/clusterAnova.py

## clusterAnova.py
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter
from rpy2.robjects.packages import importr
import numpy as np
import os
import pandas as pd
import rpy2
import rpy2.robjects as ro

wd = os.getcwd()

if (os.defpath==".;C:\\bin"):
    os.environ['R_HOME'] = 'C:/Users/User/Documents/R/R-4.1.2'
    os.environ['R_LIBS'] = 'C:/Users/User/Documents/R/R-4.1.2/library'
else:
    os.environ['R_HOME'] = '/mnt/distvol/R/4.0.5/lib64/R/'

pandas2ri.activate()

df = pd.read_csv("https://raw.githubusercontent.com/thistleknot/Python-Stock/master/data/raw/states.csv")
with localconverter(ro.default_converter + pandas2ri.converter):
    r_from_pd_df = ro.conversion.py2rpy(df)

ro.r('''
f <- function(y) {

model <- kmeans(y[,2:ncol(y)], centers = 2)

return(model)
}
''')

r_f = ro.globalenv['f']
d=(r_f(r_from_pd_df))

print(d)
labels = d[0]
print(labels)
clusters = len(d[1])
print(clusters)
centers = d[1]
print(centers)
totss = d[2]
print('totss',totss)
withinss = d[3]
print(withinss)
tot_withinss = d[4]
print('wss',tot_withinss)
betweenss = d[5]
print('bss',betweenss)

#model = KMeans(n_clusters=k, random_state=0, n_init=100).fit(df.iloc[:,1:])

within_ss = []
for n in range(0,clusters):
  #WSS means the sum of distances between the points and the corresponding centroids for each cluster
  data = df[labels==(n+1)].iloc[:,1:]
  within_ss.append(((data - centers[n])**2).sum(1).sum())

WSS = total_within_ss = np.sum(within_ss)
print('wss',total_within_ss)

#sum of ((deviation from variable means) squared)
tot_ss = np.sum(np.sum((df.iloc[:,1:].iloc[:,1:]-df.iloc[:,1:].iloc[:,1:].mean())**2))
print('tot_ss',tot_ss)

cluster_BSS = []
for n in range(0,clusters):
  #sum((variable/column means cluster - variable/column means data)^2)*len(cluster members)
  BSS = np.sum((df[labels==(n+1)].iloc[:,1:].mean()-np.array(np.mean(df.iloc[:,1:])))**2)*len(df[labels==(n+1)].iloc[:,1:])
  #print(BSS)
  cluster_BSS.append(BSS)

BSS = np.sum(cluster_BSS)
print('bss',BSS)

#print(BSS+total_within_ss)

print(tot_ss/totss)
print(WSS/tot_withinss)
#print(tot_ss-betweenss)
#print(betweenss+tot_withinss)
#print(totss-BSS)
print(BSS/betweenss)
	from rpy2.robjects import pandas2ri
	from rpy2.robjects.conversion import localconverter
	from rpy2.robjects.packages import importr
	import numpy as np
	import os
	import pandas as pd
	import rpy2
	import rpy2.robjects as ro

	wd = os.getcwd()

	if (os.defpath==".;C:\\bin"):
	os.environ['R_HOME'] = 'C:/Users/User/Documents/R/R-4.1.2'
	os.environ['R_LIBS'] = 'C:/Users/User/Documents/R/R-4.1.2/library'
	else:
	os.environ['R_HOME'] = '/mnt/distvol/R/4.0.5/lib64/R/'

	pandas2ri.activate()

	df = pd.read_csv("https://raw.githubusercontent.com/thistleknot/Python-Stock/master/data/raw/states.csv")
	with localconverter(ro.default_converter + pandas2ri.converter):
	r_from_pd_df = ro.conversion.py2rpy(df)

	ro.r('''
	f <- function(y) {

	model <- kmeans(y[,2:ncol(y)], centers = 2)

	return(model)
	}
	''')

	r_f = ro.globalenv['f']
	d=(r_f(r_from_pd_df))

	print(d)
	labels = d[0]
	print(labels)
	clusters = len(d[1])
	print(clusters)
	centers = d[1]
	print(centers)
	totss = d[2]
	print('totss',totss)
	withinss = d[3]
	print(withinss)
	tot_withinss = d[4]
	print('wss',tot_withinss)
	betweenss = d[5]
	print('bss',betweenss)

	#model = KMeans(n_clusters=k, random_state=0, n_init=100).fit(df.iloc[:,1:])

	within_ss = []
	for n in range(0,clusters):
	#WSS means the sum of distances between the points and the corresponding centroids for each cluster
	data = df[labels==(n+1)].iloc[:,1:]
	within_ss.append(((data - centers[n])**2).sum(1).sum())

	WSS = total_within_ss = np.sum(within_ss)
	print('wss',total_within_ss)

	#sum of ((deviation from variable means) squared)
	tot_ss = np.sum(np.sum((df.iloc[:,1:].iloc[:,1:]-df.iloc[:,1:].iloc[:,1:].mean())**2))
	print('tot_ss',tot_ss)

	cluster_BSS = []
	for n in range(0,clusters):
	#sum((variable/column means cluster - variable/column means data)^2)*len(cluster members)
	BSS = np.sum((df[labels==(n+1)].iloc[:,1:].mean()-np.array(np.mean(df.iloc[:,1:])))*2)len(df[labels==(n+1)].iloc[:,1:])
	#print(BSS)
	cluster_BSS.append(BSS)

	BSS = np.sum(cluster_BSS)
	print('bss',BSS)

	#print(BSS+total_within_ss)

	print(tot_ss/totss)
	print(WSS/tot_withinss)
	#print(tot_ss-betweenss)
	#print(betweenss+tot_withinss)
	#print(totss-BSS)
	print(BSS/betweenss)