Madhivarman/Kmeans.py

## Kmeans.py

import pandas as pd
import numpy as np

#create a dummy data
user_id = [x for x in range(10000)]
recency = np.random.randint(low=1, high=10, size=10000)
monetary = np.random.randint(low=1, high=10, size=10000)
frequency = np.random.randint(low=1, high=10, size=10000)

#convert above data into a dataframe
dummy_data = pd.DataFrame({'user_id':user_id,'Recency':recency, 'Monetary':monetary, 'Frequency':frequency})
dummy_data.shape

dummy_data.head()

total_value = recency + monetary + frequency
total_value

dummy_data["total_value"] = total_value
dummy_data.head()

segment = []

for i in total_value:
    #write condition value
    if i < 5:
        segment.append("lost")
    elif i >= 5 and i < 10:
        segment.append("abouttosleep")
    elif i >=10 and i < 17:
        segment.append("recentcustomer")
    elif i >= 17 and i < 25:
        segment.append("loyalcustomer")
    else:
        segment.append("champions")

dummy_data["segment"] = segment
dummy_data.head()

#do one hot encoding
dummy_data["segment"] = dummy_data["segment"].apply({"recentcustomer":4.0, "champions":5.0, "abouttosleep":2.0,
                                                    "lost":1.0,"loyalcustomer":3.0}.get)


# In[11]:


dummy_data.head()


# In[12]:


#convert all data into float
dummy_data[["Frequency","Monetary","Recency"]] = dummy_data[["Frequency","Monetary","Recency"]].astype(float)
dummy_data.head()


# In[13]:


#input  data to consider {Frequency, Monetarty, Recency}
#target data is Segment

input_data = dummy_data[["Frequency","Monetary","Recency"]]
output_data = dummy_data["segment"]

input_data.head()


# In[14]:


output_data.head()


# ## Split the dataset into Train and Test dataset

# In[15]:


train_X,train_Y = input_data.iloc[:7000], output_data.iloc[:7000]
test_X, test_Y = input_data.iloc[7000:], output_data.iloc[7000:]

print(train_X.shape, train_Y.shape, test_X.shape, test_Y.shape)
train_X, train_Y = train_X.values, train_Y.values
test_X, test_Y = test_X.values, test_Y.values


import tensorflow as tf


# In[26]:


from tensorflow.contrib.factorization import KMeans

with tf.name_scope("input_variables"):
    data = tf.placeholder(tf.float32, shape=[None, 3],name="data")
    target = tf.placeholder(tf.float32, shape=[None, 1],name="target")

#parameters defining
epochs = 500
num_classes = 5
num_clusters = 5
batch_size = 64

#defining a KMeans architecture
with tf.name_scope("KMeans_Architecture"):
    Kmeans = KMeans(inputs = data,
                   num_clusters = num_clusters,
                   distance_metric = 'cosine',
                   use_mini_batch = True)

#building a graph
training_graph =  Kmeans.training_graph()

if len(training_graph) > 6:
	(all_scores, cluster_idx, scores, cluster_centers_initialized,
		cluster_center_var, init_op, train_op) = training_graph

else:
	(all_scores, cluster_idx, scores, cluster_centers_initialized,
		init_op, train_op) = training_graph

cluster_idx = cluster_idx[0]
avg_distance = tf.reduce_mean(scores)

#initialize all variables
init_vars = tf.global_variables_initializer()

#start tensorflow session
sess = tf.Session()

#run the initializer
sess.run(init_vars, feed_dict={data: train_X})
sess.run(init_op, feed_dict={data: train_X})

#add ops to save the tensorflow model
saver = tf.train.Saver()

#Training
for i in range(1, epochs+1):
    _, d, idx = sess.run([train_op, avg_distance, cluster_idx], feed_dict={data:train_X})

    if i%100 == 0 or i == 1:
        print("step:{}, Avg-Distance:{}".format(i,d))

        save_path = saver.save(sess, "customer_segmentation_saved_model/model.ckpt")
        print("Model in saved in the path:{dir}".format(dir=save_path))

#assign a lable to each centroid
#count total number of labels per centroid, using lable for each training

counts = np.zeros(shape=(num_clusters,num_classes))
for i in range(len(idx)):
	counts[idx[i]] += train_Y[i]

#assign most frequent label to centroid
lables_map = [np.argmax(c) for c in counts]
lables_map = tf.convert_to_tensor(lables_map)

#lookup: centroid_id -> label
cluster_label = tf.nn.embedding_lookup(lables_map, cluster_idx)

#compute accuracy
correct_prediction = tf.equal(cluster_label, tf.cast(tf.argmax(target,1), tf.int32))
accuracy_op = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

print(correct_prediction)


# In[27]:


dup = test_Y.reshape(-1,1)
dup


# In[28]:


#test accuracy
print("Test Accuracy:{}".format(sess.run(accuracy_op, feed_dict={data:test_X, target:test_Y.reshape(-1,1)})))


# ## Inspect what all variables are stored in the check point ##

# In[30]:


# import the inspect_checkpoint library
from tensorflow.python.tools import inspect_checkpoint as chkp

# print all tensors in checkpoint file
chkp.print_tensors_in_checkpoint_file("customer_segmentation_saved_model/model.ckpt", tensor_name='', all_tensors=True)


# In[31]:


chkp.print_tensors_in_checkpoint_file("customer_segmentation_saved_model/model.ckpt", tensor_name='data', all_tensors=False)


# ## Do prediction in the Test data ##
#generate random test data of length 200

test_recency = np.random.randint(low=1, high=10, size=500)
test_monetary = np.random.randint(low=1, high=10, size=500)
test_frequency = np.random.randint(low=1, high=10, size=500)

test_df = pd.DataFrame({'Recency':test_recency, 'Monetary':test_monetary, 'Frequency':test_frequency})
test_df.head()


# In[33]:


test_df.values


# In[37]:


## Let us restore the saved model
t_sess = tf.Session()
# Step-1: Recreate the network graph. At this step only graph is created.
saver = tf.train.Saver()
# Step-2: Now let's load the weights saved using the restore method.
saver.restore(t_sess, 'customer_segmentation_saved_model/model.ckpt')


# Accessing the default graph which we have restored
graph = tf.get_default_graph()
#accessing the default graph which is stored
graph = tf.get_default_graph()

y_pred = graph.get_tensor_by_name("input_variables_3/target:0")

#lets feed the data into the placeholders
x = graph.get_tensor_by_name("input_variables_3/data:0")

#create the feed_dict that is required to feed the input to the data
feed_dict_testing = {x: test_df.values}

#check the result
result = t_sess.run(accuracy_op, feed_dict=feed_dict_testing)
print(result)

	import pandas as pd
	import numpy as np

	#create a dummy data
	user_id = [x for x in range(10000)]
	recency = np.random.randint(low=1, high=10, size=10000)
	monetary = np.random.randint(low=1, high=10, size=10000)
	frequency = np.random.randint(low=1, high=10, size=10000)

	#convert above data into a dataframe
	dummy_data = pd.DataFrame({'user_id':user_id,'Recency':recency, 'Monetary':monetary, 'Frequency':frequency})
	dummy_data.shape

	dummy_data.head()

	total_value = recency + monetary + frequency
	total_value

	dummy_data["total_value"] = total_value
	dummy_data.head()

	segment = []

	for i in total_value:
	#write condition value
	if i < 5:
	segment.append("lost")
	elif i >= 5 and i < 10:
	segment.append("abouttosleep")
	elif i >=10 and i < 17:
	segment.append("recentcustomer")
	elif i >= 17 and i < 25:
	segment.append("loyalcustomer")
	else:
	segment.append("champions")

	dummy_data["segment"] = segment
	dummy_data.head()

	#do one hot encoding
	dummy_data["segment"] = dummy_data["segment"].apply({"recentcustomer":4.0, "champions":5.0, "abouttosleep":2.0,
	"lost":1.0,"loyalcustomer":3.0}.get)


	# In[11]:


	dummy_data.head()


	# In[12]:


	#convert all data into float
	dummy_data[["Frequency","Monetary","Recency"]] = dummy_data[["Frequency","Monetary","Recency"]].astype(float)
	dummy_data.head()


	# In[13]:


	#input data to consider {Frequency, Monetarty, Recency}
	#target data is Segment

	input_data = dummy_data[["Frequency","Monetary","Recency"]]
	output_data = dummy_data["segment"]

	input_data.head()


	# In[14]:


	output_data.head()


	# ## Split the dataset into Train and Test dataset

	# In[15]:


	train_X,train_Y = input_data.iloc[:7000], output_data.iloc[:7000]
	test_X, test_Y = input_data.iloc[7000:], output_data.iloc[7000:]

	print(train_X.shape, train_Y.shape, test_X.shape, test_Y.shape)
	train_X, train_Y = train_X.values, train_Y.values
	test_X, test_Y = test_X.values, test_Y.values



	import tensorflow as tf


	# In[26]:


	from tensorflow.contrib.factorization import KMeans

	with tf.name_scope("input_variables"):
	data = tf.placeholder(tf.float32, shape=[None, 3],name="data")
	target = tf.placeholder(tf.float32, shape=[None, 1],name="target")

	#parameters defining
	epochs = 500
	num_classes = 5
	num_clusters = 5
	batch_size = 64

	#defining a KMeans architecture
	with tf.name_scope("KMeans_Architecture"):
	Kmeans = KMeans(inputs = data,
	num_clusters = num_clusters,
	distance_metric = 'cosine',
	use_mini_batch = True)

	#building a graph
	training_graph = Kmeans.training_graph()

	if len(training_graph) > 6:
	(all_scores, cluster_idx, scores, cluster_centers_initialized,
	cluster_center_var, init_op, train_op) = training_graph

	else:
	(all_scores, cluster_idx, scores, cluster_centers_initialized,
	init_op, train_op) = training_graph

	cluster_idx = cluster_idx[0]
	avg_distance = tf.reduce_mean(scores)

	#initialize all variables
	init_vars = tf.global_variables_initializer()

	#start tensorflow session
	sess = tf.Session()

	#run the initializer
	sess.run(init_vars, feed_dict={data: train_X})
	sess.run(init_op, feed_dict={data: train_X})

	#add ops to save the tensorflow model
	saver = tf.train.Saver()

	#Training
	for i in range(1, epochs+1):
	_, d, idx = sess.run([train_op, avg_distance, cluster_idx], feed_dict={data:train_X})

	if i%100 == 0 or i == 1:
	print("step:{}, Avg-Distance:{}".format(i,d))

	save_path = saver.save(sess, "customer_segmentation_saved_model/model.ckpt")
	print("Model in saved in the path:{dir}".format(dir=save_path))

	#assign a lable to each centroid
	#count total number of labels per centroid, using lable for each training

	counts = np.zeros(shape=(num_clusters,num_classes))
	for i in range(len(idx)):
	counts[idx[i]] += train_Y[i]

	#assign most frequent label to centroid
	lables_map = [np.argmax(c) for c in counts]
	lables_map = tf.convert_to_tensor(lables_map)

	#lookup: centroid_id -> label
	cluster_label = tf.nn.embedding_lookup(lables_map, cluster_idx)

	#compute accuracy
	correct_prediction = tf.equal(cluster_label, tf.cast(tf.argmax(target,1), tf.int32))
	accuracy_op = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

	print(correct_prediction)


	# In[27]:


	dup = test_Y.reshape(-1,1)
	dup


	# In[28]:


	#test accuracy
	print("Test Accuracy:{}".format(sess.run(accuracy_op, feed_dict={data:test_X, target:test_Y.reshape(-1,1)})))


	# ## Inspect what all variables are stored in the check point ##

	# In[30]:


	# import the inspect_checkpoint library
	from tensorflow.python.tools import inspect_checkpoint as chkp

	# print all tensors in checkpoint file
	chkp.print_tensors_in_checkpoint_file("customer_segmentation_saved_model/model.ckpt", tensor_name='', all_tensors=True)


	# In[31]:


	chkp.print_tensors_in_checkpoint_file("customer_segmentation_saved_model/model.ckpt", tensor_name='data', all_tensors=False)


	# ## Do prediction in the Test data ##
	#generate random test data of length 200

	test_recency = np.random.randint(low=1, high=10, size=500)
	test_monetary = np.random.randint(low=1, high=10, size=500)
	test_frequency = np.random.randint(low=1, high=10, size=500)

	test_df = pd.DataFrame({'Recency':test_recency, 'Monetary':test_monetary, 'Frequency':test_frequency})
	test_df.head()


	# In[33]:


	test_df.values


	# In[37]:


	## Let us restore the saved model
	t_sess = tf.Session()
	# Step-1: Recreate the network graph. At this step only graph is created.
	saver = tf.train.Saver()
	# Step-2: Now let's load the weights saved using the restore method.
	saver.restore(t_sess, 'customer_segmentation_saved_model/model.ckpt')




	# Accessing the default graph which we have restored
	graph = tf.get_default_graph()
	#accessing the default graph which is stored
	graph = tf.get_default_graph()

	y_pred = graph.get_tensor_by_name("input_variables_3/target:0")

	#lets feed the data into the placeholders
	x = graph.get_tensor_by_name("input_variables_3/data:0")

	#create the feed_dict that is required to feed the input to the data
	feed_dict_testing = {x: test_df.values}

	#check the result
	result = t_sess.run(accuracy_op, feed_dict=feed_dict_testing)
	print(result)