Skip to content

Instantly share code, notes, and snippets.

@Madhivarman
Created August 20, 2018 10:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Madhivarman/e9e0ab7f5ef03b7f93ffbd62f6afb848 to your computer and use it in GitHub Desktop.
Save Madhivarman/e9e0ab7f5ef03b7f93ffbd62f6afb848 to your computer and use it in GitHub Desktop.
import pandas as pd
import numpy as np
#create a dummy data
user_id = [x for x in range(10000)]
recency = np.random.randint(low=1, high=10, size=10000)
monetary = np.random.randint(low=1, high=10, size=10000)
frequency = np.random.randint(low=1, high=10, size=10000)
#convert above data into a dataframe
dummy_data = pd.DataFrame({'user_id':user_id,'Recency':recency, 'Monetary':monetary, 'Frequency':frequency})
dummy_data.shape
dummy_data.head()
total_value = recency + monetary + frequency
total_value
dummy_data["total_value"] = total_value
dummy_data.head()
segment = []
for i in total_value:
#write condition value
if i < 5:
segment.append("lost")
elif i >= 5 and i < 10:
segment.append("abouttosleep")
elif i >=10 and i < 17:
segment.append("recentcustomer")
elif i >= 17 and i < 25:
segment.append("loyalcustomer")
else:
segment.append("champions")
dummy_data["segment"] = segment
dummy_data.head()
#do one hot encoding
dummy_data["segment"] = dummy_data["segment"].apply({"recentcustomer":4.0, "champions":5.0, "abouttosleep":2.0,
"lost":1.0,"loyalcustomer":3.0}.get)
# In[11]:
dummy_data.head()
# In[12]:
#convert all data into float
dummy_data[["Frequency","Monetary","Recency"]] = dummy_data[["Frequency","Monetary","Recency"]].astype(float)
dummy_data.head()
# In[13]:
#input data to consider {Frequency, Monetarty, Recency}
#target data is Segment
input_data = dummy_data[["Frequency","Monetary","Recency"]]
output_data = dummy_data["segment"]
input_data.head()
# In[14]:
output_data.head()
# ## Split the dataset into Train and Test dataset
# In[15]:
train_X,train_Y = input_data.iloc[:7000], output_data.iloc[:7000]
test_X, test_Y = input_data.iloc[7000:], output_data.iloc[7000:]
print(train_X.shape, train_Y.shape, test_X.shape, test_Y.shape)
train_X, train_Y = train_X.values, train_Y.values
test_X, test_Y = test_X.values, test_Y.values
import tensorflow as tf
# In[26]:
from tensorflow.contrib.factorization import KMeans
with tf.name_scope("input_variables"):
data = tf.placeholder(tf.float32, shape=[None, 3],name="data")
target = tf.placeholder(tf.float32, shape=[None, 1],name="target")
#parameters defining
epochs = 500
num_classes = 5
num_clusters = 5
batch_size = 64
#defining a KMeans architecture
with tf.name_scope("KMeans_Architecture"):
Kmeans = KMeans(inputs = data,
num_clusters = num_clusters,
distance_metric = 'cosine',
use_mini_batch = True)
#building a graph
training_graph = Kmeans.training_graph()
if len(training_graph) > 6:
(all_scores, cluster_idx, scores, cluster_centers_initialized,
cluster_center_var, init_op, train_op) = training_graph
else:
(all_scores, cluster_idx, scores, cluster_centers_initialized,
init_op, train_op) = training_graph
cluster_idx = cluster_idx[0]
avg_distance = tf.reduce_mean(scores)
#initialize all variables
init_vars = tf.global_variables_initializer()
#start tensorflow session
sess = tf.Session()
#run the initializer
sess.run(init_vars, feed_dict={data: train_X})
sess.run(init_op, feed_dict={data: train_X})
#add ops to save the tensorflow model
saver = tf.train.Saver()
#Training
for i in range(1, epochs+1):
_, d, idx = sess.run([train_op, avg_distance, cluster_idx], feed_dict={data:train_X})
if i%100 == 0 or i == 1:
print("step:{}, Avg-Distance:{}".format(i,d))
save_path = saver.save(sess, "customer_segmentation_saved_model/model.ckpt")
print("Model in saved in the path:{dir}".format(dir=save_path))
#assign a lable to each centroid
#count total number of labels per centroid, using lable for each training
counts = np.zeros(shape=(num_clusters,num_classes))
for i in range(len(idx)):
counts[idx[i]] += train_Y[i]
#assign most frequent label to centroid
lables_map = [np.argmax(c) for c in counts]
lables_map = tf.convert_to_tensor(lables_map)
#lookup: centroid_id -> label
cluster_label = tf.nn.embedding_lookup(lables_map, cluster_idx)
#compute accuracy
correct_prediction = tf.equal(cluster_label, tf.cast(tf.argmax(target,1), tf.int32))
accuracy_op = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
print(correct_prediction)
# In[27]:
dup = test_Y.reshape(-1,1)
dup
# In[28]:
#test accuracy
print("Test Accuracy:{}".format(sess.run(accuracy_op, feed_dict={data:test_X, target:test_Y.reshape(-1,1)})))
# ## Inspect what all variables are stored in the check point ##
# In[30]:
# import the inspect_checkpoint library
from tensorflow.python.tools import inspect_checkpoint as chkp
# print all tensors in checkpoint file
chkp.print_tensors_in_checkpoint_file("customer_segmentation_saved_model/model.ckpt", tensor_name='', all_tensors=True)
# In[31]:
chkp.print_tensors_in_checkpoint_file("customer_segmentation_saved_model/model.ckpt", tensor_name='data', all_tensors=False)
# ## Do prediction in the Test data ##
#generate random test data of length 200
test_recency = np.random.randint(low=1, high=10, size=500)
test_monetary = np.random.randint(low=1, high=10, size=500)
test_frequency = np.random.randint(low=1, high=10, size=500)
test_df = pd.DataFrame({'Recency':test_recency, 'Monetary':test_monetary, 'Frequency':test_frequency})
test_df.head()
# In[33]:
test_df.values
# In[37]:
## Let us restore the saved model
t_sess = tf.Session()
# Step-1: Recreate the network graph. At this step only graph is created.
saver = tf.train.Saver()
# Step-2: Now let's load the weights saved using the restore method.
saver.restore(t_sess, 'customer_segmentation_saved_model/model.ckpt')
# Accessing the default graph which we have restored
graph = tf.get_default_graph()
#accessing the default graph which is stored
graph = tf.get_default_graph()
y_pred = graph.get_tensor_by_name("input_variables_3/target:0")
#lets feed the data into the placeholders
x = graph.get_tensor_by_name("input_variables_3/data:0")
#create the feed_dict that is required to feed the input to the data
feed_dict_testing = {x: test_df.values}
#check the result
result = t_sess.run(accuracy_op, feed_dict=feed_dict_testing)
print(result)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment