Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Compare execution times of two different one hot encoding algorithms using numpy and python
import numpy as np
import matplotlib.pyplot as plt
import random
import time
def one_hot(Y):
data_size=Y.shape[0]
classes=np.unique(Y).reshape(-1,1)
num_classes=classes.shape[0]
class_mappings=np.arange(0,max(Y)+1)
class_mappings[np.unique(classes)]=np.arange(num_classes)
Y=class_mappings[Y]
one_hot=np.zeros((data_size,num_classes))
#rows=np.arange(data_size)
one_hot[np.arange(data_size).reshape(-1,1),Y.reshape(-1,1)]=1
class_col=np.sort(classes)
return one_hot,class_col
def one_hot_for(Y):
data_size=Y.shape[0]
classes=np.unique(Y).reshape(-1,1)
num_classes=classes.shape[0]
one_hot=np.zeros((data_size,num_classes))
for row in range(data_size):
one_hot[row,np.where(classes==Y[row])[0]]=1
return one_hot,classes
# Generate a randoms file
file_name="randoms.txt"
with open(file_name,"w+") as random_labels:
for i in range(10000):
random_labels.write(str(random.randint(0,1000))+"\n")
with open(file_name,"r+") as f:
Y=f.readlines()
int_map=map(int,Y)
Y=list(int_map)
Y=np.asarray(Y).reshape(-1,1)
one_hot_timings=[]
one_hot_for_timings=[]
for i in range(100,10000,100):
start=time.time()
_,_=one_hot(Y[:i])
end=time.time()
one_hot_timings.append(end-start)
start=time.time()
_,_=one_hot_for(Y[:i])
end=time.time()
one_hot_for_timings.append(end-start)
plt.plot(one_hot_timings,label="one_hot_vector")
plt.plot(one_hot_for_timings,label="one_hot_for")
plt.xlabel('data_size for every 100 datapoints')
plt.ylabel('time of execution')
plt.legend(loc='best')
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment