steven-mi/softmax-classifier.py

## softmax-classifier.py
def softmax(class_scores):
    """
    Calculate class probability distribution for each digit from given class scores.

    :param class_scores: class scores of your function
    :return: probability distribution
    """
    class_scores -= np.max(class_scores)
    return np.exp(class_scores) / np.sum(np.exp(class_scores),axis=1, keepdims=True)

def onehot_encode_label(label):
    """
    Support function to convert label vector into a one hot encoding matrix

    :param label: array with shape (D,) , D can be whatever you want
    :return: one hot encoding matrix
    """
    onehot_encoder = OneHotEncoder(sparse=False)
    label = label.reshape(len(label), 1)
    onehot_encoded_label = onehot_encoder.fit_transform(label)
    return onehot_encoded_label

def data_loss(class_probabilities, onehot_encode_label):
    """
    Compute data_loss L_i for the correct class with a onehot encoded label

    :param class_probabilities: probabilities from the softmax function
    :param onehot_encode_label: correct labels in a one hot enconding shape

    :return: the data loss L_i
    """
    return onehot_encode_label * -np.log(class_probabilities)

def loss(X, y, theta, lam):
    """
    :param X: data
    :param y: label of the data
    :param theta: learnable parameters
    :param lam: regularization factor

    :return: loss and gradient as a tupel
    """
    encoded_labels = onehot_encode_label(y)           # also needed for the gradient, therefore separated calculated
    probabilities = softmax(class_scores(X,theta))    # also needed for the gradient, therefore separated calculated
    loss_Li = data_loss(probabilities,encoded_labels)

    m = X.shape[0]                                    # number of training data for normalization
    l2_regularization = (lam/2)*np.sum(theta*theta)   # regularization loss

    loss = np.sum(loss_Li)/m + l2_regularization

    dl2 = lam*theta
    dloss = np.dot(X.T, (probabilities - encoded_labels)/m)
    gradient = dloss +  dl2

    return  loss, gradient

def sgd(training_data, training_label, theta, lam=0.5, iterations=100, learning_rate=1e-5, batch_size=256):
    losses = []
    for i in range(iterations):
        shuffle_index = np.random.permutation(training_data.shape[0])
        data, label = training_data[shuffle_index], training_label[shuffle_index]
        data, label = data[:batch_size], label[:batch_size]

        l, grad = loss(data, label, theta, lam)
        losses.append(l)
        theta -= learning_rate*grad
    return theta, losses

# Initialize learnable parameters theta
theta = np.zeros([X_train.shape[1],len(np.unique(y_train))])

# Start optimization with traning data, theta and optional hyperparameters
opt_model, loss_history = sgd(X_train,y_train,theta,iterations=250)

# evaluation
print('last iteration loss:',loss_history[-1])
print('first iteration loss:',loss_history[0])
print('Is the first loss equal to ln(10)?', np.log(10) - loss_history[0] < 0.000001) # if its false you have a implementation error

# plot a loss curve
plt.plot(loss_history)
plt.ylabel('loss')
plt.xlabel('iterations')
plt.show()

# plot weights
plt.figure(figsize=(20, 20))
num_classes = 10
for c in range(num_classes):
    f = plt.subplot(10, num_classes, 1 * num_classes + c + 1)
    f.axis('off')
    plt.imshow(np.reshape(opt_model[:,c],[28,28]))
plt.show()
	def softmax(class_scores):
	"""
	Calculate class probability distribution for each digit from given class scores.

	:param class_scores: class scores of your function
	:return: probability distribution
	"""
	class_scores -= np.max(class_scores)
	return np.exp(class_scores) / np.sum(np.exp(class_scores),axis=1, keepdims=True)

	def onehot_encode_label(label):
	"""
	Support function to convert label vector into a one hot encoding matrix

	:param label: array with shape (D,) , D can be whatever you want
	:return: one hot encoding matrix
	"""
	onehot_encoder = OneHotEncoder(sparse=False)
	label = label.reshape(len(label), 1)
	onehot_encoded_label = onehot_encoder.fit_transform(label)
	return onehot_encoded_label

	def data_loss(class_probabilities, onehot_encode_label):
	"""
	Compute data_loss L_i for the correct class with a onehot encoded label

	:param class_probabilities: probabilities from the softmax function
	:param onehot_encode_label: correct labels in a one hot enconding shape

	:return: the data loss L_i
	"""
	return onehot_encode_label * -np.log(class_probabilities)

	def loss(X, y, theta, lam):
	"""
	:param X: data
	:param y: label of the data
	:param theta: learnable parameters
	:param lam: regularization factor

	:return: loss and gradient as a tupel
	"""
	encoded_labels = onehot_encode_label(y) # also needed for the gradient, therefore separated calculated
	probabilities = softmax(class_scores(X,theta)) # also needed for the gradient, therefore separated calculated
	loss_Li = data_loss(probabilities,encoded_labels)

	m = X.shape[0] # number of training data for normalization
	l2_regularization = (lam/2)np.sum(thetatheta) # regularization loss

	loss = np.sum(loss_Li)/m + l2_regularization

	dl2 = lam*theta
	dloss = np.dot(X.T, (probabilities - encoded_labels)/m)
	gradient = dloss + dl2

	return loss, gradient

	def sgd(training_data, training_label, theta, lam=0.5, iterations=100, learning_rate=1e-5, batch_size=256):
	losses = []
	for i in range(iterations):
	shuffle_index = np.random.permutation(training_data.shape[0])
	data, label = training_data[shuffle_index], training_label[shuffle_index]
	data, label = data[:batch_size], label[:batch_size]

	l, grad = loss(data, label, theta, lam)
	losses.append(l)
	theta -= learning_rate*grad
	return theta, losses

	# Initialize learnable parameters theta
	theta = np.zeros([X_train.shape[1],len(np.unique(y_train))])

	# Start optimization with traning data, theta and optional hyperparameters
	opt_model, loss_history = sgd(X_train,y_train,theta,iterations=250)

	# evaluation
	print('last iteration loss:',loss_history[-1])
	print('first iteration loss:',loss_history[0])
	print('Is the first loss equal to ln(10)?', np.log(10) - loss_history[0] < 0.000001) # if its false you have a implementation error

	# plot a loss curve
	plt.plot(loss_history)
	plt.ylabel('loss')
	plt.xlabel('iterations')
	plt.show()

	# plot weights
	plt.figure(figsize=(20, 20))
	num_classes = 10
	for c in range(num_classes):
	f = plt.subplot(10, num_classes, 1 * num_classes + c + 1)
	f.axis('off')
	plt.imshow(np.reshape(opt_model[:,c],[28,28]))
	plt.show()