ruggeri/imdb_logistic_regression.py

## imdb_logistic_regression.py
from keras.datasets import imdb
import numpy as np

TOP_N_WORDS = 1_000
(x_train, y_train), (x_test, y_test) = imdb.load_data(
    num_words = TOP_N_WORDS
)

# Transform dataset from variable-length word sequences to a
# binary valued dense matrix.
new_x_train = np.zeros((len(x_train), TOP_N_WORDS + 1))

for example_idx, word_sequence in enumerate(x_train):
    for word_idx in word_sequence:
        new_x_train[example_idx, word_idx] = 1
# We'll use a dummy column 0 to apply an intercept theta_0 to our model. It will always have value 1.
new_x_train[:, 0] = 1.0
x_train = new_x_train

new_x_test = np.zeros((len(x_test), TOP_N_WORDS + 1))
for example_idx, word_sequence in enumerate(x_test):
    for word_idx in word_sequence:
        new_x_test[example_idx, word_idx] = 1
new_x_test[:, 0] = 1.0
x_test = new_x_test

def sigma(z):
    return 1 / (1 + np.exp(-z))

def ce_error(probs, correct_ys):
    loss_on_positives = -np.sum(
        correct_ys * np.log(probs)
    )
    loss_on_negatives = -np.sum(
        (1 - correct_ys) * np.log(1 - probs)
    )

    return (loss_on_positives + loss_on_negatives) / len(probs)

def deriv_ce_error_wrt_theta_i(correct_ys, x_is, probs):
    return np.sum(
        (probs - correct_ys) * x_is
    )

def gradient(correct_ys, x_values, thetas):
    probs = probabilities(x_values, thetas)
    gradient = np.zeros(len(thetas))
    for i in range(0, len(thetas)):
        x_is = x_values[:, i]
        gradient[i] = (
            deriv_ce_error_wrt_theta_i(correct_ys, x_is, probs)
        )

    return gradient

def probabilities(x_values, thetas):
    return sigma(x_values.dot(thetas))

def accuracy(probs, correct_ys):
    num_correct = len(probs) - np.sum(
        np.abs(correct_ys - (probs > 0.5))
    )
    return num_correct / len(probs)

NUM_EPOCHS = 5
NUM_EXAMPLES = x_train.shape[0]
BATCH_SIZE = 32
LEARNING_RATE = 0.01
thetas = np.zeros(x_train.shape[1])

# Initial performance.
probs = probabilities(x_train, thetas)
ce = ce_error(probs, y_train)
acc = accuracy(probs, y_train)
print(f'Epoch: {0} | CE: {ce:0.2f} | Acc: {acc:0.2f}')

print('beginning training')
for epoch_idx in range(1, NUM_EPOCHS + 1):
    for batch_start_idx in range(0, NUM_EXAMPLES, BATCH_SIZE):
        batch_end_idx = np.min([
            batch_start_idx + BATCH_SIZE, NUM_EXAMPLES - 1
        ])
        x_batch = x_train[batch_start_idx:batch_end_idx, :]
        y_batch = y_train[batch_start_idx:batch_end_idx]
        thetas -= LEARNING_RATE * gradient(y_batch, x_batch, thetas)

    probs = probabilities(x_train, thetas)
    ce = ce_error(probs, y_train)
    acc = accuracy(probs, y_train)

    test_probs = probabilities(x_test, thetas)
    test_acc = accuracy(test_probs, y_test)

    print(
        f'Epoch: {epoch_idx} | '
        f'CE: {ce:0.2f} | '
        f'Acc: {acc:0.2f} | '
        f'Test Acc: {test_acc:0.2f}'
    )
	from keras.datasets import imdb
	import numpy as np

	TOP_N_WORDS = 1_000
	(x_train, y_train), (x_test, y_test) = imdb.load_data(
	num_words = TOP_N_WORDS
	)

	# Transform dataset from variable-length word sequences to a
	# binary valued dense matrix.
	new_x_train = np.zeros((len(x_train), TOP_N_WORDS + 1))

	for example_idx, word_sequence in enumerate(x_train):
	for word_idx in word_sequence:
	new_x_train[example_idx, word_idx] = 1
	# We'll use a dummy column 0 to apply an intercept theta_0 to our model. It will always have value 1.
	new_x_train[:, 0] = 1.0
	x_train = new_x_train

	new_x_test = np.zeros((len(x_test), TOP_N_WORDS + 1))
	for example_idx, word_sequence in enumerate(x_test):
	for word_idx in word_sequence:
	new_x_test[example_idx, word_idx] = 1
	new_x_test[:, 0] = 1.0
	x_test = new_x_test

	def sigma(z):
	return 1 / (1 + np.exp(-z))

	def ce_error(probs, correct_ys):
	loss_on_positives = -np.sum(
	correct_ys * np.log(probs)
	)
	loss_on_negatives = -np.sum(
	(1 - correct_ys) * np.log(1 - probs)
	)

	return (loss_on_positives + loss_on_negatives) / len(probs)

	def deriv_ce_error_wrt_theta_i(correct_ys, x_is, probs):
	return np.sum(
	(probs - correct_ys) * x_is
	)

	def gradient(correct_ys, x_values, thetas):
	probs = probabilities(x_values, thetas)
	gradient = np.zeros(len(thetas))
	for i in range(0, len(thetas)):
	x_is = x_values[:, i]
	gradient[i] = (
	deriv_ce_error_wrt_theta_i(correct_ys, x_is, probs)
	)

	return gradient

	def probabilities(x_values, thetas):
	return sigma(x_values.dot(thetas))

	def accuracy(probs, correct_ys):
	num_correct = len(probs) - np.sum(
	np.abs(correct_ys - (probs > 0.5))
	)
	return num_correct / len(probs)

	NUM_EPOCHS = 5
	NUM_EXAMPLES = x_train.shape[0]
	BATCH_SIZE = 32
	LEARNING_RATE = 0.01
	thetas = np.zeros(x_train.shape[1])

	# Initial performance.
	probs = probabilities(x_train, thetas)
	ce = ce_error(probs, y_train)
	acc = accuracy(probs, y_train)
	print(f'Epoch: {0} \| CE: {ce:0.2f} \| Acc: {acc:0.2f}')

	print('beginning training')
	for epoch_idx in range(1, NUM_EPOCHS + 1):
	for batch_start_idx in range(0, NUM_EXAMPLES, BATCH_SIZE):
	batch_end_idx = np.min([
	batch_start_idx + BATCH_SIZE, NUM_EXAMPLES - 1
	])
	x_batch = x_train[batch_start_idx:batch_end_idx, :]
	y_batch = y_train[batch_start_idx:batch_end_idx]
	thetas -= LEARNING_RATE * gradient(y_batch, x_batch, thetas)

	probs = probabilities(x_train, thetas)
	ce = ce_error(probs, y_train)
	acc = accuracy(probs, y_train)

	test_probs = probabilities(x_test, thetas)
	test_acc = accuracy(test_probs, y_test)

	print(
	f'Epoch: {epoch_idx} \| '
	f'CE: {ce:0.2f} \| '
	f'Acc: {acc:0.2f} \| '
	f'Test Acc: {test_acc:0.2f}'
	)