labriedion/get_started_with_deep_learning_using_keras.py

## get_started_with_deep_learning_using_keras.py
# Get Started with Deep Learning using Keras (https://medium.com/@labriedion/get-started-with-deep-learning-using-keras-a45ee421f3ef)
# Étienne Labrie-Dion

""" This script will train a neural network to predict salaries
based on variables such as occupation, relationship status, the age of the person
and his or her education level. """

# Prepare all the packages this script will need.
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import ModelCheckpoint


# List of all the column names since there was none in the data
columns = ['Age','Workclass','Id','Education','Education Num','Marital Status',
       'Occupation','Relationship','Race','Sex','Capital Gain','Capital Loss',
       'Hours/Week','Country','Salary']

# Load both datasets
df_train = pd.read_csv('adult-training.csv', names=columns)
df_test = pd.read_csv('adult-test.csv', header=0, names=columns).loc[1:,:]

# Prepare the salary column for classification
# Salary is 1 if higher than 50k, 0 if lower.
df_train['Salary'] = np.where(df_train['Salary'] == ' >50K',1,0)
df_test['Salary'] = np.where(df_test['Salary'] == ' >50K.',1,0)

# Check the data distribution for salaries.
# We might need to balance the classes if there's way more of lower or higher salaries.
df_train.Salary.mean()

# Train mean is 0.2408, the ratio of High to low salaries. Approximately 1:3
class_weights = {0: 1,
                1: 3}

# Produce dummy variables for all categories (one-hot encoded, each category is now a column)
df_train = pd.get_dummies(df_train)
cols = df_train.columns
df_test = pd.get_dummies(df_test)

# Correct for missing variables in test dataset
# (usually the datasets would be in one dataframe and this wouldn't be necessary)
df_test = df_test.loc[:, cols]
df_test = df_test.fillna(0)

# Drop the salary and Id columns since we dont need those in the training data
X_train = df_train.loc[:, ~((df_train.columns == 'Salary') |
            (df_train.columns == 'Id'))].values
X_test = df_test.loc[:, ~((df_test.columns == 'Salary') |
            (df_test.columns == 'Id'))].values

# Scale the variables, helps train the neural network
scaler = MinMaxScaler()
scaler.fit(X_train)

# Prepare the variables by scaling them, extract the targets into y
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
y_train = df_train['Salary'].values
y_test = df_test['Salary'].values

# A checkpoint allows you to save the ideal weights during training.
# Here we save the weights that give us the best validation accuracy.
checkpoint = ModelCheckpoint("weights.hdf5", monitor='val_acc',
                             verbose=1, save_best_only=True,
                             save_weights_only=True, mode='auto', period=1)

# Prepare the model

# There's two ways to build models in Keras;
# Here, we use the Sequential interface, there's also the Function API.
model = Sequential()

# Add a first Dense (fully-connected) layer.
# You always need to supply the input shape of your data in the first layer.
model.add(Dense(100, input_shape=[X_train.shape[1]], activation='relu'))

# Using dropout can prevent overfitting and should speed up the training.
model.add(Dropout(0.2))

# More layers to allow the network to learn more complex relationships
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.2))

# The last layer is the size of your output. In this case, we have 1 variable,
# the salary (0, 1). It's important to choose the right activation here for
# the task. Here, this is a classification task, so you need an activation function
# suited for classification. Sigmoid is suited for binary classification and
# softmax for multiple categorical classification.
# If you're doing regression, then you should do some research to find the
# right activation function for the task. Relu would work fine most of the time.
model.add(Dense(1, activation='sigmoid'))

# Here I compile and summarize the model. The optimizer I chose is Adam, a
# common optimizer that works well in general.
# The loss function needs to be chosen carefully. For this classification task, only binary
# crossentropy makes sense.
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

# This is where the training starts. The training data is loaded here.
# The batch size specifies how many training examples are loaded in one iteration
# Epochs are the number of training loops.
# You might need more or less, depending on your problem.
# Here I validate directly on the test data.
# This can lead to overfitting but is the fastest way to get good results.
model.fit(X_train, y_train, batch_size=5000, epochs=300, verbose=1,
          validation_data=[X_test,y_test],
                          callbacks=[checkpoint], class_weight=class_weights)

# Loading the best weights saved during training and recompiling the network.
model.load_weights("weights.hdf5")
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Here I run the model to get the predictions
y_pred = model.predict(X_test)

# To get a confusion matrix you need to bin your values.
# Here I chose the 0.5 threshold but different ones could suit other purposes.
y_pred_bin = np.where(y_pred >0.5, 1, 0)
matrix = confusion_matrix(y_test, y_pred_bin)

print("Precision: Salaries below 50K", round(matrix[0,0] / (matrix[0,0] + matrix[0,1])),2)
print("Precision: Salaries higher than 50K", round(matrix[1,1] / (matrix[1,1] + matrix[1,0])),2)
	# Get Started with Deep Learning using Keras (https://medium.com/@labriedion/get-started-with-deep-learning-using-keras-a45ee421f3ef)
	# Étienne Labrie-Dion

	""" This script will train a neural network to predict salaries
	based on variables such as occupation, relationship status, the age of the person
	and his or her education level. """

	# Prepare all the packages this script will need.
	import pandas as pd
	import numpy as np
	from sklearn.preprocessing import MinMaxScaler
	from sklearn.metrics import confusion_matrix
	from keras.models import Sequential
	from keras.layers import Dense, Dropout
	from keras.callbacks import ModelCheckpoint


	# List of all the column names since there was none in the data
	columns = ['Age','Workclass','Id','Education','Education Num','Marital Status',
	'Occupation','Relationship','Race','Sex','Capital Gain','Capital Loss',
	'Hours/Week','Country','Salary']

	# Load both datasets
	df_train = pd.read_csv('adult-training.csv', names=columns)
	df_test = pd.read_csv('adult-test.csv', header=0, names=columns).loc[1:,:]

	# Prepare the salary column for classification
	# Salary is 1 if higher than 50k, 0 if lower.
	df_train['Salary'] = np.where(df_train['Salary'] == ' >50K',1,0)
	df_test['Salary'] = np.where(df_test['Salary'] == ' >50K.',1,0)

	# Check the data distribution for salaries.
	# We might need to balance the classes if there's way more of lower or higher salaries.
	df_train.Salary.mean()

	# Train mean is 0.2408, the ratio of High to low salaries. Approximately 1:3
	class_weights = {0: 1,
	1: 3}

	# Produce dummy variables for all categories (one-hot encoded, each category is now a column)
	df_train = pd.get_dummies(df_train)
	cols = df_train.columns
	df_test = pd.get_dummies(df_test)

	# Correct for missing variables in test dataset
	# (usually the datasets would be in one dataframe and this wouldn't be necessary)
	df_test = df_test.loc[:, cols]
	df_test = df_test.fillna(0)

	# Drop the salary and Id columns since we dont need those in the training data
	X_train = df_train.loc[:, ~((df_train.columns == 'Salary') \|
	(df_train.columns == 'Id'))].values
	X_test = df_test.loc[:, ~((df_test.columns == 'Salary') \|
	(df_test.columns == 'Id'))].values

	# Scale the variables, helps train the neural network
	scaler = MinMaxScaler()
	scaler.fit(X_train)

	# Prepare the variables by scaling them, extract the targets into y
	X_train = scaler.transform(X_train)
	X_test = scaler.transform(X_test)
	y_train = df_train['Salary'].values
	y_test = df_test['Salary'].values

	# A checkpoint allows you to save the ideal weights during training.
	# Here we save the weights that give us the best validation accuracy.
	checkpoint = ModelCheckpoint("weights.hdf5", monitor='val_acc',
	verbose=1, save_best_only=True,
	save_weights_only=True, mode='auto', period=1)

	# Prepare the model

	# There's two ways to build models in Keras;
	# Here, we use the Sequential interface, there's also the Function API.
	model = Sequential()

	# Add a first Dense (fully-connected) layer.
	# You always need to supply the input shape of your data in the first layer.
	model.add(Dense(100, input_shape=[X_train.shape[1]], activation='relu'))

	# Using dropout can prevent overfitting and should speed up the training.
	model.add(Dropout(0.2))

	# More layers to allow the network to learn more complex relationships
	model.add(Dense(100, activation='relu'))
	model.add(Dropout(0.2))
	model.add(Dense(100, activation='relu'))
	model.add(Dropout(0.2))

	# The last layer is the size of your output. In this case, we have 1 variable,
	# the salary (0, 1). It's important to choose the right activation here for
	# the task. Here, this is a classification task, so you need an activation function
	# suited for classification. Sigmoid is suited for binary classification and
	# softmax for multiple categorical classification.
	# If you're doing regression, then you should do some research to find the
	# right activation function for the task. Relu would work fine most of the time.
	model.add(Dense(1, activation='sigmoid'))

	# Here I compile and summarize the model. The optimizer I chose is Adam, a
	# common optimizer that works well in general.
	# The loss function needs to be chosen carefully. For this classification task, only binary
	# crossentropy makes sense.
	model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
	model.summary()

	# This is where the training starts. The training data is loaded here.
	# The batch size specifies how many training examples are loaded in one iteration
	# Epochs are the number of training loops.
	# You might need more or less, depending on your problem.
	# Here I validate directly on the test data.
	# This can lead to overfitting but is the fastest way to get good results.
	model.fit(X_train, y_train, batch_size=5000, epochs=300, verbose=1,
	validation_data=[X_test,y_test],
	callbacks=[checkpoint], class_weight=class_weights)

	# Loading the best weights saved during training and recompiling the network.
	model.load_weights("weights.hdf5")
	model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

	# Here I run the model to get the predictions
	y_pred = model.predict(X_test)

	# To get a confusion matrix you need to bin your values.
	# Here I chose the 0.5 threshold but different ones could suit other purposes.
	y_pred_bin = np.where(y_pred >0.5, 1, 0)
	matrix = confusion_matrix(y_test, y_pred_bin)

	print("Precision: Salaries below 50K", round(matrix[0,0] / (matrix[0,0] + matrix[0,1])),2)
	print("Precision: Salaries higher than 50K", round(matrix[1,1] / (matrix[1,1] + matrix[1,0])),2)