Last active
February 18, 2022 13:45
-
-
Save labriedion/9e1480a22d2e259f460b56c84a01a186 to your computer and use it in GitHub Desktop.
Get Started with Deep Learning using Keras
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Get Started with Deep Learning using Keras (https://medium.com/@labriedion/get-started-with-deep-learning-using-keras-a45ee421f3ef) | |
# Étienne Labrie-Dion | |
""" This script will train a neural network to predict salaries | |
based on variables such as occupation, relationship status, the age of the person | |
and his or her education level. """ | |
# Prepare all the packages this script will need. | |
import pandas as pd | |
import numpy as np | |
from sklearn.preprocessing import MinMaxScaler | |
from sklearn.metrics import confusion_matrix | |
from keras.models import Sequential | |
from keras.layers import Dense, Dropout | |
from keras.callbacks import ModelCheckpoint | |
# List of all the column names since there was none in the data | |
columns = ['Age','Workclass','Id','Education','Education Num','Marital Status', | |
'Occupation','Relationship','Race','Sex','Capital Gain','Capital Loss', | |
'Hours/Week','Country','Salary'] | |
# Load both datasets | |
df_train = pd.read_csv('adult-training.csv', names=columns) | |
df_test = pd.read_csv('adult-test.csv', header=0, names=columns).loc[1:,:] | |
# Prepare the salary column for classification | |
# Salary is 1 if higher than 50k, 0 if lower. | |
df_train['Salary'] = np.where(df_train['Salary'] == ' >50K',1,0) | |
df_test['Salary'] = np.where(df_test['Salary'] == ' >50K.',1,0) | |
# Check the data distribution for salaries. | |
# We might need to balance the classes if there's way more of lower or higher salaries. | |
df_train.Salary.mean() | |
# Train mean is 0.2408, the ratio of High to low salaries. Approximately 1:3 | |
class_weights = {0: 1, | |
1: 3} | |
# Produce dummy variables for all categories (one-hot encoded, each category is now a column) | |
df_train = pd.get_dummies(df_train) | |
cols = df_train.columns | |
df_test = pd.get_dummies(df_test) | |
# Correct for missing variables in test dataset | |
# (usually the datasets would be in one dataframe and this wouldn't be necessary) | |
df_test = df_test.loc[:, cols] | |
df_test = df_test.fillna(0) | |
# Drop the salary and Id columns since we dont need those in the training data | |
X_train = df_train.loc[:, ~((df_train.columns == 'Salary') | | |
(df_train.columns == 'Id'))].values | |
X_test = df_test.loc[:, ~((df_test.columns == 'Salary') | | |
(df_test.columns == 'Id'))].values | |
# Scale the variables, helps train the neural network | |
scaler = MinMaxScaler() | |
scaler.fit(X_train) | |
# Prepare the variables by scaling them, extract the targets into y | |
X_train = scaler.transform(X_train) | |
X_test = scaler.transform(X_test) | |
y_train = df_train['Salary'].values | |
y_test = df_test['Salary'].values | |
# A checkpoint allows you to save the ideal weights during training. | |
# Here we save the weights that give us the best validation accuracy. | |
checkpoint = ModelCheckpoint("weights.hdf5", monitor='val_acc', | |
verbose=1, save_best_only=True, | |
save_weights_only=True, mode='auto', period=1) | |
# Prepare the model | |
# There's two ways to build models in Keras; | |
# Here, we use the Sequential interface, there's also the Function API. | |
model = Sequential() | |
# Add a first Dense (fully-connected) layer. | |
# You always need to supply the input shape of your data in the first layer. | |
model.add(Dense(100, input_shape=[X_train.shape[1]], activation='relu')) | |
# Using dropout can prevent overfitting and should speed up the training. | |
model.add(Dropout(0.2)) | |
# More layers to allow the network to learn more complex relationships | |
model.add(Dense(100, activation='relu')) | |
model.add(Dropout(0.2)) | |
model.add(Dense(100, activation='relu')) | |
model.add(Dropout(0.2)) | |
# The last layer is the size of your output. In this case, we have 1 variable, | |
# the salary (0, 1). It's important to choose the right activation here for | |
# the task. Here, this is a classification task, so you need an activation function | |
# suited for classification. Sigmoid is suited for binary classification and | |
# softmax for multiple categorical classification. | |
# If you're doing regression, then you should do some research to find the | |
# right activation function for the task. Relu would work fine most of the time. | |
model.add(Dense(1, activation='sigmoid')) | |
# Here I compile and summarize the model. The optimizer I chose is Adam, a | |
# common optimizer that works well in general. | |
# The loss function needs to be chosen carefully. For this classification task, only binary | |
# crossentropy makes sense. | |
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) | |
model.summary() | |
# This is where the training starts. The training data is loaded here. | |
# The batch size specifies how many training examples are loaded in one iteration | |
# Epochs are the number of training loops. | |
# You might need more or less, depending on your problem. | |
# Here I validate directly on the test data. | |
# This can lead to overfitting but is the fastest way to get good results. | |
model.fit(X_train, y_train, batch_size=5000, epochs=300, verbose=1, | |
validation_data=[X_test,y_test], | |
callbacks=[checkpoint], class_weight=class_weights) | |
# Loading the best weights saved during training and recompiling the network. | |
model.load_weights("weights.hdf5") | |
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) | |
# Here I run the model to get the predictions | |
y_pred = model.predict(X_test) | |
# To get a confusion matrix you need to bin your values. | |
# Here I chose the 0.5 threshold but different ones could suit other purposes. | |
y_pred_bin = np.where(y_pred >0.5, 1, 0) | |
matrix = confusion_matrix(y_test, y_pred_bin) | |
print("Precision: Salaries below 50K", round(matrix[0,0] / (matrix[0,0] + matrix[0,1])),2) | |
print("Precision: Salaries higher than 50K", round(matrix[1,1] / (matrix[1,1] + matrix[1,0])),2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment