Skip to content

Instantly share code, notes, and snippets.

@accessnash
Created September 4, 2018 00:30
Show Gist options
  • Save accessnash/60268a69687259e8b5d7fbb14c1ab932 to your computer and use it in GitHub Desktop.
Save accessnash/60268a69687259e8b5d7fbb14c1ab932 to your computer and use it in GitHub Desktop.
Forward stepwise variable selection for logistic regression - Chapter 2 - Predictive Analytics - Datacamp
# Import the linear_model and roc_auc_score modules
from sklearn import linear_model
from sklearn.metrics import roc_auc_score
# Consider two sets of variables
variables_1 = ["mean_gift","income_low"]
variables_2 = ["mean_gift","income_low","gender_F","country_India","age"]
# Make predictions using the first set of variables and assign the AUC to auc_1
X_1 = basetable[variables_1]
Y = basetable[["target"]]
logreg = linear_model.LogisticRegression()
logreg.fit(X_1, Y)
predictions_1 = logreg.predict_proba(X_1)[:,1]
auc_1 = roc_auc_score(Y, predictions_1)
# Make predictions using the second set of variables and assign the AUC to auc_2
X_2 = basetable[variables_2]
logreg.fit(X_2, Y)
predictions_2 = logreg.predict_proba(X_2)[:,1]
auc_2 = roc_auc_score(Y, predictions_2)
# Print auc_1 and auc_2
print(round(auc_1,2))
print(round(auc_2,2))
# function to calculate AUC
def auc(variables, target, basetable):
X = basetable[variables]
Y = basetable[target]
logreg = linear_model.LogisticRegression()
logreg.fit(X, Y)
predictions = logreg.predict_proba(X)[:,1]
auc = roc_auc_score(Y, predictions)
return(auc)
def next_best(current_variables,candidate_variables, target, basetable):
best_auc = -1
best_variable = None
# Calculate the auc score of adding v to the current variables
for v in candidate_variables:
auc_v = auc(current_variables + [v],target, basetable)
# Update best_auc and best_variable adding v led to a better auc score
if auc_v >= best_auc:
best_auc = auc_v
best_variable = v
return best_variable
# Try next_best to get the best predicitve variable among gender_F and age
next_variable = next_best(["max_gift", "mean_gift", "min_gift"], ["gender_F", "age"], ["target"], basetable)
print(next_variable)
candidate_variables = list(basetable.columns.values)
candidate_variables.remove("target")
current_variables = []
target = ["target"]
max_number_variables = 10
number_iterations = min(max_number_variables, len(candidate_variables))
for i in range(0, number_iterations):
next_variable = next_best(current_variables, candidate_variables, target, basetable)
current_variables = current_variables + [next_variable]
candidate_variables.remove(next_variable)
print(current_variables)
# Calculate the AUC of the model using min_gift only
auc_min_gift = auc(["min_gift"], ["target"], basetable)
print(round(auc_min_gift,2))
# Calculate the AUC of the model using income_high only
auc_income_high = auc(["income_high"], ["target"], basetable)
print(round(auc_income_high,2))
# Calculate the correlation between min_gift and mean_gift
import numpy
correlation = numpy.corrcoef(basetable["min_gift"], basetable["mean_gift"])[0,1]
print(round(correlation,2))
# Partitioning
from sklearn.cross_validation import train_test_split
# Create dataframes with variables and target
X = basetable.drop('target', 1)
Y = basetable["target"]
# Carry out 50-50 partititioning with stratification
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.50, stratify = Y)
# Create the final train and test basetables
train = pd.concat([X_train, Y_train], axis=1)
test = pd.concat([X_test, Y_test], axis=1)
# Check whether train and test have same percentage targets
print(round(sum(train["target"])/len(train), 2))
print(round(sum(test["target"])/len(test), 2))
from sklearn import linear_model
from sklearn.metrics import roc_auc_score
def auc_train_test(variables, target, train, test):
X_train = train[variables]
X_test = test[variables]
Y_train = train[target]
Y_test = test[target]
logreg = linear_model.LogisticRegression()
# Fit the model on train data
logreg.fit(X_train, Y_train)
# Calculate the predictions both on train and test data
predictions_train = logreg.predict_proba(X_train)[:,1]
predictions_test = logreg.predict_proba(X_test)[:,1]
# Calculate the AUC both on train and test data
auc_train = roc_auc_score(Y_train, predictions_train)
auc_test = roc_auc_score(Y_test,predictions_test)
return(auc_train, auc_test)
# Apply the auc_train_test function
auc_train, auc_test = auc_train_test(["age","gender_F"], ["target"], train, test)
print(round(auc_train,2))
print(round(auc_test,2))
# Variable order resulting from the forward stepwise variable selection procedure
variables = ['max_gift', 'time_since_last_gift', 'number_gift', 'mean_gift', 'income_high', 'age', 'gender_F', 'time_since_first_gift', 'income_low', 'country_UK', 'country_India', 'country_USA', 'min_gift']
# Keep track of train and test AUC values
auc_values_train = []
auc_values_test = []
# Add variables one by one
variables_evaluate = []
# Iterate over the variables in variables
for v in variables:
# Add the variable
variables_evaluate.append(v)
# Calculate the train and test AUC of this set of variables
auc_train, auc_test = auc_train_test(variables_evaluate, ["target"], train,test)
# Append the values to the lists
auc_values_train.append(auc_train)
auc_values_test.append(auc_test)
# Make plot of the AUC values
import matplotlib.pyplot as plt
import numpy as np
x = np.array(range(0,len(auc_values_train)))
y_train = np.array(auc_values_train)
y_test = np.array(auc_values_test)
plt.xticks(x, variables, rotation = 90)
plt.plot(x,y_train)
plt.plot(x,y_test)
plt.ylim((0.7, 0.8))
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment