Skip to content

Instantly share code, notes, and snippets.

@ksv-muralidhar
Created February 16, 2021 05:52
Show Gist options
  • Save ksv-muralidhar/bf712add6b2ee383c9bc9476b1435539 to your computer and use it in GitHub Desktop.
Save ksv-muralidhar/bf712add6b2ee383c9bc9476b1435539 to your computer and use it in GitHub Desktop.
Learning Curve
from sklearn.model_selection import learning_curve
#The function below builds the model and returns cross validation scores, train score and learning curve data
def learn_curve(X,y,c):
''' param X: Matrix of input features
param y: Vector of Target/Label
c: Inverse Regularization variable to control overfitting (high value causes overfitting, low value causes underfitting)
'''
'''We aren't splitting the data into train and test because we will use StratifiedKFoldCV.
KFold CV is a preferred method compared to hold out CV, since the model is tested on all the examples.
Hold out CV is preferred when the model takes too long to train and we have a huge test set that truly represents the universe
'''
le = LabelEncoder() # Label encoding the target
sc = StandardScaler() # Scaling the input features
y = le.fit_transform(y)#Label Encoding the target
log_reg = LogisticRegression(max_iter=200,random_state=11,C=c) # LogisticRegression model
# Pipeline with scaling and classification as steps, must use a pipelne since we are using KFoldCV
lr = Pipeline(steps=(['scaler',sc],
['classifier',log_reg]))
cv = StratifiedKFold(n_splits=5,random_state=11,shuffle=True) # Creating a StratifiedKFold object with 5 folds
cv_scores = cross_val_score(lr,X,y,scoring="accuracy",cv=cv) # Storing the CV scores (accuracy) of each fold
lr.fit(X,y) # Fitting the model
train_score = lr.score(X,y) # Scoring the model on train set
#Building the learning curve
train_size,train_scores,test_scores = learning_curve(estimator=lr,X=X,y=y,cv=cv,scoring="accuracy",random_state=11)
train_scores = 1-np.mean(train_scores,axis=1)#converting the accuracy score to misclassification rate
test_scores = 1-np.mean(test_scores,axis=1)#converting the accuracy score to misclassification rate
lc = pd.DataFrame({"Training_size":train_size,"Training_loss":train_scores,"Validation_loss":test_scores}).melt(id_vars="Training_size")
return {"cv_scores":cv_scores,
"train_score":train_score,
"learning_curve":lc}
lc = learn_curve(X,y,1)
print(f'Cross Validation Accuracies:\n{"-"*25}\n{list(lc["cv_scores"])}\n\n\
Mean Cross Validation Accuracy:\n{"-"*25}\n{np.mean(lc["cv_scores"])}\n\n\
Standard Deviation of Cross Validation Accuracy:\n{"-"*25}\n{np.std(lc["cv_scores"])}\n\n\
Training Accuracy:\n{"-"*15}\n{lc["train_score"]}\n\n')
sns.lineplot(data=lc["learning_curve"],x="Training_size",y="value",hue="variable")
plt.title("Learning Curve of Good Fit Model")
plt.ylabel("Misclassification Rate/Loss");
lc = learn_curve(X,y,10000)
print(f'Cross Validation Accuracies:\n{"-"*25}\n{list(lc["cv_scores"])}\n\n\
Mean Cross Validation Accuracy:\n{"-"*25}\n{np.mean(lc["cv_scores"])}\n\n\
Standard Deviation of Cross Validation Accuracy:\n{"-"*25}\n{np.std(lc["cv_scores"])} (High Variance)\n\n\
Training Accuracy:\n{"-"*15}\n{lc["train_score"]}\n\n')
sns.lineplot(data=lc["learning_curve"],x="Training_size",y="value",hue="variable")
plt.title("Learning Curve of an Overfit Model")
plt.ylabel("Misclassification Rate/Loss");
lc = learn_curve(X,y,1/10000)
print(f'Cross Validation Accuracies:\n{"-"*25}\n{list(lc["cv_scores"])}\n\n\
Mean Cross Validation Accuracy:\n{"-"*25}\n{np.mean(lc["cv_scores"])}\n\n\
Standard Deviation of Cross Validation Accuracy:\n{"-"*25}\n{np.std(lc["cv_scores"])} (Low variance)\n\n\
Training Accuracy:\n{"-"*15}\n{lc["train_score"]}\n\n')
sns.lineplot(data=lc["learning_curve"],x="Training_size",y="value",hue="variable")
plt.title("Learning Curve of an Underfit Model")
plt.ylabel("Misclassification Rate/Loss");
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment