ksv-muralidhar/learning_curve_1.py

## learning_curve_1.py
from sklearn.model_selection import learning_curve

## learning_curve_2.py
#The function below builds the model and returns cross validation scores, train score and learning curve data
def learn_curve(X,y,c):
''' param X: Matrix of input features
        param y: Vector of Target/Label
        c: Inverse Regularization variable to control overfitting (high value causes overfitting, low value causes underfitting)
    '''
'''We aren't splitting the data into train and test because we will use StratifiedKFoldCV.
       KFold CV is a preferred method compared to hold out CV, since the model is tested on all the examples.
       Hold out CV is preferred when the model takes too long to train and we have a huge test set that truly represents the universe
    '''

    le = LabelEncoder() # Label encoding the target
    sc = StandardScaler() # Scaling the input features
    y = le.fit_transform(y)#Label Encoding the target
log_reg = LogisticRegression(max_iter=200,random_state=11,C=c) # LogisticRegression model
# Pipeline with scaling and classification as steps, must use a pipelne since we are using KFoldCV
    lr = Pipeline(steps=(['scaler',sc],
                        ['classifier',log_reg]))


    cv = StratifiedKFold(n_splits=5,random_state=11,shuffle=True) # Creating a StratifiedKFold object with 5 folds
cv_scores = cross_val_score(lr,X,y,scoring="accuracy",cv=cv) # Storing the CV scores (accuracy) of each fold


    lr.fit(X,y) # Fitting the model

    train_score = lr.score(X,y) # Scoring the model on train set

    #Building the learning curve
train_size,train_scores,test_scores = learning_curve(estimator=lr,X=X,y=y,cv=cv,scoring="accuracy",random_state=11)
train_scores = 1-np.mean(train_scores,axis=1)#converting the accuracy score to misclassification rate
    test_scores = 1-np.mean(test_scores,axis=1)#converting the accuracy score to misclassification rate
lc = pd.DataFrame({"Training_size":train_size,"Training_loss":train_scores,"Validation_loss":test_scores}).melt(id_vars="Training_size")
    return {"cv_scores":cv_scores,
           "train_score":train_score,
           "learning_curve":lc}

## learning_curve_3.py
lc = learn_curve(X,y,1)
print(f'Cross Validation Accuracies:\n{"-"*25}\n{list(lc["cv_scores"])}\n\n\
Mean Cross Validation Accuracy:\n{"-"*25}\n{np.mean(lc["cv_scores"])}\n\n\
Standard Deviation of Cross Validation Accuracy:\n{"-"*25}\n{np.std(lc["cv_scores"])}\n\n\
Training Accuracy:\n{"-"*15}\n{lc["train_score"]}\n\n')
sns.lineplot(data=lc["learning_curve"],x="Training_size",y="value",hue="variable")
plt.title("Learning Curve of Good Fit Model")
plt.ylabel("Misclassification Rate/Loss");

## learning_curve_4.py
lc = learn_curve(X,y,10000)
print(f'Cross Validation Accuracies:\n{"-"*25}\n{list(lc["cv_scores"])}\n\n\
Mean Cross Validation Accuracy:\n{"-"*25}\n{np.mean(lc["cv_scores"])}\n\n\
Standard Deviation of Cross Validation Accuracy:\n{"-"*25}\n{np.std(lc["cv_scores"])} (High Variance)\n\n\
Training Accuracy:\n{"-"*15}\n{lc["train_score"]}\n\n')
sns.lineplot(data=lc["learning_curve"],x="Training_size",y="value",hue="variable")
plt.title("Learning Curve of an Overfit Model")
plt.ylabel("Misclassification Rate/Loss");

## learning_curve_5.py
lc = learn_curve(X,y,1/10000)
print(f'Cross Validation Accuracies:\n{"-"*25}\n{list(lc["cv_scores"])}\n\n\
Mean Cross Validation Accuracy:\n{"-"*25}\n{np.mean(lc["cv_scores"])}\n\n\
Standard Deviation of Cross Validation Accuracy:\n{"-"*25}\n{np.std(lc["cv_scores"])} (Low variance)\n\n\
Training Accuracy:\n{"-"*15}\n{lc["train_score"]}\n\n')
sns.lineplot(data=lc["learning_curve"],x="Training_size",y="value",hue="variable")
plt.title("Learning Curve of an Underfit Model")
plt.ylabel("Misclassification Rate/Loss");
	#The function below builds the model and returns cross validation scores, train score and learning curve data
	def learn_curve(X,y,c):
	''' param X: Matrix of input features
	param y: Vector of Target/Label
	c: Inverse Regularization variable to control overfitting (high value causes overfitting, low value causes underfitting)
	'''
	'''We aren't splitting the data into train and test because we will use StratifiedKFoldCV.
	KFold CV is a preferred method compared to hold out CV, since the model is tested on all the examples.
	Hold out CV is preferred when the model takes too long to train and we have a huge test set that truly represents the universe
	'''

	le = LabelEncoder() # Label encoding the target
	sc = StandardScaler() # Scaling the input features
	y = le.fit_transform(y)#Label Encoding the target
	log_reg = LogisticRegression(max_iter=200,random_state=11,C=c) # LogisticRegression model
	# Pipeline with scaling and classification as steps, must use a pipelne since we are using KFoldCV
	lr = Pipeline(steps=(['scaler',sc],
	['classifier',log_reg]))


	cv = StratifiedKFold(n_splits=5,random_state=11,shuffle=True) # Creating a StratifiedKFold object with 5 folds
	cv_scores = cross_val_score(lr,X,y,scoring="accuracy",cv=cv) # Storing the CV scores (accuracy) of each fold


	lr.fit(X,y) # Fitting the model

	train_score = lr.score(X,y) # Scoring the model on train set

	#Building the learning curve
	train_size,train_scores,test_scores = learning_curve(estimator=lr,X=X,y=y,cv=cv,scoring="accuracy",random_state=11)
	train_scores = 1-np.mean(train_scores,axis=1)#converting the accuracy score to misclassification rate
	test_scores = 1-np.mean(test_scores,axis=1)#converting the accuracy score to misclassification rate
	lc = pd.DataFrame({"Training_size":train_size,"Training_loss":train_scores,"Validation_loss":test_scores}).melt(id_vars="Training_size")
	return {"cv_scores":cv_scores,
	"train_score":train_score,
	"learning_curve":lc}
	lc = learn_curve(X,y,1)
	print(f'Cross Validation Accuracies:\n{"-"*25}\n{list(lc["cv_scores"])}\n\n\
	Mean Cross Validation Accuracy:\n{"-"*25}\n{np.mean(lc["cv_scores"])}\n\n\
	Standard Deviation of Cross Validation Accuracy:\n{"-"*25}\n{np.std(lc["cv_scores"])}\n\n\
	Training Accuracy:\n{"-"*15}\n{lc["train_score"]}\n\n')
	sns.lineplot(data=lc["learning_curve"],x="Training_size",y="value",hue="variable")
	plt.title("Learning Curve of Good Fit Model")
	plt.ylabel("Misclassification Rate/Loss");
	lc = learn_curve(X,y,10000)
	print(f'Cross Validation Accuracies:\n{"-"*25}\n{list(lc["cv_scores"])}\n\n\
	Mean Cross Validation Accuracy:\n{"-"*25}\n{np.mean(lc["cv_scores"])}\n\n\
	Standard Deviation of Cross Validation Accuracy:\n{"-"*25}\n{np.std(lc["cv_scores"])} (High Variance)\n\n\
	Training Accuracy:\n{"-"*15}\n{lc["train_score"]}\n\n')
	sns.lineplot(data=lc["learning_curve"],x="Training_size",y="value",hue="variable")
	plt.title("Learning Curve of an Overfit Model")
	plt.ylabel("Misclassification Rate/Loss");
	lc = learn_curve(X,y,1/10000)
	print(f'Cross Validation Accuracies:\n{"-"*25}\n{list(lc["cv_scores"])}\n\n\
	Mean Cross Validation Accuracy:\n{"-"*25}\n{np.mean(lc["cv_scores"])}\n\n\
	Standard Deviation of Cross Validation Accuracy:\n{"-"*25}\n{np.std(lc["cv_scores"])} (Low variance)\n\n\
	Training Accuracy:\n{"-"*15}\n{lc["train_score"]}\n\n')
	sns.lineplot(data=lc["learning_curve"],x="Training_size",y="value",hue="variable")
	plt.title("Learning Curve of an Underfit Model")
	plt.ylabel("Misclassification Rate/Loss");