Last active
November 9, 2022 16:03
-
-
Save lprowell/81ab813d84096290c5a833a8c7731355 to your computer and use it in GitHub Desktop.
Code for linear regression, cross validation, gridsearch, logistic regression, etc.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Linear Regression without GridSearch | |
from sklearn.linear_model import LinearRegression | |
from sklearn.model_selection import train_test_split | |
from sklearn.model_selection import cross_val_score, cross_val_predict | |
from sklearn import metrics | |
X = [[Some data frame of predictors]] | |
y = target.values (series) | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3) # 0.3 is standard test size, pick what you need to | |
print X_train.shape, y_train.shape | |
print X_test.shape, y_test.shape #It's good practice to check | |
lm = LinearRegression() | |
#Next we do cross validation, which splits apart our training data and fits the model on different samples and | |
# gives scores for each sample to get the best fit model before we test it on the testing data. | |
scores = cross_val_score(lm, X_train, y_train, cv = 5) #cv is the number of folds, scores will give an array of scores | |
print scores, np.mean(scores), np.std(scores) | |
#To get predictions (y_hat) and check them all in one using cross validation | |
predictions = cross_val_predict(lm, X_test, y_test, cv = 5) #y_test is needed here in predictions to get scores for each fold of cv | |
accuracy = metrics.r2_scores(y_test, predictions) #this says the accuracy of the predictions from the best cv fold | |
#If this is good, continue to fit the model on the data | |
lm.fit(X_train, y_train) | |
y_hat = lm.predict(X_test) #this gives me my predictions | |
lm.score(X_test, y_test) #this tells me my model performance |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Logistic Regression with Gridsearch | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV | |
from sklearn import metrics | |
X = [[Some data frame of predictors]] | |
y = target.values (series) | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3) | |
params = { | |
'penalty':['l1', 'l2', 'elasticnet'], # l1 is Lasso, l2 is Ridge | |
'solver':['liblinear'], | |
'C': np.linspace(0.00002,1,100) | |
} | |
lr = LogisticRegression() | |
lr_gs = GridSearchCV(lr, params, cv=3, verbose=1).fit(X_train, y_train) | |
print "Best Params", lr_gs.best_params_ | |
print "Best Score", lr_gs.best_score_ | |
lr_best = LogisticRegression(plug in best params here) | |
lr_best.fit(X_train, y_train) | |
lr_best.score(X_test, y_test) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@lprowell I think this doesn't show the effect of using
GridSearchCV
, because you are comparing Linear Regression (Regression) and Logistic Regression (Classification)