Skip to content

Instantly share code, notes, and snippets.

@lprowell
Last active November 9, 2022 16:03
Show Gist options
  • Save lprowell/81ab813d84096290c5a833a8c7731355 to your computer and use it in GitHub Desktop.
Save lprowell/81ab813d84096290c5a833a8c7731355 to your computer and use it in GitHub Desktop.
Code for linear regression, cross validation, gridsearch, logistic regression, etc.
# Linear Regression without GridSearch
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn import metrics
X = [[Some data frame of predictors]]
y = target.values (series)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3) # 0.3 is standard test size, pick what you need to
print X_train.shape, y_train.shape
print X_test.shape, y_test.shape #It's good practice to check
lm = LinearRegression()
#Next we do cross validation, which splits apart our training data and fits the model on different samples and
# gives scores for each sample to get the best fit model before we test it on the testing data.
scores = cross_val_score(lm, X_train, y_train, cv = 5) #cv is the number of folds, scores will give an array of scores
print scores, np.mean(scores), np.std(scores)
#To get predictions (y_hat) and check them all in one using cross validation
predictions = cross_val_predict(lm, X_test, y_test, cv = 5) #y_test is needed here in predictions to get scores for each fold of cv
accuracy = metrics.r2_scores(y_test, predictions) #this says the accuracy of the predictions from the best cv fold
#If this is good, continue to fit the model on the data
lm.fit(X_train, y_train)
y_hat = lm.predict(X_test) #this gives me my predictions
lm.score(X_test, y_test) #this tells me my model performance
# Logistic Regression with Gridsearch
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
from sklearn import metrics
X = [[Some data frame of predictors]]
y = target.values (series)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)
params = {
'penalty':['l1', 'l2', 'elasticnet'], # l1 is Lasso, l2 is Ridge
'solver':['liblinear'],
'C': np.linspace(0.00002,1,100)
}
lr = LogisticRegression()
lr_gs = GridSearchCV(lr, params, cv=3, verbose=1).fit(X_train, y_train)
print "Best Params", lr_gs.best_params_
print "Best Score", lr_gs.best_score_
lr_best = LogisticRegression(plug in best params here)
lr_best.fit(X_train, y_train)
lr_best.score(X_test, y_test)
@PhilipPurwoko
Copy link

@lprowell I think this doesn't show the effect of using GridSearchCV, because you are comparing Linear Regression (Regression) and Logistic Regression (Classification)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment