lprowell/linear_regression

## linear_regression
# Linear Regression without GridSearch

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn import metrics


X = [[Some data frame of predictors]]
y = target.values (series)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)   # 0.3 is standard test size, pick what you need to

print X_train.shape, y_train.shape
print X_test.shape, y_test.shape    #It's good practice to check

lm = LinearRegression()

#Next we do cross validation, which splits apart our training data and fits the model on different samples and
# gives scores for each sample to get the best fit model before we test it on the testing data.

scores = cross_val_score(lm, X_train, y_train, cv = 5)    #cv is the number of folds, scores will give an array of scores

print scores, np.mean(scores), np.std(scores)

#To get predictions (y_hat) and check them all in one using cross validation

predictions = cross_val_predict(lm, X_test, y_test, cv = 5)     #y_test is needed here in predictions to get scores for each fold of cv

accuracy = metrics.r2_scores(y_test, predictions)  #this says the accuracy of the predictions from the best cv fold


#If this is good, continue to fit the model on the data


lm.fit(X_train, y_train)

y_hat = lm.predict(X_test)      #this gives me my predictions

lm.score(X_test, y_test)     #this tells me my model performance

## logistic_regression_gridsearch
# Logistic Regression with Gridsearch

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
from sklearn import metrics

X = [[Some data frame of predictors]]
y = target.values (series)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

params = {
    'penalty':['l1', 'l2', 'elasticnet'],        # l1 is Lasso, l2 is Ridge
    'solver':['liblinear'],
    'C': np.linspace(0.00002,1,100)
}

lr = LogisticRegression()
lr_gs = GridSearchCV(lr, params, cv=3, verbose=1).fit(X_train, y_train)

print "Best Params", lr_gs.best_params_
print "Best Score", lr_gs.best_score_

lr_best = LogisticRegression(plug in best params here)
lr_best.fit(X_train, y_train)
lr_best.score(X_test, y_test)
	# Linear Regression without GridSearch

	from sklearn.linear_model import LinearRegression
	from sklearn.model_selection import train_test_split
	from sklearn.model_selection import cross_val_score, cross_val_predict
	from sklearn import metrics


	X = [[Some data frame of predictors]]
	y = target.values (series)

	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3) # 0.3 is standard test size, pick what you need to

	print X_train.shape, y_train.shape
	print X_test.shape, y_test.shape #It's good practice to check

	lm = LinearRegression()

	#Next we do cross validation, which splits apart our training data and fits the model on different samples and
	# gives scores for each sample to get the best fit model before we test it on the testing data.

	scores = cross_val_score(lm, X_train, y_train, cv = 5) #cv is the number of folds, scores will give an array of scores

	print scores, np.mean(scores), np.std(scores)

	#To get predictions (y_hat) and check them all in one using cross validation

	predictions = cross_val_predict(lm, X_test, y_test, cv = 5) #y_test is needed here in predictions to get scores for each fold of cv

	accuracy = metrics.r2_scores(y_test, predictions) #this says the accuracy of the predictions from the best cv fold


	#If this is good, continue to fit the model on the data


	lm.fit(X_train, y_train)

	y_hat = lm.predict(X_test) #this gives me my predictions

	lm.score(X_test, y_test) #this tells me my model performance
	# Logistic Regression with Gridsearch

	from sklearn.linear_model import LogisticRegression
	from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
	from sklearn import metrics

	X = [[Some data frame of predictors]]
	y = target.values (series)

	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

	params = {
	'penalty':['l1', 'l2', 'elasticnet'], # l1 is Lasso, l2 is Ridge
	'solver':['liblinear'],
	'C': np.linspace(0.00002,1,100)
	}

	lr = LogisticRegression()
	lr_gs = GridSearchCV(lr, params, cv=3, verbose=1).fit(X_train, y_train)

	print "Best Params", lr_gs.best_params_
	print "Best Score", lr_gs.best_score_

	lr_best = LogisticRegression(plug in best params here)
	lr_best.fit(X_train, y_train)
	lr_best.score(X_test, y_test)