Last active
July 31, 2018 13:23
-
-
Save ateneva/6dfb9b5e44f34bd6ddf4dc64ab077faf to your computer and use it in GitHub Desktop.
How do I run linear regression with Python?
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
import statsmodels | |
import statsmodels.api as sm | |
import statsmodels.formula.api as smf | |
import os; | |
path ='C:/Users/hp/PycharmProjects/datascience/' | |
os.chdir(path) | |
os.getcwd() | |
dataset = pd.read_csv('BostonHousing.csv') | |
#print (dataset) | |
#--------------Function definitions------------------------------------------------------ | |
# CRIM per capita crime rate by town | |
# ZN proportion of residential land zoned for lots over 25,000 sq.ft. | |
# INDUS proportion of non-retail business acres per town. | |
# CHAS Charles River dummy variable (1 if tract bounds river; 0 otherwise) | |
# NOX nitric oxides concentration (parts per 10 million) | |
# RM average number of rooms per dwelling | |
# AGE proportion of owner-occupied units built prior to 1940 | |
# DIS weighted distances to five Boston employment centres | |
# RAD index of accessibility to radial highways | |
# TAX full-value property-tax rate per $10,000 | |
# PTRATIO pupil-teacher ratio by town | |
# B 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town | |
# LSTAT % lower status of the population | |
# MEDV Median value of owner-occupied homes in $1000 | |
# -------------Script execution------------------------------------------------------------ | |
# define the data/predictors as the pre-set feature names | |
df = pd.DataFrame(dataset,columns=['CRIM', 'ZN', 'INDUS', 'CHAS','NOX','RM','AGE','DIS', 'RAD', 'PTRATIO','B', 'LSTAT', 'TAX']) | |
# setting the target — the variable we’re trying to predict/estimate. | |
target = pd.DataFrame(dataset, columns=['MEDV']) | |
#-------------Running a linear regression without a constant-------------------------------- | |
x = df | |
x = sm.add_constant(x) | |
y = target | |
# Note the difference in argument order | |
model = smf.OLS(y, x).fit() | |
predictions = model.predict(x) # make the predictions by the model | |
# Print out the statistics | |
print(model.summary()) | |
# -------------1. Is the assumption of homoscedasticity violated ? ------------------------- | |
# NB! Test for homoscedasticity (i.e. uneven variance across the error term) | |
# i.e. higher TAX causing higher larger variances among higher tax brackets | |
name = ['Lagrange multiplier statistic', 'p-value', | |
'f-value', 'f p-value'] | |
bp = statsmodels.stats.diagnostic.het_breushpagan(model.resid, model.model.exog) | |
print(pd.DataFrame(name,bp)) | |
# the Breusch-Pagan test indicates if results are being influenced by heteroscedasticity and are therefore unreliable | |
# the p-value is less than 0.05, this indicates that heteroscedasticity is present | |
# --------------2. Is the assumption of multicollinearity violated ? ------------------------ | |
# Multicolinearity denotes the case when two x variables are highly correlated. \ | |
# Not only is it redundant to include both related variables in a multiple regression | |
# model, but it’s also problematic. The bottom line is this: If two x variables | |
# are highly correlated, only include one of them in the regression | |
# model, not both. If you include both, the coefficients for each of the two variables | |
# will be unreliable because they share their contribution to determining the value of y. | |
# for multicolinearity check to work, columns need to be declared as variables | |
age = dataset['AGE'] | |
rm = dataset['RM'] | |
tax = dataset['TAX'] | |
crim = dataset['CRIM'] | |
zn = dataset['ZN'] | |
indus = dataset['INDUS'] | |
chas = dataset['CHAS'] | |
nox = dataset['NOX'] | |
dis = dataset['DIS'] | |
rad = dataset['RAD'] | |
pratio = dataset['PTRATIO'] | |
lstat = dataset['LSTAT'] | |
# --------------------------------calculating VIF--------------------------------------------------------- | |
z = np.column_stack((age, rm, crim, zn, indus, chas, nox, dis, rad, pratio, lstat)) | |
z = sm.add_constant(z, prepend=True) | |
from sklearn.metrics import r2_score | |
from sklearn import linear_model | |
lm = linear_model.LinearRegression() | |
model = lm.fit(z,tax) | |
predictions = lm.predict(z) | |
#print(predictions) | |
rsquared_t = r2_score(tax, predictions) | |
vif =1/(1-(rsquared_t)) | |
print('VIF =', vif) | |
# We are defining our predictor variables) as a variable z. | |
# Once we have done this, we are then running this variable against the age variable (our auxiliary regression). | |
# Once we have the predictions, we are then obtaining the R-Squared value and then calculating the VIF statistic. | |
# If VIF > 10, there is cause for concern | |
#Myers (1990) suggests that a value of 10 is a good value at which to worry | |
#if the average VIF is greater than 1, then multicollinearity may be biasing the regression model (Bowerman & O’Connell, 1990). | |
#To calculate the average VIF we simply add the VIF values for each predictor and divide by the number of predictors (k): | |
# To check for the others, we simply select the relevant variable as the dependent one, and regress against the others | |
# rm, tax vs. age = 1.362 | |
# age, tax vs. rm = 0.168 | |
# rm, age vs. tax = 0.006 | |
# -------------------Printing correlation matrix----------------------------------------- | |
#age, rm, crim, zn, indus, chas, nox, dis, rad, pratio, lstat | |
d = {'age': age, | |
'tax': tax, | |
'rm': rm, | |
'crim': crim, | |
'zn': zn, | |
'indus': indus, | |
'chas': chas, | |
'nox': nox, | |
'dis': dis, | |
'rad': rad, | |
'pratio': pratio, | |
'lstat': lstat} | |
cr = pd.DataFrame(data = d) | |
print("Data Frame") | |
print("Correlation Matrix") | |
print(cr.corr()) | |
print() | |
def get_redundant_pairs(cr): | |
'''Get diagonal and lower triangular pairs of correlation matrix''' | |
pairs_to_drop = set() | |
cols = cr.columns | |
for i in range(0, cr.shape[1]): | |
for j in range(0, i+1): | |
pairs_to_drop.add((cols[i], cols[j])) | |
return pairs_to_drop | |
def get_top_abs_correlations(cr, n=5): | |
au_corr = cr.corr().abs().unstack() | |
labels_to_drop = get_redundant_pairs(cr) | |
au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False) | |
return au_corr[0:n] | |
print("Top Absolute Correlations") | |
print(get_top_abs_correlations(cr, 10)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment