ateneva/LinearRegression.py

## LinearRegression.py
import pandas as pd
import numpy as np
import statsmodels
import statsmodels.api as sm
import statsmodels.formula.api as smf

import os;
path ='C:/Users/hp/PycharmProjects/datascience/'
os.chdir(path)
os.getcwd()

dataset = pd.read_csv('BostonHousing.csv')
#print (dataset)
#--------------Function definitions------------------------------------------------------

# CRIM     per capita crime rate by town
# ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
# INDUS    proportion of non-retail business acres per town.
# CHAS     Charles River dummy variable (1 if tract bounds river; 0 otherwise)
# NOX      nitric oxides concentration (parts per 10 million)
# RM       average number of rooms per dwelling
# AGE      proportion of owner-occupied units built prior to 1940
# DIS      weighted distances to five Boston employment centres
# RAD      index of accessibility to radial highways
# TAX      full-value property-tax rate per $10,000
# PTRATIO  pupil-teacher ratio by town
# B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
# LSTAT   % lower status of the population
# MEDV    Median value of owner-occupied homes in $1000

# -------------Script execution------------------------------------------------------------

# define the data/predictors as the pre-set feature names
df = pd.DataFrame(dataset,columns=['CRIM', 'ZN', 'INDUS', 'CHAS','NOX','RM','AGE','DIS', 'RAD', 'PTRATIO','B', 'LSTAT', 'TAX'])

# setting the target — the variable we’re trying to predict/estimate.
target = pd.DataFrame(dataset, columns=['MEDV'])

#-------------Running a linear regression without a constant--------------------------------

x = df
x = sm.add_constant(x)
y = target

# Note the difference in argument order
model = smf.OLS(y, x).fit()
predictions = model.predict(x)    # make the predictions by the model

# Print out the statistics
print(model.summary())

# -------------1. Is the assumption of homoscedasticity violated ? -------------------------
# NB! Test for homoscedasticity (i.e. uneven variance across the error term)
# i.e. higher TAX causing higher larger variances among higher tax brackets

name = ['Lagrange multiplier statistic', 'p-value',
        'f-value', 'f p-value']
bp = statsmodels.stats.diagnostic.het_breushpagan(model.resid, model.model.exog)
print(pd.DataFrame(name,bp))

# the Breusch-Pagan test indicates if results are being influenced by heteroscedasticity and are therefore unreliable
# the p-value is less than 0.05, this indicates that heteroscedasticity is present

# --------------2. Is the assumption of multicollinearity violated ? ------------------------

# Multicolinearity denotes the case when two x variables are highly correlated. \
# Not only is it redundant to include both related variables in a multiple regression
# model, but it’s also problematic. The bottom line is this: If two x variables
# are highly correlated, only include one of them in the regression
# model, not both. If you include both, the coefficients for each of the two variables
# will be unreliable because they share their contribution to determining the value of y.

# for multicolinearity check to work, columns need to be declared as variables

age = dataset['AGE']
rm = dataset['RM']
tax = dataset['TAX']
crim = dataset['CRIM']
zn = dataset['ZN']
indus = dataset['INDUS']
chas = dataset['CHAS']
nox = dataset['NOX']
dis = dataset['DIS']
rad = dataset['RAD']
pratio = dataset['PTRATIO']
lstat = dataset['LSTAT']

# --------------------------------calculating VIF---------------------------------------------------------
z = np.column_stack((age, rm, crim, zn, indus, chas, nox, dis, rad, pratio, lstat))
z = sm.add_constant(z, prepend=True)

from sklearn.metrics import r2_score
from sklearn import linear_model

lm = linear_model.LinearRegression()
model = lm.fit(z,tax)
predictions = lm.predict(z)
#print(predictions)

rsquared_t = r2_score(tax, predictions)
vif =1/(1-(rsquared_t))
print('VIF =', vif)

# We are defining our predictor variables) as a variable z.
# Once we have done this, we are then running this variable against the age variable (our auxiliary regression).
# Once we have the predictions, we are then obtaining the R-Squared value and then calculating the VIF statistic.

# If VIF > 10, there is cause for concern
#Myers (1990) suggests that a value of 10 is a good value at which to worry
#if the average VIF is greater than 1, then multicollinearity may be biasing the regression model (Bowerman & O’Connell, 1990).

#To calculate the average VIF we simply add the VIF values for each predictor and divide by the number of predictors (k):

# To check for the others, we simply select the relevant variable as the dependent one, and regress against the others

# rm, tax vs. age = 1.362
# age, tax vs. rm = 0.168
# rm, age vs. tax = 0.006

# -------------------Printing correlation matrix-----------------------------------------

#age, rm, crim, zn, indus, chas, nox, dis, rad, pratio, lstat
d = {'age': age,
     'tax': tax,
     'rm': rm,
     'crim': crim,
     'zn': zn,
     'indus': indus,
     'chas': chas,
     'nox': nox,
     'dis': dis,
     'rad': rad,
     'pratio': pratio,
     'lstat': lstat}
cr = pd.DataFrame(data = d)
print("Data Frame")

print("Correlation Matrix")
print(cr.corr())
print()

def get_redundant_pairs(cr):
    '''Get diagonal and lower triangular pairs of correlation matrix'''
    pairs_to_drop = set()
    cols = cr.columns
    for i in range(0, cr.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

def get_top_abs_correlations(cr, n=5):
    au_corr = cr.corr().abs().unstack()
    labels_to_drop = get_redundant_pairs(cr)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
    return au_corr[0:n]

print("Top Absolute Correlations")
print(get_top_abs_correlations(cr, 10))
	import pandas as pd
	import numpy as np
	import statsmodels
	import statsmodels.api as sm
	import statsmodels.formula.api as smf

	import os;
	path ='C:/Users/hp/PycharmProjects/datascience/'
	os.chdir(path)
	os.getcwd()

	dataset = pd.read_csv('BostonHousing.csv')
	#print (dataset)
	#--------------Function definitions------------------------------------------------------

	# CRIM per capita crime rate by town
	# ZN proportion of residential land zoned for lots over 25,000 sq.ft.
	# INDUS proportion of non-retail business acres per town.
	# CHAS Charles River dummy variable (1 if tract bounds river; 0 otherwise)
	# NOX nitric oxides concentration (parts per 10 million)
	# RM average number of rooms per dwelling
	# AGE proportion of owner-occupied units built prior to 1940
	# DIS weighted distances to five Boston employment centres
	# RAD index of accessibility to radial highways
	# TAX full-value property-tax rate per $10,000
	# PTRATIO pupil-teacher ratio by town
	# B 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
	# LSTAT % lower status of the population
	# MEDV Median value of owner-occupied homes in $1000

	# -------------Script execution------------------------------------------------------------

	# define the data/predictors as the pre-set feature names
	df = pd.DataFrame(dataset,columns=['CRIM', 'ZN', 'INDUS', 'CHAS','NOX','RM','AGE','DIS', 'RAD', 'PTRATIO','B', 'LSTAT', 'TAX'])

	# setting the target — the variable we’re trying to predict/estimate.
	target = pd.DataFrame(dataset, columns=['MEDV'])

	#-------------Running a linear regression without a constant--------------------------------

	x = df
	x = sm.add_constant(x)
	y = target

	# Note the difference in argument order
	model = smf.OLS(y, x).fit()
	predictions = model.predict(x) # make the predictions by the model

	# Print out the statistics
	print(model.summary())

	# -------------1. Is the assumption of homoscedasticity violated ? -------------------------
	# NB! Test for homoscedasticity (i.e. uneven variance across the error term)
	# i.e. higher TAX causing higher larger variances among higher tax brackets

	name = ['Lagrange multiplier statistic', 'p-value',
	'f-value', 'f p-value']
	bp = statsmodels.stats.diagnostic.het_breushpagan(model.resid, model.model.exog)
	print(pd.DataFrame(name,bp))

	# the Breusch-Pagan test indicates if results are being influenced by heteroscedasticity and are therefore unreliable
	# the p-value is less than 0.05, this indicates that heteroscedasticity is present

	# --------------2. Is the assumption of multicollinearity violated ? ------------------------

	# Multicolinearity denotes the case when two x variables are highly correlated. \
	# Not only is it redundant to include both related variables in a multiple regression
	# model, but it’s also problematic. The bottom line is this: If two x variables
	# are highly correlated, only include one of them in the regression
	# model, not both. If you include both, the coefficients for each of the two variables
	# will be unreliable because they share their contribution to determining the value of y.

	# for multicolinearity check to work, columns need to be declared as variables

	age = dataset['AGE']
	rm = dataset['RM']
	tax = dataset['TAX']
	crim = dataset['CRIM']
	zn = dataset['ZN']
	indus = dataset['INDUS']
	chas = dataset['CHAS']
	nox = dataset['NOX']
	dis = dataset['DIS']
	rad = dataset['RAD']
	pratio = dataset['PTRATIO']
	lstat = dataset['LSTAT']

	# --------------------------------calculating VIF---------------------------------------------------------
	z = np.column_stack((age, rm, crim, zn, indus, chas, nox, dis, rad, pratio, lstat))
	z = sm.add_constant(z, prepend=True)

	from sklearn.metrics import r2_score
	from sklearn import linear_model

	lm = linear_model.LinearRegression()
	model = lm.fit(z,tax)
	predictions = lm.predict(z)
	#print(predictions)

	rsquared_t = r2_score(tax, predictions)
	vif =1/(1-(rsquared_t))
	print('VIF =', vif)

	# We are defining our predictor variables) as a variable z.
	# Once we have done this, we are then running this variable against the age variable (our auxiliary regression).
	# Once we have the predictions, we are then obtaining the R-Squared value and then calculating the VIF statistic.

	# If VIF > 10, there is cause for concern
	#Myers (1990) suggests that a value of 10 is a good value at which to worry
	#if the average VIF is greater than 1, then multicollinearity may be biasing the regression model (Bowerman & O’Connell, 1990).

	#To calculate the average VIF we simply add the VIF values for each predictor and divide by the number of predictors (k):

	# To check for the others, we simply select the relevant variable as the dependent one, and regress against the others

	# rm, tax vs. age = 1.362
	# age, tax vs. rm = 0.168
	# rm, age vs. tax = 0.006

	# -------------------Printing correlation matrix-----------------------------------------

	#age, rm, crim, zn, indus, chas, nox, dis, rad, pratio, lstat
	d = {'age': age,
	'tax': tax,
	'rm': rm,
	'crim': crim,
	'zn': zn,
	'indus': indus,
	'chas': chas,
	'nox': nox,
	'dis': dis,
	'rad': rad,
	'pratio': pratio,
	'lstat': lstat}
	cr = pd.DataFrame(data = d)
	print("Data Frame")

	print("Correlation Matrix")
	print(cr.corr())
	print()

	def get_redundant_pairs(cr):
	'''Get diagonal and lower triangular pairs of correlation matrix'''
	pairs_to_drop = set()
	cols = cr.columns
	for i in range(0, cr.shape[1]):
	for j in range(0, i+1):
	pairs_to_drop.add((cols[i], cols[j]))
	return pairs_to_drop

	def get_top_abs_correlations(cr, n=5):
	au_corr = cr.corr().abs().unstack()
	labels_to_drop = get_redundant_pairs(cr)
	au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
	return au_corr[0:n]

	print("Top Absolute Correlations")
	print(get_top_abs_correlations(cr, 10))