sachinsdate/dummy_variables_regression.py

## dummy_variables_regression.py
import pandas as pd
import statsmodels.formula.api as smf
from patsy import dmatrices
import scipy.stats as st
from matplotlib import pyplot as plt

#Import the 7-variable subset of the automobiles dataset into a DataFrame
df = pd.read_csv('automobiles_dataset_subset_uciml.csv', header=0)

#############################################################################################
# Dummy variables regression 1
#############################################################################################

#Add dummy variable columns to represent the aspiration variable
df_with_dummies = pd.get_dummies(data=df, columns=['aspiration'])

#Print out the dummy-augmented data set
print(df_with_dummies)

#Construct the regression expression. The intercept of regression is added automatically.
#We add only one dummy variable aspiration_std and not both, _std and _turbo so as to avoid
# perfect collinearity.In this case, regression intercept captures the effect of
# aspiration_turbo. Specifically, the value of the intercept is the coefficient of aspiration_turbo.
# Alternately, we could have added both aspiration_std and  aspiration_turbo and left out the
# regression intercept. In this later case, because the model would not have the regression
# intercept, we would not be able to use the R-squared value to judge its goodness-of-fit.
reg_exp = 'price ~ aspiration_std'

#Build the Ordinary Least Squares Regression model. Even though the entire 7-variables data set
# is passed into the model, internally, statsmodels uses the regression express (reg_exp) to
# carve out the columns of interest
olsr_model = smf.ols(formula=reg_exp, data=df_with_dummies)
#Train the model
olsr_model_results = olsr_model.fit()
#Print the training summary
print(olsr_model_results.summary())

#############################################################################################
# Dummy variables regression 2
#############################################################################################

#Add dummy variable columns to represent body_style
df_with_dummies = pd.get_dummies(data=df, columns=['body_style'])

#Print out the dummy-augmented data set
print(df_with_dummies)

#Construct the regression expression. As before we'll leave out one dummy variable (
# body_style_convertible) to void perfect collinearity. The regression model's intercept will
# hold the coefficient of body_style_convertible
reg_exp = 'price ~ body_style_hardtop + body_style_hatchback + body_style_sedan + \
                    body_style_wagon'

#Build the OLS Regression model.
olsr_model = smf.ols(formula=reg_exp, data=df_with_dummies)
#Train the model
olsr_model_results = olsr_model.fit()
#Print the training summary
print(olsr_model_results.summary())

#############################################################################################
# Dummy variables regression 3
#############################################################################################

#Add dummy variable columns to represent num_of_cylinders
df_with_dummies = pd.get_dummies(data=df, columns=['num_of_cylinders'])

#Form the regression expression
reg_exp = 'price ~ num_of_cylinders_3 + num_of_cylinders_4 + ' \
          'num_of_cylinders_5 + num_of_cylinders_6 + num_of_cylinders_8 + num_of_cylinders_12'

#Build and fit the model and print out the training summary
olsr_model = smf.ols(formula=reg_exp, data=df_with_dummies)
olsr_model_results = olsr_model.fit()
print(olsr_model_results.summary())
	import pandas as pd
	import statsmodels.formula.api as smf
	from patsy import dmatrices
	import scipy.stats as st
	from matplotlib import pyplot as plt

	#Import the 7-variable subset of the automobiles dataset into a DataFrame
	df = pd.read_csv('automobiles_dataset_subset_uciml.csv', header=0)

	#############################################################################################
	# Dummy variables regression 1
	#############################################################################################

	#Add dummy variable columns to represent the aspiration variable
	df_with_dummies = pd.get_dummies(data=df, columns=['aspiration'])

	#Print out the dummy-augmented data set
	print(df_with_dummies)

	#Construct the regression expression. The intercept of regression is added automatically.
	#We add only one dummy variable aspiration_std and not both, _std and _turbo so as to avoid
	# perfect collinearity.In this case, regression intercept captures the effect of
	# aspiration_turbo. Specifically, the value of the intercept is the coefficient of aspiration_turbo.
	# Alternately, we could have added both aspiration_std and aspiration_turbo and left out the
	# regression intercept. In this later case, because the model would not have the regression
	# intercept, we would not be able to use the R-squared value to judge its goodness-of-fit.
	reg_exp = 'price ~ aspiration_std'

	#Build the Ordinary Least Squares Regression model. Even though the entire 7-variables data set
	# is passed into the model, internally, statsmodels uses the regression express (reg_exp) to
	# carve out the columns of interest
	olsr_model = smf.ols(formula=reg_exp, data=df_with_dummies)
	#Train the model
	olsr_model_results = olsr_model.fit()
	#Print the training summary
	print(olsr_model_results.summary())

	#############################################################################################
	# Dummy variables regression 2
	#############################################################################################

	#Add dummy variable columns to represent body_style
	df_with_dummies = pd.get_dummies(data=df, columns=['body_style'])

	#Print out the dummy-augmented data set
	print(df_with_dummies)

	#Construct the regression expression. As before we'll leave out one dummy variable (
	# body_style_convertible) to void perfect collinearity. The regression model's intercept will
	# hold the coefficient of body_style_convertible
	reg_exp = 'price ~ body_style_hardtop + body_style_hatchback + body_style_sedan + \
	body_style_wagon'

	#Build the OLS Regression model.
	olsr_model = smf.ols(formula=reg_exp, data=df_with_dummies)
	#Train the model
	olsr_model_results = olsr_model.fit()
	#Print the training summary
	print(olsr_model_results.summary())

	#############################################################################################
	# Dummy variables regression 3
	#############################################################################################

	#Add dummy variable columns to represent num_of_cylinders
	df_with_dummies = pd.get_dummies(data=df, columns=['num_of_cylinders'])

	#Form the regression expression
	reg_exp = 'price ~ num_of_cylinders_3 + num_of_cylinders_4 + ' \
	'num_of_cylinders_5 + num_of_cylinders_6 + num_of_cylinders_8 + num_of_cylinders_12'

	#Build and fit the model and print out the training summary
	olsr_model = smf.ols(formula=reg_exp, data=df_with_dummies)
	olsr_model_results = olsr_model.fit()
	print(olsr_model_results.summary())