Source code illustrating three different uses of dummy variables in a regression model.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import statsmodels.formula.api as smf | |
from patsy import dmatrices | |
import scipy.stats as st | |
from matplotlib import pyplot as plt | |
#Import the 7-variable subset of the automobiles dataset into a DataFrame | |
df = pd.read_csv('automobiles_dataset_subset_uciml.csv', header=0) | |
############################################################################################# | |
# Dummy variables regression 1 | |
############################################################################################# | |
#Add dummy variable columns to represent the aspiration variable | |
df_with_dummies = pd.get_dummies(data=df, columns=['aspiration']) | |
#Print out the dummy-augmented data set | |
print(df_with_dummies) | |
#Construct the regression expression. The intercept of regression is added automatically. | |
#We add only one dummy variable aspiration_std and not both, _std and _turbo so as to avoid | |
# perfect collinearity.In this case, regression intercept captures the effect of | |
# aspiration_turbo. Specifically, the value of the intercept is the coefficient of aspiration_turbo. | |
# Alternately, we could have added both aspiration_std and aspiration_turbo and left out the | |
# regression intercept. In this later case, because the model would not have the regression | |
# intercept, we would not be able to use the R-squared value to judge its goodness-of-fit. | |
reg_exp = 'price ~ aspiration_std' | |
#Build the Ordinary Least Squares Regression model. Even though the entire 7-variables data set | |
# is passed into the model, internally, statsmodels uses the regression express (reg_exp) to | |
# carve out the columns of interest | |
olsr_model = smf.ols(formula=reg_exp, data=df_with_dummies) | |
#Train the model | |
olsr_model_results = olsr_model.fit() | |
#Print the training summary | |
print(olsr_model_results.summary()) | |
############################################################################################# | |
# Dummy variables regression 2 | |
############################################################################################# | |
#Add dummy variable columns to represent body_style | |
df_with_dummies = pd.get_dummies(data=df, columns=['body_style']) | |
#Print out the dummy-augmented data set | |
print(df_with_dummies) | |
#Construct the regression expression. As before we'll leave out one dummy variable ( | |
# body_style_convertible) to void perfect collinearity. The regression model's intercept will | |
# hold the coefficient of body_style_convertible | |
reg_exp = 'price ~ body_style_hardtop + body_style_hatchback + body_style_sedan + \ | |
body_style_wagon' | |
#Build the OLS Regression model. | |
olsr_model = smf.ols(formula=reg_exp, data=df_with_dummies) | |
#Train the model | |
olsr_model_results = olsr_model.fit() | |
#Print the training summary | |
print(olsr_model_results.summary()) | |
############################################################################################# | |
# Dummy variables regression 3 | |
############################################################################################# | |
#Add dummy variable columns to represent num_of_cylinders | |
df_with_dummies = pd.get_dummies(data=df, columns=['num_of_cylinders']) | |
#Form the regression expression | |
reg_exp = 'price ~ num_of_cylinders_3 + num_of_cylinders_4 + ' \ | |
'num_of_cylinders_5 + num_of_cylinders_6 + num_of_cylinders_8 + num_of_cylinders_12' | |
#Build and fit the model and print out the training summary | |
olsr_model = smf.ols(formula=reg_exp, data=df_with_dummies) | |
olsr_model_results = olsr_model.fit() | |
print(olsr_model_results.summary()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment