Created
July 5, 2022 17:30
-
-
Save sachinsdate/bafd272a232fe1f06aad6cfec6200b05 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import statsmodels.api as sm | |
import statsmodels.formula.api as smf | |
from patsy import dmatrices | |
from matplotlib import pyplot as plt | |
#Import the 7-variable subset of the automobiles dataset into a DataFrame | |
df = pd.read_csv('automobiles_dataset_subset_uciml.csv', header=0) | |
#Plot price versus num_of_cylinders | |
fig = plt.figure() | |
fig.suptitle('Price versus Number of Cylinders') | |
plt.xlabel('number_of_cylinders') | |
plt.ylabel('price') | |
plt.scatter(x=df['num_of_cylinders'], y=df['price']) | |
plt.show() | |
#Construct an OLS model | |
#Start with defining the model's equation in Patsy syntax | |
reg_exp = 'price ~ num_of_cylinders' | |
#Carve out the y and X matrices | |
y_train, X_train = dmatrices(reg_exp, df, return_type='dataframe') | |
#build and train an OLS model | |
olsr_model = sm.OLS(endog=y_train, exog=X_train) | |
olsr_model_results = olsr_model.fit() | |
print(olsr_model_results.summary()) | |
#Plot the OLS regression line on the scatter plot of Price versus num_of_cylinders | |
fig = plt.figure() | |
fig.suptitle('Price versus Number of Cylinders') | |
plt.xlabel('number_of_cylinders') | |
plt.ylabel('price') | |
plt.scatter(x=df['num_of_cylinders'], y=df['price']) | |
y_pred_ols = olsr_model_results.predict(X_train) | |
ols, = plt.plot(X_train['num_of_cylinders'], y_pred_ols, | |
color='red', marker='o', linestyle='solid', label='OLS Model') | |
plt.legend(handles=[ols]) | |
plt.show() | |
#build and train a model that estimates the median | |
median_model = smf.quantreg(formula=reg_exp, data=df) | |
median_model_results = median_model.fit(q=0.5) | |
print(median_model_results.summary()) | |
#Plot the OLS regression line on the scatter plot of Price versus num_of_cylinders | |
fig = plt.figure() | |
fig.suptitle('Price versus Number of Cylinders') | |
plt.xlabel('number_of_cylinders') | |
plt.ylabel('price') | |
plt.scatter(x=df['num_of_cylinders'], y=df['price']) | |
y_pred_median = median_model_results.predict(X_train) | |
median, = plt.plot(X_train['num_of_cylinders'], y_pred_median, | |
color='cyan', marker='o', linestyle='solid', label='Median Model') | |
ols, = plt.plot(X_train['num_of_cylinders'], y_pred_ols, | |
color='red', marker='o', linestyle='dashed', label='OLS Model') | |
plt.legend(handles=[ols, median]) | |
plt.show() | |
#Plot all regression lines for multiple quantiles on the scatter plot of price versus | |
# num_of_cylinders | |
fig = plt.figure() | |
fig.suptitle('Price versus Number of Cylinders') | |
plt.xlabel('number_of_cylinders') | |
plt.ylabel('price') | |
plt.scatter(x=df['num_of_cylinders'], y=df['price']) | |
coeff = [] | |
colors = ['orange', 'lime', 'yellow', 'cyan', 'violet'] | |
i=0 | |
handles = [] | |
quantiles = [0.1, 0.25, 0.5, 0.75, 0.9] | |
for q in quantiles: | |
quantile_model = smf.quantreg(formula=reg_exp, data=df) | |
quantile_model_results = quantile_model.fit(q=q) | |
print(quantile_model_results.summary()) | |
coeff.append(quantile_model_results.params['num_of_cylinders']) | |
y_pred_quantile = quantile_model_results.predict(X_train) | |
quantile, = plt.plot(X_train['num_of_cylinders'], y_pred_quantile, | |
color=colors[i], marker='o', linestyle='solid', label=str(int(q*100))+'th quantile Model') | |
i = i+1 | |
handles.append(quantile) | |
ols, = plt.plot(X_train['num_of_cylinders'], y_pred_ols, | |
color='red', marker='o', linestyle='dashed', label='OLS Model') | |
handles.append(ols) | |
plt.legend(handles=handles) | |
plt.show() | |
#Plot the coefficient of num_of_cylinders versus quantile number | |
fig = plt.figure() | |
fig.suptitle('Coefficient of num_of_cylinders versus Quantile number') | |
plt.xlabel('Quantile number') | |
plt.ylabel('Coefficient of num_of_cylinders') | |
plt.plot(quantiles, coeff) | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment