Skip to content

Instantly share code, notes, and snippets.

@sachinsdate
Created July 5, 2022 17:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sachinsdate/bafd272a232fe1f06aad6cfec6200b05 to your computer and use it in GitHub Desktop.
Save sachinsdate/bafd272a232fe1f06aad6cfec6200b05 to your computer and use it in GitHub Desktop.
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from patsy import dmatrices
from matplotlib import pyplot as plt
#Import the 7-variable subset of the automobiles dataset into a DataFrame
df = pd.read_csv('automobiles_dataset_subset_uciml.csv', header=0)
#Plot price versus num_of_cylinders
fig = plt.figure()
fig.suptitle('Price versus Number of Cylinders')
plt.xlabel('number_of_cylinders')
plt.ylabel('price')
plt.scatter(x=df['num_of_cylinders'], y=df['price'])
plt.show()
#Construct an OLS model
#Start with defining the model's equation in Patsy syntax
reg_exp = 'price ~ num_of_cylinders'
#Carve out the y and X matrices
y_train, X_train = dmatrices(reg_exp, df, return_type='dataframe')
#build and train an OLS model
olsr_model = sm.OLS(endog=y_train, exog=X_train)
olsr_model_results = olsr_model.fit()
print(olsr_model_results.summary())
#Plot the OLS regression line on the scatter plot of Price versus num_of_cylinders
fig = plt.figure()
fig.suptitle('Price versus Number of Cylinders')
plt.xlabel('number_of_cylinders')
plt.ylabel('price')
plt.scatter(x=df['num_of_cylinders'], y=df['price'])
y_pred_ols = olsr_model_results.predict(X_train)
ols, = plt.plot(X_train['num_of_cylinders'], y_pred_ols,
color='red', marker='o', linestyle='solid', label='OLS Model')
plt.legend(handles=[ols])
plt.show()
#build and train a model that estimates the median
median_model = smf.quantreg(formula=reg_exp, data=df)
median_model_results = median_model.fit(q=0.5)
print(median_model_results.summary())
#Plot the OLS regression line on the scatter plot of Price versus num_of_cylinders
fig = plt.figure()
fig.suptitle('Price versus Number of Cylinders')
plt.xlabel('number_of_cylinders')
plt.ylabel('price')
plt.scatter(x=df['num_of_cylinders'], y=df['price'])
y_pred_median = median_model_results.predict(X_train)
median, = plt.plot(X_train['num_of_cylinders'], y_pred_median,
color='cyan', marker='o', linestyle='solid', label='Median Model')
ols, = plt.plot(X_train['num_of_cylinders'], y_pred_ols,
color='red', marker='o', linestyle='dashed', label='OLS Model')
plt.legend(handles=[ols, median])
plt.show()
#Plot all regression lines for multiple quantiles on the scatter plot of price versus
# num_of_cylinders
fig = plt.figure()
fig.suptitle('Price versus Number of Cylinders')
plt.xlabel('number_of_cylinders')
plt.ylabel('price')
plt.scatter(x=df['num_of_cylinders'], y=df['price'])
coeff = []
colors = ['orange', 'lime', 'yellow', 'cyan', 'violet']
i=0
handles = []
quantiles = [0.1, 0.25, 0.5, 0.75, 0.9]
for q in quantiles:
quantile_model = smf.quantreg(formula=reg_exp, data=df)
quantile_model_results = quantile_model.fit(q=q)
print(quantile_model_results.summary())
coeff.append(quantile_model_results.params['num_of_cylinders'])
y_pred_quantile = quantile_model_results.predict(X_train)
quantile, = plt.plot(X_train['num_of_cylinders'], y_pred_quantile,
color=colors[i], marker='o', linestyle='solid', label=str(int(q*100))+'th quantile Model')
i = i+1
handles.append(quantile)
ols, = plt.plot(X_train['num_of_cylinders'], y_pred_ols,
color='red', marker='o', linestyle='dashed', label='OLS Model')
handles.append(ols)
plt.legend(handles=handles)
plt.show()
#Plot the coefficient of num_of_cylinders versus quantile number
fig = plt.figure()
fig.suptitle('Coefficient of num_of_cylinders versus Quantile number')
plt.xlabel('Quantile number')
plt.ylabel('Coefficient of num_of_cylinders')
plt.plot(quantiles, coeff)
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment