sachinsdate/quantile_regression.py

## quantile_regression.py
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from patsy import dmatrices
from matplotlib import pyplot as plt


#Import the 7-variable subset of the automobiles dataset into a DataFrame
df = pd.read_csv('automobiles_dataset_subset_uciml.csv', header=0)

#Plot price versus num_of_cylinders
fig = plt.figure()
fig.suptitle('Price versus Number of Cylinders')
plt.xlabel('number_of_cylinders')
plt.ylabel('price')
plt.scatter(x=df['num_of_cylinders'], y=df['price'])
plt.show()

#Construct an OLS model
#Start with defining the model's equation in Patsy syntax
reg_exp = 'price ~ num_of_cylinders'

#Carve out the y and X matrices
y_train, X_train = dmatrices(reg_exp, df, return_type='dataframe')

#build and train an OLS model
olsr_model = sm.OLS(endog=y_train, exog=X_train)
olsr_model_results = olsr_model.fit()
print(olsr_model_results.summary())

#Plot the OLS regression line on the scatter plot of Price versus num_of_cylinders
fig = plt.figure()
fig.suptitle('Price versus Number of Cylinders')
plt.xlabel('number_of_cylinders')
plt.ylabel('price')
plt.scatter(x=df['num_of_cylinders'], y=df['price'])

y_pred_ols = olsr_model_results.predict(X_train)
ols, = plt.plot(X_train['num_of_cylinders'], y_pred_ols,
    color='red', marker='o', linestyle='solid', label='OLS Model')
plt.legend(handles=[ols])

plt.show()

#build and train a model that estimates the median
median_model = smf.quantreg(formula=reg_exp, data=df)
median_model_results = median_model.fit(q=0.5)
print(median_model_results.summary())

#Plot the OLS regression line on the scatter plot of Price versus num_of_cylinders
fig = plt.figure()
fig.suptitle('Price versus Number of Cylinders')
plt.xlabel('number_of_cylinders')
plt.ylabel('price')
plt.scatter(x=df['num_of_cylinders'], y=df['price'])

y_pred_median = median_model_results.predict(X_train)
median, = plt.plot(X_train['num_of_cylinders'], y_pred_median,
    color='cyan', marker='o', linestyle='solid',  label='Median Model')
ols, = plt.plot(X_train['num_of_cylinders'], y_pred_ols,
    color='red', marker='o', linestyle='dashed',  label='OLS Model')
plt.legend(handles=[ols, median])

plt.show()

#Plot all regression lines for multiple quantiles on the scatter plot of price versus
# num_of_cylinders
fig = plt.figure()
fig.suptitle('Price versus Number of Cylinders')
plt.xlabel('number_of_cylinders')
plt.ylabel('price')
plt.scatter(x=df['num_of_cylinders'], y=df['price'])

coeff = []
colors = ['orange', 'lime', 'yellow', 'cyan', 'violet']
i=0
handles = []
quantiles = [0.1, 0.25, 0.5, 0.75, 0.9]
for q in quantiles:
    quantile_model = smf.quantreg(formula=reg_exp, data=df)
    quantile_model_results = quantile_model.fit(q=q)
    print(quantile_model_results.summary())
    coeff.append(quantile_model_results.params['num_of_cylinders'])
    y_pred_quantile = quantile_model_results.predict(X_train)
    quantile, = plt.plot(X_train['num_of_cylinders'], y_pred_quantile,
        color=colors[i], marker='o', linestyle='solid',  label=str(int(q*100))+'th quantile Model')
    i = i+1
    handles.append(quantile)

ols, = plt.plot(X_train['num_of_cylinders'], y_pred_ols,
    color='red', marker='o', linestyle='dashed',  label='OLS Model')

handles.append(ols)
plt.legend(handles=handles)
plt.show()

#Plot the coefficient of num_of_cylinders versus quantile number
fig = plt.figure()
fig.suptitle('Coefficient of num_of_cylinders versus Quantile number')
plt.xlabel('Quantile number')
plt.ylabel('Coefficient of num_of_cylinders')
plt.plot(quantiles, coeff)
plt.show()
	import pandas as pd
	import statsmodels.api as sm
	import statsmodels.formula.api as smf
	from patsy import dmatrices
	from matplotlib import pyplot as plt


	#Import the 7-variable subset of the automobiles dataset into a DataFrame
	df = pd.read_csv('automobiles_dataset_subset_uciml.csv', header=0)

	#Plot price versus num_of_cylinders
	fig = plt.figure()
	fig.suptitle('Price versus Number of Cylinders')
	plt.xlabel('number_of_cylinders')
	plt.ylabel('price')
	plt.scatter(x=df['num_of_cylinders'], y=df['price'])
	plt.show()

	#Construct an OLS model
	#Start with defining the model's equation in Patsy syntax
	reg_exp = 'price ~ num_of_cylinders'

	#Carve out the y and X matrices
	y_train, X_train = dmatrices(reg_exp, df, return_type='dataframe')

	#build and train an OLS model
	olsr_model = sm.OLS(endog=y_train, exog=X_train)
	olsr_model_results = olsr_model.fit()
	print(olsr_model_results.summary())

	#Plot the OLS regression line on the scatter plot of Price versus num_of_cylinders
	fig = plt.figure()
	fig.suptitle('Price versus Number of Cylinders')
	plt.xlabel('number_of_cylinders')
	plt.ylabel('price')
	plt.scatter(x=df['num_of_cylinders'], y=df['price'])

	y_pred_ols = olsr_model_results.predict(X_train)
	ols, = plt.plot(X_train['num_of_cylinders'], y_pred_ols,
	color='red', marker='o', linestyle='solid', label='OLS Model')
	plt.legend(handles=[ols])

	plt.show()

	#build and train a model that estimates the median
	median_model = smf.quantreg(formula=reg_exp, data=df)
	median_model_results = median_model.fit(q=0.5)
	print(median_model_results.summary())

	#Plot the OLS regression line on the scatter plot of Price versus num_of_cylinders
	fig = plt.figure()
	fig.suptitle('Price versus Number of Cylinders')
	plt.xlabel('number_of_cylinders')
	plt.ylabel('price')
	plt.scatter(x=df['num_of_cylinders'], y=df['price'])

	y_pred_median = median_model_results.predict(X_train)
	median, = plt.plot(X_train['num_of_cylinders'], y_pred_median,
	color='cyan', marker='o', linestyle='solid', label='Median Model')
	ols, = plt.plot(X_train['num_of_cylinders'], y_pred_ols,
	color='red', marker='o', linestyle='dashed', label='OLS Model')
	plt.legend(handles=[ols, median])

	plt.show()

	#Plot all regression lines for multiple quantiles on the scatter plot of price versus
	# num_of_cylinders
	fig = plt.figure()
	fig.suptitle('Price versus Number of Cylinders')
	plt.xlabel('number_of_cylinders')
	plt.ylabel('price')
	plt.scatter(x=df['num_of_cylinders'], y=df['price'])

	coeff = []
	colors = ['orange', 'lime', 'yellow', 'cyan', 'violet']
	i=0
	handles = []
	quantiles = [0.1, 0.25, 0.5, 0.75, 0.9]
	for q in quantiles:
	quantile_model = smf.quantreg(formula=reg_exp, data=df)
	quantile_model_results = quantile_model.fit(q=q)
	print(quantile_model_results.summary())
	coeff.append(quantile_model_results.params['num_of_cylinders'])
	y_pred_quantile = quantile_model_results.predict(X_train)
	quantile, = plt.plot(X_train['num_of_cylinders'], y_pred_quantile,
	color=colors[i], marker='o', linestyle='solid', label=str(int(q*100))+'th quantile Model')
	i = i+1
	handles.append(quantile)

	ols, = plt.plot(X_train['num_of_cylinders'], y_pred_ols,
	color='red', marker='o', linestyle='dashed', label='OLS Model')

	handles.append(ols)
	plt.legend(handles=handles)
	plt.show()

	#Plot the coefficient of num_of_cylinders versus quantile number
	fig = plt.figure()
	fig.suptitle('Coefficient of num_of_cylinders versus Quantile number')
	plt.xlabel('Quantile number')
	plt.ylabel('Coefficient of num_of_cylinders')
	plt.plot(quantiles, coeff)
	plt.show()