Skip to content

Instantly share code, notes, and snippets.

@sachinsdate
Last active April 7, 2022 12:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sachinsdate/dcda1b0e45f3aa2d8b1e114f390e7925 to your computer and use it in GitHub Desktop.
Save sachinsdate/dcda1b0e45f3aa2d8b1e114f390e7925 to your computer and use it in GitHub Desktop.
Conditional variance and conditional covariance
import pandas as pd
from patsy import dmatrices
import numpy as np
import scipy.stats
import statsmodels.formula.api as sm
import matplotlib.pyplot as plt
#Read the automobiles dataset into a Pandas DataFrame
df = pd.read_csv('automobile_uciml_6vars.csv', header=0)
#Drop all empty rows
df = df.dropna()
#Plot Engine_Size versus Num_Cylinders
fig = plt.figure()
fig.suptitle('Engine_Size versus Num_Cylinders')
plt.xlabel('Num_Cylinders')
plt.ylabel('Engine_Size')
plt.scatter(df['Num_Cylinders'], df['Engine_Size'])
#Plot a horizontal mean line
plt.plot([0, df['Num_Cylinders'].max()], [df['Engine_Size'].mean(), df['Engine_Size'].mean()],
[df['Engine_Size'].mean()], color='red', linestyle='dashed')
#Group the DataFrame by Num_Cylinders and calculate the mean for each group
df_grouped_means = df.groupby(['Num_Cylinders']).mean()
#Print out all the grouped means
df_grouped_means = df.groupby(['Num_Cylinders']).mean()
#Plot the group-specific means of Engine_Size
for i in df_grouped_means.index:
mean = df_grouped_means['Engine_Size'].loc[i]
plt.plot(i, mean, color='red', marker='o')
plt.show()
#Calculate the variance of Engine_Size conditioned upon Curb_Weight, Vehicle_Volume,
# Num_Cylinders,Vehicle_Price
unconditional_variance_engine_size = df['Engine_Size'].var()
print('Unconditional variance in Engine_Size='+str(unconditional_variance_engine_size))
#Construct the regression expression. A regression intercept is included by default
olsr_expr = 'Engine_Size ~ Num_Cylinders + Curb_Weight + Vehicle_Volume'
#Carve out the y and X matrices based on the regression expression
y, X = dmatrices(olsr_expr, df, return_type='dataframe')
#Build the OLS linear regression model
olsr_model = sm.OLS(endog=y, exog=X)
#Train the model
olsr_model_results = olsr_model.fit()
#Print the fitted model's training summary
print(olsr_model_results.summary())
y_pred=olsr_model_results.predict(X)
y_pred=np.array(y_pred)
y=np.array(y['Engine_Size'])
conditional_variance_engine_size = np.sum(np.square(y-y_pred))/(len(y)-1)
print('Conditional variance in Engine_Size='+str(conditional_variance_engine_size))
r_squared = 1 - conditional_variance_engine_size/unconditional_variance_engine_size
print('r_squared='+str(r_squared))
#Calculate the unconditional (total) covariance between Engine_Size and Curb_Weight
covariance = df['Curb_Weight'].cov(df['Engine_Size'])
print('Covariance between Curb_Weight and Engine_Size='+str(covariance))
#Plot mean-centered Curb_Weight versus Engine_Size
fig = plt.figure()
fig.suptitle('Mean centered Curb_Weight versus Engine_Size')
plt.xlabel('Mean centered Engine_Size')
plt.ylabel('Mean centered Curb_Weight')
plt.scatter(df['Engine_Size']-df['Engine_Size'].mean(), df['Curb_Weight']-df['Curb_Weight'].mean())
plt.show()
#Calculate the covariance of X=Curb_Weight versus Z=Engine_Size conditional upon W=(Num_Cylinders,
# Vehicle_Volume)
#Carve out the X and W matrices
X, W = dmatrices('Engine_Size ~ Vehicle_Volume', df, return_type='dataframe')
#Regress X on W
olsr_model_XW = sm.OLS(endog=X, exog=W)
olsr_model_XW_results = olsr_model_XW.fit()
#Get the conditional expectations E(X|W)
X_pred=olsr_model_XW_results.predict(W)
X_pred=np.array(X_pred)
X=np.array(df['Engine_Size'])
#Carve out the Z and W matrices
Z, W = dmatrices('Curb_Weight ~ Vehicle_Volume', df, return_type='dataframe')
#Regress Z on W
olsr_model_ZW = sm.OLS(endog=Z, exog=W)
olsr_model_ZW_results = olsr_model_ZW.fit()
#Get the conditional expectations E(Z|W)
Z_pred=olsr_model_ZW_results.predict(W)
Z_pred=np.array(Z_pred)
Z=np.array(df['Curb_Weight'])
#Construct the delta matrices
Z_delta=Z-Z_pred
X_delta=X-X_pred
#Calculate the conditional vovariance
conditional_variance = np.sum(Z_delta*X_delta)/(len(Z)-1)
print('Conditional Covariance between Curb_Weight and Engine_Size='+str(conditional_variance))
#Plot conditional mean-centered Curb_Weight versus Engine_Size
fig = plt.figure()
fig.suptitle('Conditional variation in Curb_Weight versus conditional variation in Engine_Size')
plt.xlabel('Engine_Size - E(Engine_Size|Vehicle_Volume)')
plt.ylabel('Curb_Weight - E(Curb_Weight|Vehicle_Volume)')
plt.scatter(Z_delta, X_delta)
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment