Last active
April 7, 2022 12:23
-
-
Save sachinsdate/dcda1b0e45f3aa2d8b1e114f390e7925 to your computer and use it in GitHub Desktop.
Conditional variance and conditional covariance
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from patsy import dmatrices | |
import numpy as np | |
import scipy.stats | |
import statsmodels.formula.api as sm | |
import matplotlib.pyplot as plt | |
#Read the automobiles dataset into a Pandas DataFrame | |
df = pd.read_csv('automobile_uciml_6vars.csv', header=0) | |
#Drop all empty rows | |
df = df.dropna() | |
#Plot Engine_Size versus Num_Cylinders | |
fig = plt.figure() | |
fig.suptitle('Engine_Size versus Num_Cylinders') | |
plt.xlabel('Num_Cylinders') | |
plt.ylabel('Engine_Size') | |
plt.scatter(df['Num_Cylinders'], df['Engine_Size']) | |
#Plot a horizontal mean line | |
plt.plot([0, df['Num_Cylinders'].max()], [df['Engine_Size'].mean(), df['Engine_Size'].mean()], | |
[df['Engine_Size'].mean()], color='red', linestyle='dashed') | |
#Group the DataFrame by Num_Cylinders and calculate the mean for each group | |
df_grouped_means = df.groupby(['Num_Cylinders']).mean() | |
#Print out all the grouped means | |
df_grouped_means = df.groupby(['Num_Cylinders']).mean() | |
#Plot the group-specific means of Engine_Size | |
for i in df_grouped_means.index: | |
mean = df_grouped_means['Engine_Size'].loc[i] | |
plt.plot(i, mean, color='red', marker='o') | |
plt.show() | |
#Calculate the variance of Engine_Size conditioned upon Curb_Weight, Vehicle_Volume, | |
# Num_Cylinders,Vehicle_Price | |
unconditional_variance_engine_size = df['Engine_Size'].var() | |
print('Unconditional variance in Engine_Size='+str(unconditional_variance_engine_size)) | |
#Construct the regression expression. A regression intercept is included by default | |
olsr_expr = 'Engine_Size ~ Num_Cylinders + Curb_Weight + Vehicle_Volume' | |
#Carve out the y and X matrices based on the regression expression | |
y, X = dmatrices(olsr_expr, df, return_type='dataframe') | |
#Build the OLS linear regression model | |
olsr_model = sm.OLS(endog=y, exog=X) | |
#Train the model | |
olsr_model_results = olsr_model.fit() | |
#Print the fitted model's training summary | |
print(olsr_model_results.summary()) | |
y_pred=olsr_model_results.predict(X) | |
y_pred=np.array(y_pred) | |
y=np.array(y['Engine_Size']) | |
conditional_variance_engine_size = np.sum(np.square(y-y_pred))/(len(y)-1) | |
print('Conditional variance in Engine_Size='+str(conditional_variance_engine_size)) | |
r_squared = 1 - conditional_variance_engine_size/unconditional_variance_engine_size | |
print('r_squared='+str(r_squared)) | |
#Calculate the unconditional (total) covariance between Engine_Size and Curb_Weight | |
covariance = df['Curb_Weight'].cov(df['Engine_Size']) | |
print('Covariance between Curb_Weight and Engine_Size='+str(covariance)) | |
#Plot mean-centered Curb_Weight versus Engine_Size | |
fig = plt.figure() | |
fig.suptitle('Mean centered Curb_Weight versus Engine_Size') | |
plt.xlabel('Mean centered Engine_Size') | |
plt.ylabel('Mean centered Curb_Weight') | |
plt.scatter(df['Engine_Size']-df['Engine_Size'].mean(), df['Curb_Weight']-df['Curb_Weight'].mean()) | |
plt.show() | |
#Calculate the covariance of X=Curb_Weight versus Z=Engine_Size conditional upon W=(Num_Cylinders, | |
# Vehicle_Volume) | |
#Carve out the X and W matrices | |
X, W = dmatrices('Engine_Size ~ Vehicle_Volume', df, return_type='dataframe') | |
#Regress X on W | |
olsr_model_XW = sm.OLS(endog=X, exog=W) | |
olsr_model_XW_results = olsr_model_XW.fit() | |
#Get the conditional expectations E(X|W) | |
X_pred=olsr_model_XW_results.predict(W) | |
X_pred=np.array(X_pred) | |
X=np.array(df['Engine_Size']) | |
#Carve out the Z and W matrices | |
Z, W = dmatrices('Curb_Weight ~ Vehicle_Volume', df, return_type='dataframe') | |
#Regress Z on W | |
olsr_model_ZW = sm.OLS(endog=Z, exog=W) | |
olsr_model_ZW_results = olsr_model_ZW.fit() | |
#Get the conditional expectations E(Z|W) | |
Z_pred=olsr_model_ZW_results.predict(W) | |
Z_pred=np.array(Z_pred) | |
Z=np.array(df['Curb_Weight']) | |
#Construct the delta matrices | |
Z_delta=Z-Z_pred | |
X_delta=X-X_pred | |
#Calculate the conditional vovariance | |
conditional_variance = np.sum(Z_delta*X_delta)/(len(Z)-1) | |
print('Conditional Covariance between Curb_Weight and Engine_Size='+str(conditional_variance)) | |
#Plot conditional mean-centered Curb_Weight versus Engine_Size | |
fig = plt.figure() | |
fig.suptitle('Conditional variation in Curb_Weight versus conditional variation in Engine_Size') | |
plt.xlabel('Engine_Size - E(Engine_Size|Vehicle_Volume)') | |
plt.ylabel('Curb_Weight - E(Curb_Weight|Vehicle_Volume)') | |
plt.scatter(Z_delta, X_delta) | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment