Skip to content

Instantly share code, notes, and snippets.

@sachinsdate
Created June 21, 2024 11:55
Show Gist options
  • Save sachinsdate/c35ee86666673789dcb79283e539cf64 to your computer and use it in GitHub Desktop.
Save sachinsdate/c35ee86666673789dcb79283e539cf64 to your computer and use it in GitHub Desktop.
Partial Auto-Correlation
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import matplotlib.dates as mdates
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.tsa.stattools import pacf
from statsmodels.tsa.stattools import acf
import statsmodels.api as sm
from patsy import dmatrices
##################################################
#Load the ENSO data set into a Pandas Dataframe
##################################################
df_osc_long = pd.read_csv('southern_oscillations_standardized_long_may24.csv', header=0)
#Convert the Date field to datetime64[ns] format with the specified format
df_osc_long['Date'] = pd.to_datetime(df_osc_long['Date'], format='%Y-%m-%d')
df_osc_long = df_osc_long.sort_values(by='Date')
##################################################
#Plot the time series
##################################################
plt.figure(figsize=(12, 6))
plt.plot(df_osc_long['Date'], df_osc_long['Y_t'], color='blue', linewidth=1, label='Southern Oscillations')
# Set major ticks format on the X-axis to display only the years
plt.gca().xaxis.set_major_locator(mdates.YearLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
plt.gca().set_xlim([df_osc_long['Date'].min(), df_osc_long['Date'].max()])
plt.gcf().autofmt_xdate()
plt.title('Southern Oscillations Index Over Time (1951-2024)', fontsize=16)
plt.xlabel('Date', fontsize=14)
plt.ylabel('Difference in Standardized SSP', fontsize=14)
plt.tick_params(axis='both', which='major', labelsize=8)
plt.grid(which='major', axis='y')
plt.legend(loc='upper left', fontsize=12)
plt.tight_layout()
plt.show()
##################################################
#Add the time-lagged copies of Y_t as columns
##################################################
for lag in range(1, 11):
df_osc_long[f'Y_(t-{lag})'] = df_osc_long['Y_t'].shift(lag)
df_osc_long = df_osc_long.dropna()
##################################################
#Plot Y_t against Y_(t-1)
##################################################
plt.figure(figsize=(12, 6))
#Plot Y_t versus Y_(t-1)
plt.scatter(df_osc_long['Y_(t-1)'], df_osc_long['Y_t'], alpha=0.5, edgecolors='w', s=20, c='b')
plt.title('Scatter Plot of Y_t versus Y_(t-1)', fontsize=16)
plt.xlabel('Y_(t-1)', fontsize=14)
plt.ylabel('Y_t', fontsize=14)
plt.tick_params(axis='both', which='major', labelsize=12)
plt.grid(True)
plt.tight_layout()
plt.show()
##################################################
#Print a table of correlations of T_t with lagged copies of Y
##################################################
print(df_osc_long.corr())
##################################################
#Plot the PACF of Y_t and Y_(t-k) for 30 lags
##################################################
plot_pacf(x=df_osc_long['Y_t'], lags=30)
plt.title('PACF plot of Y_(t-k) with Y_t for k=0 to 30', fontsize=16)
plt.xlabel('Lag', fontsize=14)
plt.ylabel('Partial Auto-correlation coefficient', fontsize=14)
plt.tick_params(axis='both', which='major', labelsize=12)
plt.grid(True)
plt.tight_layout()
plt.show()
##################################################
#Plot the OLS estimated Y_t as well as the actual Y_t
##################################################
X = df_osc_long['Y_(t-1)']
y = df_osc_long['Y_t']
X = sm.add_constant(X)
olsr_model = sm.OLS(y, X).fit()
df_osc_long['Y_t_estimated'] = olsr_model.predict(X)
plt.figure(figsize=(12, 6))
plt.scatter(df_osc_long['Y_(t-1)'], df_osc_long['Y_t'], alpha=0.5, edgecolors='w', s=20, c='b', label='Actual Y_t')
plt.scatter(df_osc_long['Y_(t-1)'], df_osc_long['Y_t_estimated'], alpha=0.5, edgecolors='w', s=20, c='r', label='Estimated Y_t')
plt.title('Scatter Plot of Actual vs Estimated Y_t', fontsize=16)
plt.xlabel('Y_(t-1)', fontsize=14)
plt.ylabel('Y_t', fontsize=14)
plt.tick_params(axis='both', which='major', labelsize=12)
plt.legend(fontsize=12, loc='lower right')
r_squared = olsr_model.rsquared
plt.text(0.05, 0.95, f'R-squared: {r_squared:.2f}', transform=plt.gca().transAxes, fontsize=14, verticalalignment='top')
plt.grid(True)
plt.tight_layout()
plt.show()
##################################################
#Manually calculate the partial correlation coefficient of Y(t-4) with Y_t
##################################################
#Regress Y_t on Y_(t-1), Y_(t-2) and Y_(t-3) and a constant
X = df_osc_long[['Y_(t-1)', 'Y_(t-2)', 'Y_(t-3)']]
y = df_osc_long['Y_t']
X = sm.add_constant(X)
olsr_model = sm.OLS(y, X).fit()
#Calculate the estimated Y_t
df_osc_long['Y_t_estimated'] = olsr_model.predict(X)
#Get the residuals
df_osc_long['epsilon_a'] = df_osc_long['Y_t'] - df_osc_long['Y_t_estimated']
print(df_osc_long[['Y_t', 'Y_(t-1)', 'Y_(t-2)', 'Y_(t-3)', 'Y_t_estimated', 'epsilon_a']])
#Regress Y_(t-4) on Y_(t-1), Y_(t-2) and Y_(t-3) and a constant
X = df_osc_long[['Y_(t-1)', 'Y_(t-2)', 'Y_(t-3)']]
y = df_osc_long['Y_(t-4)']
X = sm.add_constant(X)
olsr_model = sm.OLS(y, X).fit()
#Calculate the estimated Y_t
df_osc_long['Y_(t-4)_estimated'] = olsr_model.predict(X)
#Get the residuals
df_osc_long['epsilon_b'] = df_osc_long['Y_(t-4)'] - df_osc_long['Y_(t-4)_estimated']
print(df_osc_long[['Y_(t-4)', 'Y_(t-1)', 'Y_(t-2)', 'Y_(t-3)', 'Y_(t-4)_estimated', 'epsilon_b']])
#Calculate the Pearson's r between df_osc_resid_a and df_osc_resid_b
cov_a_b = np.cov(df_osc_long['epsilon_a'], df_osc_long['epsilon_b'], ddof=1)
std_a = np.std(df_osc_long['epsilon_a'], ddof=1)
std_b = np.std(df_osc_long['epsilon_b'], ddof=1)
pac_a_b = cov_a_b/(std_a*std_b)
print('cov_a_b=',cov_a_b)
print('std_a=',std_a)
print('std_b=',std_b)
print('pac_a_b=',pac_a_b)
##################################################
#Load the Boston Average Monthly Max Temps data set into a Pandas Dataframe
##################################################
df_bos_temp = pd.read_csv('boston_monthly_tmax_1998_2019.csv', header=0)
#Convert the Date field to datetime64[ns] format with the specified format
df_bos_temp['Date'] = pd.to_datetime(df_bos_temp['Date'], format='%m/%d/%Y')
df_bos_temp = df_bos_temp.sort_values(by='Date')
##################################################
#Plot the Boston Average Monthly Max Temps data
##################################################
plt.figure(figsize=(12, 6))
plt.plot(df_bos_temp['Date'], df_bos_temp['Monthly Average Maximum'], color='blue', linewidth=1, label='Monthly Average Maximum')
plt.gca().xaxis.set_major_locator(mdates.YearLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
plt.gca().set_xlim([df_bos_temp['Date'].min(), df_bos_temp['Date'].max()])
plt.gcf().autofmt_xdate()
plt.title('Monthy Average Maximum Temp in Boston, MA (1998-2019)', fontsize=16)
plt.xlabel('Date', fontsize=14)
plt.ylabel('Temperature (F)', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.tick_params(axis='both', which='major', labelsize=12)
plt.grid(which='major', axis='y')
plt.legend(loc='upper left', fontsize=12)
plt.tight_layout()
plt.show()
##################################################
#Seasonal difference with a period of 12 months
##################################################
df_bos_temp['Diff12_TMax'] = df_bos_temp['Monthly Average Maximum'].diff(periods=12)
df_bos_temp = df_bos_temp.dropna()
##################################################
#Plot the seasonally differenced time series
##################################################
plt.figure(figsize=(12, 6))
plt.plot(df_bos_temp['Date'], df_bos_temp['Diff12_TMax'], color='blue', linewidth=1, label='Deseasonalized Monthly Average Maximum')
plt.gca().xaxis.set_major_locator(mdates.YearLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
plt.gca().set_xlim([df_bos_temp['Date'].min(), df_bos_temp['Date'].max()])
plt.gcf().autofmt_xdate()
plt.title('Deseasonalized Monthly Average Maximum Temp in Boston, MA (1998-2019)', fontsize=16)
plt.xlabel('Date', fontsize=14)
plt.ylabel('Temperature (F)', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.tick_params(axis='both', which='major', labelsize=12)
plt.grid(which='major', axis='y')
plt.legend(loc='upper left', fontsize=12)
plt.tight_layout()
plt.show()
##################################################
# #Plot the PACF for 72 lags
##################################################
plot_pacf(x=df_bos_temp['Diff12_TMax'], lags=72)
plt.title('PACF plot of the Seasonally differenced Temperature Series for k=0 to 72', fontsize=16)
plt.xlabel('Lag', fontsize=14)
plt.ylabel('Partial Auto-correlation coefficient', fontsize=14)
plt.tick_params(axis='both', which='major', labelsize=12)
plt.grid(True)
plt.tight_layout()
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment