Created
June 21, 2024 11:55
-
-
Save sachinsdate/c35ee86666673789dcb79283e539cf64 to your computer and use it in GitHub Desktop.
Partial Auto-Correlation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
from matplotlib import pyplot as plt | |
import matplotlib.dates as mdates | |
from statsmodels.graphics.tsaplots import plot_pacf | |
from statsmodels.graphics.tsaplots import plot_acf | |
from statsmodels.tsa.stattools import pacf | |
from statsmodels.tsa.stattools import acf | |
import statsmodels.api as sm | |
from patsy import dmatrices | |
################################################## | |
#Load the ENSO data set into a Pandas Dataframe | |
################################################## | |
df_osc_long = pd.read_csv('southern_oscillations_standardized_long_may24.csv', header=0) | |
#Convert the Date field to datetime64[ns] format with the specified format | |
df_osc_long['Date'] = pd.to_datetime(df_osc_long['Date'], format='%Y-%m-%d') | |
df_osc_long = df_osc_long.sort_values(by='Date') | |
################################################## | |
#Plot the time series | |
################################################## | |
plt.figure(figsize=(12, 6)) | |
plt.plot(df_osc_long['Date'], df_osc_long['Y_t'], color='blue', linewidth=1, label='Southern Oscillations') | |
# Set major ticks format on the X-axis to display only the years | |
plt.gca().xaxis.set_major_locator(mdates.YearLocator()) | |
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y')) | |
plt.gca().set_xlim([df_osc_long['Date'].min(), df_osc_long['Date'].max()]) | |
plt.gcf().autofmt_xdate() | |
plt.title('Southern Oscillations Index Over Time (1951-2024)', fontsize=16) | |
plt.xlabel('Date', fontsize=14) | |
plt.ylabel('Difference in Standardized SSP', fontsize=14) | |
plt.tick_params(axis='both', which='major', labelsize=8) | |
plt.grid(which='major', axis='y') | |
plt.legend(loc='upper left', fontsize=12) | |
plt.tight_layout() | |
plt.show() | |
################################################## | |
#Add the time-lagged copies of Y_t as columns | |
################################################## | |
for lag in range(1, 11): | |
df_osc_long[f'Y_(t-{lag})'] = df_osc_long['Y_t'].shift(lag) | |
df_osc_long = df_osc_long.dropna() | |
################################################## | |
#Plot Y_t against Y_(t-1) | |
################################################## | |
plt.figure(figsize=(12, 6)) | |
#Plot Y_t versus Y_(t-1) | |
plt.scatter(df_osc_long['Y_(t-1)'], df_osc_long['Y_t'], alpha=0.5, edgecolors='w', s=20, c='b') | |
plt.title('Scatter Plot of Y_t versus Y_(t-1)', fontsize=16) | |
plt.xlabel('Y_(t-1)', fontsize=14) | |
plt.ylabel('Y_t', fontsize=14) | |
plt.tick_params(axis='both', which='major', labelsize=12) | |
plt.grid(True) | |
plt.tight_layout() | |
plt.show() | |
################################################## | |
#Print a table of correlations of T_t with lagged copies of Y | |
################################################## | |
print(df_osc_long.corr()) | |
################################################## | |
#Plot the PACF of Y_t and Y_(t-k) for 30 lags | |
################################################## | |
plot_pacf(x=df_osc_long['Y_t'], lags=30) | |
plt.title('PACF plot of Y_(t-k) with Y_t for k=0 to 30', fontsize=16) | |
plt.xlabel('Lag', fontsize=14) | |
plt.ylabel('Partial Auto-correlation coefficient', fontsize=14) | |
plt.tick_params(axis='both', which='major', labelsize=12) | |
plt.grid(True) | |
plt.tight_layout() | |
plt.show() | |
################################################## | |
#Plot the OLS estimated Y_t as well as the actual Y_t | |
################################################## | |
X = df_osc_long['Y_(t-1)'] | |
y = df_osc_long['Y_t'] | |
X = sm.add_constant(X) | |
olsr_model = sm.OLS(y, X).fit() | |
df_osc_long['Y_t_estimated'] = olsr_model.predict(X) | |
plt.figure(figsize=(12, 6)) | |
plt.scatter(df_osc_long['Y_(t-1)'], df_osc_long['Y_t'], alpha=0.5, edgecolors='w', s=20, c='b', label='Actual Y_t') | |
plt.scatter(df_osc_long['Y_(t-1)'], df_osc_long['Y_t_estimated'], alpha=0.5, edgecolors='w', s=20, c='r', label='Estimated Y_t') | |
plt.title('Scatter Plot of Actual vs Estimated Y_t', fontsize=16) | |
plt.xlabel('Y_(t-1)', fontsize=14) | |
plt.ylabel('Y_t', fontsize=14) | |
plt.tick_params(axis='both', which='major', labelsize=12) | |
plt.legend(fontsize=12, loc='lower right') | |
r_squared = olsr_model.rsquared | |
plt.text(0.05, 0.95, f'R-squared: {r_squared:.2f}', transform=plt.gca().transAxes, fontsize=14, verticalalignment='top') | |
plt.grid(True) | |
plt.tight_layout() | |
plt.show() | |
################################################## | |
#Manually calculate the partial correlation coefficient of Y(t-4) with Y_t | |
################################################## | |
#Regress Y_t on Y_(t-1), Y_(t-2) and Y_(t-3) and a constant | |
X = df_osc_long[['Y_(t-1)', 'Y_(t-2)', 'Y_(t-3)']] | |
y = df_osc_long['Y_t'] | |
X = sm.add_constant(X) | |
olsr_model = sm.OLS(y, X).fit() | |
#Calculate the estimated Y_t | |
df_osc_long['Y_t_estimated'] = olsr_model.predict(X) | |
#Get the residuals | |
df_osc_long['epsilon_a'] = df_osc_long['Y_t'] - df_osc_long['Y_t_estimated'] | |
print(df_osc_long[['Y_t', 'Y_(t-1)', 'Y_(t-2)', 'Y_(t-3)', 'Y_t_estimated', 'epsilon_a']]) | |
#Regress Y_(t-4) on Y_(t-1), Y_(t-2) and Y_(t-3) and a constant | |
X = df_osc_long[['Y_(t-1)', 'Y_(t-2)', 'Y_(t-3)']] | |
y = df_osc_long['Y_(t-4)'] | |
X = sm.add_constant(X) | |
olsr_model = sm.OLS(y, X).fit() | |
#Calculate the estimated Y_t | |
df_osc_long['Y_(t-4)_estimated'] = olsr_model.predict(X) | |
#Get the residuals | |
df_osc_long['epsilon_b'] = df_osc_long['Y_(t-4)'] - df_osc_long['Y_(t-4)_estimated'] | |
print(df_osc_long[['Y_(t-4)', 'Y_(t-1)', 'Y_(t-2)', 'Y_(t-3)', 'Y_(t-4)_estimated', 'epsilon_b']]) | |
#Calculate the Pearson's r between df_osc_resid_a and df_osc_resid_b | |
cov_a_b = np.cov(df_osc_long['epsilon_a'], df_osc_long['epsilon_b'], ddof=1) | |
std_a = np.std(df_osc_long['epsilon_a'], ddof=1) | |
std_b = np.std(df_osc_long['epsilon_b'], ddof=1) | |
pac_a_b = cov_a_b/(std_a*std_b) | |
print('cov_a_b=',cov_a_b) | |
print('std_a=',std_a) | |
print('std_b=',std_b) | |
print('pac_a_b=',pac_a_b) | |
################################################## | |
#Load the Boston Average Monthly Max Temps data set into a Pandas Dataframe | |
################################################## | |
df_bos_temp = pd.read_csv('boston_monthly_tmax_1998_2019.csv', header=0) | |
#Convert the Date field to datetime64[ns] format with the specified format | |
df_bos_temp['Date'] = pd.to_datetime(df_bos_temp['Date'], format='%m/%d/%Y') | |
df_bos_temp = df_bos_temp.sort_values(by='Date') | |
################################################## | |
#Plot the Boston Average Monthly Max Temps data | |
################################################## | |
plt.figure(figsize=(12, 6)) | |
plt.plot(df_bos_temp['Date'], df_bos_temp['Monthly Average Maximum'], color='blue', linewidth=1, label='Monthly Average Maximum') | |
plt.gca().xaxis.set_major_locator(mdates.YearLocator()) | |
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y')) | |
plt.gca().set_xlim([df_bos_temp['Date'].min(), df_bos_temp['Date'].max()]) | |
plt.gcf().autofmt_xdate() | |
plt.title('Monthy Average Maximum Temp in Boston, MA (1998-2019)', fontsize=16) | |
plt.xlabel('Date', fontsize=14) | |
plt.ylabel('Temperature (F)', fontsize=14) | |
plt.xticks(rotation=45, ha='right') | |
plt.tick_params(axis='both', which='major', labelsize=12) | |
plt.grid(which='major', axis='y') | |
plt.legend(loc='upper left', fontsize=12) | |
plt.tight_layout() | |
plt.show() | |
################################################## | |
#Seasonal difference with a period of 12 months | |
################################################## | |
df_bos_temp['Diff12_TMax'] = df_bos_temp['Monthly Average Maximum'].diff(periods=12) | |
df_bos_temp = df_bos_temp.dropna() | |
################################################## | |
#Plot the seasonally differenced time series | |
################################################## | |
plt.figure(figsize=(12, 6)) | |
plt.plot(df_bos_temp['Date'], df_bos_temp['Diff12_TMax'], color='blue', linewidth=1, label='Deseasonalized Monthly Average Maximum') | |
plt.gca().xaxis.set_major_locator(mdates.YearLocator()) | |
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y')) | |
plt.gca().set_xlim([df_bos_temp['Date'].min(), df_bos_temp['Date'].max()]) | |
plt.gcf().autofmt_xdate() | |
plt.title('Deseasonalized Monthly Average Maximum Temp in Boston, MA (1998-2019)', fontsize=16) | |
plt.xlabel('Date', fontsize=14) | |
plt.ylabel('Temperature (F)', fontsize=14) | |
plt.xticks(rotation=45, ha='right') | |
plt.tick_params(axis='both', which='major', labelsize=12) | |
plt.grid(which='major', axis='y') | |
plt.legend(loc='upper left', fontsize=12) | |
plt.tight_layout() | |
plt.show() | |
################################################## | |
# #Plot the PACF for 72 lags | |
################################################## | |
plot_pacf(x=df_bos_temp['Diff12_TMax'], lags=72) | |
plt.title('PACF plot of the Seasonally differenced Temperature Series for k=0 to 72', fontsize=16) | |
plt.xlabel('Lag', fontsize=14) | |
plt.ylabel('Partial Auto-correlation coefficient', fontsize=14) | |
plt.tick_params(axis='both', which='major', labelsize=12) | |
plt.grid(True) | |
plt.tight_layout() | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment