Created
October 30, 2020 07:08
-
-
Save accessnash/a37774f22cbafb6acec2c3545143edb8 to your computer and use it in GitHub Desktop.
Fitting a ARIMA model on the dataset for Milk production
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Created on Thu Oct 29 20:35:30 2020 | |
@author: User | |
""" | |
import pandas as pd | |
import seaborn as sns | |
import matplotlib.pyplot as plt | |
import statsmodels.api as sm | |
import numpy as np | |
df = pd.read_csv('monthly-milk-production-pounds-p.csv') | |
df.columns = ['Month', 'Milk in Pounds per Cow'] | |
df.drop(168, axis = 0, inplace = True) | |
df['Month'] = pd.to_datetime(df['Month']) | |
df.set_index('Month', inplace= True) | |
df.describe().transpose() | |
df.plot() | |
time_series = df['Milk in Pounds per Cow'] | |
type(time_series) | |
time_series.rolling(12).mean().plot(label = '12 Month Rolling Mean') | |
time_series.rolling(12).std().plot(label = '12 Month Rolling Std Dev') | |
time_series.plot() | |
plt.legend() | |
from statsmodels.tsa.seasonal import seasonal_decompose | |
decomp = seasonal_decompose(time_series) | |
decomp.plot() | |
from statsmodels.tsa.stattools import adfuller | |
result = adfuller(df['Milk in Pounds per Cow']) | |
#Function to format/report Aug. Dickey Fuller test results | |
def adf_check(time_series): | |
result = adfuller(time_series) | |
print("Augmented Dicky Fuller Test") | |
labels = ['ADF Test statistic', 'p-value', '# of lags', 'Num of Observations used'] | |
for value, label in zip(result, labels): | |
print(label+ ":"+str(value)) | |
if result[1] <= 0.05: | |
print("Strong evidence against Null hypothesis") | |
print('Reject Null hypothesis') | |
print("Data has no unit root & is stationary") | |
else: | |
print('weak evidence against Null hypothesis') | |
print('Fail to reject null hypothesis') | |
print("Data has unit root and is non-stationary") | |
adf_check(df['Milk in Pounds per Cow']) | |
df['First Difference'] = df['Milk in Pounds per Cow'] - df['Milk in Pounds per Cow'].shift(1) | |
df['First Difference'].plot() | |
adf_check(df['First Difference'].dropna()) | |
df['2nd Difference'] = df['First Difference'] - df['First Difference'].shift(1) | |
adf_check(df['2nd Difference'].dropna()) | |
df['2nd Difference'].plot() | |
df['Seasonal Difference'] = df['Milk in Pounds per Cow'] - df['Milk in Pounds per Cow'].shift(12) | |
adf_check(df['Seasonal Difference'].dropna()) | |
df['Seasonal Difference'].plot() | |
df['First Seasonal Difference'] = df['First Difference'] - df['First Difference'].shift(12) | |
adf_check(df['First Seasonal Difference'].dropna()) | |
#plot Autocorrelation and Partial autocorrelation | |
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf | |
fig_first = plot_acf(df['First Difference'].dropna()) | |
fig_seasonal_first = plot_acf(df['First Seasonal Difference'].dropna()) | |
result = plot_pacf(df['First Seasonal Difference'].dropna()) | |
from statsmodels.tsa.arima_model import ARIMA | |
model = sm.tsa.statespace.SARIMAX(df['Milk in Pounds per Cow'], order=(0,1,0), seasonal_order=(1,1,1,12)) | |
results = model.fit() | |
print(results.summary()) | |
results.resid | |
results.resid.plot(kind='kde') # Kernel density estimation to see distribution of errors | |
df['forecast'] = results.predict(start=150, end = 168) | |
df[['Milk in Pounds per Cow', 'forecast']].plot() | |
from pandas.tseries.offsets import DateOffset | |
future_dates = [df.index[-1] + DateOffset(months =x) for x in range(1,24)] | |
future_df= pd.DataFrame(index = future_dates, columns = df.columns) | |
final_df = pd.concat([df, future_df]) | |
final_df['forecast'] = results.predict(start = 168, end = 192) | |
final_df[['Milk in Pounds per Cow', 'forecast']].plot() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment