Skip to content

Instantly share code, notes, and snippets.

@accessnash
Created October 30, 2020 07:08
Show Gist options
  • Save accessnash/a37774f22cbafb6acec2c3545143edb8 to your computer and use it in GitHub Desktop.
Save accessnash/a37774f22cbafb6acec2c3545143edb8 to your computer and use it in GitHub Desktop.
Fitting a ARIMA model on the dataset for Milk production
# -*- coding: utf-8 -*-
"""
Created on Thu Oct 29 20:35:30 2020
@author: User
"""
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import numpy as np
df = pd.read_csv('monthly-milk-production-pounds-p.csv')
df.columns = ['Month', 'Milk in Pounds per Cow']
df.drop(168, axis = 0, inplace = True)
df['Month'] = pd.to_datetime(df['Month'])
df.set_index('Month', inplace= True)
df.describe().transpose()
df.plot()
time_series = df['Milk in Pounds per Cow']
type(time_series)
time_series.rolling(12).mean().plot(label = '12 Month Rolling Mean')
time_series.rolling(12).std().plot(label = '12 Month Rolling Std Dev')
time_series.plot()
plt.legend()
from statsmodels.tsa.seasonal import seasonal_decompose
decomp = seasonal_decompose(time_series)
decomp.plot()
from statsmodels.tsa.stattools import adfuller
result = adfuller(df['Milk in Pounds per Cow'])
#Function to format/report Aug. Dickey Fuller test results
def adf_check(time_series):
result = adfuller(time_series)
print("Augmented Dicky Fuller Test")
labels = ['ADF Test statistic', 'p-value', '# of lags', 'Num of Observations used']
for value, label in zip(result, labels):
print(label+ ":"+str(value))
if result[1] <= 0.05:
print("Strong evidence against Null hypothesis")
print('Reject Null hypothesis')
print("Data has no unit root & is stationary")
else:
print('weak evidence against Null hypothesis')
print('Fail to reject null hypothesis')
print("Data has unit root and is non-stationary")
adf_check(df['Milk in Pounds per Cow'])
df['First Difference'] = df['Milk in Pounds per Cow'] - df['Milk in Pounds per Cow'].shift(1)
df['First Difference'].plot()
adf_check(df['First Difference'].dropna())
df['2nd Difference'] = df['First Difference'] - df['First Difference'].shift(1)
adf_check(df['2nd Difference'].dropna())
df['2nd Difference'].plot()
df['Seasonal Difference'] = df['Milk in Pounds per Cow'] - df['Milk in Pounds per Cow'].shift(12)
adf_check(df['Seasonal Difference'].dropna())
df['Seasonal Difference'].plot()
df['First Seasonal Difference'] = df['First Difference'] - df['First Difference'].shift(12)
adf_check(df['First Seasonal Difference'].dropna())
#plot Autocorrelation and Partial autocorrelation
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
fig_first = plot_acf(df['First Difference'].dropna())
fig_seasonal_first = plot_acf(df['First Seasonal Difference'].dropna())
result = plot_pacf(df['First Seasonal Difference'].dropna())
from statsmodels.tsa.arima_model import ARIMA
model = sm.tsa.statespace.SARIMAX(df['Milk in Pounds per Cow'], order=(0,1,0), seasonal_order=(1,1,1,12))
results = model.fit()
print(results.summary())
results.resid
results.resid.plot(kind='kde') # Kernel density estimation to see distribution of errors
df['forecast'] = results.predict(start=150, end = 168)
df[['Milk in Pounds per Cow', 'forecast']].plot()
from pandas.tseries.offsets import DateOffset
future_dates = [df.index[-1] + DateOffset(months =x) for x in range(1,24)]
future_df= pd.DataFrame(index = future_dates, columns = df.columns)
final_df = pd.concat([df, future_df])
final_df['forecast'] = results.predict(start = 168, end = 192)
final_df[['Milk in Pounds per Cow', 'forecast']].plot()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment