accessnash/ARIMA_milk_prod.py

## ARIMA_milk_prod.py
# -*- coding: utf-8 -*-
"""
Created on Thu Oct 29 20:35:30 2020

@author: User
"""


import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import numpy as np

df = pd.read_csv('monthly-milk-production-pounds-p.csv')
df.columns = ['Month', 'Milk in Pounds per Cow']
df.drop(168, axis = 0, inplace = True)
df['Month'] = pd.to_datetime(df['Month'])
df.set_index('Month', inplace= True)
df.describe().transpose()

df.plot()
time_series = df['Milk in Pounds per Cow']
type(time_series)
time_series.rolling(12).mean().plot(label = '12 Month Rolling Mean')
time_series.rolling(12).std().plot(label = '12 Month Rolling Std Dev')
time_series.plot()
plt.legend()

from statsmodels.tsa.seasonal import seasonal_decompose
decomp = seasonal_decompose(time_series)
decomp.plot()


from statsmodels.tsa.stattools import adfuller
result = adfuller(df['Milk in Pounds per Cow'])

#Function to format/report Aug. Dickey Fuller test results
def adf_check(time_series):
    result = adfuller(time_series)
    print("Augmented Dicky Fuller Test")
    labels = ['ADF Test statistic', 'p-value', '# of lags', 'Num of Observations used']

    for value, label in zip(result, labels):
        print(label+ ":"+str(value))

    if result[1] <= 0.05:
        print("Strong evidence against Null hypothesis")
        print('Reject Null hypothesis')
        print("Data has no unit root & is stationary")
    else:
        print('weak evidence against Null hypothesis')
        print('Fail to reject null hypothesis')
        print("Data has unit root and is non-stationary")

adf_check(df['Milk in Pounds per Cow'])

df['First Difference'] = df['Milk in Pounds per Cow'] - df['Milk in Pounds per Cow'].shift(1)
df['First Difference'].plot()
adf_check(df['First Difference'].dropna())

df['2nd Difference'] = df['First Difference'] - df['First Difference'].shift(1)
adf_check(df['2nd Difference'].dropna())
df['2nd Difference'].plot()
df['Seasonal Difference'] = df['Milk in Pounds per Cow']  - df['Milk in Pounds per Cow'].shift(12)
adf_check(df['Seasonal Difference'].dropna())
df['Seasonal Difference'].plot()
df['First Seasonal Difference'] = df['First Difference'] - df['First Difference'].shift(12)
adf_check(df['First Seasonal Difference'].dropna())

#plot Autocorrelation and Partial autocorrelation
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
fig_first = plot_acf(df['First Difference'].dropna())
fig_seasonal_first = plot_acf(df['First Seasonal Difference'].dropna())
result = plot_pacf(df['First Seasonal Difference'].dropna())

from statsmodels.tsa.arima_model import ARIMA
model = sm.tsa.statespace.SARIMAX(df['Milk in Pounds per Cow'], order=(0,1,0), seasonal_order=(1,1,1,12))
results = model.fit()
print(results.summary())
results.resid
results.resid.plot(kind='kde') # Kernel density estimation to see distribution of errors
df['forecast'] = results.predict(start=150, end = 168)
df[['Milk in Pounds per Cow', 'forecast']].plot()
from pandas.tseries.offsets import DateOffset
future_dates = [df.index[-1] + DateOffset(months =x) for x in range(1,24)]
future_df= pd.DataFrame(index = future_dates, columns = df.columns)
final_df = pd.concat([df, future_df])
final_df['forecast'] = results.predict(start = 168, end = 192)
final_df[['Milk in Pounds per Cow', 'forecast']].plot()
	# -- coding: utf-8 --
	"""
	Created on Thu Oct 29 20:35:30 2020

	@author: User
	"""


	import pandas as pd
	import seaborn as sns
	import matplotlib.pyplot as plt
	import statsmodels.api as sm
	import numpy as np

	df = pd.read_csv('monthly-milk-production-pounds-p.csv')
	df.columns = ['Month', 'Milk in Pounds per Cow']
	df.drop(168, axis = 0, inplace = True)
	df['Month'] = pd.to_datetime(df['Month'])
	df.set_index('Month', inplace= True)
	df.describe().transpose()

	df.plot()
	time_series = df['Milk in Pounds per Cow']
	type(time_series)
	time_series.rolling(12).mean().plot(label = '12 Month Rolling Mean')
	time_series.rolling(12).std().plot(label = '12 Month Rolling Std Dev')
	time_series.plot()
	plt.legend()

	from statsmodels.tsa.seasonal import seasonal_decompose
	decomp = seasonal_decompose(time_series)
	decomp.plot()


	from statsmodels.tsa.stattools import adfuller
	result = adfuller(df['Milk in Pounds per Cow'])

	#Function to format/report Aug. Dickey Fuller test results
	def adf_check(time_series):
	result = adfuller(time_series)
	print("Augmented Dicky Fuller Test")
	labels = ['ADF Test statistic', 'p-value', '# of lags', 'Num of Observations used']

	for value, label in zip(result, labels):
	print(label+ ":"+str(value))

	if result[1] <= 0.05:
	print("Strong evidence against Null hypothesis")
	print('Reject Null hypothesis')
	print("Data has no unit root & is stationary")
	else:
	print('weak evidence against Null hypothesis')
	print('Fail to reject null hypothesis')
	print("Data has unit root and is non-stationary")

	adf_check(df['Milk in Pounds per Cow'])

	df['First Difference'] = df['Milk in Pounds per Cow'] - df['Milk in Pounds per Cow'].shift(1)
	df['First Difference'].plot()
	adf_check(df['First Difference'].dropna())

	df['2nd Difference'] = df['First Difference'] - df['First Difference'].shift(1)
	adf_check(df['2nd Difference'].dropna())
	df['2nd Difference'].plot()
	df['Seasonal Difference'] = df['Milk in Pounds per Cow'] - df['Milk in Pounds per Cow'].shift(12)
	adf_check(df['Seasonal Difference'].dropna())
	df['Seasonal Difference'].plot()
	df['First Seasonal Difference'] = df['First Difference'] - df['First Difference'].shift(12)
	adf_check(df['First Seasonal Difference'].dropna())

	#plot Autocorrelation and Partial autocorrelation
	from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
	fig_first = plot_acf(df['First Difference'].dropna())
	fig_seasonal_first = plot_acf(df['First Seasonal Difference'].dropna())
	result = plot_pacf(df['First Seasonal Difference'].dropna())

	from statsmodels.tsa.arima_model import ARIMA
	model = sm.tsa.statespace.SARIMAX(df['Milk in Pounds per Cow'], order=(0,1,0), seasonal_order=(1,1,1,12))
	results = model.fit()
	print(results.summary())
	results.resid
	results.resid.plot(kind='kde') # Kernel density estimation to see distribution of errors
	df['forecast'] = results.predict(start=150, end = 168)
	df[['Milk in Pounds per Cow', 'forecast']].plot()
	from pandas.tseries.offsets import DateOffset
	future_dates = [df.index[-1] + DateOffset(months =x) for x in range(1,24)]
	future_df= pd.DataFrame(index = future_dates, columns = df.columns)
	final_df = pd.concat([df, future_df])
	final_df['forecast'] = results.predict(start = 168, end = 192)
	final_df[['Milk in Pounds per Cow', 'forecast']].plot()