Skip to content

Instantly share code, notes, and snippets.

@yabyzq
Last active June 3, 2018 22:24
Show Gist options
  • Save yabyzq/a36b294739830838adeb3d03831fa7f4 to your computer and use it in GitHub Desktop.
Save yabyzq/a36b294739830838adeb3d03831fa7f4 to your computer and use it in GitHub Desktop.
forecast
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import time
from numpy import newaxis
import math
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm
from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt
def mean_absolute_percentage_error(y_true, y_pred):
y_true, y_pred = np.array(y_true), np.array(y_pred)
return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
#loading data
df = pd.read_csv('C:/Users/Administrator/Desktop/LSTM/simple.csv')
df.index = pd.DatetimeIndex(pd.to_datetime(df.date,format='%Y-%m-%d'),freq='D')
#split train and test
train_size = round(0.75*len(df))
train = df[1:train_size]
test = df[train_size:]
#http://www.seanabu.com/2016/03/22/time-series-seasonal-ARIMA-model-in-python/
from statsmodels.tsa.stattools import adfuller
def test_stationarity(timeseries):
#Determing rolling statistics
rolmean = timeseries.rolling(window = 12).mean()
rolstd = timeseries.rolling(window = 12).std()
#Plot rolling statistics:
#fig = plt.figure(figsize=(12, 8))
orig = plt.plot(timeseries, color='blue',label='Original')
mean = plt.plot(rolmean, color='red', label='Rolling Mean')
std = plt.plot(rolstd, color='black', label = 'Rolling Std')
plt.legend(loc='best')
plt.title('Rolling Mean & Standard Deviation')
plt.show()
#Perform Dickey-Fuller test:
print('Results of Dickey-Fuller Test:')
dftest = adfuller(timeseries, autolag='AIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
dfoutput['Critical Value (%s)'%key] = value
print(dfoutput)
test_stationarity(df.balance)
# Method 1 - Baseline using last day value
prediction = test.copy().drop(['balance'], axis=1)
prediction['last_day'] = train['balance'][len(train)-1]
x = 30
prediction['last_x_day'] = train['balance'].rolling(x).mean().iloc[-1]
plt.plot(train.index, train['balance'], label='Train')
plt.plot(test.index,test['balance'], label='Test')
plt.plot(prediction.index,prediction['last_day'], label='Last Day')
plt.plot(prediction.index,prediction['last_x_day'], label='Last %d Days'%x)
plt.legend(loc='best')
plt.show()
print("#Last Day\nCoefficent:" ,round(np.corrcoef(test.balance,prediction.last_day)[0,1],4),\
" RMSE:", round(math.sqrt(mean_squared_error(test.balance, prediction.last_day)),4),\
" MAPE:", round(mean_absolute_percentage_error(test.balance, prediction.last_day),4))
print("#Last %d Days\nCoefficent:"%x ,round(np.corrcoef(test.balance,prediction.last_x_day)[0,1],4),\
" RMSE:", round(math.sqrt(mean_squared_error(test.balance, prediction.last_x_day)),4),\
" MAPE:", round(mean_absolute_percentage_error(test.balance, prediction.last_x_day),4))
# Method 2 - capture Seasonality using Holt, ARIMA
# analysing training data
sm.tsa.seasonal_decompose(train.balance).plot()
plt.show()
#fit and predict
fit1 = Holt(np.asarray(train['balance'])).fit(smoothing_level = 0.05,smoothing_slope = 0.05)
prediction['holt_linear'] = fit1.forecast(len(test))
fit2 = ExponentialSmoothing(np.asarray(train['balance']) ,seasonal_periods=7 ,trend='add', seasonal='add',).fit()
prediction['holt_winter'] = fit2.forecast(len(test))
fit3 = sm.tsa.statespace.SARIMAX(train.balance, order=(2, 1, 4),seasonal_order=(0,1,1,7)).fit(maxiter=500)
prediction['arima'] = fit3.predict(start=test.index[0], end=test.index[-1], dynamic=True)
plt.plot(train['balance'], label='Train')
plt.plot(test['balance'], label='Test')
plt.plot(prediction['holt_linear'], label='Holt Linear')
plt.plot(prediction['holt_winter'], label='Holt Winter')
plt.plot(prediction['arima'], label='ARIMA')
plt.legend(loc='best')
plt.show()
print("#Holt Linear\nCoefficent:",round(np.corrcoef(test.balance,prediction.holt_linear)[0,1],4),\
" RMSE:", round(math.sqrt(mean_squared_error(test.balance, prediction.holt_linear)),4),\
" MAPE:", round(mean_absolute_percentage_error(test.balance, prediction.holt_linear),4))
print("#Holt Winter\nCoefficent:",round(np.corrcoef(test.balance,prediction.holt_winter)[0,1],4),\
" RMSE:", round(math.sqrt(mean_squared_error(test.balance, prediction.holt_winter)),4),\
" MAPE:", round(mean_absolute_percentage_error(test.balance, prediction.holt_winter),4))
print("#ARIMA\nCoefficent:",round(np.corrcoef(test.balance,prediction.arima)[0,1],4),\
" RMSE:", round(math.sqrt(mean_squared_error(test.balance, prediction.arima)),4),\
" MAPE:", round(mean_absolute_percentage_error(test.balance, prediction.arima),4))
import Prophet from fbprophet
m = Prophet()
m.fit(pd.DataFrame({'ds':train.date, 'y':train.balance}))
forecast = m.predict(m.make_future_dataframe(periods=len(test)))
prediction['prophet'] = forecast['yhat']
#http://pythondata.com/forecasting-time-series-data-with-prophet-part-1/
# Method 3 - Facebook Prophet
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']]
type(forecast.yhat.index)
type(train.balance.index)
forecast = forecast.set_index(pd.DatetimeIndex(forecast['ds'],freq = 'D'))
plt.figure(figsize=(16,8))
plt.plot( train['balance'], label='Train')
plt.plot(test['balance'], label='Test')
plt.plot(forecast.yhat[-35:-1], label='PROPHET')
plt.legend(loc='best')
plt.show()
m = Prophet()
m.fit(pd.DataFrame({'ds':train.date, 'y':train.balance}))
forecast = m.predict(m.make_future_dataframe(periods=len(test)))
forecast = forecast.set_index(pd.DatetimeIndex(forecast['ds'],freq = 'D'))
prediction['prophet'] = forecast['yhat']
plt.figure(figsize=(16,8))
plt.plot(train['balance'], label='Train')
plt.plot(test['balance'], label='Test')
plt.plot(prediction['prophet'], label='Prophet')
plt.legend(loc='best')
plt.show()
print("#Prophet\nCoefficent:",round(np.corrcoef(test.balance,prediction.prophet)[0,1],4),\
" RMSE:", round(math.sqrt(mean_squared_error(test.balance, prediction.prophet)),4),\
" MAPE:", round(mean_absolute_percentage_error(test.balance, prediction.prophet),4))
from datetime import date, timedelta
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers import LSTM
from keras import callbacks
from keras.callbacks import ModelCheckpoint
from keras import optimizers
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
%matplotlib inline
#loading the data
data = pd.read_csv("C:/Users/eye1/Desktop/sett_data.csv",parse_dates=["date"])
scaler = MinMaxScaler(feature_range=(0, 1))
data['balance'] = scaler.fit_transform(data['balance'].values.reshape(-1,1))
data = data.set_index(pd.DatetimeIndex(data['date'],freq = 'D')).drop(['date'], axis=1)
def get_timespan(input_data, date, minus, periods, freq='D'):
return input_data.loc[pd.date_range(date - timedelta(days=minus), periods=periods, freq=freq)]
train_start_date = pd.datetime(2017, 6, 10)
def convert_timeseries(data, start_date, is_train=True):
X = pd.DataFrame({"last_1": get_timespan(data, start_date, 1, 1).values.ravel(),
"last_3_mean": get_timespan(data, start_date, 3, 3).mean().values,#last 3
"last_5_mean": get_timespan(data, start_date, 5, 5).mean().values,#last 5
"last_7_mean": get_timespan(data, start_date, 7, 7).mean().values,#last 5
"last_14_mean": get_timespan(data, start_date, 14, 14).mean().values})
for i in range(7):
X['last_4_dow{}_mean'.format(i)] = get_timespan(data, start_date, 28-i, 4, freq='7D').mean().values #get every week last 7
if is_train:
y = data.loc[pd.date_range(start_date, periods=1)].values
return X, y
return X
#choose cut off for train and test
cut_off = pd.datetime(2018, 2, 1)
#set up training data
X_train, y_train = [], []
for i in range(pd.Timedelta(cut_off - train_start_date).days):
delta = timedelta(days=i)
X_tmp, y_tmp = convert_timeseries(data, train_start_date + delta)
X_train.append(X_tmp)
y_train.append(y_tmp)
X_train = pd.concat(X_train, axis=0)
y_train = np.concatenate(y_train, axis=0)
X_train = X_train.as_matrix()
X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test, y_test = [], []
for i in range(pd.Timedelta(data.index[-2] - cut_off).days):
delta = timedelta(days=i)
X_tmp, y_tmp = convert_timeseries(data, cut_off + delta)
X_test.append(X_tmp)
y_test.append(y_tmp)
X_test = pd.concat(X_test, axis=0)
y_test = np.concatenate(y_test, axis=0)
X_test = X_test.as_matrix()
X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))
#model
model = Sequential()
model.add(LSTM(32, input_shape=(X_train.shape[1],X_train.shape[2])))
model.add(Dropout(.1))
model.add(Dense(8))
model.add(Dropout(.1))
model.add(Dense(1))
#model.compile(loss = 'mse', optimizer='adam', metrics=['mse'])
model.compile(loss='mse',optimizer=optimizers.SGD(lr=0.05, decay=1e-6, momentum=0.9, nesterov=True))
from numpy import newaxis
def predict_sequence_full(model, data, window_size):
#Shift the window by 1 new prediction each time, re-run predictions on new window
curr_frame = data
predicted = []
for i in range(len(data)):
predicted.append(model.predict(curr_frame[i:i+1]))
curr_frame = curr_frame[1:]
curr_frame = np.insert(curr_frame, [window_size-1], predicted[-1], axis=0)
return predicted
model.fit(X_train, y_train, batch_size = 512, epochs = 500, verbose=2)
#plt.plot(model.predict(X_test))
plt.plot(np.concatenate(predict_sequence_full(model, X_test, window_size = 1)).ravel())
plt.plot(y_test)
plt.show()
def get_timespan(input_data, date, minus, periods, freq='D'):
return input_data.loc[pd.date_range(date - timedelta(days=minus), periods=periods, freq=freq)]
def convert_timeseries(input_data, start_date, is_train=True):
X = pd.DataFrame({"last_1": get_timespan(input_data, start_date, 1, 1).mean().values,
"last_2_mean": get_timespan(input_data, start_date, 2, 2).mean().values,#last 2
"last_3_mean": get_timespan(input_data, start_date, 3, 3).mean().values,#last 3
"last_4_mean": get_timespan(input_data, start_date, 4, 4).mean().values,#last 4
"last_5_mean": get_timespan(input_data, start_date, 5, 5).mean().values,#last 5
"last_6_mean": get_timespan(input_data, start_date, 6, 6).mean().values,#last 6
"last_7_mean": get_timespan(input_data, start_date, 7, 7).mean().values,#last 7
"last_14_mean": get_timespan(input_data, start_date, 14, 14).mean().values})
for i in range(7):
X['last_4_dow{}_mean'.format(i)] = get_timespan(input_data, start_date, 28-i, 4, freq='7D').mean().values #get every week last 7
if is_train:
y = input_data.loc[pd.date_range(start_date, periods=1)].mean().values
return X, y
return X
def predict_sequence_full(model, data, window_size):
#Shift the window by 1 new prediction each time, re-run predictions on new window
curr_frame = data
predicted = []
for i in range(len(data)):
predicted.append(model.predict(curr_frame[i:i+1]))
curr_frame = curr_frame[1:]
curr_frame = np.insert(curr_frame, [window_size-1], predicted[-1], axis=0)
return predicted
#set up training data
X_train, y_train = [], []
for i in range(pd.Timedelta(cut_off - train_start_date).days):
delta = timedelta(days=i)
X_tmp, y_tmp = convert_timeseries(df, train_start_date + delta)
X_train.append(X_tmp)
y_train.append(y_tmp)
X_train = pd.concat(X_train, axis=0)
y_train = np.concatenate(y_train, axis=0)
X_train = X_train.as_matrix()
X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
#set up testing data
X_test, y_test = [], []
for i in range(pd.Timedelta(df.index[-1] - cut_off).days):
delta = timedelta(days=i)
X_tmp, y_tmp = convert_timeseries(df, cut_off + delta)
X_test.append(X_tmp)
y_test.append(y_tmp)
X_test = pd.concat(X_test, axis=0)
y_test = np.concatenate(y_test, axis=0)
X_test = X_test.as_matrix()
X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))
#model
model = Sequential()
model.add(LSTM(32, input_shape=(X_train.shape[1],X_train.shape[2])))
model.add(Dropout(.1))
model.add(Dense(8))
model.add(Dropout(.1))
model.add(Dense(1))
model.compile(loss='mse',optimizer=optimizers.SGD(lr=0.05, decay=1e-6, momentum=0.9, nesterov=True))#optimizer='adam', metrics=['mse']
model.fit(X_train, y_train, batch_size = 512, epochs = 1000, verbose=2)
prediction['lstm'] = np.concatenate(predict_sequence_full(model, X_test, window_size = 1)).ravel()
plt.figure(figsize=(12,4))
plt.plot(train['balance'], label='Train')
plt.plot(test['balance'], label='Test')
plt.plot(prediction['lstm'], label='LSTM')
plt.legend(loc='best')
plt.show()
print("#LSTM\nCoefficent:",round(np.corrcoef(test.balance,prediction.lstm)[0,1],4),\
" RMSE:", round(math.sqrt(mean_squared_error(test.balance, prediction.lstm)),4),\
" MAPE:", round(mean_absolute_percentage_error(test.balance, prediction.lstm),4))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment