Skip to content

Instantly share code, notes, and snippets.

@rahit
Created September 2, 2016 22:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rahit/5dcbee8f63c0be06eec2b51110d21272 to your computer and use it in GitHub Desktop.
Save rahit/5dcbee8f63c0be06eec2b51110d21272 to your computer and use it in GitHub Desktop.
Time Series Prediction Using ARMA and Linear Regression
__author__ = "K.M. Tahsin Hassan Rahit"
__email__ = "tahsin.rahit@gmail.com"
import pandas as pd
import statsmodels.api as sm
import matplotlib.pylab as plt
from matplotlib import *
from spectrum import *
from pylab import *
import spectrum.arma
from pylab import plot, log10, hold, legend
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.tsa.arima_model import ARIMA
from sklearn.cross_validation import train_test_split
#####
# data = pd.read_csv('air_data.csv',
# parse_dates=[['Date', 'Time']], index_col='Date_Time')
# dta = data.ix[:,['T']]
# res = sm.tsa.ARMA(dta, (6, 23), freq='H').fit()
# #print res.predict(7000, 10000)
# fig, ax = plt.subplots()
# ax = dta.ix[6500:].plot(ax=ax)
# res.plot_predict(8000, 10000, dynamic=True, ax=ax,
# plot_insample=False)
# plt.show()
#####
def check_pq_value(data):
data_diff = data - data.shift()
lag_acf = acf([float(i) for i in data['L'].tolist()], nlags=10)
lag_pacf = pacf([float(i) for i in data['L'].tolist()], nlags=10, method='ols')
plt.subplot(121)
plt.plot(lag_acf)
plt.axhline(y=0,linestyle='--',color='gray')
plt.axhline(y=2/np.sqrt(len(data_diff)),linestyle='--',color='gray')
plt.axhline(y=4/np.sqrt(len(data_diff)),linestyle='--',color='gray')
plt.title('Autocorrelation Function')
plt.subplot(122)
plt.plot(lag_pacf)
plt.axhline(y=0,linestyle='--',color='gray')
plt.axhline(y=2/np.sqrt(len(data_diff)),linestyle='--',color='gray')
plt.axhline(y=4/np.sqrt(len(data_diff)),linestyle='--',color='gray')
plt.title('Partial Autocorrelation Function')
plt.tight_layout()
plt.show()
def do_arma(data, target, freq, train_start, pred_start, pred_end):
plt.style.use('ggplot')
y = [float(i) for i in data[target].tolist()]
nobs = len(y)
# dates = sm.tsa.datetools.dates_from_range('1680m1', length=nobs)
dates = pd.period_range(train_start, periods=nobs, freq=freq)
y = pd.Series(y, index=dates.to_timestamp(freq=freq))
arma_mod = sm.tsa.ARMA(y, order=(2, 2))
arma_res = arma_mod.fit()
pred = arma_res.predict(pred_start, pred_end)
# pred_idx = pd.period_range('1755-12-31', end='1888-12-31', freq='A')
fig, ax = plt.subplots()
ax.plot(y[100:300])
ax.plot(pred)
plt.show()
# data = pd.read_csv('air_data.csv')
# do_arma(data, 'T', 'H', '1755-12-31', '1756-01-05', '1756-01-10')
data = pd.read_csv('nile-water-level.csv')
# check_pq_value(data)
do_arma(data, 'L', 'A', '1755-12-31', '2025-12-31', '2150-12-31')
__author__ = "K.M. Tahsin Hassan Rahit"
__email__ = "tahsin.rahit@gmail.com"
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from matplotlib import *
from matplotlib.pylab import rcParams
from sklearn.ensemble.forest import RandomForestRegressor
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
plt.style.use('ggplot')
data = pd.read_csv('air_data.csv', parse_dates=[['Date', 'Time']], index_col='Date_Time')
# print data
data = data[data["T"] > 0]
data = data.dropna()
train, test = train_test_split(data, random_state=1, test_size=.1)
print data.corr()["T"]
columns = data.columns.tolist()
target = 'T'
columns = [c for c in columns if c not in ['T', 'Date', 'Time', 'RH', 'AH']]
model = LinearRegression()
# model = RandomForestRegressor(n_estimators=100, min_samples_leaf=5, random_state=1)
#print columns
# Fit the model to the training data.
model.fit(train[columns], train[target])
predictions = model.predict(test[columns])
print mean_squared_error(predictions, test[target])
test_indecies = test['T'].index.tolist()
fig, ax = plt.subplots()
ax.scatter(test_indecies, predictions, c='r', marker="s", label='Predicted')
ax.scatter(test_indecies, test['T'], c='g', marker="o", label='Original')
plt.legend(loc='upper left')
plt.grid(True)
# ax.plot([test[target].index.min(), test[target].index.max()], [predictions.min(), predictions.max()], 'k--', lw=4)
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment