Last active
April 12, 2016 13:29
-
-
Save Gedevan-Aleksizde/93ba87d1ede8ff49fdfbde17ce9867d6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyr) # ver. 0.3.1. | |
library(dplyr) # ver. 0.4.3 | |
library(rstan) # ver. 2.9.0 | |
library(forecast) # ver. 6.2 | |
library(zoo) # ver. 1.7-11 | |
library(gdata) # ver. 2.13.3 | |
library(loo) # ver. 0.1.6 | |
# read the xls file | |
# http://www.stat.go.jp/data/chouki/zuhyou/02-05.xls | |
temp <- read.xls("~/Documents/blog/20160327_VARIMA2/02-05.xls", skip=5, header=T, dec=",") | |
header <- as.character(read.xls("~/Documents/blog/20160327_VARIMA2/02-05.xls", skip=3, nrows=1, header=F, stringsAsFactors =F)) | |
colnames(temp) <- ifelse(header != "NA", header, 1:length(header)*10) | |
colnames(temp)[2] <- "year" | |
temp <- temp %>% dplyr::select(-starts_with("Japan"), -ends_with("0")) | |
colnames(temp)[ncol(temp)] <- "Okinawa" | |
temp <- temp %>% mutate_each(funs(gsub(pattern=",|[a-z]|)|-", replacement="", .)) ) %>% mutate_each(funs(as.integer)) %>% filter(!is.na(year), year >=1950) | |
ts.pop <- ts(as.matrix(temp[,-1]), start = temp$year[1]) | |
# interpolate missing values | |
ts.pop <- ts(na.approx(zoo(ts.pop)), start=temp$year[1]) | |
df.pop <- as.data.frame(ts.pop) | |
row.names(df.pop) <- temp$year | |
# plot correlograms (too many) | |
acf(ts.pop) | |
temp <- df.pop | |
temp$t <- row.names(temp) | |
for( i in seq(from=1, to=47, by=4)){ | |
print( | |
ggplot(temp %>% gather(key=prefecture, value=pop, -t) %>% filter(prefecture %in% unique(prefecture)[i:(i+5) ]) ) + | |
geom_line(aes(x=t, y=pop, group=prefecture)) + facet_wrap(~prefecture, ncol = 1) | |
) | |
} | |
plot.stan.pred <- function(data, stanout, p, d, q, span=NULL, time.offset=NULL, ser=NULL){ | |
Time <- nrow(data) | |
T_end <- dim(rstan::extract(stanout, "y_pred")$y_pred)[2] | |
T_forecast <- T_end - Time + d; | |
N <- ncol(data) | |
if(is.null(time.offset) ) time.offset <- as.integer(row.names(data)[1]) | |
if( is.na(time.offset) | is.nan(time.offset) ) time.offset <- 0 | |
if (is.null(span) ) span <- 1:(Time + nrow(y_pred)) | |
y_pred <- data.frame() | |
for ( i in 1:N){ | |
m.pred <- rstan::extract(stanout, "y_pred")$y_pred[,(Time-d+1):T_end,i] | |
# arima(p,1,d) case | |
if( d==1){ | |
m.pred[,1] <- m.pred[,1] + data[nrow(data),i] | |
for( t in 2:ncol(m.pred)) | |
m.pred[,t] <- m.pred[,t] + m.pred[,t-1] | |
} | |
# pred. interval | |
temp <- data.frame(t = Time-d +1:T_forecast, | |
series = rep(i, T_forecast), | |
y90_l = apply(m.pred, 2, quantile, probs=.05, na.rm=T), | |
y90_u = apply(m.pred, 2, quantile, probs=.95, na.rm=T), | |
y80_l = apply(m.pred, 2, quantile, probs=.1, na.rm=T), | |
y80_u = apply(m.pred, 2, quantile, probs=.9, na.rm=T), | |
y_med = apply(m.pred, 2, median), | |
y_mean = apply(m.pred, 2, mean, rm.na=T) | |
) | |
y_pred <- rbind(y_pred, temp) | |
} | |
temp <- data.frame(t=1:Time, data) | |
colnames(temp)[1+1:N] <- seq(1:N) | |
# transpose y_* in single column and join | |
y_pred <- temp %>% gather(key=series, value=y_mean, -t) %>% mutate(series=as.integer(series)) %>% bind_rows(y_pred) | |
y_pred <- y_pred %>% filter(t %in% span) %>% mutate(t=t+time.offset) | |
# merge with header | |
df.header <- data.frame( series=1:N, name=colnames(data)) | |
y_pred <- y_pred %>% left_join(y = df.header, by = "series") %>% mutate(series=as.character(name)) | |
if (!is.null(ser)){ | |
y_pred <- y_pred %>% filter(series %in% ser) | |
} | |
ggplot(y_pred) + geom_line(aes(x=t, y=y_mean)) + geom_line(aes(x=t, y_med), linetype=2) + | |
geom_ribbon(aes(x=t, ymin=y90_l, ymax=y90_u), alpha=.2, fill="blue") + geom_ribbon(aes(x=t, ymin=y80_l, ymax=y80_u), alpha=.5, fill="grey") + | |
labs(title=paste0("ARMA(", p, ",", d, ",", q, ") Forecasts by MCMC"), y="y") + | |
scale_y_continuous(labels = function(x){formatC(x, format="d", big.mark = "," )} ) + facet_wrap(~series) | |
} | |
### estimate by stan ### | |
rstan_options(auto_write = TRUE) | |
options(mc.cores = parallel::detectCores()) | |
stan.varmapq <- stan_model(file="~/Documents/blog/20160327_VARIMA2/varma.stan") # compile | |
stan.output <- function(data, T_forecast, p, d, q, chain=1, series=c("Hokkaido", "Tokyo", "Osaka") ){ | |
seq.span <- as.integer(row.names(data)) | |
if( d > 0) data.d <- diff(ts(data), differences = d) | |
else data.d <- data | |
res.stan.pop <- sampling(stan.varmapq, data=list(T=nrow(data.d), N=ncol(data.d), y=data.d, T_forecast=T_forecast, p=p, q=q, d=d), chain=chain ) | |
w <- waic(extract_log_lik(res.stan.pop)) | |
g <- plot.stan.pred(data = data, stanout = res.stan.pop, p=p, d=d, q=q, | |
ser= series, time.offset = 1950 ) | |
print(g) | |
print(w) | |
return( list( | |
stan.out=res.stan.pop, | |
graph= g, | |
waic = w | |
)) | |
} | |
# VARIMA(1,0,1) | |
result.1 <- stan.output(data = df.pop, T_forecast = 30, p = 1, d = 0, q = 1) | |
print(result.1$stan.out, pars="Psi") | |
ggplot(temp %>% gather(key=prefecture, value=pop, -t) %>% filter(prefecture %in% c("Tokyo", "Osaka", "Hokkaido") ) ) + | |
geom_line(aes(x=t, y=pop, group=prefecture)) + facet_wrap(~prefecture, ncol = 1) | |
# VARMA(1,1,1) | |
plot(diff(ts.pop[,1:10])) | |
plot(diff(ts.pop[,11:20])) | |
plot(diff(ts.pop[,21:30])) | |
plot(diff(ts.pop[,31:40])) | |
plot(diff(ts.pop[,41:47])) | |
result.2 <- stan.output(data = df.pop, T_forecast = 30, p = 1, d = 1, q = 1) | |
# semi-auto (p,q) selection version | |
stan.output.semiauto <- function(data, T_forecast, d=1, chain=1, series=c("Hokkaido", "Tokyo", "Osaka") ){ | |
seq.span <- as.integer(row.names(data)) | |
if( d > 0) data.d <- diff(ts(data), differences = d) | |
else data.d <- data | |
w <- Inf | |
for ( pq in list(c(1,1), c(2,2), c(1,0), c(0,1), c(0,0) ) ){ | |
print(paste("p=", pq[[1]], ",", "d=", d, "q=", pq[[2]] )) | |
res.temp <- sampling(stan.varmapq, data=list(T=nrow(data.d), N=ncol(data.d), y=data.d, T_forecast=T_forecast, p=pq[[1]], q=pq[[2]], d=d), chain=1 ) | |
w.temp <- waic(extract_log_lik(res.temp)) | |
if( w.temp$waic < w) { | |
w <- w.temp$waic | |
w.info <- w.temp | |
p <- pq[[1]] | |
q <- pq[[2]] | |
res.stan <- res.temp | |
} | |
print(w.temp) | |
} | |
w <- w.info | |
g <- plot.stan.pred(data = data, stanout = res.stan.pop, p=p, d=d, q=q, | |
ser= series, time.offset = 1950 ) | |
print(g) | |
print(paste("selected:", "p=", p, "d=", d, "q=", q)) | |
print(w) | |
return( list( | |
stan.out=res.stan.pop, | |
graph= g, | |
waic = w | |
)) | |
} | |
# Only Kantou district | |
df.pop.k <- df.pop %>% dplyr::select(Tokyo, Kanagawa, Saitama, Chiba, Ibaraki, Tochigi, Gumma) | |
result.k <- stan.output.semiauto(data = df.pop.k, T_forecast = 30, d = 1, series = NULL) | |
result.all.1 <- stan.output.semiauto(data = df.pop, T_forecast = 30, d = 0, series = NULL) | |
result.all.0 <- stan.output.semiauto(data = df.pop, T_forecast = 30, d = 1, series = NULL) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment