Skip to content

Instantly share code, notes, and snippets.

@Gedevan-Aleksizde
Last active April 12, 2016 13:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Gedevan-Aleksizde/93ba87d1ede8ff49fdfbde17ce9867d6 to your computer and use it in GitHub Desktop.
Save Gedevan-Aleksizde/93ba87d1ede8ff49fdfbde17ce9867d6 to your computer and use it in GitHub Desktop.
library(tidyr) # ver. 0.3.1.
library(dplyr) # ver. 0.4.3
library(rstan) # ver. 2.9.0
library(forecast) # ver. 6.2
library(zoo) # ver. 1.7-11
library(gdata) # ver. 2.13.3
library(loo) # ver. 0.1.6
# read the xls file
# http://www.stat.go.jp/data/chouki/zuhyou/02-05.xls
temp <- read.xls("~/Documents/blog/20160327_VARIMA2/02-05.xls", skip=5, header=T, dec=",")
header <- as.character(read.xls("~/Documents/blog/20160327_VARIMA2/02-05.xls", skip=3, nrows=1, header=F, stringsAsFactors =F))
colnames(temp) <- ifelse(header != "NA", header, 1:length(header)*10)
colnames(temp)[2] <- "year"
temp <- temp %>% dplyr::select(-starts_with("Japan"), -ends_with("0"))
colnames(temp)[ncol(temp)] <- "Okinawa"
temp <- temp %>% mutate_each(funs(gsub(pattern=",|[a-z]|)|-", replacement="", .)) ) %>% mutate_each(funs(as.integer)) %>% filter(!is.na(year), year >=1950)
ts.pop <- ts(as.matrix(temp[,-1]), start = temp$year[1])
# interpolate missing values
ts.pop <- ts(na.approx(zoo(ts.pop)), start=temp$year[1])
df.pop <- as.data.frame(ts.pop)
row.names(df.pop) <- temp$year
# plot correlograms (too many)
acf(ts.pop)
temp <- df.pop
temp$t <- row.names(temp)
for( i in seq(from=1, to=47, by=4)){
print(
ggplot(temp %>% gather(key=prefecture, value=pop, -t) %>% filter(prefecture %in% unique(prefecture)[i:(i+5) ]) ) +
geom_line(aes(x=t, y=pop, group=prefecture)) + facet_wrap(~prefecture, ncol = 1)
)
}
plot.stan.pred <- function(data, stanout, p, d, q, span=NULL, time.offset=NULL, ser=NULL){
Time <- nrow(data)
T_end <- dim(rstan::extract(stanout, "y_pred")$y_pred)[2]
T_forecast <- T_end - Time + d;
N <- ncol(data)
if(is.null(time.offset) ) time.offset <- as.integer(row.names(data)[1])
if( is.na(time.offset) | is.nan(time.offset) ) time.offset <- 0
if (is.null(span) ) span <- 1:(Time + nrow(y_pred))
y_pred <- data.frame()
for ( i in 1:N){
m.pred <- rstan::extract(stanout, "y_pred")$y_pred[,(Time-d+1):T_end,i]
# arima(p,1,d) case
if( d==1){
m.pred[,1] <- m.pred[,1] + data[nrow(data),i]
for( t in 2:ncol(m.pred))
m.pred[,t] <- m.pred[,t] + m.pred[,t-1]
}
# pred. interval
temp <- data.frame(t = Time-d +1:T_forecast,
series = rep(i, T_forecast),
y90_l = apply(m.pred, 2, quantile, probs=.05, na.rm=T),
y90_u = apply(m.pred, 2, quantile, probs=.95, na.rm=T),
y80_l = apply(m.pred, 2, quantile, probs=.1, na.rm=T),
y80_u = apply(m.pred, 2, quantile, probs=.9, na.rm=T),
y_med = apply(m.pred, 2, median),
y_mean = apply(m.pred, 2, mean, rm.na=T)
)
y_pred <- rbind(y_pred, temp)
}
temp <- data.frame(t=1:Time, data)
colnames(temp)[1+1:N] <- seq(1:N)
# transpose y_* in single column and join
y_pred <- temp %>% gather(key=series, value=y_mean, -t) %>% mutate(series=as.integer(series)) %>% bind_rows(y_pred)
y_pred <- y_pred %>% filter(t %in% span) %>% mutate(t=t+time.offset)
# merge with header
df.header <- data.frame( series=1:N, name=colnames(data))
y_pred <- y_pred %>% left_join(y = df.header, by = "series") %>% mutate(series=as.character(name))
if (!is.null(ser)){
y_pred <- y_pred %>% filter(series %in% ser)
}
ggplot(y_pred) + geom_line(aes(x=t, y=y_mean)) + geom_line(aes(x=t, y_med), linetype=2) +
geom_ribbon(aes(x=t, ymin=y90_l, ymax=y90_u), alpha=.2, fill="blue") + geom_ribbon(aes(x=t, ymin=y80_l, ymax=y80_u), alpha=.5, fill="grey") +
labs(title=paste0("ARMA(", p, ",", d, ",", q, ") Forecasts by MCMC"), y="y") +
scale_y_continuous(labels = function(x){formatC(x, format="d", big.mark = "," )} ) + facet_wrap(~series)
}
### estimate by stan ###
rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())
stan.varmapq <- stan_model(file="~/Documents/blog/20160327_VARIMA2/varma.stan") # compile
stan.output <- function(data, T_forecast, p, d, q, chain=1, series=c("Hokkaido", "Tokyo", "Osaka") ){
seq.span <- as.integer(row.names(data))
if( d > 0) data.d <- diff(ts(data), differences = d)
else data.d <- data
res.stan.pop <- sampling(stan.varmapq, data=list(T=nrow(data.d), N=ncol(data.d), y=data.d, T_forecast=T_forecast, p=p, q=q, d=d), chain=chain )
w <- waic(extract_log_lik(res.stan.pop))
g <- plot.stan.pred(data = data, stanout = res.stan.pop, p=p, d=d, q=q,
ser= series, time.offset = 1950 )
print(g)
print(w)
return( list(
stan.out=res.stan.pop,
graph= g,
waic = w
))
}
# VARIMA(1,0,1)
result.1 <- stan.output(data = df.pop, T_forecast = 30, p = 1, d = 0, q = 1)
print(result.1$stan.out, pars="Psi")
ggplot(temp %>% gather(key=prefecture, value=pop, -t) %>% filter(prefecture %in% c("Tokyo", "Osaka", "Hokkaido") ) ) +
geom_line(aes(x=t, y=pop, group=prefecture)) + facet_wrap(~prefecture, ncol = 1)
# VARMA(1,1,1)
plot(diff(ts.pop[,1:10]))
plot(diff(ts.pop[,11:20]))
plot(diff(ts.pop[,21:30]))
plot(diff(ts.pop[,31:40]))
plot(diff(ts.pop[,41:47]))
result.2 <- stan.output(data = df.pop, T_forecast = 30, p = 1, d = 1, q = 1)
# semi-auto (p,q) selection version
stan.output.semiauto <- function(data, T_forecast, d=1, chain=1, series=c("Hokkaido", "Tokyo", "Osaka") ){
seq.span <- as.integer(row.names(data))
if( d > 0) data.d <- diff(ts(data), differences = d)
else data.d <- data
w <- Inf
for ( pq in list(c(1,1), c(2,2), c(1,0), c(0,1), c(0,0) ) ){
print(paste("p=", pq[[1]], ",", "d=", d, "q=", pq[[2]] ))
res.temp <- sampling(stan.varmapq, data=list(T=nrow(data.d), N=ncol(data.d), y=data.d, T_forecast=T_forecast, p=pq[[1]], q=pq[[2]], d=d), chain=1 )
w.temp <- waic(extract_log_lik(res.temp))
if( w.temp$waic < w) {
w <- w.temp$waic
w.info <- w.temp
p <- pq[[1]]
q <- pq[[2]]
res.stan <- res.temp
}
print(w.temp)
}
w <- w.info
g <- plot.stan.pred(data = data, stanout = res.stan.pop, p=p, d=d, q=q,
ser= series, time.offset = 1950 )
print(g)
print(paste("selected:", "p=", p, "d=", d, "q=", q))
print(w)
return( list(
stan.out=res.stan.pop,
graph= g,
waic = w
))
}
# Only Kantou district
df.pop.k <- df.pop %>% dplyr::select(Tokyo, Kanagawa, Saitama, Chiba, Ibaraki, Tochigi, Gumma)
result.k <- stan.output.semiauto(data = df.pop.k, T_forecast = 30, d = 1, series = NULL)
result.all.1 <- stan.output.semiauto(data = df.pop, T_forecast = 30, d = 0, series = NULL)
result.all.0 <- stan.output.semiauto(data = df.pop, T_forecast = 30, d = 1, series = NULL)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment