Gedevan-Aleksizde/varma_pop.R

## varma_pop.R
library(tidyr)    # ver. 0.3.1.
library(dplyr)    # ver. 0.4.3
library(rstan)    # ver. 2.9.0
library(forecast) # ver. 6.2
library(zoo)      # ver. 1.7-11
library(gdata)    # ver. 2.13.3
library(loo)      # ver. 0.1.6


# read the xls file
# http://www.stat.go.jp/data/chouki/zuhyou/02-05.xls
temp <- read.xls("~/Documents/blog/20160327_VARIMA2/02-05.xls", skip=5, header=T, dec=",")
header <- as.character(read.xls("~/Documents/blog/20160327_VARIMA2/02-05.xls", skip=3, nrows=1, header=F, stringsAsFactors =F))
colnames(temp) <- ifelse(header != "NA", header, 1:length(header)*10)
colnames(temp)[2] <- "year"
temp <- temp %>% dplyr::select(-starts_with("Japan"), -ends_with("0"))
colnames(temp)[ncol(temp)] <- "Okinawa"
temp <- temp %>% mutate_each(funs(gsub(pattern=",|[a-z]|)|-", replacement="", .)) ) %>% mutate_each(funs(as.integer)) %>% filter(!is.na(year), year >=1950)
ts.pop <- ts(as.matrix(temp[,-1]), start = temp$year[1])

# interpolate missing values
ts.pop <- ts(na.approx(zoo(ts.pop)), start=temp$year[1])
df.pop <- as.data.frame(ts.pop)
row.names(df.pop) <- temp$year

# plot correlograms (too many)
acf(ts.pop)

temp <- df.pop
temp$t <- row.names(temp)
for( i in seq(from=1, to=47, by=4)){
  print(
    ggplot(temp %>% gather(key=prefecture, value=pop, -t) %>% filter(prefecture %in% unique(prefecture)[i:(i+5) ]) ) +
      geom_line(aes(x=t, y=pop, group=prefecture)) + facet_wrap(~prefecture, ncol = 1)
    )
}

plot.stan.pred <- function(data, stanout, p, d, q, span=NULL, time.offset=NULL, ser=NULL){
  Time <- nrow(data)
  T_end <- dim(rstan::extract(stanout, "y_pred")$y_pred)[2]
  T_forecast <- T_end - Time + d;
  N <- ncol(data)
  if(is.null(time.offset) ) time.offset <- as.integer(row.names(data)[1])
  if( is.na(time.offset) | is.nan(time.offset) ) time.offset <- 0
  if (is.null(span) ) span <- 1:(Time + nrow(y_pred))

  y_pred <- data.frame()
  for ( i in 1:N){
    m.pred <- rstan::extract(stanout, "y_pred")$y_pred[,(Time-d+1):T_end,i]

    # arima(p,1,d) case
    if( d==1){
      m.pred[,1] <- m.pred[,1] + data[nrow(data),i]
      for( t in 2:ncol(m.pred))
      m.pred[,t] <- m.pred[,t] + m.pred[,t-1]
    }
    # pred. interval
    temp <- data.frame(t = Time-d +1:T_forecast,
                       series = rep(i, T_forecast),
                       y90_l  = apply(m.pred, 2, quantile, probs=.05, na.rm=T),
                       y90_u = apply(m.pred, 2, quantile, probs=.95, na.rm=T),
                       y80_l  = apply(m.pred, 2, quantile, probs=.1, na.rm=T),
                       y80_u = apply(m.pred, 2, quantile, probs=.9, na.rm=T),
                       y_med  = apply(m.pred, 2, median),
                       y_mean = apply(m.pred, 2, mean, rm.na=T)
    )
    y_pred <- rbind(y_pred, temp)
  }

  temp <- data.frame(t=1:Time, data)
  colnames(temp)[1+1:N] <- seq(1:N)
  # transpose y_* in single column and join
  y_pred <- temp %>% gather(key=series, value=y_mean, -t) %>% mutate(series=as.integer(series)) %>% bind_rows(y_pred)
  y_pred <- y_pred %>% filter(t %in% span) %>% mutate(t=t+time.offset)

  # merge with header
  df.header <- data.frame( series=1:N, name=colnames(data))
  y_pred <- y_pred %>% left_join(y = df.header, by = "series") %>% mutate(series=as.character(name))


  if (!is.null(ser)){
    y_pred <- y_pred %>% filter(series %in% ser)
  }

  ggplot(y_pred) + geom_line(aes(x=t, y=y_mean)) + geom_line(aes(x=t, y_med), linetype=2) +
    geom_ribbon(aes(x=t, ymin=y90_l, ymax=y90_u), alpha=.2, fill="blue") +  geom_ribbon(aes(x=t, ymin=y80_l, ymax=y80_u), alpha=.5, fill="grey") +
    labs(title=paste0("ARMA(", p, ",", d, ",", q, ") Forecasts by MCMC"), y="y") +
    scale_y_continuous(labels = function(x){formatC(x, format="d", big.mark = "," )} ) + facet_wrap(~series)
}

### estimate by stan ###
rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())
stan.varmapq <- stan_model(file="~/Documents/blog/20160327_VARIMA2/varma.stan") # compile

stan.output <- function(data, T_forecast, p, d, q, chain=1, series=c("Hokkaido", "Tokyo", "Osaka") ){
  seq.span <- as.integer(row.names(data))
  if( d > 0) data.d <- diff(ts(data), differences = d)
  else data.d <- data
  res.stan.pop <- sampling(stan.varmapq, data=list(T=nrow(data.d), N=ncol(data.d), y=data.d, T_forecast=T_forecast, p=p, q=q, d=d), chain=chain )
  w <- waic(extract_log_lik(res.stan.pop))
  g <- plot.stan.pred(data = data, stanout = res.stan.pop, p=p, d=d, q=q,
                      ser= series, time.offset = 1950 )
  print(g)
  print(w)
  return( list(
    stan.out=res.stan.pop,
    graph= g,
    waic = w
    ))
}

# VARIMA(1,0,1)
result.1 <- stan.output(data = df.pop, T_forecast = 30, p = 1, d = 0, q = 1)
print(result.1$stan.out, pars="Psi")
ggplot(temp %>% gather(key=prefecture, value=pop, -t) %>% filter(prefecture %in% c("Tokyo", "Osaka", "Hokkaido") ) ) +
  geom_line(aes(x=t, y=pop, group=prefecture)) + facet_wrap(~prefecture, ncol = 1)

# VARMA(1,1,1)
plot(diff(ts.pop[,1:10]))
plot(diff(ts.pop[,11:20]))
plot(diff(ts.pop[,21:30]))
plot(diff(ts.pop[,31:40]))
plot(diff(ts.pop[,41:47]))
result.2 <- stan.output(data = df.pop, T_forecast = 30, p = 1, d = 1, q = 1)

# semi-auto (p,q) selection version
stan.output.semiauto <- function(data, T_forecast, d=1, chain=1, series=c("Hokkaido", "Tokyo", "Osaka") ){
  seq.span <- as.integer(row.names(data))
  if( d > 0) data.d <- diff(ts(data), differences = d)
  else data.d <- data
  w <- Inf

  for ( pq in list(c(1,1), c(2,2), c(1,0), c(0,1), c(0,0) ) ){
    print(paste("p=", pq[[1]], ",", "d=", d, "q=", pq[[2]] ))
    res.temp <- sampling(stan.varmapq, data=list(T=nrow(data.d), N=ncol(data.d), y=data.d, T_forecast=T_forecast, p=pq[[1]], q=pq[[2]], d=d), chain=1 )
    w.temp <- waic(extract_log_lik(res.temp))
    if( w.temp$waic < w) {
      w <- w.temp$waic
      w.info <- w.temp
      p <- pq[[1]]
      q <- pq[[2]]
      res.stan <- res.temp
    }
    print(w.temp)
  }

  w <- w.info
  g <- plot.stan.pred(data = data, stanout = res.stan.pop, p=p, d=d, q=q,
                      ser= series, time.offset = 1950 )
  print(g)
  print(paste("selected:", "p=", p, "d=", d, "q=", q))
  print(w)
  return( list(
    stan.out=res.stan.pop,
    graph= g,
    waic = w
  ))
}

# Only Kantou district
df.pop.k <- df.pop %>% dplyr::select(Tokyo, Kanagawa, Saitama, Chiba, Ibaraki, Tochigi, Gumma)
result.k <- stan.output.semiauto(data = df.pop.k, T_forecast = 30, d = 1, series = NULL)


result.all.1 <- stan.output.semiauto(data = df.pop, T_forecast = 30, d = 0, series = NULL)
result.all.0 <- stan.output.semiauto(data = df.pop, T_forecast = 30, d = 1, series = NULL)
	library(tidyr) # ver. 0.3.1.
	library(dplyr) # ver. 0.4.3
	library(rstan) # ver. 2.9.0
	library(forecast) # ver. 6.2
	library(zoo) # ver. 1.7-11
	library(gdata) # ver. 2.13.3
	library(loo) # ver. 0.1.6



	# read the xls file
	# http://www.stat.go.jp/data/chouki/zuhyou/02-05.xls
	temp <- read.xls("~/Documents/blog/20160327_VARIMA2/02-05.xls", skip=5, header=T, dec=",")
	header <- as.character(read.xls("~/Documents/blog/20160327_VARIMA2/02-05.xls", skip=3, nrows=1, header=F, stringsAsFactors =F))
	colnames(temp) <- ifelse(header != "NA", header, 1:length(header)*10)
	colnames(temp)[2] <- "year"
	temp <- temp %>% dplyr::select(-starts_with("Japan"), -ends_with("0"))
	colnames(temp)[ncol(temp)] <- "Okinawa"
	temp <- temp %>% mutate_each(funs(gsub(pattern=",\|[a-z]\|)\|-", replacement="", .)) ) %>% mutate_each(funs(as.integer)) %>% filter(!is.na(year), year >=1950)
	ts.pop <- ts(as.matrix(temp[,-1]), start = temp$year[1])

	# interpolate missing values
	ts.pop <- ts(na.approx(zoo(ts.pop)), start=temp$year[1])
	df.pop <- as.data.frame(ts.pop)
	row.names(df.pop) <- temp$year

	# plot correlograms (too many)
	acf(ts.pop)

	temp <- df.pop
	temp$t <- row.names(temp)
	for( i in seq(from=1, to=47, by=4)){
	print(
	ggplot(temp %>% gather(key=prefecture, value=pop, -t) %>% filter(prefecture %in% unique(prefecture)[i:(i+5) ]) ) +
	geom_line(aes(x=t, y=pop, group=prefecture)) + facet_wrap(~prefecture, ncol = 1)
	)
	}

	plot.stan.pred <- function(data, stanout, p, d, q, span=NULL, time.offset=NULL, ser=NULL){
	Time <- nrow(data)
	T_end <- dim(rstan::extract(stanout, "y_pred")$y_pred)[2]
	T_forecast <- T_end - Time + d;
	N <- ncol(data)
	if(is.null(time.offset) ) time.offset <- as.integer(row.names(data)[1])
	if( is.na(time.offset) \| is.nan(time.offset) ) time.offset <- 0
	if (is.null(span) ) span <- 1:(Time + nrow(y_pred))

	y_pred <- data.frame()
	for ( i in 1:N){
	m.pred <- rstan::extract(stanout, "y_pred")$y_pred[,(Time-d+1):T_end,i]

	# arima(p,1,d) case
	if( d==1){
	m.pred[,1] <- m.pred[,1] + data[nrow(data),i]
	for( t in 2:ncol(m.pred))
	m.pred[,t] <- m.pred[,t] + m.pred[,t-1]
	}
	# pred. interval
	temp <- data.frame(t = Time-d +1:T_forecast,
	series = rep(i, T_forecast),
	y90_l = apply(m.pred, 2, quantile, probs=.05, na.rm=T),
	y90_u = apply(m.pred, 2, quantile, probs=.95, na.rm=T),
	y80_l = apply(m.pred, 2, quantile, probs=.1, na.rm=T),
	y80_u = apply(m.pred, 2, quantile, probs=.9, na.rm=T),
	y_med = apply(m.pred, 2, median),
	y_mean = apply(m.pred, 2, mean, rm.na=T)
	)
	y_pred <- rbind(y_pred, temp)
	}

	temp <- data.frame(t=1:Time, data)
	colnames(temp)[1+1:N] <- seq(1:N)
	# transpose y_* in single column and join
	y_pred <- temp %>% gather(key=series, value=y_mean, -t) %>% mutate(series=as.integer(series)) %>% bind_rows(y_pred)
	y_pred <- y_pred %>% filter(t %in% span) %>% mutate(t=t+time.offset)

	# merge with header
	df.header <- data.frame( series=1:N, name=colnames(data))
	y_pred <- y_pred %>% left_join(y = df.header, by = "series") %>% mutate(series=as.character(name))


	if (!is.null(ser)){
	y_pred <- y_pred %>% filter(series %in% ser)
	}

	ggplot(y_pred) + geom_line(aes(x=t, y=y_mean)) + geom_line(aes(x=t, y_med), linetype=2) +
	geom_ribbon(aes(x=t, ymin=y90_l, ymax=y90_u), alpha=.2, fill="blue") + geom_ribbon(aes(x=t, ymin=y80_l, ymax=y80_u), alpha=.5, fill="grey") +
	labs(title=paste0("ARMA(", p, ",", d, ",", q, ") Forecasts by MCMC"), y="y") +
	scale_y_continuous(labels = function(x){formatC(x, format="d", big.mark = "," )} ) + facet_wrap(~series)
	}

	### estimate by stan ###
	rstan_options(auto_write = TRUE)
	options(mc.cores = parallel::detectCores())
	stan.varmapq <- stan_model(file="~/Documents/blog/20160327_VARIMA2/varma.stan") # compile

	stan.output <- function(data, T_forecast, p, d, q, chain=1, series=c("Hokkaido", "Tokyo", "Osaka") ){
	seq.span <- as.integer(row.names(data))
	if( d > 0) data.d <- diff(ts(data), differences = d)
	else data.d <- data
	res.stan.pop <- sampling(stan.varmapq, data=list(T=nrow(data.d), N=ncol(data.d), y=data.d, T_forecast=T_forecast, p=p, q=q, d=d), chain=chain )
	w <- waic(extract_log_lik(res.stan.pop))
	g <- plot.stan.pred(data = data, stanout = res.stan.pop, p=p, d=d, q=q,
	ser= series, time.offset = 1950 )
	print(g)
	print(w)
	return( list(
	stan.out=res.stan.pop,
	graph= g,
	waic = w
	))
	}

	# VARIMA(1,0,1)
	result.1 <- stan.output(data = df.pop, T_forecast = 30, p = 1, d = 0, q = 1)
	print(result.1$stan.out, pars="Psi")
	ggplot(temp %>% gather(key=prefecture, value=pop, -t) %>% filter(prefecture %in% c("Tokyo", "Osaka", "Hokkaido") ) ) +
	geom_line(aes(x=t, y=pop, group=prefecture)) + facet_wrap(~prefecture, ncol = 1)

	# VARMA(1,1,1)
	plot(diff(ts.pop[,1:10]))
	plot(diff(ts.pop[,11:20]))
	plot(diff(ts.pop[,21:30]))
	plot(diff(ts.pop[,31:40]))
	plot(diff(ts.pop[,41:47]))
	result.2 <- stan.output(data = df.pop, T_forecast = 30, p = 1, d = 1, q = 1)

	# semi-auto (p,q) selection version
	stan.output.semiauto <- function(data, T_forecast, d=1, chain=1, series=c("Hokkaido", "Tokyo", "Osaka") ){
	seq.span <- as.integer(row.names(data))
	if( d > 0) data.d <- diff(ts(data), differences = d)
	else data.d <- data
	w <- Inf

	for ( pq in list(c(1,1), c(2,2), c(1,0), c(0,1), c(0,0) ) ){
	print(paste("p=", pq[[1]], ",", "d=", d, "q=", pq[[2]] ))
	res.temp <- sampling(stan.varmapq, data=list(T=nrow(data.d), N=ncol(data.d), y=data.d, T_forecast=T_forecast, p=pq[[1]], q=pq[[2]], d=d), chain=1 )
	w.temp <- waic(extract_log_lik(res.temp))
	if( w.temp$waic < w) {
	w <- w.temp$waic
	w.info <- w.temp
	p <- pq[[1]]
	q <- pq[[2]]
	res.stan <- res.temp
	}
	print(w.temp)
	}

	w <- w.info
	g <- plot.stan.pred(data = data, stanout = res.stan.pop, p=p, d=d, q=q,
	ser= series, time.offset = 1950 )
	print(g)
	print(paste("selected:", "p=", p, "d=", d, "q=", q))
	print(w)
	return( list(
	stan.out=res.stan.pop,
	graph= g,
	waic = w
	))
	}

	# Only Kantou district
	df.pop.k <- df.pop %>% dplyr::select(Tokyo, Kanagawa, Saitama, Chiba, Ibaraki, Tochigi, Gumma)
	result.k <- stan.output.semiauto(data = df.pop.k, T_forecast = 30, d = 1, series = NULL)


	result.all.1 <- stan.output.semiauto(data = df.pop, T_forecast = 30, d = 0, series = NULL)
	result.all.0 <- stan.output.semiauto(data = df.pop, T_forecast = 30, d = 1, series = NULL)