timriffe/GraduateCounts.R

## GraduateCounts.R
library(ungroup)
library(devtools)
install_github("timriffe/DemoTools")
library(DemoTools)


datapoints <- c(117,
		382,
		418,
		490,
		333,
		408)
dat <- matrix(datapoints, ncol = 1, dimnames=list(c(0,5,10,15,20,25)))

# basic Sprague (should be close to spreadhseet result)
spr  <- spragueSimple(dat,Age=c(0,5,10,15,20,25),OAG=FALSE,closeout=FALSE)
# Grabill-- smoother in middle, not a solution
gra  <- grabill(dat,Age=c(0,5,10,15,20,25),OAG=FALSE)
# retry Sprague, with a hack: make bins artificially wider, then group back.
spr2 <- spragueSimple(matrix(rep(datapoints/2,each=2),ncol = 1,dimnames=list(seq(0,55,by=5))),OAG=FALSE,closeout=FALSE)
spr2 <- groupAges(spr2,N=2) # this groups back to original
# oscillatory using Sprague
spo  <- splitOscillate(rep(datapoints/5,each=5),OAG=FALSE,closeout=FALSE)
# oscillatory using Beers
spob <- splitOscillate(rep(datapoints/5,each=5),OAG=FALSE,closeout=FALSE,splitfun=beersSimple)
# beers (similar to Sprague, an alternative)
brs  <- beersSimple(dat,Age=c(0,5,10,15,20,25),OAG=FALSE)

# pclm, with and without hacks. All free of zeros, last is best though.
riz0 <- ungroup::pclm(x= seq(0,25,by=5), y=c(dat),nlast=5,control=list(lambda=.1,deg=3,kr=1))
riz1 <- ungroup::pclm(x= seq(0,35,by=5), y=c(dat,408,408),nlast=5,control=list(lambda=.1,deg=3,kr=1))
riz2 <- ungroup::pclm(x= seq(0,45,by=5), y=c(117,117,dat,408,408),nlast=5,control=list(lambda=.1,deg=3,kr=1))

# show that constrained (or approximately so)
groupAges(spr)- c(dat)
groupAges(spr2)- c(dat)
groupAges(brs) - c(dat)
groupAges(riz0$fitted)[1:6]- c(dat)
groupAges(riz1$fitted)[1:6]- c(dat)
groupAges(riz2$fitted)[3:8]- c(dat)


barplot(t(dat)/5,width=5,space=0,col=gray(.95),ylim=c(-20,120))
lines(0:29,spr,col="red")          # negative at age 0
lines(0:29,spr2,col="red",lty=2)   # solved, but not optimal
#lines(0:29,gra,col="red",lty=2,lwd=2) # negative at age 0, not constrained in middle
lines(0:29,brs,col="blue")         # negative at age 0
lines(0:29,spo,col="blue",lty=2)   # value = 0 at age 0
#lines(0:29,spob,col="blue",lty=2) # value = 0 at age 0
lines(0:29,riz0$fitted,col="green") # constrained, not zero, but very wavy
lines(0:39,riz1$fitted,col="green",lwd=2,lty=3) # constrained, not zero, and tame(r) upper end

# best one IMO
lines(0:29,riz2$fitted[11:40],col="green",lty=2) # constrained, not zero, and tame both ends

# how about a wrapper to make it generally applicable?
pclm_wrap <- function(y,pad=2){
	n      <- length(y)
	Age    <- (1:n - 1) * 5
	AgePad <- seq(0,max(Age) + (pad*2*5), by = 5)
	ypad   <- c(rep(y[1],pad),y,rep(y[n],pad))
	yout   <- ungroup::pclm(x = AgePad, y = ypad, nlast = 5, control = list(lambda = .1, deg = 3, kr = 1))$fitted
	Age1   <- 0:(max(Age) + 4)
	yout   <- yout[Age1 + (pad*5+1)]
	names(yout) <- Age1
	yout
}
# easy to apply.
pclm_wrap(datapoints)


# data dimensions were modified and therefore broke: changes here:


#define the function

pclm_wrap <- function(y,constrain = TRUE,lambda=.1, OAG = 111){
  n      <- length(y)
  Age    <- (1:n - 1) * 5
  maxA   <- max(Age)
  nlast  <- length((maxA + 1):OAG)
  # AgePad <- seq(0,max(Age) + (pad*2*5), by = 5)
  # ypad   <- c(rep(y[1],pad),y,rep(y[n],pad))
  # cbind(AgePad,ypad)

  yout   <- ungroup::pclm(x = Age,
  						y = y,
  						nlast = nlast,
  						control = list(lambda = lambda, deg = 3, kr = 1))$fitted
  #set the length of last interval is 26, the oldest person would be 111

  if (constrain){
    # groupAges comes from DemoTools
    y2   <- DemoTools::groupAges(yout, OAG = maxA)

    y2   <- c(rep(y2[1:(n - 1)], each = 5), rep(y2[n], nlast))
    yin  <- c(rep(y[1:(n - 1)], each = 5), rep(y[n], nlast))

    yout <- yout * (yin / y2)
  }
  yout
}

#apply the function pclm_wrap to the vector of columns x1-x5 to generate SYOA (single years of age)
#and store the results in a new data frame.
#install.packages("xlsx")
library(xlsx)
df            <- read.xlsx("/home/tim/Downloads/B01001_AGE.xlsx", sheetIndex = 1)
mat           <- as.matrix(df[, 4:21])
rownames(mat) <- df$GEOID

#apply() runs until it hits an error, likely due to
# an interaction with low lambda and 0 value cells.
# experiment with catching the errors and increasing
# lambda where applicable.
# syoa          <- apply(mat, 1, pclm_wrap)
syoa          <- matrix(0,nrow=nrow(mat),ncol=111)
for (i in 1:nrow(mat)){
	yout      <- try(pclm_wrap(y = mat[i, ], OAG = 111))
	if (class(yout) == "try-error"){
		# increase lambda?
		yout      <- try(pclm_wrap(y = mat[i, ], lambda = 1, OAG = 111))
	}
	if (class(yout)!= "try-error"){

		syoa[i, ] <- yout
	}
}
# rows where fit was unsuccessful have sums of 0
# the other rows should be constrained both in row
# margin and in age groups.


# ----------------------------------
# 20 July code

# reduce console clutter (saw code in twitter)
shhh <- function(expr){
	capture.output(x <- suppressPackageStartupMessages(
					suppressMessages(suppressWarnings(expr))))
	invisible(x)
}

pclm_wrap <- function(y,constrain = TRUE,lambda=.1, OAG = 111){
	n      <- length(y)
	Age    <- (1:n - 1) * 5
	maxA   <- max(Age)
	nlast  <- length((maxA + 1):OAG)
	# only difference here is shhh()
	yout   <- shhh(ungroup::pclm(x = Age,
			y = y,
			nlast = nlast,
			control = list(lambda = lambda, deg = 3, kr = 1))$fitted )
	#set the length of last interval is 26, the oldest person would be 111

	if (constrain){
		# groupAges comes from DemoTools
		y2   <- DemoTools::groupAges(yout, OAG = maxA)

		y2   <- c(rep(y2[1:(n - 1)], each = 5), rep(y2[n], nlast))
		yin  <- c(rep(y[1:(n - 1)], each = 5), rep(y[n], nlast))

		yout <- yout * (yin / y2)
	}
	yout
}

#apply the function pclm_wrap to the vector of columns x1-x5 to generate SYOA (single years of age)
#and store the results in a new data frame.
#install.packages("xlsx")
library(xlsx)
df            <- read.xlsx("/home/tim/Downloads/B01001_AGE.xlsx", sheetIndex = 1)
# notice there was an error in column selection before
# row indeces are read in as a column, so we start at 5
mat           <- as.matrix(df[, 5:22])
rownames(mat) <- df$GEOID

#apply() runs until it hits an error, likely due to
# an interaction with low lambda and 0 value cells.
# experiment with catching the errors and increasing
# lambda where applicable.
# syoa          <- apply(mat, 1, pclm_wrap)
syoa          <- matrix(0,nrow=nrow(mat),ncol=111)
probs <- c()
for (i in 1:nrow(mat)){
	# same, with shhh() on first error
	yout      <- shhh(try(pclm_wrap(y = mat[i, ], OAG = 111)))
	if (class(yout) == "try-error"){
		# increase lambda?
		# but maybe instead try decreasing it?
		yout      <- try(pclm_wrap(y = mat[i, ], lambda = 1, OAG = 111))
	}
	if (class(yout)!= "try-error"){

		syoa[i, ] <- yout
	}
	# collect indices of problematic cases
	if (class(yout) == "try-error"){
		probs <- c(probs,i)
	}
}
# are problematic cases small pops?
sum(rowSums(mat[probs,]) < 10)    # holy shit, some of these were less than 10 people
sum(rowSums(mat[probs,]) < 100)
sum(rowSums(mat[probs,]) < 200)
sum(rowSums(mat[probs,]) > 1000)

# lets look at the age pattern of input data that are problematic:
#------------------
# problematic cases are probematic because they have lots of 0s,
# and are otherwise erratic. Does it make sense to smooth these?
# Splitting pop counts is usually a convenience operation, not a matter of
# finding a true underlying pattern. We usually assume that underlying
# rate patterns over age are truly smooth, but counts can be anything.
matplot(t(mat[probs,][rowSums(mat[probs,]) > 1000,][1:100,]),type='l')
# first 100 problematic cases of populations over 1000

# and if we regroup split data to the original bin sizes?
# regroup results:
regrouped <- t(apply(syoa,1,DemoTools::groupAges, OAG = 85))

# among those cases that fit there are no instances
# of absolute cell differences greater than .1
sum(abs(regrouped[-probs, ] - mat[-probs,]) > .1)

# % of cases that are problematic.
length(probs) / nrow(mat)

# so you'll need to figure out how you want to deal with erratic or small
# populations.

# you might try changing lambda (smaller?), distributing uniformly in age groups?
# grouping to 10-year ages then splitting. Different method altogether. Lots of options.
	library(ungroup)
	library(devtools)
	install_github("timriffe/DemoTools")
	library(DemoTools)


	datapoints <- c(117,
	382,
	418,
	490,
	333,
	408)
	dat <- matrix(datapoints, ncol = 1, dimnames=list(c(0,5,10,15,20,25)))

	# basic Sprague (should be close to spreadhseet result)
	spr <- spragueSimple(dat,Age=c(0,5,10,15,20,25),OAG=FALSE,closeout=FALSE)
	# Grabill-- smoother in middle, not a solution
	gra <- grabill(dat,Age=c(0,5,10,15,20,25),OAG=FALSE)
	# retry Sprague, with a hack: make bins artificially wider, then group back.
	spr2 <- spragueSimple(matrix(rep(datapoints/2,each=2),ncol = 1,dimnames=list(seq(0,55,by=5))),OAG=FALSE,closeout=FALSE)
	spr2 <- groupAges(spr2,N=2) # this groups back to original
	# oscillatory using Sprague
	spo <- splitOscillate(rep(datapoints/5,each=5),OAG=FALSE,closeout=FALSE)
	# oscillatory using Beers
	spob <- splitOscillate(rep(datapoints/5,each=5),OAG=FALSE,closeout=FALSE,splitfun=beersSimple)
	# beers (similar to Sprague, an alternative)
	brs <- beersSimple(dat,Age=c(0,5,10,15,20,25),OAG=FALSE)

	# pclm, with and without hacks. All free of zeros, last is best though.
	riz0 <- ungroup::pclm(x= seq(0,25,by=5), y=c(dat),nlast=5,control=list(lambda=.1,deg=3,kr=1))
	riz1 <- ungroup::pclm(x= seq(0,35,by=5), y=c(dat,408,408),nlast=5,control=list(lambda=.1,deg=3,kr=1))
	riz2 <- ungroup::pclm(x= seq(0,45,by=5), y=c(117,117,dat,408,408),nlast=5,control=list(lambda=.1,deg=3,kr=1))

	# show that constrained (or approximately so)
	groupAges(spr)- c(dat)
	groupAges(spr2)- c(dat)
	groupAges(brs) - c(dat)
	groupAges(riz0$fitted)[1:6]- c(dat)
	groupAges(riz1$fitted)[1:6]- c(dat)
	groupAges(riz2$fitted)[3:8]- c(dat)


	barplot(t(dat)/5,width=5,space=0,col=gray(.95),ylim=c(-20,120))
	lines(0:29,spr,col="red") # negative at age 0
	lines(0:29,spr2,col="red",lty=2) # solved, but not optimal
	#lines(0:29,gra,col="red",lty=2,lwd=2) # negative at age 0, not constrained in middle
	lines(0:29,brs,col="blue") # negative at age 0
	lines(0:29,spo,col="blue",lty=2) # value = 0 at age 0
	#lines(0:29,spob,col="blue",lty=2) # value = 0 at age 0
	lines(0:29,riz0$fitted,col="green") # constrained, not zero, but very wavy
	lines(0:39,riz1$fitted,col="green",lwd=2,lty=3) # constrained, not zero, and tame(r) upper end

	# best one IMO
	lines(0:29,riz2$fitted[11:40],col="green",lty=2) # constrained, not zero, and tame both ends

	# how about a wrapper to make it generally applicable?
	pclm_wrap <- function(y,pad=2){
	n <- length(y)
	Age <- (1:n - 1) * 5
	AgePad <- seq(0,max(Age) + (pad25), by = 5)
	ypad <- c(rep(y[1],pad),y,rep(y[n],pad))
	yout <- ungroup::pclm(x = AgePad, y = ypad, nlast = 5, control = list(lambda = .1, deg = 3, kr = 1))$fitted
	Age1 <- 0:(max(Age) + 4)
	yout <- yout[Age1 + (pad*5+1)]
	names(yout) <- Age1
	yout
	}
	# easy to apply.
	pclm_wrap(datapoints)


	# data dimensions were modified and therefore broke: changes here:


	#define the function

	pclm_wrap <- function(y,constrain = TRUE,lambda=.1, OAG = 111){
	n <- length(y)
	Age <- (1:n - 1) * 5
	maxA <- max(Age)
	nlast <- length((maxA + 1):OAG)
	# AgePad <- seq(0,max(Age) + (pad25), by = 5)
	# ypad <- c(rep(y[1],pad),y,rep(y[n],pad))
	# cbind(AgePad,ypad)

	yout <- ungroup::pclm(x = Age,
	y = y,
	nlast = nlast,
	control = list(lambda = lambda, deg = 3, kr = 1))$fitted
	#set the length of last interval is 26, the oldest person would be 111

	if (constrain){
	# groupAges comes from DemoTools
	y2 <- DemoTools::groupAges(yout, OAG = maxA)

	y2 <- c(rep(y2[1:(n - 1)], each = 5), rep(y2[n], nlast))
	yin <- c(rep(y[1:(n - 1)], each = 5), rep(y[n], nlast))

	yout <- yout * (yin / y2)
	}
	yout
	}

	#apply the function pclm_wrap to the vector of columns x1-x5 to generate SYOA (single years of age)
	#and store the results in a new data frame.
	#install.packages("xlsx")
	library(xlsx)
	df <- read.xlsx("/home/tim/Downloads/B01001_AGE.xlsx", sheetIndex = 1)
	mat <- as.matrix(df[, 4:21])
	rownames(mat) <- df$GEOID

	#apply() runs until it hits an error, likely due to
	# an interaction with low lambda and 0 value cells.
	# experiment with catching the errors and increasing
	# lambda where applicable.
	# syoa <- apply(mat, 1, pclm_wrap)
	syoa <- matrix(0,nrow=nrow(mat),ncol=111)
	for (i in 1:nrow(mat)){
	yout <- try(pclm_wrap(y = mat[i, ], OAG = 111))
	if (class(yout) == "try-error"){
	# increase lambda?
	yout <- try(pclm_wrap(y = mat[i, ], lambda = 1, OAG = 111))
	}
	if (class(yout)!= "try-error"){

	syoa[i, ] <- yout
	}
	}
	# rows where fit was unsuccessful have sums of 0
	# the other rows should be constrained both in row
	# margin and in age groups.



	# ----------------------------------
	# 20 July code

	# reduce console clutter (saw code in twitter)
	shhh <- function(expr){
	capture.output(x <- suppressPackageStartupMessages(
	suppressMessages(suppressWarnings(expr))))
	invisible(x)
	}

	pclm_wrap <- function(y,constrain = TRUE,lambda=.1, OAG = 111){
	n <- length(y)
	Age <- (1:n - 1) * 5
	maxA <- max(Age)
	nlast <- length((maxA + 1):OAG)
	# only difference here is shhh()
	yout <- shhh(ungroup::pclm(x = Age,
	y = y,
	nlast = nlast,
	control = list(lambda = lambda, deg = 3, kr = 1))$fitted )
	#set the length of last interval is 26, the oldest person would be 111

	if (constrain){
	# groupAges comes from DemoTools
	y2 <- DemoTools::groupAges(yout, OAG = maxA)

	y2 <- c(rep(y2[1:(n - 1)], each = 5), rep(y2[n], nlast))
	yin <- c(rep(y[1:(n - 1)], each = 5), rep(y[n], nlast))

	yout <- yout * (yin / y2)
	}
	yout
	}

	#apply the function pclm_wrap to the vector of columns x1-x5 to generate SYOA (single years of age)
	#and store the results in a new data frame.
	#install.packages("xlsx")
	library(xlsx)
	df <- read.xlsx("/home/tim/Downloads/B01001_AGE.xlsx", sheetIndex = 1)
	# notice there was an error in column selection before
	# row indeces are read in as a column, so we start at 5
	mat <- as.matrix(df[, 5:22])
	rownames(mat) <- df$GEOID

	#apply() runs until it hits an error, likely due to
	# an interaction with low lambda and 0 value cells.
	# experiment with catching the errors and increasing
	# lambda where applicable.
	# syoa <- apply(mat, 1, pclm_wrap)
	syoa <- matrix(0,nrow=nrow(mat),ncol=111)
	probs <- c()
	for (i in 1:nrow(mat)){
	# same, with shhh() on first error
	yout <- shhh(try(pclm_wrap(y = mat[i, ], OAG = 111)))
	if (class(yout) == "try-error"){
	# increase lambda?
	# but maybe instead try decreasing it?
	yout <- try(pclm_wrap(y = mat[i, ], lambda = 1, OAG = 111))
	}
	if (class(yout)!= "try-error"){

	syoa[i, ] <- yout
	}
	# collect indices of problematic cases
	if (class(yout) == "try-error"){
	probs <- c(probs,i)
	}
	}
	# are problematic cases small pops?
	sum(rowSums(mat[probs,]) < 10) # holy shit, some of these were less than 10 people
	sum(rowSums(mat[probs,]) < 100)
	sum(rowSums(mat[probs,]) < 200)
	sum(rowSums(mat[probs,]) > 1000)

	# lets look at the age pattern of input data that are problematic:
	#------------------
	# problematic cases are probematic because they have lots of 0s,
	# and are otherwise erratic. Does it make sense to smooth these?
	# Splitting pop counts is usually a convenience operation, not a matter of
	# finding a true underlying pattern. We usually assume that underlying
	# rate patterns over age are truly smooth, but counts can be anything.
	matplot(t(mat[probs,][rowSums(mat[probs,]) > 1000,][1:100,]),type='l')
	# first 100 problematic cases of populations over 1000

	# and if we regroup split data to the original bin sizes?
	# regroup results:
	regrouped <- t(apply(syoa,1,DemoTools::groupAges, OAG = 85))

	# among those cases that fit there are no instances
	# of absolute cell differences greater than .1
	sum(abs(regrouped[-probs, ] - mat[-probs,]) > .1)

	# % of cases that are problematic.
	length(probs) / nrow(mat)

	# so you'll need to figure out how you want to deal with erratic or small
	# populations.

	# you might try changing lambda (smaller?), distributing uniformly in age groups?
	# grouping to 10-year ages then splitting. Different method altogether. Lots of options.