Skip to content

Instantly share code, notes, and snippets.

@timriffe
Last active July 20, 2018 07:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save timriffe/734f6b143cae4e2238a8cc1ab27ae038 to your computer and use it in GitHub Desktop.
Save timriffe/734f6b143cae4e2238a8cc1ab27ae038 to your computer and use it in GitHub Desktop.
Some code to help solve the problem of negatives, which can happen in Sprague if the bottom age has a relatively low value.
library(ungroup)
library(devtools)
install_github("timriffe/DemoTools")
library(DemoTools)
datapoints <- c(117,
382,
418,
490,
333,
408)
dat <- matrix(datapoints, ncol = 1, dimnames=list(c(0,5,10,15,20,25)))
# basic Sprague (should be close to spreadhseet result)
spr <- spragueSimple(dat,Age=c(0,5,10,15,20,25),OAG=FALSE,closeout=FALSE)
# Grabill-- smoother in middle, not a solution
gra <- grabill(dat,Age=c(0,5,10,15,20,25),OAG=FALSE)
# retry Sprague, with a hack: make bins artificially wider, then group back.
spr2 <- spragueSimple(matrix(rep(datapoints/2,each=2),ncol = 1,dimnames=list(seq(0,55,by=5))),OAG=FALSE,closeout=FALSE)
spr2 <- groupAges(spr2,N=2) # this groups back to original
# oscillatory using Sprague
spo <- splitOscillate(rep(datapoints/5,each=5),OAG=FALSE,closeout=FALSE)
# oscillatory using Beers
spob <- splitOscillate(rep(datapoints/5,each=5),OAG=FALSE,closeout=FALSE,splitfun=beersSimple)
# beers (similar to Sprague, an alternative)
brs <- beersSimple(dat,Age=c(0,5,10,15,20,25),OAG=FALSE)
# pclm, with and without hacks. All free of zeros, last is best though.
riz0 <- ungroup::pclm(x= seq(0,25,by=5), y=c(dat),nlast=5,control=list(lambda=.1,deg=3,kr=1))
riz1 <- ungroup::pclm(x= seq(0,35,by=5), y=c(dat,408,408),nlast=5,control=list(lambda=.1,deg=3,kr=1))
riz2 <- ungroup::pclm(x= seq(0,45,by=5), y=c(117,117,dat,408,408),nlast=5,control=list(lambda=.1,deg=3,kr=1))
# show that constrained (or approximately so)
groupAges(spr)- c(dat)
groupAges(spr2)- c(dat)
groupAges(brs) - c(dat)
groupAges(riz0$fitted)[1:6]- c(dat)
groupAges(riz1$fitted)[1:6]- c(dat)
groupAges(riz2$fitted)[3:8]- c(dat)
barplot(t(dat)/5,width=5,space=0,col=gray(.95),ylim=c(-20,120))
lines(0:29,spr,col="red") # negative at age 0
lines(0:29,spr2,col="red",lty=2) # solved, but not optimal
#lines(0:29,gra,col="red",lty=2,lwd=2) # negative at age 0, not constrained in middle
lines(0:29,brs,col="blue") # negative at age 0
lines(0:29,spo,col="blue",lty=2) # value = 0 at age 0
#lines(0:29,spob,col="blue",lty=2) # value = 0 at age 0
lines(0:29,riz0$fitted,col="green") # constrained, not zero, but very wavy
lines(0:39,riz1$fitted,col="green",lwd=2,lty=3) # constrained, not zero, and tame(r) upper end
# best one IMO
lines(0:29,riz2$fitted[11:40],col="green",lty=2) # constrained, not zero, and tame both ends
# how about a wrapper to make it generally applicable?
pclm_wrap <- function(y,pad=2){
n <- length(y)
Age <- (1:n - 1) * 5
AgePad <- seq(0,max(Age) + (pad*2*5), by = 5)
ypad <- c(rep(y[1],pad),y,rep(y[n],pad))
yout <- ungroup::pclm(x = AgePad, y = ypad, nlast = 5, control = list(lambda = .1, deg = 3, kr = 1))$fitted
Age1 <- 0:(max(Age) + 4)
yout <- yout[Age1 + (pad*5+1)]
names(yout) <- Age1
yout
}
# easy to apply.
pclm_wrap(datapoints)
# data dimensions were modified and therefore broke: changes here:
#define the function
pclm_wrap <- function(y,constrain = TRUE,lambda=.1, OAG = 111){
n <- length(y)
Age <- (1:n - 1) * 5
maxA <- max(Age)
nlast <- length((maxA + 1):OAG)
# AgePad <- seq(0,max(Age) + (pad*2*5), by = 5)
# ypad <- c(rep(y[1],pad),y,rep(y[n],pad))
# cbind(AgePad,ypad)
yout <- ungroup::pclm(x = Age,
y = y,
nlast = nlast,
control = list(lambda = lambda, deg = 3, kr = 1))$fitted
#set the length of last interval is 26, the oldest person would be 111
if (constrain){
# groupAges comes from DemoTools
y2 <- DemoTools::groupAges(yout, OAG = maxA)
y2 <- c(rep(y2[1:(n - 1)], each = 5), rep(y2[n], nlast))
yin <- c(rep(y[1:(n - 1)], each = 5), rep(y[n], nlast))
yout <- yout * (yin / y2)
}
yout
}
#apply the function pclm_wrap to the vector of columns x1-x5 to generate SYOA (single years of age)
#and store the results in a new data frame.
#install.packages("xlsx")
library(xlsx)
df <- read.xlsx("/home/tim/Downloads/B01001_AGE.xlsx", sheetIndex = 1)
mat <- as.matrix(df[, 4:21])
rownames(mat) <- df$GEOID
#apply() runs until it hits an error, likely due to
# an interaction with low lambda and 0 value cells.
# experiment with catching the errors and increasing
# lambda where applicable.
# syoa <- apply(mat, 1, pclm_wrap)
syoa <- matrix(0,nrow=nrow(mat),ncol=111)
for (i in 1:nrow(mat)){
yout <- try(pclm_wrap(y = mat[i, ], OAG = 111))
if (class(yout) == "try-error"){
# increase lambda?
yout <- try(pclm_wrap(y = mat[i, ], lambda = 1, OAG = 111))
}
if (class(yout)!= "try-error"){
syoa[i, ] <- yout
}
}
# rows where fit was unsuccessful have sums of 0
# the other rows should be constrained both in row
# margin and in age groups.
# ----------------------------------
# 20 July code
# reduce console clutter (saw code in twitter)
shhh <- function(expr){
capture.output(x <- suppressPackageStartupMessages(
suppressMessages(suppressWarnings(expr))))
invisible(x)
}
pclm_wrap <- function(y,constrain = TRUE,lambda=.1, OAG = 111){
n <- length(y)
Age <- (1:n - 1) * 5
maxA <- max(Age)
nlast <- length((maxA + 1):OAG)
# only difference here is shhh()
yout <- shhh(ungroup::pclm(x = Age,
y = y,
nlast = nlast,
control = list(lambda = lambda, deg = 3, kr = 1))$fitted )
#set the length of last interval is 26, the oldest person would be 111
if (constrain){
# groupAges comes from DemoTools
y2 <- DemoTools::groupAges(yout, OAG = maxA)
y2 <- c(rep(y2[1:(n - 1)], each = 5), rep(y2[n], nlast))
yin <- c(rep(y[1:(n - 1)], each = 5), rep(y[n], nlast))
yout <- yout * (yin / y2)
}
yout
}
#apply the function pclm_wrap to the vector of columns x1-x5 to generate SYOA (single years of age)
#and store the results in a new data frame.
#install.packages("xlsx")
library(xlsx)
df <- read.xlsx("/home/tim/Downloads/B01001_AGE.xlsx", sheetIndex = 1)
# notice there was an error in column selection before
# row indeces are read in as a column, so we start at 5
mat <- as.matrix(df[, 5:22])
rownames(mat) <- df$GEOID
#apply() runs until it hits an error, likely due to
# an interaction with low lambda and 0 value cells.
# experiment with catching the errors and increasing
# lambda where applicable.
# syoa <- apply(mat, 1, pclm_wrap)
syoa <- matrix(0,nrow=nrow(mat),ncol=111)
probs <- c()
for (i in 1:nrow(mat)){
# same, with shhh() on first error
yout <- shhh(try(pclm_wrap(y = mat[i, ], OAG = 111)))
if (class(yout) == "try-error"){
# increase lambda?
# but maybe instead try decreasing it?
yout <- try(pclm_wrap(y = mat[i, ], lambda = 1, OAG = 111))
}
if (class(yout)!= "try-error"){
syoa[i, ] <- yout
}
# collect indices of problematic cases
if (class(yout) == "try-error"){
probs <- c(probs,i)
}
}
# are problematic cases small pops?
sum(rowSums(mat[probs,]) < 10) # holy shit, some of these were less than 10 people
sum(rowSums(mat[probs,]) < 100)
sum(rowSums(mat[probs,]) < 200)
sum(rowSums(mat[probs,]) > 1000)
# lets look at the age pattern of input data that are problematic:
#------------------
# problematic cases are probematic because they have lots of 0s,
# and are otherwise erratic. Does it make sense to smooth these?
# Splitting pop counts is usually a convenience operation, not a matter of
# finding a true underlying pattern. We usually assume that underlying
# rate patterns over age are truly smooth, but counts can be anything.
matplot(t(mat[probs,][rowSums(mat[probs,]) > 1000,][1:100,]),type='l')
# first 100 problematic cases of populations over 1000
# and if we regroup split data to the original bin sizes?
# regroup results:
regrouped <- t(apply(syoa,1,DemoTools::groupAges, OAG = 85))
# among those cases that fit there are no instances
# of absolute cell differences greater than .1
sum(abs(regrouped[-probs, ] - mat[-probs,]) > .1)
# % of cases that are problematic.
length(probs) / nrow(mat)
# so you'll need to figure out how you want to deal with erratic or small
# populations.
# you might try changing lambda (smaller?), distributing uniformly in age groups?
# grouping to 10-year ages then splitting. Different method altogether. Lots of options.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment