Last active
July 20, 2018 07:29
-
-
Save timriffe/734f6b143cae4e2238a8cc1ab27ae038 to your computer and use it in GitHub Desktop.
Some code to help solve the problem of negatives, which can happen in Sprague if the bottom age has a relatively low value.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(ungroup) | |
library(devtools) | |
install_github("timriffe/DemoTools") | |
library(DemoTools) | |
datapoints <- c(117, | |
382, | |
418, | |
490, | |
333, | |
408) | |
dat <- matrix(datapoints, ncol = 1, dimnames=list(c(0,5,10,15,20,25))) | |
# basic Sprague (should be close to spreadhseet result) | |
spr <- spragueSimple(dat,Age=c(0,5,10,15,20,25),OAG=FALSE,closeout=FALSE) | |
# Grabill-- smoother in middle, not a solution | |
gra <- grabill(dat,Age=c(0,5,10,15,20,25),OAG=FALSE) | |
# retry Sprague, with a hack: make bins artificially wider, then group back. | |
spr2 <- spragueSimple(matrix(rep(datapoints/2,each=2),ncol = 1,dimnames=list(seq(0,55,by=5))),OAG=FALSE,closeout=FALSE) | |
spr2 <- groupAges(spr2,N=2) # this groups back to original | |
# oscillatory using Sprague | |
spo <- splitOscillate(rep(datapoints/5,each=5),OAG=FALSE,closeout=FALSE) | |
# oscillatory using Beers | |
spob <- splitOscillate(rep(datapoints/5,each=5),OAG=FALSE,closeout=FALSE,splitfun=beersSimple) | |
# beers (similar to Sprague, an alternative) | |
brs <- beersSimple(dat,Age=c(0,5,10,15,20,25),OAG=FALSE) | |
# pclm, with and without hacks. All free of zeros, last is best though. | |
riz0 <- ungroup::pclm(x= seq(0,25,by=5), y=c(dat),nlast=5,control=list(lambda=.1,deg=3,kr=1)) | |
riz1 <- ungroup::pclm(x= seq(0,35,by=5), y=c(dat,408,408),nlast=5,control=list(lambda=.1,deg=3,kr=1)) | |
riz2 <- ungroup::pclm(x= seq(0,45,by=5), y=c(117,117,dat,408,408),nlast=5,control=list(lambda=.1,deg=3,kr=1)) | |
# show that constrained (or approximately so) | |
groupAges(spr)- c(dat) | |
groupAges(spr2)- c(dat) | |
groupAges(brs) - c(dat) | |
groupAges(riz0$fitted)[1:6]- c(dat) | |
groupAges(riz1$fitted)[1:6]- c(dat) | |
groupAges(riz2$fitted)[3:8]- c(dat) | |
barplot(t(dat)/5,width=5,space=0,col=gray(.95),ylim=c(-20,120)) | |
lines(0:29,spr,col="red") # negative at age 0 | |
lines(0:29,spr2,col="red",lty=2) # solved, but not optimal | |
#lines(0:29,gra,col="red",lty=2,lwd=2) # negative at age 0, not constrained in middle | |
lines(0:29,brs,col="blue") # negative at age 0 | |
lines(0:29,spo,col="blue",lty=2) # value = 0 at age 0 | |
#lines(0:29,spob,col="blue",lty=2) # value = 0 at age 0 | |
lines(0:29,riz0$fitted,col="green") # constrained, not zero, but very wavy | |
lines(0:39,riz1$fitted,col="green",lwd=2,lty=3) # constrained, not zero, and tame(r) upper end | |
# best one IMO | |
lines(0:29,riz2$fitted[11:40],col="green",lty=2) # constrained, not zero, and tame both ends | |
# how about a wrapper to make it generally applicable? | |
pclm_wrap <- function(y,pad=2){ | |
n <- length(y) | |
Age <- (1:n - 1) * 5 | |
AgePad <- seq(0,max(Age) + (pad*2*5), by = 5) | |
ypad <- c(rep(y[1],pad),y,rep(y[n],pad)) | |
yout <- ungroup::pclm(x = AgePad, y = ypad, nlast = 5, control = list(lambda = .1, deg = 3, kr = 1))$fitted | |
Age1 <- 0:(max(Age) + 4) | |
yout <- yout[Age1 + (pad*5+1)] | |
names(yout) <- Age1 | |
yout | |
} | |
# easy to apply. | |
pclm_wrap(datapoints) | |
# data dimensions were modified and therefore broke: changes here: | |
#define the function | |
pclm_wrap <- function(y,constrain = TRUE,lambda=.1, OAG = 111){ | |
n <- length(y) | |
Age <- (1:n - 1) * 5 | |
maxA <- max(Age) | |
nlast <- length((maxA + 1):OAG) | |
# AgePad <- seq(0,max(Age) + (pad*2*5), by = 5) | |
# ypad <- c(rep(y[1],pad),y,rep(y[n],pad)) | |
# cbind(AgePad,ypad) | |
yout <- ungroup::pclm(x = Age, | |
y = y, | |
nlast = nlast, | |
control = list(lambda = lambda, deg = 3, kr = 1))$fitted | |
#set the length of last interval is 26, the oldest person would be 111 | |
if (constrain){ | |
# groupAges comes from DemoTools | |
y2 <- DemoTools::groupAges(yout, OAG = maxA) | |
y2 <- c(rep(y2[1:(n - 1)], each = 5), rep(y2[n], nlast)) | |
yin <- c(rep(y[1:(n - 1)], each = 5), rep(y[n], nlast)) | |
yout <- yout * (yin / y2) | |
} | |
yout | |
} | |
#apply the function pclm_wrap to the vector of columns x1-x5 to generate SYOA (single years of age) | |
#and store the results in a new data frame. | |
#install.packages("xlsx") | |
library(xlsx) | |
df <- read.xlsx("/home/tim/Downloads/B01001_AGE.xlsx", sheetIndex = 1) | |
mat <- as.matrix(df[, 4:21]) | |
rownames(mat) <- df$GEOID | |
#apply() runs until it hits an error, likely due to | |
# an interaction with low lambda and 0 value cells. | |
# experiment with catching the errors and increasing | |
# lambda where applicable. | |
# syoa <- apply(mat, 1, pclm_wrap) | |
syoa <- matrix(0,nrow=nrow(mat),ncol=111) | |
for (i in 1:nrow(mat)){ | |
yout <- try(pclm_wrap(y = mat[i, ], OAG = 111)) | |
if (class(yout) == "try-error"){ | |
# increase lambda? | |
yout <- try(pclm_wrap(y = mat[i, ], lambda = 1, OAG = 111)) | |
} | |
if (class(yout)!= "try-error"){ | |
syoa[i, ] <- yout | |
} | |
} | |
# rows where fit was unsuccessful have sums of 0 | |
# the other rows should be constrained both in row | |
# margin and in age groups. | |
# ---------------------------------- | |
# 20 July code | |
# reduce console clutter (saw code in twitter) | |
shhh <- function(expr){ | |
capture.output(x <- suppressPackageStartupMessages( | |
suppressMessages(suppressWarnings(expr)))) | |
invisible(x) | |
} | |
pclm_wrap <- function(y,constrain = TRUE,lambda=.1, OAG = 111){ | |
n <- length(y) | |
Age <- (1:n - 1) * 5 | |
maxA <- max(Age) | |
nlast <- length((maxA + 1):OAG) | |
# only difference here is shhh() | |
yout <- shhh(ungroup::pclm(x = Age, | |
y = y, | |
nlast = nlast, | |
control = list(lambda = lambda, deg = 3, kr = 1))$fitted ) | |
#set the length of last interval is 26, the oldest person would be 111 | |
if (constrain){ | |
# groupAges comes from DemoTools | |
y2 <- DemoTools::groupAges(yout, OAG = maxA) | |
y2 <- c(rep(y2[1:(n - 1)], each = 5), rep(y2[n], nlast)) | |
yin <- c(rep(y[1:(n - 1)], each = 5), rep(y[n], nlast)) | |
yout <- yout * (yin / y2) | |
} | |
yout | |
} | |
#apply the function pclm_wrap to the vector of columns x1-x5 to generate SYOA (single years of age) | |
#and store the results in a new data frame. | |
#install.packages("xlsx") | |
library(xlsx) | |
df <- read.xlsx("/home/tim/Downloads/B01001_AGE.xlsx", sheetIndex = 1) | |
# notice there was an error in column selection before | |
# row indeces are read in as a column, so we start at 5 | |
mat <- as.matrix(df[, 5:22]) | |
rownames(mat) <- df$GEOID | |
#apply() runs until it hits an error, likely due to | |
# an interaction with low lambda and 0 value cells. | |
# experiment with catching the errors and increasing | |
# lambda where applicable. | |
# syoa <- apply(mat, 1, pclm_wrap) | |
syoa <- matrix(0,nrow=nrow(mat),ncol=111) | |
probs <- c() | |
for (i in 1:nrow(mat)){ | |
# same, with shhh() on first error | |
yout <- shhh(try(pclm_wrap(y = mat[i, ], OAG = 111))) | |
if (class(yout) == "try-error"){ | |
# increase lambda? | |
# but maybe instead try decreasing it? | |
yout <- try(pclm_wrap(y = mat[i, ], lambda = 1, OAG = 111)) | |
} | |
if (class(yout)!= "try-error"){ | |
syoa[i, ] <- yout | |
} | |
# collect indices of problematic cases | |
if (class(yout) == "try-error"){ | |
probs <- c(probs,i) | |
} | |
} | |
# are problematic cases small pops? | |
sum(rowSums(mat[probs,]) < 10) # holy shit, some of these were less than 10 people | |
sum(rowSums(mat[probs,]) < 100) | |
sum(rowSums(mat[probs,]) < 200) | |
sum(rowSums(mat[probs,]) > 1000) | |
# lets look at the age pattern of input data that are problematic: | |
#------------------ | |
# problematic cases are probematic because they have lots of 0s, | |
# and are otherwise erratic. Does it make sense to smooth these? | |
# Splitting pop counts is usually a convenience operation, not a matter of | |
# finding a true underlying pattern. We usually assume that underlying | |
# rate patterns over age are truly smooth, but counts can be anything. | |
matplot(t(mat[probs,][rowSums(mat[probs,]) > 1000,][1:100,]),type='l') | |
# first 100 problematic cases of populations over 1000 | |
# and if we regroup split data to the original bin sizes? | |
# regroup results: | |
regrouped <- t(apply(syoa,1,DemoTools::groupAges, OAG = 85)) | |
# among those cases that fit there are no instances | |
# of absolute cell differences greater than .1 | |
sum(abs(regrouped[-probs, ] - mat[-probs,]) > .1) | |
# % of cases that are problematic. | |
length(probs) / nrow(mat) | |
# so you'll need to figure out how you want to deal with erratic or small | |
# populations. | |
# you might try changing lambda (smaller?), distributing uniformly in age groups? | |
# grouping to 10-year ages then splitting. Different method altogether. Lots of options. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment