Skip to content

Instantly share code, notes, and snippets.

@krz krz/tw.R Secret
Last active Jan 6, 2020

Embed
What would you like to do?
code for the paper "Tweedie distributions for fitting semicontinuous health care utilization cost data"
library(sampleSelection) # for RandHIE data
library(dplyr)
data(RandHIE)
# combined costs
RandHIE$cost <- RandHIE$outpdol +
RandHIE$drugdol +
RandHIE$suppdol +
RandHIE$mentdol +
RandHIE$inpdol
# select relevant variables
rh <- RandHIE %>% group_by(zper) %>% summarise(cost = first(cost),
age=first(xage),
disea=first(disea),
physlm=first(physlm),
logc=first(logc), #lc
idp=first(idp),#
lpi=first(lpi),#
fmde=first(fmde),
linc=first(linc),
lfam=first(lfam),
female=first(female),
black=first(black),
educdec=first(educdec),
hlthg=first(hlthg)
)
# select only people >= 18 and remove missing values
rh <- rh %>% filter(age >= 18 & !is.na(educdec))
rh$zper <- NULL
# remove implausible values
rh <- rh %>% filter(physlm == 0 | physlm == 1 | black == 0 | black == 1)
length(which(rh$cost ==0))/nrow(rh) # 18.1 % zeros
############## tobit
library(VGAM)
tob <- vglm(cost ~ ., tobit(Lower = 0, type.fitted = "censored"), data = rh, maxit=100)
summary(tob)
AICc(tob)
############## tweedie
library(cplm)
tw2 <- cpglm(cost ~ ., data=rh)
summary(tw2)
(AIC(tw2) - 2*length(coef(tw2)) ) / -2
############## gamma twopart
rh$zero <- ifelse(rh$cost == 0, 0, 1) # 0 if zero costs
h1 <- glm(zero ~ . -cost, family=binomial(link=logit), data=rh)
summary(h1)
logLik(h1)
h2 <- glm(cost ~ . -zero, family = Gamma(link = log), data=subset(rh, zero == 1))
summary(h2)
logLik(h2)
AIC(h1) + AIC(h2)
#######################
# rmse estimation
# split into train and test
set.seed(41)
idx <- sample(1:nrow(rh), 500, replace=F)
train.x <- rh[-idx, ]
test.x <- rh[idx,]
### tobit
tob.rmse <- vglm(cost ~ . -zero, tobit(Lower = 0, type.fitted = c("censored")), data = train.x, maxit=100)
summary(tob.rmse)
preds <- predict(tob.rmse, test.x, type="response")
#rmse
sqrt(mean((preds - test.x$cost)^2))
### gamma twopart
h1.rmse <- glm(zero ~ . -cost, family=binomial(link=logit), data=train.x)
pred2 <- predict(h1.rmse, test.x, type="response")
#pred2 <- ifelse(pred2 < 0.5, 0, 1)
h2.rmse <- glm(cost ~ . -zero, family = Gamma(link = log), data=subset(train.x, zero == 1))
# new
preds2 <- pred2 * predict(h2.rmse, test.x, type="response")
#rmse
sqrt(mean((preds2 - test.x$cost)^2))
### tweedie
train.x$zero <- NULL
tw.rmse <- cpglm(cost ~., data=train.x)
preds3 <- predict(tw.rmse, test.x, type="response")
# rmse
sqrt(mean((preds3 - test.x$cost)^2))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.