|
library(foreign) |
|
library(survey) |
|
|
|
# Make folder to contain all the output CSVs |
|
dir.create('output', showWarnings = FALSE) |
|
|
|
# Download file and unzip into data frame |
|
download.file("https://meps.ahrq.gov/mepsweb/data_files/pufs/h171ssp.zip",temp <- tempfile()) |
|
data = read.xport(unzip(temp)) |
|
|
|
# Add broad age categorization (1 is 17 and under, 2 is 17-64, 3 is 65+) |
|
data$agecategories[data$AGELAST <= 17] = 1 |
|
data$agecategories[data$AGELAST >= 18 & data$AGELAST < 65] = 2 |
|
data$agecategories[data$AGELAST >= 65] = 3 |
|
|
|
# Build survey object using person weights |
|
full_population_survey = svydesign( |
|
ids=~VARPSU, |
|
strata=~VARSTR, |
|
data=data, |
|
weights=~PERWT14F, |
|
nest=T |
|
) |
|
|
|
# Answer 1: Get total amount spent on healthcare |
|
svytotal(~TOTEXP14, full_population_survey) |
|
|
|
# Answer 2: Get median amount spent on healthcare |
|
svyquantile(~TOTEXP14, full_population_survey, c(.5 )) |
|
|
|
# Answer 3: Export a CSV that breaks out distributions of total cost, average cost and median cost in 5-percent blocks |
|
# First, make blank distribution object |
|
distributions <- data.frame(start=numeric(), |
|
end=numeric(), |
|
total_spent=numeric(), |
|
median_spent=numeric(), |
|
average_spent=numeric()) |
|
for(block in 0:95 ){ |
|
# Only continue if this percentile is divisible by 5 |
|
if(block %% 5 != 0) |
|
next() |
|
block = block / 100 |
|
# Get the low expenditure cutoff for this 5-percentile |
|
block_cutoff_low = svyquantile(~TOTEXP14, full_population_survey, c(block))[1] |
|
# Get the high expenditure cutoff for this 5-percentile (aka, 5 percentage points higher) |
|
block_cutoff_high = svyquantile(~TOTEXP14, full_population_survey, c(block + .05 ))[1] |
|
# Filter down to this 5-percentile block |
|
blocksegment <- subset(full_population_survey, TOTEXP14 >= block_cutoff_low & TOTEXP14 <= block_cutoff_high ) |
|
# Add characteristics of this block to the distribution object |
|
distributions = rbind(distributions, data.frame( |
|
start = block, |
|
end = block + .05, |
|
total_spent = svytotal(~TOTEXP14, blocksegment)[1], |
|
median_spent = svyquantile(~TOTEXP14, blocksegment, c(.05))[1], |
|
average_spent = svymean(~TOTEXP14, blocksegment)[1] |
|
)) |
|
write.table(distributions, "output/histogram-distribution.csv", sep=',', row.names=F) |
|
} |
|
|
|
|
|
# Build a counter variable |
|
full_population_survey <- update(full_population_survey, one=1) |
|
|
|
# Answer 4: Find cutoff for top 5 percent of spenders |
|
top_five_percentile_cutoff = svyquantile(~TOTEXP14, full_population_survey, c(.95 ))[1] |
|
|
|
# Get subset of survey that's just the top 5 percent of spenders |
|
fivepercent <- subset(full_population_survey, TOTEXP14 >= top_five_percentile_cutoff) |
|
|
|
# Answer 5: Get mean expenditures for top five percent |
|
svymean(~TOTEXP14, fivepercent) |
|
|
|
# Answer 6: I want to slice and dice the 5 percent popuation and see what it looks like demographically |
|
# This outputs a set of CSVs with categories, percentage of the whole and margin of error |
|
for( property in c('SEX', |
|
'INSCOV14', |
|
'POVCAT14', |
|
'RACETHX', |
|
'RTHLTH53', |
|
'LOCATN42', |
|
'agecategories')){ |
|
name <- property |
|
property <- make.formula(property) |
|
|
|
# Get weighted totals |
|
totals = svyby(~one, property, fivepercent, svytotal, vartype="ci") |
|
|
|
# Get percentage of the whole |
|
final <- data.frame("category" <- totals[name], |
|
"percent" = (totals$one / sum(totals$one) * 100), |
|
"MoE"=(totals$ci_u - totals$one) / sum(totals$one) * 100) |
|
|
|
write.table(final, paste('output/', name, '.csv', sep=""), sep=',', row.names=F) |
|
} |