arm5077/generate_insights.r

## readme.md

      
    Raw
  

              readme.md
            
          
    What this is

We're using MEPS data to describe the five percent of Americans responsible for fifty percent of the U.S.'s annual health care expenditures.
The specific questions we're trying to answer


How much was spent on health care in 2014 in total?
What was the median amount spent on health care in 2014?
How does the total amount spent on health care vary for each decile of the population? (OK, we iterate by 5%, but I can't find the word for that.)
How much do you have to spend to enter the "5 percent"?
What's the average amount a person in the 5 percent spends?
How does the 5 percent break down in terms of:
a. gender?
b. insurance coverage
c. poverty status?
d. race?
e. self-reported health status?
f. type of primary care (ER v. doctors office)
g. age?


## generate_insights.r
library(foreign)
library(survey)

# Make folder to contain all the output CSVs
dir.create('output', showWarnings = FALSE)

# Download file and unzip into data frame
download.file("https://meps.ahrq.gov/mepsweb/data_files/pufs/h171ssp.zip",temp <- tempfile())
data = read.xport(unzip(temp))

# Add broad age categorization (1 is 17 and under, 2 is 17-64, 3 is 65+)
data$agecategories[data$AGELAST <= 17] = 1
data$agecategories[data$AGELAST >= 18 & data$AGELAST < 65] = 2
data$agecategories[data$AGELAST >= 65] = 3

# Build survey object using person weights
full_population_survey = svydesign(
  ids=~VARPSU,
  strata=~VARSTR,
  data=data,
  weights=~PERWT14F,
  nest=T
)

# Answer 1: Get total amount spent on healthcare
svytotal(~TOTEXP14, full_population_survey)

# Answer 2: Get median amount spent on healthcare
svyquantile(~TOTEXP14, full_population_survey, c(.5 ))

# Answer 3: Export a CSV that breaks out distributions of total cost, average cost and median cost in 5-percent blocks
# First, make blank distribution object
distributions <- data.frame(start=numeric(),
                            end=numeric(),
                            total_spent=numeric(),
                            median_spent=numeric(),
                            average_spent=numeric())
for(block in 0:95 ){
  # Only continue if this percentile is divisible by 5
  if(block %% 5 != 0)
    next()
  block = block / 100
  # Get the low expenditure cutoff for this 5-percentile
  block_cutoff_low = svyquantile(~TOTEXP14, full_population_survey, c(block))[1]
  # Get the high expenditure cutoff for this 5-percentile (aka, 5 percentage points higher)
  block_cutoff_high = svyquantile(~TOTEXP14, full_population_survey, c(block + .05 ))[1]
  # Filter down to this 5-percentile block
  blocksegment <- subset(full_population_survey, TOTEXP14 >= block_cutoff_low & TOTEXP14 <= block_cutoff_high )
  # Add characteristics of this block to  the distribution object
  distributions = rbind(distributions, data.frame(
    start = block,
    end = block + .05,
    total_spent = svytotal(~TOTEXP14, blocksegment)[1],
    median_spent = svyquantile(~TOTEXP14, blocksegment, c(.05))[1],
    average_spent = svymean(~TOTEXP14, blocksegment)[1]
  ))
  write.table(distributions, "output/histogram-distribution.csv", sep=',', row.names=F)
}


# Build a counter variable
full_population_survey <- update(full_population_survey, one=1)

# Answer 4: Find cutoff for top 5 percent of spenders
top_five_percentile_cutoff = svyquantile(~TOTEXP14, full_population_survey, c(.95 ))[1]

# Get subset of survey that's just the top 5 percent of spenders
fivepercent <- subset(full_population_survey, TOTEXP14 >= top_five_percentile_cutoff)

# Answer 5: Get mean expenditures for top five percent
svymean(~TOTEXP14, fivepercent)

# Answer 6: I want to slice and dice the 5 percent popuation and see what it looks like demographically
# This outputs a set of CSVs with categories, percentage of the whole and margin of error
for( property in c('SEX',
                   'INSCOV14',
                   'POVCAT14',
                   'RACETHX',
                   'RTHLTH53',
                   'LOCATN42',
                   'agecategories')){
  name <- property
  property <- make.formula(property)

  # Get weighted totals
  totals = svyby(~one, property, fivepercent, svytotal, vartype="ci")

  # Get percentage of the whole
  final <- data.frame("category" <- totals[name],
                      "percent" = (totals$one / sum(totals$one) * 100),
                      "MoE"=(totals$ci_u - totals$one) / sum(totals$one) * 100)

  write.table(final, paste('output/', name, '.csv', sep=""), sep=',', row.names=F)
}
	library(foreign)
	library(survey)

	# Make folder to contain all the output CSVs
	dir.create('output', showWarnings = FALSE)

	# Download file and unzip into data frame
	download.file("https://meps.ahrq.gov/mepsweb/data_files/pufs/h171ssp.zip",temp <- tempfile())
	data = read.xport(unzip(temp))

	# Add broad age categorization (1 is 17 and under, 2 is 17-64, 3 is 65+)
	data$agecategories[data$AGELAST <= 17] = 1
	data$agecategories[data$AGELAST >= 18 & data$AGELAST < 65] = 2
	data$agecategories[data$AGELAST >= 65] = 3

	# Build survey object using person weights
	full_population_survey = svydesign(
	ids=~VARPSU,
	strata=~VARSTR,
	data=data,
	weights=~PERWT14F,
	nest=T
	)

	# Answer 1: Get total amount spent on healthcare
	svytotal(~TOTEXP14, full_population_survey)

	# Answer 2: Get median amount spent on healthcare
	svyquantile(~TOTEXP14, full_population_survey, c(.5 ))

	# Answer 3: Export a CSV that breaks out distributions of total cost, average cost and median cost in 5-percent blocks
	# First, make blank distribution object
	distributions <- data.frame(start=numeric(),
	end=numeric(),
	total_spent=numeric(),
	median_spent=numeric(),
	average_spent=numeric())
	for(block in 0:95 ){
	# Only continue if this percentile is divisible by 5
	if(block %% 5 != 0)
	next()
	block = block / 100
	# Get the low expenditure cutoff for this 5-percentile
	block_cutoff_low = svyquantile(~TOTEXP14, full_population_survey, c(block))[1]
	# Get the high expenditure cutoff for this 5-percentile (aka, 5 percentage points higher)
	block_cutoff_high = svyquantile(~TOTEXP14, full_population_survey, c(block + .05 ))[1]
	# Filter down to this 5-percentile block
	blocksegment <- subset(full_population_survey, TOTEXP14 >= block_cutoff_low & TOTEXP14 <= block_cutoff_high )
	# Add characteristics of this block to the distribution object
	distributions = rbind(distributions, data.frame(
	start = block,
	end = block + .05,
	total_spent = svytotal(~TOTEXP14, blocksegment)[1],
	median_spent = svyquantile(~TOTEXP14, blocksegment, c(.05))[1],
	average_spent = svymean(~TOTEXP14, blocksegment)[1]
	))
	write.table(distributions, "output/histogram-distribution.csv", sep=',', row.names=F)
	}


	# Build a counter variable
	full_population_survey <- update(full_population_survey, one=1)

	# Answer 4: Find cutoff for top 5 percent of spenders
	top_five_percentile_cutoff = svyquantile(~TOTEXP14, full_population_survey, c(.95 ))[1]

	# Get subset of survey that's just the top 5 percent of spenders
	fivepercent <- subset(full_population_survey, TOTEXP14 >= top_five_percentile_cutoff)

	# Answer 5: Get mean expenditures for top five percent
	svymean(~TOTEXP14, fivepercent)

	# Answer 6: I want to slice and dice the 5 percent popuation and see what it looks like demographically
	# This outputs a set of CSVs with categories, percentage of the whole and margin of error
	for( property in c('SEX',
	'INSCOV14',
	'POVCAT14',
	'RACETHX',
	'RTHLTH53',
	'LOCATN42',
	'agecategories')){
	name <- property
	property <- make.formula(property)

	# Get weighted totals
	totals = svyby(~one, property, fivepercent, svytotal, vartype="ci")

	# Get percentage of the whole
	final <- data.frame("category" <- totals[name],
	"percent" = (totals$one / sum(totals$one) * 100),
	"MoE"=(totals$ci_u - totals$one) / sum(totals$one) * 100)

	write.table(final, paste('output/', name, '.csv', sep=""), sep=',', row.names=F)
	}