Skip to content

Instantly share code, notes, and snippets.

@andrewbtran
Created August 3, 2018 19:43
top 1 percent ipums
# File # 1
# https://drive.google.com/open?id=1Lhz23JP4gRW4p_01D7OzaL60CT1wfem-
# File # 2
# https://drive.google.com/open?id=1XOWD8COTdDx30HNLZgrp57WzWUGOtaKA
# NOTE: To load data, you must download both the extract's data and the DDI
# and also set the working directory to the folder with these files (or change the path below).
#install.packages("tidyverse")
library(tidyverse)
install.packages("ipumsr")
library(ipumsr)
# Put the two files (usa_00008.dat.gz and usa_00008.xml) into your working directory
ddi <- read_ipums_ddi("usa_00008.xml")
data <- read_ipums_micro(ddi)
install.packages("Hmisc")
library(Hmisc)
View(data)
data %>%
select_if(is.labelled)
ipums_val_labels(data$STATEFIP)
# Convert the labels to factors (and drop the unused levels)
data <- data %>%
mutate(STATE_factor = as_factor(lbl_clean(STATEFIP)),
COUNTY_factor = as_factor(lbl_clean(COUNTY)),
COUNTYFIPS_factor = as_factor(lbl_clean(GQ)),
HHINCOME_factor = as_factor(lbl_clean(HHINCOME)))
md <- filter(data, STATE_factor=="Maryland")
View(md)
mont <- filter(data, COUNTYFIPS==31)
View(mont)
# wrong
wtd.quantile(mont$HHINCOME, mont$HHWT, probs=c(.99))
range(mont$HHINCOME)
# let's fix
mont2 <- mont %>% mutate(
HHINCOME = as.numeric(as.character(HHINCOME)),
HHINCOME=case_when(
HHINCOME==9999999 ~ 0,
TRUE ~ HHINCOME
))
# What's the range now?
range(mont2$HHINCOME)
wtd.quantile(mont2$HHINCOME, mont2$HHWT, probs=c(.99))
# Now let's see where the county ranks in the country
ranked <- filter(data, HHINCOME!=9999999) %>%
group_by(STATE_factor) %>%
summarize(one_percent=wtd.quantile(HHINCOME, HHWT, probs=c(.99))) %>%
mutate(rank=rank(desc(one_percent)))
View(ranked)
# Prep the data
data <- mutate(data,
statef=as.character(STATEFIP),
countyf=as.character(COUNTYFIPS),
statef=case_when(
nchar(statef)==1 ~ paste0("0", statef),
TRUE ~ statef),
countyf=case_when(
nchar(countyf)==1 ~ paste0("00", countyf),
nchar(countyf)==2 ~ paste0("0", countyf),
TRUE ~ countyf),
fips=paste0(statef, countyf))
# Rank by all available counties
# PERNUM is persons per household.
# We want this to be 1 so it's not double counting
# https://usa.ipums.org/usa-action/variables/PERNUM#description_section [usa.ipums.org]
# and GQ stands for Group Housing status
# https://usa.ipums.org/usa-action/variables/PERNUM#description_section [usa.ipums.org]
# Everything higher than 3 is institutions and dorms or group homes, etc
ranked <- filter(data, HHINCOME!=9999999) %>%
filter(PERNUM==1) %>%
filter(GQ<3) %>%
group_by(STATE_factor, fips) %>%
summarize(one_percent=wtd.quantile(HHINCOME, HHWT, probs=c(.99)),
count=n()) %>%
ungroup() %>%
mutate(rank=rank(-one_percent))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment