andrewbtran/top_1_percent

## top_1_percent
# File # 1
# https://drive.google.com/open?id=1Lhz23JP4gRW4p_01D7OzaL60CT1wfem-

# File # 2
# https://drive.google.com/open?id=1XOWD8COTdDx30HNLZgrp57WzWUGOtaKA

# NOTE: To load data, you must download both the extract's data and the DDI
# and also set the working directory to the folder with these files (or change the path below).

#install.packages("tidyverse")
library(tidyverse)

install.packages("ipumsr")
library(ipumsr)

# Put the two files (usa_00008.dat.gz and usa_00008.xml) into your working directory

ddi <- read_ipums_ddi("usa_00008.xml")
data <- read_ipums_micro(ddi)

install.packages("Hmisc")
library(Hmisc)

View(data)

data %>%
  select_if(is.labelled)

ipums_val_labels(data$STATEFIP)

# Convert the labels to factors (and drop the unused levels)
data <- data %>%
  mutate(STATE_factor = as_factor(lbl_clean(STATEFIP)),
         COUNTY_factor = as_factor(lbl_clean(COUNTY)),
         COUNTYFIPS_factor = as_factor(lbl_clean(GQ)),
         HHINCOME_factor = as_factor(lbl_clean(HHINCOME)))


md <- filter(data, STATE_factor=="Maryland")

View(md)

mont <- filter(data, COUNTYFIPS==31)


View(mont)

# wrong


wtd.quantile(mont$HHINCOME, mont$HHWT, probs=c(.99))

range(mont$HHINCOME)

# let's fix

mont2 <- mont %>%  mutate(
  HHINCOME = as.numeric(as.character(HHINCOME)),
  HHINCOME=case_when(
    HHINCOME==9999999 ~ 0,
    TRUE ~ HHINCOME
))

# What's the range now?

range(mont2$HHINCOME)

wtd.quantile(mont2$HHINCOME, mont2$HHWT, probs=c(.99))

# Now let's see where the county ranks in the country

ranked <- filter(data, HHINCOME!=9999999) %>%
  group_by(STATE_factor) %>%
  summarize(one_percent=wtd.quantile(HHINCOME, HHWT, probs=c(.99))) %>%
  mutate(rank=rank(desc(one_percent)))

View(ranked)

# Prep the data

data <- mutate(data,
               statef=as.character(STATEFIP),
               countyf=as.character(COUNTYFIPS),
               statef=case_when(
                 nchar(statef)==1 ~ paste0("0", statef),
                 TRUE ~ statef),
               countyf=case_when(
                 nchar(countyf)==1 ~ paste0("00", countyf),
                 nchar(countyf)==2 ~ paste0("0", countyf),
                 TRUE ~ countyf),
               fips=paste0(statef, countyf))

# Rank by all available counties

# PERNUM is persons per household.
# We want this to be 1 so it's not double counting
# https://usa.ipums.org/usa-action/variables/PERNUM#description_section [usa.ipums.org]
# and GQ stands for Group Housing status
# https://usa.ipums.org/usa-action/variables/PERNUM#description_section [usa.ipums.org]
# Everything higher than 3 is institutions and dorms or group homes, etc

ranked <- filter(data, HHINCOME!=9999999) %>%
  filter(PERNUM==1) %>%
  filter(GQ<3) %>%
  group_by(STATE_factor, fips) %>%
  summarize(one_percent=wtd.quantile(HHINCOME, HHWT, probs=c(.99)),

            count=n()) %>%
  ungroup() %>%
  mutate(rank=rank(-one_percent))
	# File # 1
	# https://drive.google.com/open?id=1Lhz23JP4gRW4p_01D7OzaL60CT1wfem-

	# File # 2
	# https://drive.google.com/open?id=1XOWD8COTdDx30HNLZgrp57WzWUGOtaKA

	# NOTE: To load data, you must download both the extract's data and the DDI
	# and also set the working directory to the folder with these files (or change the path below).

	#install.packages("tidyverse")
	library(tidyverse)

	install.packages("ipumsr")
	library(ipumsr)

	# Put the two files (usa_00008.dat.gz and usa_00008.xml) into your working directory

	ddi <- read_ipums_ddi("usa_00008.xml")
	data <- read_ipums_micro(ddi)

	install.packages("Hmisc")
	library(Hmisc)

	View(data)

	data %>%
	select_if(is.labelled)

	ipums_val_labels(data$STATEFIP)

	# Convert the labels to factors (and drop the unused levels)
	data <- data %>%
	mutate(STATE_factor = as_factor(lbl_clean(STATEFIP)),
	COUNTY_factor = as_factor(lbl_clean(COUNTY)),
	COUNTYFIPS_factor = as_factor(lbl_clean(GQ)),
	HHINCOME_factor = as_factor(lbl_clean(HHINCOME)))



	md <- filter(data, STATE_factor=="Maryland")

	View(md)

	mont <- filter(data, COUNTYFIPS==31)


	View(mont)

	# wrong


	wtd.quantile(mont$HHINCOME, mont$HHWT, probs=c(.99))

	range(mont$HHINCOME)

	# let's fix

	mont2 <- mont %>% mutate(
	HHINCOME = as.numeric(as.character(HHINCOME)),
	HHINCOME=case_when(
	HHINCOME==9999999 ~ 0,
	TRUE ~ HHINCOME
	))

	# What's the range now?

	range(mont2$HHINCOME)

	wtd.quantile(mont2$HHINCOME, mont2$HHWT, probs=c(.99))

	# Now let's see where the county ranks in the country

	ranked <- filter(data, HHINCOME!=9999999) %>%
	group_by(STATE_factor) %>%
	summarize(one_percent=wtd.quantile(HHINCOME, HHWT, probs=c(.99))) %>%
	mutate(rank=rank(desc(one_percent)))

	View(ranked)

	# Prep the data

	data <- mutate(data,
	statef=as.character(STATEFIP),
	countyf=as.character(COUNTYFIPS),
	statef=case_when(
	nchar(statef)==1 ~ paste0("0", statef),
	TRUE ~ statef),
	countyf=case_when(
	nchar(countyf)==1 ~ paste0("00", countyf),
	nchar(countyf)==2 ~ paste0("0", countyf),
	TRUE ~ countyf),
	fips=paste0(statef, countyf))

	# Rank by all available counties

	# PERNUM is persons per household.
	# We want this to be 1 so it's not double counting
	# https://usa.ipums.org/usa-action/variables/PERNUM#description_section [usa.ipums.org]
	# and GQ stands for Group Housing status
	# https://usa.ipums.org/usa-action/variables/PERNUM#description_section [usa.ipums.org]
	# Everything higher than 3 is institutions and dorms or group homes, etc

	ranked <- filter(data, HHINCOME!=9999999) %>%
	filter(PERNUM==1) %>%
	filter(GQ<3) %>%
	group_by(STATE_factor, fips) %>%
	summarize(one_percent=wtd.quantile(HHINCOME, HHWT, probs=c(.99)),

	count=n()) %>%
	ungroup() %>%
	mutate(rank=rank(-one_percent))