Skip to content

Instantly share code, notes, and snippets.

@jhofman
Last active August 29, 2015 14:22
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save jhofman/9b0a8bb88fb2c99b0f75 to your computer and use it in GitHub Desktop.
Save jhofman/9b0a8bb88fb2c99b0f75 to your computer and use it in GitHub Desktop.
You Draw It
#!/bin/bash
#
# Scrape income distribution data from whatsmypercent.com
#
# Output is in incomes.csv (percentile,income)
#
# start at $100 / year
income=100
# loop over all 100 percentiles
for f in {1..100}
do
# grab the bottom of the next percentile
income=`curl -silent 'http://whatsmypercent.com/incomeRank.php?income='$income'&status=All+Filers' | grep 'The next percentile begins at:' | awk -F"[<>]" '{print $9}'`
income=${income/\$/}
income=${income/,/}
# grab the percentile
percentile=`curl -silent 'http://whatsmypercent.com/incomeRank.php?income='$income'&status=All+Filers' | grep 'Your percentile is:' | awk -F"[<>]" '{print $9}'`
percentile=${percentile/\%/}
echo $percentile,$income
done | \
grep -v '^0,' > incomes.csv
# write output to csv file
#
# Compare various plots of child college attendance by parent income
#
# Inspired by the interactive NYT piece "You Draw It" at http://www.nytimes.com/interactive/2015/05/28/upshot/you-draw-it-how-family-income-affects-childrens-college-chances.html
#
library(ggplot2)
library(scales)
# income distribution data (2010) from scrape_income_dist.sh
incomes <- structure(list(percentile = 2:99, dollars = c(2451L, 4134L, 5184L,
6028L, 6922L, 7626L, 8226L, 8764L, 9235L, 9832L, 10482L, 11366L,
12207L, 12999L, 13732L, 14447L, 15064L, 15736L, 16358L, 16992L,
17659L, 18204L, 18768L, 19375L, 19964L, 20860L, 22013L, 23034L,
23873L, 24675L, 25505L, 26311L, 27033L, 27811L, 28560L, 29306L,
29999L, 30999L, 32188L, 33281L, 34272L, 35295L, 36253L, 37194L,
38051L, 39064L, 39953L, 41113L, 42327L, 43564L, 44769L, 45871L,
46956L, 48095L, 49225L, 50353L, 51922L, 54282L, 57213L, 59670L,
61654L, 63469L, 65192L, 66639L, 68140L, 69658L, 71150L, 72539L,
73866L, 75296L, 77160L, 79838L, 83011L, 85811L, 88317L, 90794L,
93165L, 95174L, 97298L, 99424L, 102060L, 106770L, 117025L, 125260L,
131032L, 136231L, 141453L, 147725L, 154131L, 160864L, 168227L,
177123L, 187412L, 200026L, 235687L, 290860L, 360435L, 506553L
)), .Names = c("percentile", "dollars"), class = "data.frame", row.names = c(NA,
-98L))
# create a column for the percent of children who attend college at each percentile
# (slope and intercept guesstimated from Chetty et. al.)
incomes <- transform(incomes, college=2/3*percentile + 27)
# plot college attendance vs parent income percentile
qplot(data=incomes, x=percentile, y=college) +
xlab('Parent income percentile') +
ylab('Percent of children who attend college') +
ylim(c(0,100))
ggsave('percentile_college.png', width=4, height=4)
# plot college attendance vs parent income
qplot(data=incomes, x=dollars, y=college) +
xlab('Parent income') +
ylab('Percent of children who attend college') +
ylim(c(0,100)) +
scale_x_continuous(labels=comma)
ggsave('dollars_college.png', width=4, height=4)
# plot college attendance vs parent income, with log scale
qplot(data=incomes, x=dollars, y=college) +
xlab('Parent income') +
ylab('Percent of children who attend college') +
ylim(c(0,100)) +
scale_x_log10(labels=comma)
ggsave('dollars_college_log10.png', width=4, height=4)
# plot college attendance vs parent income, showing population at each income
incomes %>%
mutate(dollars_bin=round(dollars/10000)*10000) %>%
group_by(dollars_bin) %>%
summarize(size=n(), college=mean(college)) %>%
qplot(data=., x=dollars_bin, y=college, size=size) +
xlab('Parent income') +
ylab('Percent of children who attend college') +
ylim(c(0,100)) +
theme(legend.position="none") +
scale_x_continuous(labels=comma)
ggsave('dollars_college_sized.png', width=4, height=4)
# plot college attendance vs parent income, with log scale showing population at each income
incomes %>%
mutate(dollars_bin=round(dollars/10000)*10000) %>%
group_by(dollars_bin) %>%
summarize(size=n(), college=mean(college)) %>%
qplot(data=., x=dollars_bin, y=college, size=size) +
xlab('Parent income') +
ylab('Percent of children who attend college') +
ylim(c(0,100)) +
theme(legend.position="none") +
scale_x_log10(labels=comma)
ggsave('dollars_college_log10_sized.png', width=4, height=4)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment