Last active
August 29, 2015 14:26
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#population density | |
#Dan Goldstein 2015 | |
#LIBS | |
library("rvest") | |
library("ggplot2") | |
library("dplyr") | |
library("animation") | |
###HARDCODES | |
EDGE_M = 1000 | |
if (EDGE_M == 1000) { | |
PS = .2 | |
} else { | |
PS = 2 | |
} | |
CUTOFF = 20 | |
#PULL DOWN DATA IF YOU HAVE TO | |
if (!file.exists("counties.csv")) { | |
#H/T http://blog.corynissen.com/2015/01/using-rvest-to-scrape-html-table.html | |
counties = | |
"https://en.wikipedia.org/wiki/List_of_the_most_populous_counties_in_the_United_States" %>% | |
html() %>% | |
html_nodes(xpath = '//*[@id="mw-content-text"]/table[2]') %>% | |
html_table() | |
counties <- counties[[1]] | |
df = counties[1:100,c(2:5,12:14,17)] | |
names(df) = c("county", | |
"state", | |
"sq_km", | |
"sq_mi", | |
"pop_2014", | |
"density_km", | |
"density_mi", | |
"city") | |
cols = 3:7 | |
df[,cols] = lapply(df[,cols],function(x) { | |
as.numeric(gsub(",", "", x)) | |
}) | |
#Was a little bug in the wikipedia today | |
df[5,7] = 776 | |
write.csv(df,"counties.csv",row.names = FALSE) | |
} else { | |
df = read.csv( | |
"C:/Dropbox/Projects/20150730_PopulationDensity/counties.csv", stringsAsFactors = | |
FALSE | |
) | |
} | |
#help people with some county names | |
df[df$county == "New York County","city"] = "Manhattan" | |
df = df %>% arrange(-density_mi) | |
get_row = function(index) { | |
index %/% EDGE_M | |
} | |
get_column = function(index) { | |
index %% EDGE_M | |
} | |
#Do the densest city and progressively downsample from it | |
gpeople_to_draw = df[1,"density_km"] | |
gpeople_to_draw = round(gpeople_to_draw / (1000 ^ 2 / EDGE_M ^ 2),0) #convert to appropriate units (hectare, miles, etc) | |
people_ids = sample(0:(EDGE_M * EDGE_M - 1),gpeople_to_draw,replace = FALSE) | |
pdf = data.frame(x = get_row(people_ids), | |
y = get_column(people_ids)) | |
#Graph one county | |
draw_a_row = function(row) { | |
people_to_draw = df[row,"density_km"] | |
people_to_draw = round(people_to_draw / (1000 ^ 2 / EDGE_M ^ 2),0) #convert to appropriate units (hectare, miles, etc) | |
title = sprintf( | |
"#%d: %s, %s\nHome of: %s\n%d people per %d square meters", | |
row, | |
df[row,"county"], | |
df[row,"state"], | |
df[row,"city"], | |
people_to_draw, | |
EDGE_M | |
) | |
filen = sprintf("figures/%03d_%s_%s.png", | |
row, | |
df[row,"county"], | |
df[row,"state"]) | |
spdf = pdf[1:people_to_draw,] | |
p = ggplot(spdf,aes(x = x,y = y)) | |
p = p + geom_point(size = PS) + | |
xlim(c(0,EDGE_M)) + | |
ylim(c(0,EDGE_M)) + | |
theme(aspect.ratio = 1) + | |
theme(plot.title = element_text(size = rel(1))) + | |
labs(x = NULL,y = NULL,title = title) | |
#ggsave(filename=filen,plot=p,width=4, height=4,dpi=250) | |
p | |
} | |
#Animate thanks to Yuhui | |
ani.options(convert = "C:\\ImageMagick\\convert.exe", | |
interval = 1) | |
saveGIF({ | |
for (i in 1:CUTOFF) { | |
print(draw_a_row(i)) | |
ani.pause() | |
} | |
}, movie.name = paste(EDGE_M,"pop_dens.gif",sep = "_"), | |
img.name = "onehui",clean = FALSE) | |
#Create the leading graph | |
p = ggplot(df[1:CUTOFF,],aes( | |
x = 1:CUTOFF,y = density_mi,color = state,group = 1 | |
)) | |
p = p + geom_point(size = 4) | |
p = p + scale_x_discrete(labels = df$county[1:CUTOFF]) | |
p = p + scale_y_continuous(breaks = seq(10000,80000,by = 10000)) | |
p = p + theme( | |
plot.title = element_text(size = rel(1)), | |
legend.position = "none", | |
axis.text.x = element_text(angle = 90, vjust = 0.5) | |
) + | |
labs(x = "Counties Ranked by Density",y = "People Per Square Mile") | |
p | |
ggsave( | |
filename = "pop_rank.png",plot = p,width = 6, height = 6 | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment