dggoldst/populationdensity.R

## populationdensity.R
#population density
#Dan Goldstein 2015

#LIBS
library("rvest")
library("ggplot2")
library("dplyr")
library("animation")

###HARDCODES
EDGE_M = 1000
if (EDGE_M == 1000) {
  PS = .2
} else {
  PS = 2
}
CUTOFF = 20

#PULL DOWN DATA IF YOU HAVE TO
if (!file.exists("counties.csv")) {
  #H/T http://blog.corynissen.com/2015/01/using-rvest-to-scrape-html-table.html
  counties =
    "https://en.wikipedia.org/wiki/List_of_the_most_populous_counties_in_the_United_States" %>%
    html() %>%
    html_nodes(xpath = '//*[@id="mw-content-text"]/table[2]') %>%
    html_table()
  counties <- counties[[1]]

  df = counties[1:100,c(2:5,12:14,17)]

  names(df) = c("county",
                "state",
                "sq_km",
                "sq_mi",
                "pop_2014",
                "density_km",
                "density_mi",
                "city")

  cols = 3:7
  df[,cols] = lapply(df[,cols],function(x) {
    as.numeric(gsub(",", "", x))
  })

  #Was a little bug in the wikipedia today
  df[5,7] = 776
  write.csv(df,"counties.csv",row.names = FALSE)
} else {
  df = read.csv(
    "C:/Dropbox/Projects/20150730_PopulationDensity/counties.csv", stringsAsFactors =
      FALSE
  )
}

#help people with some county names
df[df$county == "New York County","city"] = "Manhattan"

df = df %>% arrange(-density_mi)

get_row = function(index) {
  index %/% EDGE_M
}

get_column = function(index) {
  index %% EDGE_M
}

#Do the densest city and progressively downsample from it
gpeople_to_draw = df[1,"density_km"]
gpeople_to_draw = round(gpeople_to_draw / (1000 ^ 2 / EDGE_M ^ 2),0) #convert to appropriate units (hectare, miles, etc)
people_ids = sample(0:(EDGE_M * EDGE_M - 1),gpeople_to_draw,replace = FALSE)
pdf = data.frame(x = get_row(people_ids),
                 y = get_column(people_ids))

#Graph one county
draw_a_row = function(row) {
  people_to_draw = df[row,"density_km"]
  people_to_draw = round(people_to_draw / (1000 ^ 2 / EDGE_M ^ 2),0) #convert to appropriate units (hectare, miles, etc)
  title = sprintf(
    "#%d: %s, %s\nHome of: %s\n%d people per %d square meters",
    row,
    df[row,"county"],
    df[row,"state"],
    df[row,"city"],
    people_to_draw,
    EDGE_M
  )
  filen = sprintf("figures/%03d_%s_%s.png",
                  row,
                  df[row,"county"],
                  df[row,"state"])
  spdf = pdf[1:people_to_draw,]
  p = ggplot(spdf,aes(x = x,y = y))
  p = p + geom_point(size = PS) +
    xlim(c(0,EDGE_M)) +
    ylim(c(0,EDGE_M)) +
    theme(aspect.ratio = 1) +
    theme(plot.title = element_text(size = rel(1))) +
    labs(x = NULL,y = NULL,title = title)
  #ggsave(filename=filen,plot=p,width=4, height=4,dpi=250)
  p
}

#Animate thanks to Yuhui
ani.options(convert = "C:\\ImageMagick\\convert.exe",
            interval = 1)

saveGIF({
  for (i in 1:CUTOFF) {
    print(draw_a_row(i))
    ani.pause()
  }
},            movie.name = paste(EDGE_M,"pop_dens.gif",sep = "_"),
img.name = "onehui",clean = FALSE)

#Create the leading graph
p = ggplot(df[1:CUTOFF,],aes(
  x = 1:CUTOFF,y = density_mi,color = state,group = 1
))
p = p + geom_point(size = 4)
p = p + scale_x_discrete(labels = df$county[1:CUTOFF])
p = p + scale_y_continuous(breaks = seq(10000,80000,by = 10000))
p = p + theme(
  plot.title = element_text(size = rel(1)),
  legend.position = "none",
  axis.text.x  = element_text(angle = 90, vjust = 0.5)
) +
  labs(x = "Counties Ranked by Density",y = "People Per Square Mile")
p
ggsave(
  filename = "pop_rank.png",plot = p,width = 6, height = 6
)
	#population density
	#Dan Goldstein 2015

	#LIBS
	library("rvest")
	library("ggplot2")
	library("dplyr")
	library("animation")

	###HARDCODES
	EDGE_M = 1000
	if (EDGE_M == 1000) {
	PS = .2
	} else {
	PS = 2
	}
	CUTOFF = 20

	#PULL DOWN DATA IF YOU HAVE TO
	if (!file.exists("counties.csv")) {
	#H/T http://blog.corynissen.com/2015/01/using-rvest-to-scrape-html-table.html
	counties =
	"https://en.wikipedia.org/wiki/List_of_the_most_populous_counties_in_the_United_States" %>%
	html() %>%
	html_nodes(xpath = '//*[@id="mw-content-text"]/table[2]') %>%
	html_table()
	counties <- counties[[1]]

	df = counties[1:100,c(2:5,12:14,17)]

	names(df) = c("county",
	"state",
	"sq_km",
	"sq_mi",
	"pop_2014",
	"density_km",
	"density_mi",
	"city")

	cols = 3:7
	df[,cols] = lapply(df[,cols],function(x) {
	as.numeric(gsub(",", "", x))
	})

	#Was a little bug in the wikipedia today
	df[5,7] = 776
	write.csv(df,"counties.csv",row.names = FALSE)
	} else {
	df = read.csv(
	"C:/Dropbox/Projects/20150730_PopulationDensity/counties.csv", stringsAsFactors =
	FALSE
	)
	}

	#help people with some county names
	df[df$county == "New York County","city"] = "Manhattan"

	df = df %>% arrange(-density_mi)

	get_row = function(index) {
	index %/% EDGE_M
	}

	get_column = function(index) {
	index %% EDGE_M
	}

	#Do the densest city and progressively downsample from it
	gpeople_to_draw = df[1,"density_km"]
	gpeople_to_draw = round(gpeople_to_draw / (1000 ^ 2 / EDGE_M ^ 2),0) #convert to appropriate units (hectare, miles, etc)
	people_ids = sample(0:(EDGE_M * EDGE_M - 1),gpeople_to_draw,replace = FALSE)
	pdf = data.frame(x = get_row(people_ids),
	y = get_column(people_ids))

	#Graph one county
	draw_a_row = function(row) {
	people_to_draw = df[row,"density_km"]
	people_to_draw = round(people_to_draw / (1000 ^ 2 / EDGE_M ^ 2),0) #convert to appropriate units (hectare, miles, etc)
	title = sprintf(
	"#%d: %s, %s\nHome of: %s\n%d people per %d square meters",
	row,
	df[row,"county"],
	df[row,"state"],
	df[row,"city"],
	people_to_draw,
	EDGE_M
	)
	filen = sprintf("figures/%03d_%s_%s.png",
	row,
	df[row,"county"],
	df[row,"state"])
	spdf = pdf[1:people_to_draw,]
	p = ggplot(spdf,aes(x = x,y = y))
	p = p + geom_point(size = PS) +
	xlim(c(0,EDGE_M)) +
	ylim(c(0,EDGE_M)) +
	theme(aspect.ratio = 1) +
	theme(plot.title = element_text(size = rel(1))) +
	labs(x = NULL,y = NULL,title = title)
	#ggsave(filename=filen,plot=p,width=4, height=4,dpi=250)
	p
	}

	#Animate thanks to Yuhui
	ani.options(convert = "C:\\ImageMagick\\convert.exe",
	interval = 1)

	saveGIF({
	for (i in 1:CUTOFF) {
	print(draw_a_row(i))
	ani.pause()
	}
	}, movie.name = paste(EDGE_M,"pop_dens.gif",sep = "_"),
	img.name = "onehui",clean = FALSE)

	#Create the leading graph
	p = ggplot(df[1:CUTOFF,],aes(
	x = 1:CUTOFF,y = density_mi,color = state,group = 1
	))
	p = p + geom_point(size = 4)
	p = p + scale_x_discrete(labels = df$county[1:CUTOFF])
	p = p + scale_y_continuous(breaks = seq(10000,80000,by = 10000))
	p = p + theme(
	plot.title = element_text(size = rel(1)),
	legend.position = "none",
	axis.text.x = element_text(angle = 90, vjust = 0.5)
	) +
	labs(x = "Counties Ranked by Density",y = "People Per Square Mile")
	p
	ggsave(
	filename = "pop_rank.png",plot = p,width = 6, height = 6
	)