Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save ikashnitsky/09618405045ea86aca3484881a9094dc to your computer and use it in GitHub Desktop.
Save ikashnitsky/09618405045ea86aca3484881a9094dc to your computer and use it in GitHub Desktop.
Replication of the results in the post https://habrahabr.ru/post/312694/
################################################################################
#
# Sex ratios 11-11-2016
# Analysis - US counties.
# Check the effect of urb/rur of the link between homicide rate and adult sex
# ratio. Doubts raised by https://dx.doi.org/10.1007/s12110-016-9271-x
# Ilya Kashnitsky, ilya.kashnitsky@gmail.com
#
################################################################################
# load required packages
# The code is written and tested on a PC-win7-x64
# R version 3.3.1
# load required packages
library(tidyverse) # version 1.0.0
library(ggthemes) # version 3.2.0
library(extrafont) # version 0.17
library(RColorBrewer) # version 1.1-2
library(viridis) # version 0.3.4
library(cowplot) # version 0.7.0
library(choroplethr) # version 3.5.2
library(tmap) # version 1.6-1
library(rgdal) # version 1.1-10
library(rgeos) # version 0.3-21
library(maptools) # version 0.8-39
# set working directory to the one where you have this file
setwd()
# create sub-directories
ifelse(!dir.exists('data'),dir.create('data'),paste("Directory already exists"))
ifelse(!dir.exists('geodata'),dir.create('geodata'),paste("Directory already exists"))
ifelse(!dir.exists('out'),dir.create('out'),paste("Directory already exists"))
################################################################################
# Get US geodata
# source solution (http://stackoverflow.com/a/34453890/4638884)
f <- tempfile()
download.file("http://www2.census.gov/geo/tiger/GENZ2010/gz_2010_us_050_00_20m.zip", destfile = f)
unzip(f, exdir = "geodata/.")
US <- read_shape("geodata/gz_2010_us_050_00_20m.shp")
# reproject geodata
US_prj <- spTransform(US, CRS('+init=epsg:2163'))
names(US_prj@data) <- str_to_lower(names(US_prj@data))
US_prj@data <- US_prj@data %>% mutate(id = str_sub(geo_id,10,14))
# Alaska, Hawaii
# https://rpubs.com/technocrat/thematic-alaska-hawaii
alaska <- US_prj[US_prj$state=="02",]
alaska <- elide(alaska, rotate=-36)
alaska <- elide(alaska, scale=max(apply(bbox(alaska), 1, diff)) / 2.5)
alaska <- elide(alaska, shift=c(-2500000, -2200000))
proj4string(alaska) <- proj4string(US_prj)
hawaii <- US_prj[US_prj$state=="15",]
hawaii <- elide(hawaii, rotate=-35)
hawaii <- elide(hawaii, shift=c(5100000, -1300000))
proj4string(hawaii) <- proj4string(US_prj)
plot(US_prj)
plot(alaska, add=T, col='red')
plot(hawaii, add=T, col='red')
US_prj <- US_prj[!US_prj$state %in% c("02","15","72"),]
US_prj <- rbind(US_prj, alaska, hawaii)
# fortify
gd_county <- fortify(US_prj, region = 'id')
# states
f <- tempfile()
download.file("http://www2.census.gov/geo/tiger/GENZ2010/gz_2010_us_040_00_20m.zip", destfile = f)
unzip(f, exdir = "geodata/.")
US_st <- read_shape("geodata/gz_2010_us_040_00_20m.shp")
# reproject geodata
US_st_prj <- spTransform(US_st, CRS('+init=epsg:2163'))
names(US_st_prj@data) <- str_to_lower(names(US_st_prj@data))
# Alaska, Hawaii
# https://rpubs.com/technocrat/thematic-alaska-hawaii
alaska_st <- US_st_prj[US_st_prj$state=="02",]
alaska_st <- elide(alaska_st, rotate=-36)
alaska_st <- elide(alaska_st, scale=max(apply(bbox(alaska_st), 1, diff)) / 2.5)
alaska_st <- elide(alaska_st, shift=c(-2500000, -2200000))
proj4string(alaska_st) <- proj4string(US_st_prj)
hawaii_st <- US_st_prj[US_st_prj$state=="15",]
hawaii_st <- elide(hawaii_st, rotate=-35)
hawaii_st <- elide(hawaii_st, shift=c(5100000, -1300000))
proj4string(hawaii_st) <- proj4string(US_st_prj)
plot(US_st_prj)
plot(alaska_st, add=T, col='red')
plot(hawaii_st, add=T, col='red')
US_st_prj <- US_st_prj[!US_st_prj$state %in% c("02","15","72"),]
US_st_prj <- rbind(US_st_prj, alaska_st, hawaii_st)
# fortify
gd_state <- fortify(US_st_prj, region = 'state')
# get states borders
US_st_borders <- ik_map_identify_borders(US_st_prj)
gd_state_borders <- fortify(US_st_borders)
# major cities
f <- tempfile()
download.file("https://prd-tnm.s3.amazonaws.com/StagedProducts/Small-scale/data/Structures/citiesx020_nt00007.tar.gz", destfile = f)
dir.create('geodata/us_cities')
untar(f, exdir = "geodata/us_cities/.")
cities <- readOGR(dsn = 'geodata/us_cities', layer = 'citiesx020')
cities_sub <- cities[cities$FEATURE%in%c("State Capital","State Capital County Seat") |
cities$POP_RANGE%in%c("1,000,000 - 9,999,999","500,000 - 999,999"),]
cities_sub@data <- cities_sub@data %>% droplevels()
proj4string(cities_sub) <- CRS('+proj=longlat')
cities_prj <- spTransform(cities_sub, CRS('+init=epsg:2163'))
gd_cities <- data.frame(cities_prj) %>%
transmute(id = CITIESX020, long = coords.x1, lat = coords.x2,
name = NAME, fips = FIPS, state = STATE,
huge = POP_RANGE%in%c("1,000,000 - 9,999,999","500,000 - 999,999"),
capital = FEATURE%in%c("State Capital","State Capital County Seat"))
# adjust positions for Juneau (AK) and Honolulu (HI)
gd_cities[gd_cities$state=='AK','long'] <- -1270000
gd_cities[gd_cities$state=='AK','lat'] <- -2030000
gd_cities[gd_cities$state=='HI','long'] <- -775000
gd_cities[gd_cities$state=='HI','lat'] <- -1900000
# SAVE GEODATA
save(gd_state_borders, gd_state, gd_county, gd_cities,
file = 'geodata/us_geodata.RData')
################################################################################
# Create templates for maps
# my favorite palette
brbg <- RColorBrewer::brewer.pal(11,'BrBG')
basemap_cont <- ggplot()+
geom_polygon(data = gd_county, aes(x=long, y=lat, group=group), fill='grey50')+
guides(fill = guide_colorbar(barwidth = 1, barheight = 10))+
coord_equal(xlim = c(-2100000,3300000),ylim = c(-2400000,800000),expand = c(0,0))+
theme_map()+
theme(panel.border=element_rect(color = 'black',size=.5,fill = NA),
panel.background=element_rect(fill='grey15'),
legend.position = c(1, 0),
legend.justification = c(1, 0),
legend.background = element_rect(colour = NA, fill = 'grey95'),
legend.title = element_text(size=15),
legend.text = element_text(size=15))+
scale_x_continuous(expand=c(0,0)) +
scale_y_continuous(expand=c(0,0)) +
labs(x = NULL, y = NULL)
basemap_disc <- ggplot()+
geom_polygon(data = gd_county, aes(x=long, y=lat, group=group), fill='grey50')+
coord_equal(xlim = c(-2100000,3300000),ylim = c(-2400000,800000),expand = c(0,0))+
theme_map()+
theme(panel.border=element_rect(color = 'black',size=.5,fill = NA),
panel.background=element_rect(fill='grey15'),
legend.position = c(1, 0),
legend.justification = c(1, 0),
legend.background = element_rect(colour = NA, fill = 'grey95'),
legend.title = element_text(size=15),
legend.text = element_text(size=15))+
scale_x_continuous(expand=c(0,0)) +
scale_y_continuous(expand=c(0,0)) +
labs(x = NULL, y = NULL)
################################################################################
# County Characteristics, 2000-2007 ICPSR 20660 dataset
# http://www.icpsr.umich.edu/icpsrweb/NACJD/studies/20660
df_cc <- haven::read_sav('data/us_crime/ICPSR_20660/DS0001/20660-0001-Data.sav')
# extract and calculate sex ratio
df <- df_cc %>% select(FIPS, State, Division, CBSA_Status, Pop04, Male15_44_05, Fmale15_44_05,
PctBlack05, CrimeRate04, Muder04, PerstPov04, RuralUrban03,
UnempRate05, MedianAge05)
names(df) <- tolower(names(df))
df <- df %>% mutate(asr = male15_44_05 / fmale15_44_05,
murder_rate = muder04 / pop04 * 10e5,
metro = ifelse(ruralurban03 <= 3, 'Metro', 'Non-metro'),
south = ifelse(division==3, 'South', 'Non-South'))
df[is.na(df$murder_rate),'murder_rate'] <- 0
df$fips <- str_pad(df$fips, 5, 'left', 0)
df$state <- str_pad(paste(df$state),width = 2,side = 'left',pad = '0')
################################################################################
# EXPLORATORY PLOTS
# Adult sex rates
gg_asr_dens <- ggplot(df)+
geom_density(aes(asr,color=metro)) +
scale_color_manual('Type of county', values = brbg[c(2,8)])+
coord_cartesian(xlim = c(.8,1.4))+
theme_minimal(base_size = 15) +
theme(legend.position = c(.8,.6))+
ylab('Density')+
xlab('Adult sex ratio, males to females')
gg_asr_ecdf <- ggplot(df)+
stat_ecdf(aes(asr,color=metro)) +
scale_color_manual('Type of county', values = brbg[c(2,8)])+
coord_cartesian(xlim = c(.8,1.4))+
theme_minimal(base_size = 15) +
theme(legend.position = c(.8,.6))+
ylab('Empirical cumulative density')+
xlab('Adult sex ratio, males to females')
gg_asr <- plot_grid(gg_asr_dens,gg_asr_ecdf)
ggsave('out/gg_asr.png', gg_asr, width = 12, height = 5, type="cairo-png")
# murder rate
gg_mr_dens <- ggplot(df)+
geom_density(aes(murder_rate,color=metro)) +
scale_color_manual('Type of county', values = brbg[c(2,8)])+
coord_cartesian(xlim = c(0,200))+
#scale_x_continuous(trans = 'log') +
theme_minimal(base_size = 15) +
theme(legend.position = c(.8,.6))+
ylab('Density')+
xlab('Murder rate per 100K population')
gg_mr_ecdf <- ggplot(df)+
stat_ecdf(aes(murder_rate,color=metro)) +
scale_color_manual('Type of county', values = brbg[c(2,8)])+
coord_cartesian(xlim = c(0,200))+
#scale_x_continuous(trans = 'log') +
theme_minimal(base_size = 15) +
theme(legend.position = c(.8,.6))+
ylab('Empirical cumulative density')+
xlab('Murder rate per 100K population')
gg_mr <- plot_grid(gg_mr_dens,gg_mr_ecdf)
ggsave('out/gg_mr.png', gg_mr, width = 12, height = 5, type="cairo-png")
# meadian age
gg_ma_dens <- ggplot(df)+
geom_density(aes(medianage05,color=metro)) +
scale_color_manual('Type of county', values = brbg[c(2,8)])+
theme_minimal(base_size = 15) +
theme(legend.position = c(.8,.6))+
ylab('Density')+
xlab('Median age of the population')
gg_ma_ecdf <-ggplot(df)+
stat_ecdf(aes(medianage05,color=metro)) +
scale_color_manual('Type of county', values = brbg[c(2,8)])+
theme_minimal(base_size = 15) +
theme(legend.position = c(.8,.6))+
ylab('Empirical cumulative density')+
xlab('Median age of the population')
gg_ma <- plot_grid(gg_ma_dens,gg_ma_ecdf)
ggsave('out/gg_ma.png', gg_ma, width = 12, height = 5, type="cairo-png")
################################################################################
# MAPS
# summarize by states
df_st <- df %>%
filter(!state%in%c('11')) %>%
group_by(state, metro) %>%
summarise_all(.funs = mean, na.rm = T) %>%
ungroup()
# asr metro / non-metro
df_st_asr_rr <- df_st %>% select(state, metro,asr) %>%
spread(metro,asr) %>%
mutate(asrrr = Metro/`Non-metro`)
map_asr <- basemap_cont +
geom_map(map = gd_state, data = df_st_asr_rr, aes(map_id=state, fill=asrrr))+
geom_path(data = gd_state_borders, aes(x=long, y=lat, group=group),
color='grey50', size = .5)+
scale_fill_gradient2('Metro to\nNon-metro\nASR\nrate ratio\n',
low = brbg[1:5], mid = brbg[6], high = brbg[7:11], midpoint = 1)
ggsave('out/map_asr.png', map_asr, width = 12, height = 7, type="cairo-png")
# homicide metro / non-metro
df_st_hr_rr <- df_st %>% select(state, metro,murder_rate) %>%
spread(metro,murder_rate) %>%
mutate(hrrr = Metro/`Non-metro`,
hrrr_int = cut(hrrr, breaks = c(0,.5,.9,1.1,2,10)))
# # continious scale map
# basemap_cont +
# geom_map(map = gd_state, data = df_st_hr_rr, aes(map_id=state, fill=hrrr))+
# geom_path(data = gd_state_borders, aes(x=long, y=lat, group=group),
# color='grey50', size = .5)+
# scale_fill_gradient2('Metro to\nNon-metro\nhomicide\nrate ratio\n',
# low = brbg[1:5], mid = brbg[6], high = brbg[7:11], midpoint = 1)
map_hr <- basemap_disc +
geom_map(map = gd_state, data = df_st_hr_rr, aes(map_id=state, fill=hrrr_int))+
geom_path(data = gd_state_borders, aes(x=long, y=lat, group=group),
color='grey50', size = .5)+
scale_fill_manual('Metro to\nNon-metro\nhomicide\nrate ratio\n',
values = (brewer.pal(5,"BrBG")),
labels = c('less than 0.5','from 0.5 to 0.9','from 0.9 to 1.1',
'from 1.1 to 2','more than 2'))
ggsave('out/map_hr.png', map_hr, width = 12, height = 7, type="cairo-png")
# blacks
map_black <- basemap_cont +
geom_map(map = gd_county, data = df, aes(map_id=fips, fill=pctblack05))+
geom_path(data = gd_state_borders, aes(x=long, y=lat, group=group),
color='grey50', size = .5)+
scale_fill_viridis('Percentage\nof black\npopulation\n', option = 'B')
ggsave('out/map_black.png', map_black, width = 12, height = 7, type="cairo-png")
# median age
map_ma <- basemap_cont +
geom_map(map = gd_county, data = df, aes(map_id=fips, fill=medianage05))+
geom_path(data = gd_state_borders, aes(x=long, y=lat, group=group),
color='grey50', size = .5)+
geom_point(data = gd_cities %>% filter(capital==T, huge==F),
aes(x=long, y=lat), color = 'red', size = 3, pch=1)+
geom_point(data = gd_cities %>% filter(capital==T, huge==T),
aes(x=long, y=lat), color = 'red', size = 5, pch=1)+
geom_point(data = gd_cities %>% filter(capital==F, huge==T),
aes(x=long, y=lat), color = 'gold', size = 5, pch=1)+
scale_fill_viridis('Median\nage of the\npopulation,\nyears\n', option = 'D', direction = -1)
ggsave('out/map_ma.png', map_ma, width = 12, height = 7, type="cairo-png")
# unemployment
basemap_cont +
geom_map(map = gd_county, data = df, aes(map_id=fips, fill=unemprate05))+
geom_path(data = gd_state_borders, aes(x=long, y=lat, group=group),
color='grey50', size = .5)+
scale_fill_viridis('Unemployment\nrate, %\n', option = 'B')
map_unemp <- basemap_disc +
geom_map(map = gd_county, data = df, aes(map_id=fips, fill=cut_number(unemprate05,5)))+
geom_path(data = gd_state_borders, aes(x=long, y=lat, group=group),
color='grey50', size = .5)+
scale_fill_viridis('Unemployment\nrate, %\n', option = 'D',discrete = T)
ggsave('out/map_unemp.png', map_unemp, width = 12, height = 7, type="cairo-png")
# metro
map_urb <- basemap_disc +
geom_map(map = gd_county, data = df, aes(map_id=fips, fill=factor(ruralurban03)))+
geom_path(data = gd_state_borders, aes(x=long, y=lat, group=group),
color='grey50', size = .5)+
scale_fill_viridis('Urban\nRural\ncounty\nclassification\n', option = 'B', discrete = T, direction = -1)+
geom_point(data = gd_cities %>% filter(capital==T, huge==F),
aes(x=long, y=lat), color = 'red', size = 3, pch=1)+
geom_point(data = gd_cities %>% filter(capital==T, huge==T),
aes(x=long, y=lat), color = 'red', size = 5, pch=1)+
geom_point(data = gd_cities %>% filter(capital==F, huge==T),
aes(x=long, y=lat), color = 'purple4', size = 5, pch=1)
ggsave('out/map_urb.png', map_urb, width = 12, height = 7, type="cairo-png")
################################################################################
# STAT MODELS
mod1 <- glm(family = 'poisson', data = df,
murder_rate ~ asr)
mod2 <- glm(family = 'poisson', data = df,
murder_rate ~ asr + perstpov04)
mod3 <- glm(family = 'poisson', data = df,
murder_rate ~ asr + perstpov04 + pctblack05)
mod4 <- glm(family = 'poisson', data = df,
murder_rate ~ asr + perstpov04 + pctblack05 + south)
mod5 <- glm(family = 'poisson', data = df,
murder_rate ~ asr + perstpov04 + pctblack05 + south + metro)
mod6 <- glm(family = 'poisson', data = df,
murder_rate ~ asr + perstpov04 + pctblack05 + south + ruralurban03)
mod7 <- glm(family = 'poisson', data = df,
murder_rate ~ asr + perstpov04 + pctblack05 + south + ruralurban03 + unemprate05)
mod8 <- glm(family = 'poisson', data = df,
murder_rate ~ asr + perstpov04 + pctblack05 + south + ruralurban03 + unemprate05 + medianage05)
texreg::htmlreg(list(mod1, mod2, mod3, mod4, mod5, mod6, mod7, mod8),
file = 'out/us_cnt_poisson_models.html',
single.row = T)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment