Last active
August 29, 2015 14:02
-
-
Save geotheory/9b03e9c5d56595e7e8a1 to your computer and use it in GitHub Desktop.
This script demo's how to process the 2011 UK Census shapefiles and data to produce graphics used in http://geotheory.co.uk/blog/2014/06/23/gender-in-urban-workplaces/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require(maptools) | |
require(fields) | |
require(ggplot2) | |
require(stringr) | |
bng = "+init=epsg:27700" | |
workdir = '[change to working directory path]' | |
setwd(workdir) | |
R.utils::mkdirs('wz') | |
workdir = paste0(workdir, '/wz') | |
setwd(workdir) | |
# download & unzip (may have to do manually) | |
wz_centroids_url = 'https://geoportal.statistics.gov.uk/Docs/Boundaries/Workplace_zones_(E+W)_2011_Population_Weighted_Centroids.zip' | |
wz_boundaries_url = 'https://geoportal.statistics.gov.uk/Docs/Boundaries/Workplace_zones_(E+W)_2011_Boundaries_(Generalised_Clipped).zip' | |
download.file(wz_centroids_url, paste0(workdir, "/workplace_zones_centroids.zip"), method="internal", mode="wb") | |
download.file(wz_boundaries_url, paste0(workdir, "/workplace_zones_boundaries.zip"), method="internal", mode="wb") | |
unzip('workplace_zones_centroids.zip'); unzip('workplace_zones_boundaries.zip') | |
unlink(c('workplace_zones_centroids.zip','workplace_zones_boundaries.zip')) # delete zips | |
# read in and tidy up shapefiles | |
wz = readShapePoints("WZ_2011_EW_PWC.shp", proj4string=CRS(bng)) # centroids | |
wz = wz[order(wz$WZ11CD),] # reorder by zone code | |
names(wz) = 'code' | |
head(wz@data) | |
wzb = readShapePoly("WZ_2011_EW_BGC.shp", proj4string=CRS(bng)) # boundaries | |
wzb = wzb[order(wzb$WZ11CD),] # reorder | |
dim(wzb) # data table dimensions | |
head(wzb@data) | |
all(wzb$WZ11CD == wz$code) # check codes match other data | |
wzb = wzb[,c(1,3)] # remove non-required data columns | |
names(wzb) = c('code','name') | |
head(wzb@data) | |
# download file 'census workplace zones 2011' from http://www.nomisweb.co.uk/census/2011/wp1101ew | |
# copy file ('bulk.csv') to working directory | |
datain = read.csv('bulk.csv', stringsAsFactors=F) | |
View(datain) | |
names(datain)[1:6] = c('year','geog','code','total','men','women') | |
datain[1:6,3:6] # data we will use | |
datain = datain[order(datain$code),] # reorder | |
# check data and shapefile zones match up | |
all(datain$code == wz$code) | |
# extract centroid coordinates from wz spatial object | |
wz_centroids = data.frame(coordinates(wz)) | |
names(wz_centroids) = c('e','n') # eastings and northings | |
head(wz_centroids) | |
# working data | |
d = cbind(datain[,3:6], wz_centroids) | |
# round coordinates and aggregate to 1km resolution | |
d$e_round = round(d$e, -3) | |
d$n_round = round(d$n, -3) | |
d_agg = aggregate(total ~ e_round + n_round, d, sum) | |
d_agg = d_agg[order(d_agg$total, decreasing=T),] | |
dim(d_agg); head(d_agg) | |
# algorithm to find 28 highest density km squares not within a 20 km | |
# radius of a higher-density square | |
topcities = d_agg[1,] # first entry is top of results | |
k=1 # to increase to 2 before first use | |
for(i in 2:28){ | |
while(TRUE){ | |
k = k+1 | |
p = d_agg[k,] # new d_agg row | |
tst = FALSE # test variable for checking if new d_agg row is within dist radius of an existing entry | |
# iterate through results already compiled | |
for(j in 1:nrow(topcities)){ | |
m = t(matrix(c(p[,1:2], topcities[j,1:2]),2)) # matrix of 2 point coordinates | |
dis = as.numeric(dist(m)) | |
if(dis < 20000) tst = TRUE # distance threshold in meters | |
} | |
if(tst) next else { # next moves while on to next loop | |
topcities[i,] = d_agg[k,] # new entry has been found | |
break # fully breaks while loop | |
} | |
} | |
} | |
symbols(topcities[,1:2], squares=sqrt(topcities$total), bg='#ff000050', asp=T) # see how they distribute | |
names(topcities) = c('e','n','pop') | |
# get city names from boundaries data (I then re-named some manually) | |
for(i in 1:nrow(topcities)){ | |
# calc distances to centroids | |
dists = t(rdist(topcities[i,1:2], wz_centroids))[,1] | |
mindist = min(dists) | |
topcities$name[i] = as.character(wzb$name[grep(mindist, dists)]) | |
} | |
# let's see where they are in relation to the data | |
par(mai=c(0,0,0,0)) # remove plot margins | |
plot(d$e, d$n, pch='.', asp=T) | |
points(topcities$e, topcities$n, col='red', pch=16) | |
text(topcities[,1:2], labels=topcities$name, col='blue', cex=.7) | |
# calculate quantile categories for colour scaling later on | |
N = 7 | |
d$femprop = 100 * d$women / d$total | |
brks = quantile(d$femprop, probs = seq(0, 1, length.out=N+1)) | |
qtls = cut(d$femprop, brks, include.lowest=T, labels = 1:N) | |
qtl_labs = levels(cut(d$femprop, brks, include.lowest=T)) | |
d$women_labs = qtl_labs[qtls] | |
d$women_labs = ordered(d$women_labs, levels=rev(qtl_labs)) | |
d_uk = d # save full England & Wales dataset | |
# filter 'd' table to records within 3.5kms of a top city | |
# apply vectorises the code to avoid slow loop (i.e. MUCH quicker..) | |
d$city = apply(d, 1, function(row){ | |
distx = abs(as.numeric(row[5]) - topcities$e) | |
disty = abs(as.numeric(row[6]) - topcities$n) | |
if(any(distx <= 3500 & disty <= 3500)) { | |
dists = sqrt(distx^2 + disty^2) | |
n = match(min(dists), dists) | |
return(topcities$name[n]) | |
} else return("") | |
}) | |
d = d[d$city != '',] | |
plot(d$e, d$n, pch='.', asp=T) | |
# create custom theme | |
customtheme = theme_bw() | |
customtheme$axis.text = element_blank() | |
customtheme$axis.text = element_blank() | |
customtheme$axis.ticks = element_blank() | |
customtheme$axis.ticks = element_blank() | |
customtheme$aspect.ratio = 1 | |
# create a colour scale | |
cols = colorRampPalette(c("navy", "grey70", 'pink'))(N+1) | |
# run the plot | |
ggplot(d, aes(e, n, col=factor(women_labs))) + geom_point(size=1.5) + | |
scale_colour_manual(values=cols[as.numeric(d$women_labs)]) + | |
facet_wrap(~ city, scales='free', nrow=4) + customtheme | |
# density plot (not correct as calculating on square areas not circles, but to demo method..) | |
d$dist = apply(d, 1, function(row) rdist(topcities[topcities$name == as.character(row[11]), 1:2], t(as.numeric(row[5:6])))[1,1]) | |
ggplot(d, aes(dist, femprop)) + geom_smooth(method="loess") | |
ggplot(d, aes(dist, femprop)) + geom_smooth(method="loess") + facet_wrap(~ city, scales='free_y') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment