geotheory/urban_gender_analysis.R

## urban_gender_analysis.R
require(maptools)
require(fields)
require(ggplot2)
require(stringr)

bng = "+init=epsg:27700"

workdir = '[change to working directory path]'
setwd(workdir)
R.utils::mkdirs('wz')
workdir = paste0(workdir, '/wz')
setwd(workdir)

# download & unzip (may have to do manually)
wz_centroids_url = 'https://geoportal.statistics.gov.uk/Docs/Boundaries/Workplace_zones_(E+W)_2011_Population_Weighted_Centroids.zip'
wz_boundaries_url = 'https://geoportal.statistics.gov.uk/Docs/Boundaries/Workplace_zones_(E+W)_2011_Boundaries_(Generalised_Clipped).zip'

download.file(wz_centroids_url, paste0(workdir, "/workplace_zones_centroids.zip"), method="internal", mode="wb")
download.file(wz_boundaries_url, paste0(workdir, "/workplace_zones_boundaries.zip"), method="internal", mode="wb")
unzip('workplace_zones_centroids.zip'); unzip('workplace_zones_boundaries.zip')
unlink(c('workplace_zones_centroids.zip','workplace_zones_boundaries.zip')) # delete zips

# read in and tidy up shapefiles
wz = readShapePoints("WZ_2011_EW_PWC.shp", proj4string=CRS(bng)) # centroids
wz = wz[order(wz$WZ11CD),]  # reorder by zone code
names(wz) = 'code'
head(wz@data)

wzb = readShapePoly("WZ_2011_EW_BGC.shp", proj4string=CRS(bng)) # boundaries
wzb = wzb[order(wzb$WZ11CD),]  # reorder
dim(wzb)                       # data table dimensions
head(wzb@data)

all(wzb$WZ11CD == wz$code)     # check codes match other data
wzb = wzb[,c(1,3)]             # remove non-required data columns
names(wzb) = c('code','name')
head(wzb@data)

# download file 'census workplace zones 2011' from http://www.nomisweb.co.uk/census/2011/wp1101ew
# copy file ('bulk.csv') to working directory

datain = read.csv('bulk.csv', stringsAsFactors=F)
View(datain)
names(datain)[1:6] = c('year','geog','code','total','men','women')
datain[1:6,3:6] # data we will use

datain = datain[order(datain$code),] # reorder

# check data and shapefile zones match up
all(datain$code == wz$code)

# extract centroid coordinates from wz spatial object
wz_centroids = data.frame(coordinates(wz))
names(wz_centroids) = c('e','n')  # eastings and northings
head(wz_centroids)

# working data
d = cbind(datain[,3:6], wz_centroids)

# round coordinates and aggregate to 1km resolution
d$e_round = round(d$e, -3)
d$n_round = round(d$n, -3)
d_agg = aggregate(total ~ e_round + n_round, d, sum)
d_agg = d_agg[order(d_agg$total, decreasing=T),]
dim(d_agg); head(d_agg)

# algorithm to find 28 highest density km squares not within a 20 km
# radius of a higher-density square

topcities = d_agg[1,]  # first entry is top of results
k=1                    # to increase to 2 before first use

for(i in 2:28){
  while(TRUE){
    k = k+1
    p = d_agg[k,]  # new d_agg row
    tst = FALSE # test variable for checking if new d_agg row is within dist radius of an existing entry
    # iterate through results already compiled
    for(j in 1:nrow(topcities)){
      m = t(matrix(c(p[,1:2], topcities[j,1:2]),2)) # matrix of 2 point coordinates
      dis = as.numeric(dist(m))
      if(dis < 20000) tst = TRUE   # distance threshold in meters
    }
    if(tst) next else { # next moves while on to next loop
      topcities[i,] = d_agg[k,]  # new entry has been found
      break # fully breaks while loop
    }
  }
}

symbols(topcities[,1:2], squares=sqrt(topcities$total), bg='#ff000050', asp=T) # see how they distribute

names(topcities) = c('e','n','pop')

# get city names from boundaries data (I then re-named some manually)
for(i in 1:nrow(topcities)){
  # calc distances to centroids
  dists = t(rdist(topcities[i,1:2], wz_centroids))[,1]
  mindist = min(dists)
  topcities$name[i] = as.character(wzb$name[grep(mindist, dists)])
}

# let's see where they are in relation to the data
par(mai=c(0,0,0,0)) # remove plot margins
plot(d$e, d$n, pch='.', asp=T)
points(topcities$e, topcities$n, col='red', pch=16)
text(topcities[,1:2], labels=topcities$name, col='blue', cex=.7)

# calculate quantile categories for colour scaling later on
N = 7
d$femprop = 100 * d$women / d$total
brks = quantile(d$femprop, probs = seq(0, 1, length.out=N+1))
qtls = cut(d$femprop, brks, include.lowest=T, labels = 1:N)
qtl_labs = levels(cut(d$femprop, brks, include.lowest=T))
d$women_labs = qtl_labs[qtls]
d$women_labs = ordered(d$women_labs, levels=rev(qtl_labs))

d_uk = d   # save full England & Wales dataset

# filter 'd' table to records within 3.5kms of a top city
# apply vectorises the code to avoid slow loop (i.e. MUCH quicker..)
d$city = apply(d, 1, function(row){
  distx = abs(as.numeric(row[5]) - topcities$e)
  disty = abs(as.numeric(row[6]) - topcities$n)
  if(any(distx <= 3500 & disty <= 3500)) {
    dists = sqrt(distx^2 + disty^2)
    n = match(min(dists), dists)
    return(topcities$name[n])
  } else return("")
})

d = d[d$city != '',]
plot(d$e, d$n, pch='.', asp=T)

# create custom theme
customtheme = theme_bw()
customtheme$axis.text = element_blank()
customtheme$axis.text = element_blank()
customtheme$axis.ticks = element_blank()
customtheme$axis.ticks = element_blank()
customtheme$aspect.ratio = 1

# create a colour scale
cols = colorRampPalette(c("navy", "grey70", 'pink'))(N+1)

# run the plot
ggplot(d, aes(e, n, col=factor(women_labs))) + geom_point(size=1.5) +
  scale_colour_manual(values=cols[as.numeric(d$women_labs)]) +
  facet_wrap(~ city, scales='free', nrow=4) + customtheme


# density plot (not correct as calculating on square areas not circles, but to demo method..)

d$dist = apply(d, 1, function(row) rdist(topcities[topcities$name == as.character(row[11]), 1:2], t(as.numeric(row[5:6])))[1,1])

ggplot(d, aes(dist, femprop)) + geom_smooth(method="loess")

ggplot(d, aes(dist, femprop)) + geom_smooth(method="loess") + facet_wrap(~ city, scales='free_y')
	require(maptools)
	require(fields)
	require(ggplot2)
	require(stringr)

	bng = "+init=epsg:27700"

	workdir = '[change to working directory path]'
	setwd(workdir)
	R.utils::mkdirs('wz')
	workdir = paste0(workdir, '/wz')
	setwd(workdir)

	# download & unzip (may have to do manually)
	wz_centroids_url = 'https://geoportal.statistics.gov.uk/Docs/Boundaries/Workplace_zones_(E+W)_2011_Population_Weighted_Centroids.zip'
	wz_boundaries_url = 'https://geoportal.statistics.gov.uk/Docs/Boundaries/Workplace_zones_(E+W)_2011_Boundaries_(Generalised_Clipped).zip'

	download.file(wz_centroids_url, paste0(workdir, "/workplace_zones_centroids.zip"), method="internal", mode="wb")
	download.file(wz_boundaries_url, paste0(workdir, "/workplace_zones_boundaries.zip"), method="internal", mode="wb")
	unzip('workplace_zones_centroids.zip'); unzip('workplace_zones_boundaries.zip')
	unlink(c('workplace_zones_centroids.zip','workplace_zones_boundaries.zip')) # delete zips

	# read in and tidy up shapefiles
	wz = readShapePoints("WZ_2011_EW_PWC.shp", proj4string=CRS(bng)) # centroids
	wz = wz[order(wz$WZ11CD),] # reorder by zone code
	names(wz) = 'code'
	head(wz@data)

	wzb = readShapePoly("WZ_2011_EW_BGC.shp", proj4string=CRS(bng)) # boundaries
	wzb = wzb[order(wzb$WZ11CD),] # reorder
	dim(wzb) # data table dimensions
	head(wzb@data)

	all(wzb$WZ11CD == wz$code) # check codes match other data
	wzb = wzb[,c(1,3)] # remove non-required data columns
	names(wzb) = c('code','name')
	head(wzb@data)

	# download file 'census workplace zones 2011' from http://www.nomisweb.co.uk/census/2011/wp1101ew
	# copy file ('bulk.csv') to working directory

	datain = read.csv('bulk.csv', stringsAsFactors=F)
	View(datain)
	names(datain)[1:6] = c('year','geog','code','total','men','women')
	datain[1:6,3:6] # data we will use

	datain = datain[order(datain$code),] # reorder

	# check data and shapefile zones match up
	all(datain$code == wz$code)

	# extract centroid coordinates from wz spatial object
	wz_centroids = data.frame(coordinates(wz))
	names(wz_centroids) = c('e','n') # eastings and northings
	head(wz_centroids)

	# working data
	d = cbind(datain[,3:6], wz_centroids)

	# round coordinates and aggregate to 1km resolution
	d$e_round = round(d$e, -3)
	d$n_round = round(d$n, -3)
	d_agg = aggregate(total ~ e_round + n_round, d, sum)
	d_agg = d_agg[order(d_agg$total, decreasing=T),]
	dim(d_agg); head(d_agg)

	# algorithm to find 28 highest density km squares not within a 20 km
	# radius of a higher-density square

	topcities = d_agg[1,] # first entry is top of results
	k=1 # to increase to 2 before first use

	for(i in 2:28){
	while(TRUE){
	k = k+1
	p = d_agg[k,] # new d_agg row
	tst = FALSE # test variable for checking if new d_agg row is within dist radius of an existing entry
	# iterate through results already compiled
	for(j in 1:nrow(topcities)){
	m = t(matrix(c(p[,1:2], topcities[j,1:2]),2)) # matrix of 2 point coordinates
	dis = as.numeric(dist(m))
	if(dis < 20000) tst = TRUE # distance threshold in meters
	}
	if(tst) next else { # next moves while on to next loop
	topcities[i,] = d_agg[k,] # new entry has been found
	break # fully breaks while loop
	}
	}
	}

	symbols(topcities[,1:2], squares=sqrt(topcities$total), bg='#ff000050', asp=T) # see how they distribute

	names(topcities) = c('e','n','pop')

	# get city names from boundaries data (I then re-named some manually)
	for(i in 1:nrow(topcities)){
	# calc distances to centroids
	dists = t(rdist(topcities[i,1:2], wz_centroids))[,1]
	mindist = min(dists)
	topcities$name[i] = as.character(wzb$name[grep(mindist, dists)])
	}

	# let's see where they are in relation to the data
	par(mai=c(0,0,0,0)) # remove plot margins
	plot(d$e, d$n, pch='.', asp=T)
	points(topcities$e, topcities$n, col='red', pch=16)
	text(topcities[,1:2], labels=topcities$name, col='blue', cex=.7)

	# calculate quantile categories for colour scaling later on
	N = 7
	d$femprop = 100 * d$women / d$total
	brks = quantile(d$femprop, probs = seq(0, 1, length.out=N+1))
	qtls = cut(d$femprop, brks, include.lowest=T, labels = 1:N)
	qtl_labs = levels(cut(d$femprop, brks, include.lowest=T))
	d$women_labs = qtl_labs[qtls]
	d$women_labs = ordered(d$women_labs, levels=rev(qtl_labs))

	d_uk = d # save full England & Wales dataset

	# filter 'd' table to records within 3.5kms of a top city
	# apply vectorises the code to avoid slow loop (i.e. MUCH quicker..)
	d$city = apply(d, 1, function(row){
	distx = abs(as.numeric(row[5]) - topcities$e)
	disty = abs(as.numeric(row[6]) - topcities$n)
	if(any(distx <= 3500 & disty <= 3500)) {
	dists = sqrt(distx^2 + disty^2)
	n = match(min(dists), dists)
	return(topcities$name[n])
	} else return("")
	})

	d = d[d$city != '',]
	plot(d$e, d$n, pch='.', asp=T)

	# create custom theme
	customtheme = theme_bw()
	customtheme$axis.text = element_blank()
	customtheme$axis.text = element_blank()
	customtheme$axis.ticks = element_blank()
	customtheme$axis.ticks = element_blank()
	customtheme$aspect.ratio = 1

	# create a colour scale
	cols = colorRampPalette(c("navy", "grey70", 'pink'))(N+1)

	# run the plot
	ggplot(d, aes(e, n, col=factor(women_labs))) + geom_point(size=1.5) +
	scale_colour_manual(values=cols[as.numeric(d$women_labs)]) +
	facet_wrap(~ city, scales='free', nrow=4) + customtheme


	# density plot (not correct as calculating on square areas not circles, but to demo method..)

	d$dist = apply(d, 1, function(row) rdist(topcities[topcities$name == as.character(row[11]), 1:2], t(as.numeric(row[5:6])))[1,1])

	ggplot(d, aes(dist, femprop)) + geom_smooth(method="loess")

	ggplot(d, aes(dist, femprop)) + geom_smooth(method="loess") + facet_wrap(~ city, scales='free_y')