davidaarmstrong/can_census_fed.r

## can_census_fed.r
library(dplyr)
library(sf)
library(rgeos)
library(rgdal)

## Download: https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/bound-limit-2016-eng.cfm
## choose ArcGIS (.shp) and Federal Electoral Districts (2013 Representation Order)
## read in electoral district boundaries.
eb <- read_sf("FED_CA_2_2_ENG.shp")

## Download the digital map boundaries for the 2016 Census Subdivisions from here: https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/bound-limit-2016-eng.cfm
## Choose digital boundary files for census subdivisions
cb <- read_sf("lcsd000b16a_e/lcsd000b16a_e.shp")

## Download the Census Data
## I used this API: https://github.com/mountainMath/cancensus
## Look at the Readme page for the GitHub Repository to see how to obtain and incorporate the API Key.

library(cancensus)

## this is an example set of vectors
v <- c("v_CA16_5054", "v_CA16_5060", "v_CA16_5078", "v_CA16_5081", "v_CA16_5090", "v_CA16_3957", "v_CA16_401",  "v_CA16_407", "v_CA16_2207")

## list the regions and filter them to the CSDs
regs <- list_census_regions("CA16") %>% filter(level == "CSD")

## initialize a list
cd <- list()

## Force R to download 100 regions at a time
s <- seq(1,5201,by=100)
s[1] <- 0
s[length(s)] <- nrow(regs)
for(i in 2:length(s)){
  cd[[i]] <- get_census('CA16', regions = list(CSD=regs$region[(s[(i-1)]+1):s[i]]), level="CSD", vectors=v)
}

## Put all of the regions together
cd <- do.call(rbind, cd)

## Join the downloaded census data to the data in the census geographic data
## first, remove columns that will be duplicates
cd <- cd[,-(8:10)]
names(cd)[1] <- "CSDUID"

## merge datasets
cb <- left_join(cb, cd)

## Initialize a matrix to hold the mapping results
mat <- array(0, dim = c(nrow(eb), nrow(cb)))

## create an observation counter in each dataset
eb$obs <- 1:nrow(eb)
cb$obs <- 1:nrow(cb)

## convert both objects to spatial objects
eb <- as(eb, "Spatial")
cb <- as(cb, "Spatial")

## save the data from both objects
ebd <- eb@data
cbd <- cb@data

## Simplify and buffer to clear up problems
## with rGEOS errors
eb <- gSimplify(eb, tol=0.00001)
eb <- gBuffer(eb, byid=TRUE, width=0)
cb <- gSimplify(cb, tol=0.00001)
cb <- gBuffer(cb, byid=TRUE, width=0)

## NOTE: the loop below would take about 20 hours to run
## on a single process.
## loop over electoral boundaries
for(i in 1:length(eb)){
## loop over census subdivision boundaries
   for(j in 1:length(cb)){
   ## subset boundaries so that each contains a single polygon set
        tmp.eb <- eb[i]
        tmp.csd <- cb[j]
        ## Find the intersection of the two polygons
        inter <- gIntersection(tmp.eb, tmp.csd)
        ## If the intersection isn't null, create find the area of the intersectiono
        if(!is.null(inter)){
            r <- try(gArea(inter))
            if(class(r) != "try-error"){
            ## if the area function didn't fail, put the result in the appropriate
            ## cell in the matrix
              mat[i,j] <- r
            }
        }
    }
  # print a counter for electoral boundaries (347 in total)
  cat(i, ", ", sep="")
  }

## Next, calculate the area of the original census divisions
csd.areas <- gArea(cb, byid=TRUE)

## Find the proportion of overlap from each census subdivision in each electoral district
mat2 <- t(apply(mat, 1, function(x)x/csd.areas))

## take the census boundary data from above and use
## only those columns that represent the vectors downloaded
## from the census API
cd.mat <- as.matrix(cbd[,21:33])

## change missing data to 0.  There were 14
## subdivisions that were in the map boundary file
## that were not in the data downloaded from the census.
## These are the only missing observations
cd.mat[which(is.na(cd.mat), arr.ind=TRUE)] <- 0

## Reapportion the data
FED.data <- mat2 %*% cd.mat

## Make the object a data frame and set the variable names
FED.data <- as.data.frame(FED.data)
names(FED.data) <- names(cbd)[21:33]

## Connect the new census-based FED data to the original data from the
## geographic boundary file.
FED_census <- cbind(ebd, FED.data)
	library(dplyr)
	library(sf)
	library(rgeos)
	library(rgdal)

	## Download: https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/bound-limit-2016-eng.cfm
	## choose ArcGIS (.shp) and Federal Electoral Districts (2013 Representation Order)
	## read in electoral district boundaries.
	eb <- read_sf("FED_CA_2_2_ENG.shp")

	## Download the digital map boundaries for the 2016 Census Subdivisions from here: https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/bound-limit-2016-eng.cfm
	## Choose digital boundary files for census subdivisions
	cb <- read_sf("lcsd000b16a_e/lcsd000b16a_e.shp")

	## Download the Census Data
	## I used this API: https://github.com/mountainMath/cancensus
	## Look at the Readme page for the GitHub Repository to see how to obtain and incorporate the API Key.

	library(cancensus)

	## this is an example set of vectors
	v <- c("v_CA16_5054", "v_CA16_5060", "v_CA16_5078", "v_CA16_5081", "v_CA16_5090", "v_CA16_3957", "v_CA16_401", "v_CA16_407", "v_CA16_2207")

	## list the regions and filter them to the CSDs
	regs <- list_census_regions("CA16") %>% filter(level == "CSD")

	## initialize a list
	cd <- list()

	## Force R to download 100 regions at a time
	s <- seq(1,5201,by=100)
	s[1] <- 0
	s[length(s)] <- nrow(regs)
	for(i in 2:length(s)){
	cd[[i]] <- get_census('CA16', regions = list(CSD=regs$region[(s[(i-1)]+1):s[i]]), level="CSD", vectors=v)
	}

	## Put all of the regions together
	cd <- do.call(rbind, cd)

	## Join the downloaded census data to the data in the census geographic data
	## first, remove columns that will be duplicates
	cd <- cd[,-(8:10)]
	names(cd)[1] <- "CSDUID"

	## merge datasets
	cb <- left_join(cb, cd)

	## Initialize a matrix to hold the mapping results
	mat <- array(0, dim = c(nrow(eb), nrow(cb)))

	## create an observation counter in each dataset
	eb$obs <- 1:nrow(eb)
	cb$obs <- 1:nrow(cb)

	## convert both objects to spatial objects
	eb <- as(eb, "Spatial")
	cb <- as(cb, "Spatial")

	## save the data from both objects
	ebd <- eb@data
	cbd <- cb@data

	## Simplify and buffer to clear up problems
	## with rGEOS errors
	eb <- gSimplify(eb, tol=0.00001)
	eb <- gBuffer(eb, byid=TRUE, width=0)
	cb <- gSimplify(cb, tol=0.00001)
	cb <- gBuffer(cb, byid=TRUE, width=0)

	## NOTE: the loop below would take about 20 hours to run
	## on a single process.
	## loop over electoral boundaries
	for(i in 1:length(eb)){
	## loop over census subdivision boundaries
	for(j in 1:length(cb)){
	## subset boundaries so that each contains a single polygon set
	tmp.eb <- eb[i]
	tmp.csd <- cb[j]
	## Find the intersection of the two polygons
	inter <- gIntersection(tmp.eb, tmp.csd)
	## If the intersection isn't null, create find the area of the intersectiono
	if(!is.null(inter)){
	r <- try(gArea(inter))
	if(class(r) != "try-error"){
	## if the area function didn't fail, put the result in the appropriate
	## cell in the matrix
	mat[i,j] <- r
	}
	}
	}
	# print a counter for electoral boundaries (347 in total)
	cat(i, ", ", sep="")
	}

	## Next, calculate the area of the original census divisions
	csd.areas <- gArea(cb, byid=TRUE)

	## Find the proportion of overlap from each census subdivision in each electoral district
	mat2 <- t(apply(mat, 1, function(x)x/csd.areas))

	## take the census boundary data from above and use
	## only those columns that represent the vectors downloaded
	## from the census API
	cd.mat <- as.matrix(cbd[,21:33])

	## change missing data to 0. There were 14
	## subdivisions that were in the map boundary file
	## that were not in the data downloaded from the census.
	## These are the only missing observations
	cd.mat[which(is.na(cd.mat), arr.ind=TRUE)] <- 0

	## Reapportion the data
	FED.data <- mat2 %*% cd.mat

	## Make the object a data frame and set the variable names
	FED.data <- as.data.frame(FED.data)
	names(FED.data) <- names(cbd)[21:33]

	## Connect the new census-based FED data to the original data from the
	## geographic boundary file.
	FED_census <- cbind(ebd, FED.data)