Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bohdanszymanik/ebae4fd663be37f9cbaa0dd79e00ee2e to your computer and use it in GitHub Desktop.
Save bohdanszymanik/ebae4fd663be37f9cbaa0dd79e00ee2e to your computer and use it in GitHub Desktop.
nominatum openstreetmap address lookup for the liquor licencing register
##################################################################################
# Import liquor licencing data, geocode and plot
# Plot on a NZ map
# Show a regional view - some demographics eg population density at SA2
# Import the public NZ Police aggregated crime data
# Look at crime harm regional cf liquor outlet distribution
#
library(readxl)
library(rgdal)
library(broom)
library(dplyr)
library(ggplot2)
library(ggmap)
library(memoise)
library(naniar)
library(stringr)
library(purrr)
library(tmaptools)
library(logger)
library(cowplot)
library(ggrepel)
library(sf)
library(nngeo)
library(rlist)
library(bench) # https://cran.r-project.org/web/packages/bench/bench.pdf
library(pryr)
library(stringdist)
library(ngram)
library(areal)
setwd("h:\\Documents\\wd\\erp")
################################################################################
# Bringing in liquour licensing data
#
# read the liquor licensing licences register in two steps
# first get the column names
cNames <- read_excel("February-2021-Licences.xlsx", skip=1, n_max=1) %>% names()
# then the raw data
llr <- read_excel("February-2021-Licences.xlsx", skip=2, col_names=cNames)
# getting a comma separated address - bit pointless having its own function?
#createAddress <- function(`Premises Street`, `Premises Suburb`, `Premises City`, ...) {
# paste(`Premises Street`, `Premises Suburb`, `Premises City`, sep=',')
#}
# eg
#llr %>% rowwise() %>% pmap_chr(createAddress)
llr2 <- llr %>% dplyr::mutate(address = paste(`Premises Street`, `Premises Suburb`, `Premises City`, c("New Zealand"), sep=',') )
llr2 %>% slice_sample(n=50)
llr2 %>% slice_head(n=5)
geocode_OSM("Klondike Ale House, 5 Gillies St, Kawakawa, new zealand", server = "https://nominatim.openstreetmap.org")
geocode_OSM("5 Gillies St, Kawakawa, new zealand", server = "https://nominatim.openstreetmap.org")
#geocode_OSM("5 Gillies St, Kawakawa, new zealand")
#geocode_OSM("5 Gillies St, Kawakawa, new zealand")$coords
#typeof(geocode_OSM("5 Gillies St, Kawakawa, new zealand")$coords)
#length(geocode_OSM("5 Gillies St, Kawakawa, new zealand")$coords)
#str(geocode_OSM("5 Gillies St, Kawakawa, new zealand")$coords)
#is.vector(geocode_OSM("5 Gillies St, Kawakawa, new zealand")$coords)
#geocode_OSM("5 Gillies St, Kawakawa, new zealand")$coords["x"]
#log_layout(layout_glue_colors) # warning ... unreadable in the log with default text editors
log_formatter(formatter_glue)
log_layout(layout_simple)
log_threshold(TRACE)
#t <- tempfile()
t <- "mylogger.log"
log_appender(appender_file(t))
#readLines(t)
#unlink(t)
cf <- cache_filesystem(".rcache", algo = "xxhash64", compress = FALSE)
# cache_memory(algo="sha512")
# geocode_OSM defaults to EPSG 4326 lat lon
# st_crs can be used to set coordinate reference system on underlying sf objects
# not sure if nztm is supported by sf
lookupAddress <- memoise (
function(address) {
geocode <- geocode_OSM(address) # returns either the address if not found, or a vector with lat lon
if (is.null(geocode)) {
log_trace('{address}')
NA
} else {
log_trace('{address}\t{geocode$coords["x"]}\t{geocode$coords["y"]}')
geocode$coords
}
},
cache = cf
)
lookupAddress("5 Gillies St,Kawakawa,New Zealand")
# has_cache(lookupAddress)("5 Gillies St,Kawakawa,New Zealand")
# drop_cache(lookupAddress)("5 Gillies St,Kawakawa,New Zealand")
# forget(lookupAddress)
#geocode_OSM("5 Gillies St,Kawakawa,New Zealand")
# it doesn't like the presence of a # char in a column in the 'address'Premises Street' data so we need to filter that out
llr3 <-
llr2 %>%
# slice_head(n=20) %>%
select(LicenceType, `Date Licence is Valid From`, `Date Licence Expires`, address) %>%
rowwise %>%
dplyr::mutate(addressFiltered = str_replace(address, "#", "")) %>%
dplyr::mutate(lat = lookupAddress(address)["y"], lon = lookupAddress(address)["x"])
# we really need to be doing this for licences that are currently active
# that means date today is after `Date Licence is Valid From` and before `Date Licence Expires`
llr3Active <-
llr3 %>%
# slice_head(n=20) %>%
filter(Sys.time() >= `Date Licence is Valid From` & Sys.time() < `Date Licence Expires`)
# down from 10719 obs to 9547 observations
################################################################################
# Plotting
# first we run against the natural earth data which gives a simple global map with countries
# then we'll go to an NZ map with regional boundaries
library("rnaturalearth")
library("rnaturalearthdata")
world <- ne_countries(scale = "medium", returnclass = "sf")
class(world)
ggplot(data = world) +
geom_sf() +
coord_sf(expand=F)
ggplot(data = world) +
geom_sf() +
label_graticule = "EW" +
xlab("Longitude") + ylab("Latitude") +
ggtitle("World map", subtitle = paste0("(", length(unique(world$NAME)), " countries)"))
# # example of colouring the earth map using an aesthetic based off population data
# ggplot(data = world) +
# geom_sf(aes(fill = pop_est)) +
# scale_fill_viridis_c(option = "plasma", trans = "sqrt")
# # plotting just nz out of the rnaturalearth data set
# nz <- world[world$name == 'New Zealand',]
#
# ggplot(data = nz) +
# geom_sf() +
# # test by plotting one or two points
# two ways to create tibbles on the fly - tribble is quite nice
# tibble( lat = -35.11244, lon = 173.26267)
#
# tribble (
# ~lat, ~lon,
# -35.11244, 173.26267,
# -35.73572, 174.33096,
# )
#
#
# ggplot(data = world) +
# geom_sf() +
# geom_point(data = tibble( lat = -35.11244, lon = 173.26267),
# aes(x = lon, y = lat),
# size = 2,
# shape = 23,
# fill = "darkred") +
# coord_sf(
# crs = 4326, #"EPSG:4326", or equivalently "WGS84"
# xlim = c(164, 179),
# ylim = c(-48, -33),
# expand = FALSE)
ggplot(data = world) +
geom_sf() +
geom_point(data = llr3, aes(x = lon, y = lat), size = 1,
shape = 23, fill = "darkred") +
coord_sf(
crs = 4326, #"EPSG:4326", or equivalently "WGS84"
xlim = c(164, 179),
ylim = c(-48, -33),
expand = FALSE)
################################################################################
# Time to bring in an NZ map with regions
# We can use the annual boundaries published with population data collected during the census
# With meshblocks it gets real tough without decent memory so let's use a large grain size
# estimated resident population by Statistical Area 2
# https://datafinder.stats.govt.nz/layer/105008-estimated-resident-population-at-30-june-2018-by-statistical-area-2/data/
# In the download dialog box you get a choice of geodatabase or shapefile for the download
# In this example I'm using shapefile
# unzip the shapefile to a directory and then load it like this
# the zipped shape file doesn't make it fully clear here that it's actually SA2 regions
# if you open it up in explorer and take a look at the txt or pdf files
# they explain what's contained... ERP, sex ratio, age groups, ethnicity, median age by sa2
shp <- readOGR(dsn ="h:\\Downloads\\statsnzestimated-resident-population-at-30-june-2018-by-statistical-SHP", layer="estimated-resident-population-at-30-june-2018-by-statistical")
tidy_nzdf <- tidy(shp)
# Pulling out the polygon id to match on when trying to merge with meshblock id
shp$polyID <- sapply(slot(shp, "polygons"), function(x) slot(x, "ID"))
# Merge to get meshblock ids (again can take a couple mins)
nz_df <- merge(tidy_nzdf, shp, by.x = "id", by.y="polyID")
map <- ggplot() +
geom_path(data = nz_df,
aes(x = long, y = lat, group = group),
colour = 'grey', fill = 'white', size = 0.2)
print(map)
populationMap <- ggplot(data=nz_df, aes(x=long, y=lat, group=group)) +
geom_polygon(aes(fill=erp96), colour='gray', size=0.1) +
theme(line = element_blank(), axis.text=element_blank(),axis.title=element_blank(), panel.background = element_blank()) +
scale_fill_gradient2("#4d4dff", mid = "white", high = "#ff4d4d") +
guides(fill=guide_legend(title="Estimated resident population"))
# plots out with some colour aesthetics for the regions
populationMap
# all very nice but now we want to sum up the number of liquor licenses sitting within each region
# then, with the population being known we can generate a normalised liquor license measure
# then we want to compare to
# crime harm measure: https://www.police.govt.nz/about-us/publications-statistics/data-and-statistics/policedatanz/victimisation-time-and-place
# police demand measure: https://www.police.govt.nz/about-us/statistics-and-publications/data-and-statistics/demand-and-activity
# todo - filter out missing values for coords in llr3
llr4 <-
llr3 %>%
# slice_head(n=20) %>%
filter(!is.na(lat)) %>%
filter(!is.na(lon)) %>%
filter(between(lat, -48, -33 )) %>% # reasonableness clipping
filter(between(lon, 164, 179 )) # reasonableness clipping
licence_sf_points <- st_as_sf( llr4 %>% select(lat,lon), coords = c("lon","lat"), crs = 4326)
lsp <- st_as_sf( llr4 , coords = c("lon","lat"), crs = 4326)
ggplot() + geom_sf(data=licence_sf_points) # plot the licence locations
# my first attempt at working with nz region data is via the spData package
# library(spData)
# licenceSum <- aggregate(x = licence_sf_points, by = nz, FUN = sum)
#
# licenceSumPlt <- ggplot(data = licenses)
# licenceSumPlt
#
# plot(licenceSum)
#
# class(licence_sf_points)
# head(licence_sf_points)
# class(licenceSum)
# head(licenceSum)
#
# class(nz)
# head(nz)
#
# ggplot() + geom_sf(data=nz)
# plot(nz)
#
# nz %>% filter(Name == "Canterbury") # that's unexpected, no name in the df
#
# m1 = cbind(c(0, 0, 1, 0), c(0, 1, 1, 0))
# m2 = cbind(c(0, 1, 1, 0), c(0, 0, 1, 0))
# pol = st_sfc(st_polygon(list(m1)), st_polygon(list(m2)))
# set.seed(1985)
# d = data.frame(matrix(runif(15), ncol = 3))
# p = st_as_sf(x = d, coords = 1:2)
# head(p)
# plot(pol)
# plot(p, add = TRUE)
#
# (p_ag1 = aggregate(p, pol, mean))
#
# this isn't working the way I thought.
# how about this
class(nz_df)
# nz_df is just a dataframe - I want a sf class
# I'm following approach of what's shown in the spData documentation for the nz data set: https://cran.r-project.org/web/packages/spData/spData.pdf
library(rmapshaper)
# I've downloaded the SA2 dataset as a geopackage this time
unzip("h:\\downloads\\statsnzestimated-resident-population-at-30-june-2018-by-statistical-FGDB.zip")
nz_full = st_read("estimated-resident-population-at-30-june-2018-by-statistical.gdb")
print(object.size(nz_full), units= "Kb") # 30784.2 Kb
# make our memory issues go away... refer to line 435 for better examples
# # we can use other native r packages for the feature simplification
# library(rgeos)
# # rgeos doens't work with sf objects, only sp, so you have to cast
#
# nz_simple <- gSimplify(as(nz_full, 'Spatial'), tol = 1, topologyPreserve = T) %>% st_as_sf()
# print(object.size(nz_simple), units = "Kb") # 30782.6 Kb - no big change :( , maybe don't worry about this atm
class(nz_full) # good - it's an sf dataframe
# I need to get the crs the same on each sf data structure - maybe consolidate on WGS 84
st_crs(licence_sf_points) # WGS 84, EPSG:4326
st_crs(nz_full) # NZGD2000, EPSG:2193
(nz_full_WGS84 <- st_transform(nz_full, 4326))
# Bit confused over aggregate but I think I have it figured now
# I'm using 'length' as the function but it's just going to generate the same length for every field
licenceSum <- aggregate(x = licence_sf_points, by = nz_full_WGS84, FUN = length)
# alternative approach refer https://geocompr.robinlovelace.net/spatial-operations.html#spatial-vec
# where I've purposely got a field in licence_sf_points that just contains 1s
(licenseRegionCount <- nz_full_WGS84 %>%
st_join(licence_sf_points1) %>%
group_by(SA22020_V1_00_NAME) %>%
summarize(count = sum(rowone, na.rm = TRUE)))
head(licenseRegionCount)
# this is better I think
# The count is an explicit field
####################
# I think this is somewhat working!
ggplot() +
geom_sf(data = licenseDensity, aes(fill=count)) +
coord_sf(
crs = 4326, #"EPSG:4326", or equivalently "WGS84"
xlim = c(164, 179),
ylim = c(-48, -33),
expand = FALSE)
# what about intersection as in this stackexchange answer https://gis.stackexchange.com/a/270479
(intersection <- st_intersection(x = st_transform(nz_full, 4326), y = licence_sf_points)) # nice, this has counts up front
ggplot() +
geom_sf(data = st_transform(nz_full, 4326)) +
geom_sf(data = intersection[1]) +
coord_sf(
crs = 4326, #"EPSG:4326", or equivalently "WGS84"
xlim = c(164, 179),
ylim = c(-48, -33),
expand = FALSE)
licenceSum1 <- aggregate(x = lsp, by = st_transform(nz_full, 4326), FUN = length)
###########################################
nashp <- readOGR(dsn ="H:\\Documents\\wd\\erp\\alcoshp", layer="NA_Alco")
head(nashp)
class(nashp)
# generate an sf dataframe
(nalicences <- st_as_sf( nashp, coords = c("EASTING","NORTHING"), crs = 2193))
# transform to wgs84
(nalicences_WGS84 <- st_transform(nalicences, 4326))
# plotting like this ggplot wont know how to create a legend refer https://stackoverflow.com/questions/38907371/ggplot-legends-when-plot-is-built-from-two-data-frames
ggplot() +
geom_sf(data = st_transform(nz_full, 4326)) +
geom_sf(data = intersection[1], colour="blue", size=2) +
geom_sf(data = nalicences_WGS84[1], colour="red", size=0.5, alpha=0.5) +
coord_sf(
crs = 4326, #"EPSG:4326", or equivalently "WGS84"
xlim = c(164, 179),
ylim = c(-48, -33),
expand = FALSE)
# the answer is to do it like this
ggplot() +
geom_sf(data = st_transform(nz_full, 4326)) +
geom_sf(data = intersection[1], aes(color="MoJ LLR"), size=2) +
geom_sf(data = nalicences_WGS84[1], aes(color="NA LLR"), size=0.5, alpha=0.5) +
coord_sf(
crs = 4326, #"EPSG:4326", or equivalently "WGS84"
xlim = c(164, 179),
ylim = c(-48, -33),
expand = FALSE)
# this is quite slow to plot out but we can investigate what's going on
# bench_time(expr)
# bench_memory(expr)
# bench::mark( some series of functions )
benchMarkResults <- bench::mark( ggplot() +
geom_sf(data = st_transform(nz_full, 4326)) +
geom_sf(data = intersection[1], aes(color="MoJ LLR"), size=2) +
geom_sf(data = nalicences_WGS84[1], aes(color="NA LLR"), size=0.5, alpha=0.5) +
coord_sf(
crs = 4326, #"EPSG:4326", or equivalently "WGS84"
xlim = c(164, 179),
ylim = c(-48, -33),
expand = FALSE)
)
View(benchMarkResults)
# the plot appears to show that all/most MoJ records have a matching NA record - and differences?
# maybe for each item in the NA dataset look for a nearby matching MoJ record
# and can I put a boundary region on this so I don't need to test every point to point distance?
# possible approaches in
# https://stackoverflow.com/questions/22121742/calculate-the-distance-between-two-points-of-two-datasets-nearest-neighbor
# https://gis.stackexchange.com/questions/127244/is-there-an-r-equivalent-of-near-in-esri-arcgis
# or we can just do a pair wise distance calc using st_distance
# (sf::st_distance(intersection, nalicences_WGS84, by_element=F, tolerance=100)) # this is way too slow - what can we do to make this better?
# this still calculates over the full product of points so we need a way to just get the nearest neighbour features
# maybe just take a fraction of points to start with
(sf::st_distance(intersection %>% slice_sample(n=100), nalicences_WGS84 %>% slice_sample(n=100), by_element=F, tolerance=100)) # this is slow - what can we do to make this better?
# I just popped back to look at simplification again to see if I can save some memory on the polygon features
# two examples of simplification: one using st_simplify; the other ms_simplify
ggplot() +
geom_sf(data = st_transform(st_simplify(nz_full, dTolerance = 4000), 4326)) + # st_simplify needs to work on a project crs not lat lon where a unit of longitude changes with latitude
coord_sf(
crs = 4326, #"EPSG:4326", or equivalently "WGS84"
xlim = c(164, 179),
ylim = c(-48, -33),
expand = FALSE)
ggplot() +
geom_sf(data = st_transform(rmapshaper::ms_simplify(nz_full, keep = 0.001, keep_shapes = T), 4326)) + # keeps 0.1% of vertices
coord_sf(
crs = 4326, #"EPSG:4326", or equivalently "WGS84"
xlim = c(164, 179),
ylim = c(-48, -33),
expand = FALSE)
# let's try nngeo::st_nn nearest neighbour search for Simple Features - sounds like what we want
p2p <- st_nn(x = intersection, y = nalicences_WGS84, k = 1, maxdist = 1000, returnDist = T, progress = T) #, parallel = 16)
# using parallel option with all 16 cores destabilises the system and when I try the line 'ggplot() + geom_histogram(aes(distanceSample))' - it dies
# let's get the structure of the created list
str(p2p)
# List of two, $nn and $dist
# I'm guessing $nn identifies the neighbour somehow
# lets histogram the distribution of distances - sample first
distanceSample <- unlist(list.sample(p2p$dist, 100))
ggplot() + geom_histogram(aes(distanceSample))
# may as well do the lot
ggplot() + geom_histogram(aes(unlist(p2p$dist)))
# so we have vast majority of nearest neighbours matching within just a few metres
# do we have duplicated records?
nalicences %>%
group_by(ORG_ID) %>%
filter(n() > 1 )
# answer = no
# filtering out the Not Applicable TYPE from nalicences
nalicencesApplicable <- nalicences %>%
filter(TYPE != "Not Applicable" ) %>%
length()
# next problem, we've matched by geocoordinate - now let's try and match by address using string similarity measures
# n-grams work well but we have others like cosine/jacquard etc
# https://cran.r-project.org/web/packages/ngram/vignettes/ngram-guide.pdf
# https://cran.r-project.org/web/packages/stringdist/stringdist.pdf
# small aside - might try and check how apportionment works in R for two data sets we can do the exact same
# thing as I've done in ArcGIS - Census Estimated Resident Population apportioned to Police Areas - police boundaries are available on koordinates
# https://koordinates.com/layer/3825-nz-police-area-boundaries/
# looks like I can use sf::st_interpolate_aw or, more complete, the areal package
PoliceAreasShp <- readOGR(dsn ="h:\\Documents\\wd\\erp\\Police_Areas_Clipped__2021_04", layer="police_area_clipped_with_maori")
nrow(shp) # features in source
nrow(PoliceAreasShp) # features in target
nrow(suppressWarnings(sf::st_intersection(st_as_sf(shp), st_as_sf(PoliceAreasShp))))
(sf::st_intersection(st_as_sf(shp), st_as_sf(PoliceAreasShp) ) )
sourceERP <- st_as_sf(shp)
targetPoliceAreas <- st_as_sf(PoliceAreasShp)
apportioned <- areal::aw_interpolate(targetPoliceAreas,
tid = "AREA_ID",
source = sourceERP,
sid = "SA22020_V1",
weight = "sum",
output = "sf",
extensive = c("erp18"))
head(apportioned)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment