bohdanszymanik/LiquorLicenceRegisterAddressLookup.R

## LiquorLicenceRegisterAddressLookup.R
##################################################################################
# Import liquor licencing data, geocode and plot
# Plot on a NZ map
# Show a regional view - some demographics eg population density at SA2
# Import the public NZ Police aggregated crime data
# Look at crime harm regional cf liquor outlet distribution
#

library(readxl)
library(rgdal)
library(broom)
library(dplyr)
library(ggplot2)
library(ggmap)
library(memoise)
library(naniar)
library(stringr)
library(purrr)
library(tmaptools)
library(logger)
library(cowplot)
library(ggrepel)
library(sf)
library(nngeo)
library(rlist)
library(bench) # https://cran.r-project.org/web/packages/bench/bench.pdf
library(pryr)
library(stringdist)
library(ngram)
library(areal)

setwd("h:\\Documents\\wd\\erp")

################################################################################
# Bringing in liquour licensing data
#
# read the liquor licensing licences register in two steps
# first get the column names
cNames <- read_excel("February-2021-Licences.xlsx", skip=1, n_max=1) %>% names()
# then the raw data
llr <- read_excel("February-2021-Licences.xlsx", skip=2, col_names=cNames)

# getting a comma separated address - bit pointless having its own function?
#createAddress <- function(`Premises Street`, `Premises Suburb`, `Premises City`, ...) {
#  paste(`Premises Street`, `Premises Suburb`, `Premises City`, sep=',')
#}
# eg
#llr %>% rowwise() %>% pmap_chr(createAddress)

llr2 <- llr %>% dplyr::mutate(address = paste(`Premises Street`, `Premises Suburb`, `Premises City`, c("New Zealand"), sep=',') )
llr2 %>% slice_sample(n=50)
llr2 %>% slice_head(n=5)


geocode_OSM("Klondike Ale House, 5 Gillies St, Kawakawa, new zealand", server = "https://nominatim.openstreetmap.org")
geocode_OSM("5 Gillies St, Kawakawa, new zealand", server = "https://nominatim.openstreetmap.org")
#geocode_OSM("5 Gillies St, Kawakawa, new zealand")
#geocode_OSM("5 Gillies St, Kawakawa, new zealand")$coords
#typeof(geocode_OSM("5 Gillies St, Kawakawa, new zealand")$coords)
#length(geocode_OSM("5 Gillies St, Kawakawa, new zealand")$coords)
#str(geocode_OSM("5 Gillies St, Kawakawa, new zealand")$coords)
#is.vector(geocode_OSM("5 Gillies St, Kawakawa, new zealand")$coords)
#geocode_OSM("5 Gillies St, Kawakawa, new zealand")$coords["x"]


#log_layout(layout_glue_colors) # warning ... unreadable in the log with default text editors
log_formatter(formatter_glue)
log_layout(layout_simple)

log_threshold(TRACE)

#t <- tempfile()
t <- "mylogger.log"
log_appender(appender_file(t))

#readLines(t)
#unlink(t)


cf <- cache_filesystem(".rcache", algo = "xxhash64", compress = FALSE)
# cache_memory(algo="sha512")

# geocode_OSM defaults to EPSG 4326 lat lon
# st_crs can be used to set coordinate reference system on underlying sf objects
# not sure if nztm is supported by sf

lookupAddress <- memoise (
  function(address) {
    geocode <- geocode_OSM(address) # returns either the address if not found, or a vector with lat lon

    if (is.null(geocode)) {
      log_trace('{address}')
      NA
    } else {

      log_trace('{address}\t{geocode$coords["x"]}\t{geocode$coords["y"]}')
      geocode$coords
    }
  },
  cache = cf
)


lookupAddress("5 Gillies St,Kawakawa,New Zealand")
# has_cache(lookupAddress)("5 Gillies St,Kawakawa,New Zealand")
# drop_cache(lookupAddress)("5 Gillies St,Kawakawa,New Zealand")
# forget(lookupAddress)
#geocode_OSM("5 Gillies St,Kawakawa,New Zealand")

# it doesn't like the presence of a # char in a column in the 'address'Premises Street' data so we need to filter that out
llr3 <-
  llr2 %>%
  # slice_head(n=20) %>%
  select(LicenceType, `Date Licence is Valid From`, `Date Licence Expires`, address) %>%
  rowwise %>%
  dplyr::mutate(addressFiltered = str_replace(address, "#", "")) %>%
  dplyr::mutate(lat = lookupAddress(address)["y"], lon = lookupAddress(address)["x"])

# we really need to be doing this for licences that are currently active
# that means date today is after `Date Licence is Valid From` and before `Date Licence Expires`

llr3Active <-
  llr3 %>%
  # slice_head(n=20) %>%
  filter(Sys.time() >= `Date Licence is Valid From` & Sys.time() < `Date Licence Expires`)
# down from 10719 obs to 9547 observations


################################################################################

# Plotting

# first we run against the natural earth data which gives a simple global map with countries
# then we'll go to an NZ map with regional boundaries

library("rnaturalearth")
library("rnaturalearthdata")

world <- ne_countries(scale = "medium", returnclass = "sf")
class(world)

ggplot(data = world) +
  geom_sf() +
  coord_sf(expand=F)

ggplot(data = world) +
  geom_sf() +
  label_graticule = "EW" +
  xlab("Longitude") + ylab("Latitude") +
  ggtitle("World map", subtitle = paste0("(", length(unique(world$NAME)), " countries)"))

# # example of colouring the earth map using an aesthetic based off population data
# ggplot(data = world) +
#   geom_sf(aes(fill = pop_est)) +
#   scale_fill_viridis_c(option = "plasma", trans = "sqrt")

# # plotting just nz out of the rnaturalearth data set
# nz <- world[world$name == 'New Zealand',]
#
# ggplot(data = nz) +
#   geom_sf() +


# # test by plotting one or two points
# two ways to create tibbles on the fly - tribble is quite nice
# tibble( lat = -35.11244, lon = 173.26267)
#
# tribble (
#   ~lat, ~lon,
#   -35.11244, 173.26267,
#   -35.73572, 174.33096,
# )
#
#
# ggplot(data = world) +
#   geom_sf() +
#   geom_point(data = tibble( lat = -35.11244, lon = 173.26267),
#              aes(x = lon, y = lat),
#              size = 2,
#              shape = 23,
#              fill = "darkred") +
#   coord_sf(
#     crs = 4326, #"EPSG:4326", or equivalently "WGS84"
#     xlim = c(164, 179),
#     ylim = c(-48, -33),
#     expand = FALSE)

ggplot(data = world) +
  geom_sf() +
  geom_point(data = llr3, aes(x = lon, y = lat), size = 1,
             shape = 23, fill = "darkred") +
  coord_sf(
    crs = 4326, #"EPSG:4326", or equivalently "WGS84"
    xlim = c(164, 179),
    ylim = c(-48, -33),
    expand = FALSE)


################################################################################

# Time to bring in an NZ map with regions

# We can use the annual boundaries published with population data collected during the census
# With meshblocks it gets real tough without decent memory so let's use a large grain size
# estimated resident population by Statistical Area 2
# https://datafinder.stats.govt.nz/layer/105008-estimated-resident-population-at-30-june-2018-by-statistical-area-2/data/

# In the download dialog box you get a choice of geodatabase or shapefile for the download
# In this example I'm using shapefile

# unzip the shapefile to a directory and then load it like this
# the zipped shape file doesn't make it fully clear here that it's actually SA2 regions
# if you open it up in explorer and take a look at the txt or pdf files
# they explain what's contained... ERP, sex ratio, age groups, ethnicity, median age by sa2
shp <- readOGR(dsn ="h:\\Downloads\\statsnzestimated-resident-population-at-30-june-2018-by-statistical-SHP", layer="estimated-resident-population-at-30-june-2018-by-statistical")

tidy_nzdf <- tidy(shp)

# Pulling out the polygon id to match on when trying to merge with meshblock id
shp$polyID <- sapply(slot(shp, "polygons"), function(x) slot(x, "ID"))

# Merge to get meshblock ids (again can take a couple mins)
nz_df <- merge(tidy_nzdf, shp, by.x = "id", by.y="polyID")


map <- ggplot() +
  geom_path(data = nz_df,
            aes(x = long, y = lat, group = group),
            colour = 'grey', fill = 'white', size = 0.2)

print(map)

populationMap <- ggplot(data=nz_df, aes(x=long, y=lat, group=group)) +
  geom_polygon(aes(fill=erp96), colour='gray', size=0.1) +
  theme(line = element_blank(), axis.text=element_blank(),axis.title=element_blank(), panel.background = element_blank()) +
  scale_fill_gradient2("#4d4dff", mid = "white", high = "#ff4d4d") +
  guides(fill=guide_legend(title="Estimated resident population"))

# plots out with some colour aesthetics for the regions
populationMap


# all very nice but now we want to sum up the number of liquor licenses sitting within each region
# then, with the population being known we can generate a normalised liquor license measure
# then we want to compare to
#   crime harm measure: https://www.police.govt.nz/about-us/publications-statistics/data-and-statistics/policedatanz/victimisation-time-and-place
#   police demand measure: https://www.police.govt.nz/about-us/statistics-and-publications/data-and-statistics/demand-and-activity

# todo - filter out missing values for coords in llr3
llr4 <-
  llr3 %>%
  # slice_head(n=20) %>%
  filter(!is.na(lat)) %>%
  filter(!is.na(lon)) %>%
  filter(between(lat, -48, -33 )) %>% # reasonableness clipping
  filter(between(lon, 164, 179 ))     # reasonableness clipping


licence_sf_points <- st_as_sf( llr4 %>% select(lat,lon), coords = c("lon","lat"), crs = 4326)
lsp <- st_as_sf( llr4 , coords = c("lon","lat"), crs = 4326)

ggplot() + geom_sf(data=licence_sf_points)  # plot the licence locations

# my first attempt at working with nz region data is via the spData package
# library(spData)
# licenceSum <- aggregate(x = licence_sf_points, by = nz, FUN = sum)
#
# licenceSumPlt <- ggplot(data = licenses)
# licenceSumPlt
#
# plot(licenceSum)
#
# class(licence_sf_points)
# head(licence_sf_points)
# class(licenceSum)
# head(licenceSum)
#
# class(nz)
# head(nz)
#
# ggplot() + geom_sf(data=nz)
# plot(nz)
#
# nz %>% filter(Name == "Canterbury") # that's unexpected, no name in the df
#
# m1 = cbind(c(0, 0, 1, 0), c(0, 1, 1, 0))
# m2 = cbind(c(0, 1, 1, 0), c(0, 0, 1, 0))
# pol = st_sfc(st_polygon(list(m1)), st_polygon(list(m2)))
# set.seed(1985)
# d = data.frame(matrix(runif(15), ncol = 3))
# p = st_as_sf(x = d, coords = 1:2)
# head(p)
# plot(pol)
# plot(p, add = TRUE)
#
# (p_ag1 = aggregate(p, pol, mean))
#
# this isn't working the way I thought.
# how about this
class(nz_df)
# nz_df is just a dataframe - I want a sf class

# I'm following approach of what's shown in the spData documentation for the nz data set: https://cran.r-project.org/web/packages/spData/spData.pdf
library(rmapshaper)

# I've downloaded the SA2 dataset as a geopackage this time
unzip("h:\\downloads\\statsnzestimated-resident-population-at-30-june-2018-by-statistical-FGDB.zip")
nz_full = st_read("estimated-resident-population-at-30-june-2018-by-statistical.gdb")
print(object.size(nz_full), units= "Kb") # 30784.2 Kb

# make our memory issues go away... refer to line 435 for better examples

# # we can use other native r packages for the feature simplification
# library(rgeos)
# # rgeos doens't work with sf objects, only sp, so you have to cast
#
# nz_simple <- gSimplify(as(nz_full, 'Spatial'), tol = 1, topologyPreserve = T) %>% st_as_sf()
# print(object.size(nz_simple), units = "Kb") # 30782.6 Kb - no big change :( , maybe don't worry about this atm

class(nz_full) # good - it's an sf dataframe

# I need to get the crs the same on each sf data structure - maybe consolidate on WGS 84
st_crs(licence_sf_points) # WGS 84, EPSG:4326
st_crs(nz_full) # NZGD2000, EPSG:2193

(nz_full_WGS84 <- st_transform(nz_full, 4326))

# Bit confused over aggregate but I think I have it figured now
# I'm using 'length' as the function but it's just going to generate the same length for every field
licenceSum <- aggregate(x = licence_sf_points, by = nz_full_WGS84, FUN = length)

# alternative approach refer https://geocompr.robinlovelace.net/spatial-operations.html#spatial-vec
# where I've purposely got a field in licence_sf_points that just contains 1s
(licenseRegionCount <- nz_full_WGS84 %>%
    st_join(licence_sf_points1) %>%
    group_by(SA22020_V1_00_NAME) %>%
    summarize(count = sum(rowone, na.rm = TRUE)))
head(licenseRegionCount)


# this is better I think
# The count is an explicit field

####################
# I think this is somewhat working!
ggplot() +
  geom_sf(data = licenseDensity, aes(fill=count)) +
  coord_sf(
    crs = 4326, #"EPSG:4326", or equivalently "WGS84"
    xlim = c(164, 179),
    ylim = c(-48, -33),
    expand = FALSE)


# what about intersection as in this stackexchange answer https://gis.stackexchange.com/a/270479
(intersection <- st_intersection(x = st_transform(nz_full, 4326), y = licence_sf_points)) # nice, this has counts up front

ggplot() +
  geom_sf(data = st_transform(nz_full, 4326)) +
  geom_sf(data = intersection[1]) +
  coord_sf(
    crs = 4326, #"EPSG:4326", or equivalently "WGS84"
    xlim = c(164, 179),
    ylim = c(-48, -33),
    expand = FALSE)


licenceSum1 <- aggregate(x = lsp, by = st_transform(nz_full, 4326), FUN = length)


###########################################
nashp <- readOGR(dsn ="H:\\Documents\\wd\\erp\\alcoshp", layer="NA_Alco")

head(nashp)
class(nashp)

# generate an sf dataframe
(nalicences <- st_as_sf( nashp, coords = c("EASTING","NORTHING"), crs = 2193))

# transform to wgs84
(nalicences_WGS84 <- st_transform(nalicences, 4326))

# plotting like this ggplot wont know how to create a legend refer https://stackoverflow.com/questions/38907371/ggplot-legends-when-plot-is-built-from-two-data-frames
ggplot() +
  geom_sf(data = st_transform(nz_full, 4326)) +
  geom_sf(data = intersection[1], colour="blue", size=2) +
  geom_sf(data = nalicences_WGS84[1], colour="red", size=0.5, alpha=0.5) +
  coord_sf(
    crs = 4326, #"EPSG:4326", or equivalently "WGS84"
    xlim = c(164, 179),
    ylim = c(-48, -33),
    expand = FALSE)

# the answer is to do it like this
ggplot() +
  geom_sf(data = st_transform(nz_full, 4326)) +
  geom_sf(data = intersection[1], aes(color="MoJ LLR"), size=2) +
  geom_sf(data = nalicences_WGS84[1], aes(color="NA LLR"), size=0.5, alpha=0.5) +
  coord_sf(
    crs = 4326, #"EPSG:4326", or equivalently "WGS84"
    xlim = c(164, 179),
    ylim = c(-48, -33),
    expand = FALSE)

# this is quite slow to plot out but we can investigate what's going on
# bench_time(expr)
# bench_memory(expr)
# bench::mark( some series of functions )

benchMarkResults <- bench::mark( ggplot() +
                                   geom_sf(data = st_transform(nz_full, 4326)) +
                                   geom_sf(data = intersection[1], aes(color="MoJ LLR"), size=2) +
                                   geom_sf(data = nalicences_WGS84[1], aes(color="NA LLR"), size=0.5, alpha=0.5) +
                                   coord_sf(
                                     crs = 4326, #"EPSG:4326", or equivalently "WGS84"
                                     xlim = c(164, 179),
                                     ylim = c(-48, -33),
                                     expand = FALSE)
)
View(benchMarkResults)


# the plot appears to show that all/most MoJ records have a matching NA record - and differences?
# maybe for each item in the NA dataset look for a nearby matching MoJ record
# and can I put a boundary region on this so I don't need to test every point to point distance?

# possible approaches in
# https://stackoverflow.com/questions/22121742/calculate-the-distance-between-two-points-of-two-datasets-nearest-neighbor
# https://gis.stackexchange.com/questions/127244/is-there-an-r-equivalent-of-near-in-esri-arcgis

# or we can just do a pair wise distance calc using st_distance

# (sf::st_distance(intersection, nalicences_WGS84, by_element=F, tolerance=100)) # this is way too slow - what can we do to make this better?
# this still calculates over the full product of points so we need a way to just get the nearest neighbour features

# maybe just take a fraction of points to start with
(sf::st_distance(intersection %>% slice_sample(n=100), nalicences_WGS84 %>% slice_sample(n=100), by_element=F, tolerance=100)) # this is slow - what can we do to make this better?

# I just popped back to look at simplification again to see if I can save some memory on the polygon features
# two examples of simplification: one using st_simplify; the other ms_simplify
ggplot() +
  geom_sf(data = st_transform(st_simplify(nz_full, dTolerance = 4000), 4326)) + # st_simplify needs to work on a project crs not lat lon where a unit of longitude changes with latitude
  coord_sf(
    crs = 4326, #"EPSG:4326", or equivalently "WGS84"
    xlim = c(164, 179),
    ylim = c(-48, -33),
    expand = FALSE)

ggplot() +
  geom_sf(data = st_transform(rmapshaper::ms_simplify(nz_full, keep = 0.001, keep_shapes = T), 4326)) + # keeps 0.1% of vertices
  coord_sf(
    crs = 4326, #"EPSG:4326", or equivalently "WGS84"
    xlim = c(164, 179),
    ylim = c(-48, -33),
    expand = FALSE)

# let's try nngeo::st_nn nearest neighbour search for Simple Features - sounds like what we want
p2p <- st_nn(x = intersection, y = nalicences_WGS84, k = 1, maxdist = 1000, returnDist = T, progress = T) #, parallel = 16)
# using parallel option with all 16 cores destabilises the system and when I try the line 'ggplot() + geom_histogram(aes(distanceSample))' - it dies

# let's get the structure of the created list
str(p2p)
# List of two, $nn and $dist
# I'm guessing $nn identifies the neighbour somehow

# lets histogram the distribution of distances - sample first
distanceSample <- unlist(list.sample(p2p$dist, 100))
ggplot() + geom_histogram(aes(distanceSample))

# may as well do the lot
ggplot() + geom_histogram(aes(unlist(p2p$dist)))
# so we have vast majority of nearest neighbours matching within just a few metres

# do we have duplicated records?
nalicences %>%
  group_by(ORG_ID) %>%
  filter(n() > 1 )
# answer = no

# filtering out the Not Applicable TYPE from nalicences
nalicencesApplicable <- nalicences %>%
                          filter(TYPE != "Not Applicable" ) %>%
                          length()


# next problem, we've matched by geocoordinate - now let's try and match by address using string similarity measures
# n-grams work well but we have others like cosine/jacquard etc
# https://cran.r-project.org/web/packages/ngram/vignettes/ngram-guide.pdf
# https://cran.r-project.org/web/packages/stringdist/stringdist.pdf


# small aside - might try and check how apportionment works in R for two data sets we can do the exact same
# thing as I've done in ArcGIS - Census Estimated Resident Population apportioned to Police Areas - police boundaries are available on koordinates
# https://koordinates.com/layer/3825-nz-police-area-boundaries/
# looks like I can use sf::st_interpolate_aw or, more complete, the areal package
PoliceAreasShp <- readOGR(dsn ="h:\\Documents\\wd\\erp\\Police_Areas_Clipped__2021_04", layer="police_area_clipped_with_maori")

nrow(shp) # features in source
nrow(PoliceAreasShp) # features in target

nrow(suppressWarnings(sf::st_intersection(st_as_sf(shp), st_as_sf(PoliceAreasShp))))
(sf::st_intersection(st_as_sf(shp), st_as_sf(PoliceAreasShp) ) )

sourceERP <- st_as_sf(shp)
targetPoliceAreas <- st_as_sf(PoliceAreasShp)

apportioned <- areal::aw_interpolate(targetPoliceAreas,
                                     tid = "AREA_ID",
                                     source = sourceERP,
                                     sid = "SA22020_V1",
                                     weight = "sum",
                                     output = "sf",
                                     extensive = c("erp18"))

head(apportioned)
	##################################################################################
	# Import liquor licencing data, geocode and plot
	# Plot on a NZ map
	# Show a regional view - some demographics eg population density at SA2
	# Import the public NZ Police aggregated crime data
	# Look at crime harm regional cf liquor outlet distribution
	#

	library(readxl)
	library(rgdal)
	library(broom)
	library(dplyr)
	library(ggplot2)
	library(ggmap)
	library(memoise)
	library(naniar)
	library(stringr)
	library(purrr)
	library(tmaptools)
	library(logger)
	library(cowplot)
	library(ggrepel)
	library(sf)
	library(nngeo)
	library(rlist)
	library(bench) # https://cran.r-project.org/web/packages/bench/bench.pdf
	library(pryr)
	library(stringdist)
	library(ngram)
	library(areal)

	setwd("h:\\Documents\\wd\\erp")

	################################################################################
	# Bringing in liquour licensing data
	#
	# read the liquor licensing licences register in two steps
	# first get the column names
	cNames <- read_excel("February-2021-Licences.xlsx", skip=1, n_max=1) %>% names()
	# then the raw data
	llr <- read_excel("February-2021-Licences.xlsx", skip=2, col_names=cNames)

	# getting a comma separated address - bit pointless having its own function?
	#createAddress <- function(`Premises Street`, `Premises Suburb`, `Premises City`, ...) {
	# paste(`Premises Street`, `Premises Suburb`, `Premises City`, sep=',')
	#}
	# eg
	#llr %>% rowwise() %>% pmap_chr(createAddress)

	llr2 <- llr %>% dplyr::mutate(address = paste(`Premises Street`, `Premises Suburb`, `Premises City`, c("New Zealand"), sep=',') )
	llr2 %>% slice_sample(n=50)
	llr2 %>% slice_head(n=5)


	geocode_OSM("Klondike Ale House, 5 Gillies St, Kawakawa, new zealand", server = "https://nominatim.openstreetmap.org")
	geocode_OSM("5 Gillies St, Kawakawa, new zealand", server = "https://nominatim.openstreetmap.org")
	#geocode_OSM("5 Gillies St, Kawakawa, new zealand")
	#geocode_OSM("5 Gillies St, Kawakawa, new zealand")$coords
	#typeof(geocode_OSM("5 Gillies St, Kawakawa, new zealand")$coords)
	#length(geocode_OSM("5 Gillies St, Kawakawa, new zealand")$coords)
	#str(geocode_OSM("5 Gillies St, Kawakawa, new zealand")$coords)
	#is.vector(geocode_OSM("5 Gillies St, Kawakawa, new zealand")$coords)
	#geocode_OSM("5 Gillies St, Kawakawa, new zealand")$coords["x"]


	#log_layout(layout_glue_colors) # warning ... unreadable in the log with default text editors
	log_formatter(formatter_glue)
	log_layout(layout_simple)

	log_threshold(TRACE)

	#t <- tempfile()
	t <- "mylogger.log"
	log_appender(appender_file(t))

	#readLines(t)
	#unlink(t)


	cf <- cache_filesystem(".rcache", algo = "xxhash64", compress = FALSE)
	# cache_memory(algo="sha512")

	# geocode_OSM defaults to EPSG 4326 lat lon
	# st_crs can be used to set coordinate reference system on underlying sf objects
	# not sure if nztm is supported by sf

	lookupAddress <- memoise (
	function(address) {
	geocode <- geocode_OSM(address) # returns either the address if not found, or a vector with lat lon

	if (is.null(geocode)) {
	log_trace('{address}')
	NA
	} else {

	log_trace('{address}\t{geocode$coords["x"]}\t{geocode$coords["y"]}')
	geocode$coords
	}
	},
	cache = cf
	)


	lookupAddress("5 Gillies St,Kawakawa,New Zealand")
	# has_cache(lookupAddress)("5 Gillies St,Kawakawa,New Zealand")
	# drop_cache(lookupAddress)("5 Gillies St,Kawakawa,New Zealand")
	# forget(lookupAddress)
	#geocode_OSM("5 Gillies St,Kawakawa,New Zealand")

	# it doesn't like the presence of a # char in a column in the 'address'Premises Street' data so we need to filter that out
	llr3 <-
	llr2 %>%
	# slice_head(n=20) %>%
	select(LicenceType, `Date Licence is Valid From`, `Date Licence Expires`, address) %>%
	rowwise %>%
	dplyr::mutate(addressFiltered = str_replace(address, "#", "")) %>%
	dplyr::mutate(lat = lookupAddress(address)["y"], lon = lookupAddress(address)["x"])

	# we really need to be doing this for licences that are currently active
	# that means date today is after `Date Licence is Valid From` and before `Date Licence Expires`

	llr3Active <-
	llr3 %>%
	# slice_head(n=20) %>%
	filter(Sys.time() >= `Date Licence is Valid From` & Sys.time() < `Date Licence Expires`)
	# down from 10719 obs to 9547 observations


	################################################################################

	# Plotting

	# first we run against the natural earth data which gives a simple global map with countries
	# then we'll go to an NZ map with regional boundaries

	library("rnaturalearth")
	library("rnaturalearthdata")

	world <- ne_countries(scale = "medium", returnclass = "sf")
	class(world)

	ggplot(data = world) +
	geom_sf() +
	coord_sf(expand=F)

	ggplot(data = world) +
	geom_sf() +
	label_graticule = "EW" +
	xlab("Longitude") + ylab("Latitude") +
	ggtitle("World map", subtitle = paste0("(", length(unique(world$NAME)), " countries)"))

	# # example of colouring the earth map using an aesthetic based off population data
	# ggplot(data = world) +
	# geom_sf(aes(fill = pop_est)) +
	# scale_fill_viridis_c(option = "plasma", trans = "sqrt")

	# # plotting just nz out of the rnaturalearth data set
	# nz <- world[world$name == 'New Zealand',]
	#
	# ggplot(data = nz) +
	# geom_sf() +


	# # test by plotting one or two points
	# two ways to create tibbles on the fly - tribble is quite nice
	# tibble( lat = -35.11244, lon = 173.26267)
	#
	# tribble (
	# ~lat, ~lon,
	# -35.11244, 173.26267,
	# -35.73572, 174.33096,
	# )
	#
	#
	# ggplot(data = world) +
	# geom_sf() +
	# geom_point(data = tibble( lat = -35.11244, lon = 173.26267),
	# aes(x = lon, y = lat),
	# size = 2,
	# shape = 23,
	# fill = "darkred") +
	# coord_sf(
	# crs = 4326, #"EPSG:4326", or equivalently "WGS84"
	# xlim = c(164, 179),
	# ylim = c(-48, -33),
	# expand = FALSE)

	ggplot(data = world) +
	geom_sf() +
	geom_point(data = llr3, aes(x = lon, y = lat), size = 1,
	shape = 23, fill = "darkred") +
	coord_sf(
	crs = 4326, #"EPSG:4326", or equivalently "WGS84"
	xlim = c(164, 179),
	ylim = c(-48, -33),
	expand = FALSE)


	################################################################################

	# Time to bring in an NZ map with regions

	# We can use the annual boundaries published with population data collected during the census
	# With meshblocks it gets real tough without decent memory so let's use a large grain size
	# estimated resident population by Statistical Area 2
	# https://datafinder.stats.govt.nz/layer/105008-estimated-resident-population-at-30-june-2018-by-statistical-area-2/data/

	# In the download dialog box you get a choice of geodatabase or shapefile for the download
	# In this example I'm using shapefile

	# unzip the shapefile to a directory and then load it like this
	# the zipped shape file doesn't make it fully clear here that it's actually SA2 regions
	# if you open it up in explorer and take a look at the txt or pdf files
	# they explain what's contained... ERP, sex ratio, age groups, ethnicity, median age by sa2
	shp <- readOGR(dsn ="h:\\Downloads\\statsnzestimated-resident-population-at-30-june-2018-by-statistical-SHP", layer="estimated-resident-population-at-30-june-2018-by-statistical")

	tidy_nzdf <- tidy(shp)

	# Pulling out the polygon id to match on when trying to merge with meshblock id
	shp$polyID <- sapply(slot(shp, "polygons"), function(x) slot(x, "ID"))

	# Merge to get meshblock ids (again can take a couple mins)
	nz_df <- merge(tidy_nzdf, shp, by.x = "id", by.y="polyID")


	map <- ggplot() +
	geom_path(data = nz_df,
	aes(x = long, y = lat, group = group),
	colour = 'grey', fill = 'white', size = 0.2)

	print(map)

	populationMap <- ggplot(data=nz_df, aes(x=long, y=lat, group=group)) +
	geom_polygon(aes(fill=erp96), colour='gray', size=0.1) +
	theme(line = element_blank(), axis.text=element_blank(),axis.title=element_blank(), panel.background = element_blank()) +
	scale_fill_gradient2("#4d4dff", mid = "white", high = "#ff4d4d") +
	guides(fill=guide_legend(title="Estimated resident population"))

	# plots out with some colour aesthetics for the regions
	populationMap


	# all very nice but now we want to sum up the number of liquor licenses sitting within each region
	# then, with the population being known we can generate a normalised liquor license measure
	# then we want to compare to
	# crime harm measure: https://www.police.govt.nz/about-us/publications-statistics/data-and-statistics/policedatanz/victimisation-time-and-place
	# police demand measure: https://www.police.govt.nz/about-us/statistics-and-publications/data-and-statistics/demand-and-activity

	# todo - filter out missing values for coords in llr3
	llr4 <-
	llr3 %>%
	# slice_head(n=20) %>%
	filter(!is.na(lat)) %>%
	filter(!is.na(lon)) %>%
	filter(between(lat, -48, -33 )) %>% # reasonableness clipping
	filter(between(lon, 164, 179 )) # reasonableness clipping


	licence_sf_points <- st_as_sf( llr4 %>% select(lat,lon), coords = c("lon","lat"), crs = 4326)
	lsp <- st_as_sf( llr4 , coords = c("lon","lat"), crs = 4326)

	ggplot() + geom_sf(data=licence_sf_points) # plot the licence locations

	# my first attempt at working with nz region data is via the spData package
	# library(spData)
	# licenceSum <- aggregate(x = licence_sf_points, by = nz, FUN = sum)
	#
	# licenceSumPlt <- ggplot(data = licenses)
	# licenceSumPlt
	#
	# plot(licenceSum)
	#
	# class(licence_sf_points)
	# head(licence_sf_points)
	# class(licenceSum)
	# head(licenceSum)
	#
	# class(nz)
	# head(nz)
	#
	# ggplot() + geom_sf(data=nz)
	# plot(nz)
	#
	# nz %>% filter(Name == "Canterbury") # that's unexpected, no name in the df
	#
	# m1 = cbind(c(0, 0, 1, 0), c(0, 1, 1, 0))
	# m2 = cbind(c(0, 1, 1, 0), c(0, 0, 1, 0))
	# pol = st_sfc(st_polygon(list(m1)), st_polygon(list(m2)))
	# set.seed(1985)
	# d = data.frame(matrix(runif(15), ncol = 3))
	# p = st_as_sf(x = d, coords = 1:2)
	# head(p)
	# plot(pol)
	# plot(p, add = TRUE)
	#
	# (p_ag1 = aggregate(p, pol, mean))
	#
	# this isn't working the way I thought.
	# how about this
	class(nz_df)
	# nz_df is just a dataframe - I want a sf class

	# I'm following approach of what's shown in the spData documentation for the nz data set: https://cran.r-project.org/web/packages/spData/spData.pdf
	library(rmapshaper)

	# I've downloaded the SA2 dataset as a geopackage this time
	unzip("h:\\downloads\\statsnzestimated-resident-population-at-30-june-2018-by-statistical-FGDB.zip")
	nz_full = st_read("estimated-resident-population-at-30-june-2018-by-statistical.gdb")
	print(object.size(nz_full), units= "Kb") # 30784.2 Kb

	# make our memory issues go away... refer to line 435 for better examples

	# # we can use other native r packages for the feature simplification
	# library(rgeos)
	# # rgeos doens't work with sf objects, only sp, so you have to cast
	#
	# nz_simple <- gSimplify(as(nz_full, 'Spatial'), tol = 1, topologyPreserve = T) %>% st_as_sf()
	# print(object.size(nz_simple), units = "Kb") # 30782.6 Kb - no big change :( , maybe don't worry about this atm

	class(nz_full) # good - it's an sf dataframe

	# I need to get the crs the same on each sf data structure - maybe consolidate on WGS 84
	st_crs(licence_sf_points) # WGS 84, EPSG:4326
	st_crs(nz_full) # NZGD2000, EPSG:2193

	(nz_full_WGS84 <- st_transform(nz_full, 4326))

	# Bit confused over aggregate but I think I have it figured now
	# I'm using 'length' as the function but it's just going to generate the same length for every field
	licenceSum <- aggregate(x = licence_sf_points, by = nz_full_WGS84, FUN = length)

	# alternative approach refer https://geocompr.robinlovelace.net/spatial-operations.html#spatial-vec
	# where I've purposely got a field in licence_sf_points that just contains 1s
	(licenseRegionCount <- nz_full_WGS84 %>%
	st_join(licence_sf_points1) %>%
	group_by(SA22020_V1_00_NAME) %>%
	summarize(count = sum(rowone, na.rm = TRUE)))
	head(licenseRegionCount)


	# this is better I think
	# The count is an explicit field

	####################
	# I think this is somewhat working!
	ggplot() +
	geom_sf(data = licenseDensity, aes(fill=count)) +
	coord_sf(
	crs = 4326, #"EPSG:4326", or equivalently "WGS84"
	xlim = c(164, 179),
	ylim = c(-48, -33),
	expand = FALSE)


	# what about intersection as in this stackexchange answer https://gis.stackexchange.com/a/270479
	(intersection <- st_intersection(x = st_transform(nz_full, 4326), y = licence_sf_points)) # nice, this has counts up front

	ggplot() +
	geom_sf(data = st_transform(nz_full, 4326)) +
	geom_sf(data = intersection[1]) +
	coord_sf(
	crs = 4326, #"EPSG:4326", or equivalently "WGS84"
	xlim = c(164, 179),
	ylim = c(-48, -33),
	expand = FALSE)


	licenceSum1 <- aggregate(x = lsp, by = st_transform(nz_full, 4326), FUN = length)



	###########################################
	nashp <- readOGR(dsn ="H:\\Documents\\wd\\erp\\alcoshp", layer="NA_Alco")

	head(nashp)
	class(nashp)

	# generate an sf dataframe
	(nalicences <- st_as_sf( nashp, coords = c("EASTING","NORTHING"), crs = 2193))

	# transform to wgs84
	(nalicences_WGS84 <- st_transform(nalicences, 4326))

	# plotting like this ggplot wont know how to create a legend refer https://stackoverflow.com/questions/38907371/ggplot-legends-when-plot-is-built-from-two-data-frames
	ggplot() +
	geom_sf(data = st_transform(nz_full, 4326)) +
	geom_sf(data = intersection[1], colour="blue", size=2) +
	geom_sf(data = nalicences_WGS84[1], colour="red", size=0.5, alpha=0.5) +
	coord_sf(
	crs = 4326, #"EPSG:4326", or equivalently "WGS84"
	xlim = c(164, 179),
	ylim = c(-48, -33),
	expand = FALSE)

	# the answer is to do it like this
	ggplot() +
	geom_sf(data = st_transform(nz_full, 4326)) +
	geom_sf(data = intersection[1], aes(color="MoJ LLR"), size=2) +
	geom_sf(data = nalicences_WGS84[1], aes(color="NA LLR"), size=0.5, alpha=0.5) +
	coord_sf(
	crs = 4326, #"EPSG:4326", or equivalently "WGS84"
	xlim = c(164, 179),
	ylim = c(-48, -33),
	expand = FALSE)

	# this is quite slow to plot out but we can investigate what's going on
	# bench_time(expr)
	# bench_memory(expr)
	# bench::mark( some series of functions )

	benchMarkResults <- bench::mark( ggplot() +
	geom_sf(data = st_transform(nz_full, 4326)) +
	geom_sf(data = intersection[1], aes(color="MoJ LLR"), size=2) +
	geom_sf(data = nalicences_WGS84[1], aes(color="NA LLR"), size=0.5, alpha=0.5) +
	coord_sf(
	crs = 4326, #"EPSG:4326", or equivalently "WGS84"
	xlim = c(164, 179),
	ylim = c(-48, -33),
	expand = FALSE)
	)
	View(benchMarkResults)



	# the plot appears to show that all/most MoJ records have a matching NA record - and differences?
	# maybe for each item in the NA dataset look for a nearby matching MoJ record
	# and can I put a boundary region on this so I don't need to test every point to point distance?

	# possible approaches in
	# https://stackoverflow.com/questions/22121742/calculate-the-distance-between-two-points-of-two-datasets-nearest-neighbor
	# https://gis.stackexchange.com/questions/127244/is-there-an-r-equivalent-of-near-in-esri-arcgis

	# or we can just do a pair wise distance calc using st_distance

	# (sf::st_distance(intersection, nalicences_WGS84, by_element=F, tolerance=100)) # this is way too slow - what can we do to make this better?
	# this still calculates over the full product of points so we need a way to just get the nearest neighbour features

	# maybe just take a fraction of points to start with
	(sf::st_distance(intersection %>% slice_sample(n=100), nalicences_WGS84 %>% slice_sample(n=100), by_element=F, tolerance=100)) # this is slow - what can we do to make this better?

	# I just popped back to look at simplification again to see if I can save some memory on the polygon features
	# two examples of simplification: one using st_simplify; the other ms_simplify
	ggplot() +
	geom_sf(data = st_transform(st_simplify(nz_full, dTolerance = 4000), 4326)) + # st_simplify needs to work on a project crs not lat lon where a unit of longitude changes with latitude
	coord_sf(
	crs = 4326, #"EPSG:4326", or equivalently "WGS84"
	xlim = c(164, 179),
	ylim = c(-48, -33),
	expand = FALSE)

	ggplot() +
	geom_sf(data = st_transform(rmapshaper::ms_simplify(nz_full, keep = 0.001, keep_shapes = T), 4326)) + # keeps 0.1% of vertices
	coord_sf(
	crs = 4326, #"EPSG:4326", or equivalently "WGS84"
	xlim = c(164, 179),
	ylim = c(-48, -33),
	expand = FALSE)

	# let's try nngeo::st_nn nearest neighbour search for Simple Features - sounds like what we want
	p2p <- st_nn(x = intersection, y = nalicences_WGS84, k = 1, maxdist = 1000, returnDist = T, progress = T) #, parallel = 16)
	# using parallel option with all 16 cores destabilises the system and when I try the line 'ggplot() + geom_histogram(aes(distanceSample))' - it dies

	# let's get the structure of the created list
	str(p2p)
	# List of two, $nn and $dist
	# I'm guessing $nn identifies the neighbour somehow

	# lets histogram the distribution of distances - sample first
	distanceSample <- unlist(list.sample(p2p$dist, 100))
	ggplot() + geom_histogram(aes(distanceSample))

	# may as well do the lot
	ggplot() + geom_histogram(aes(unlist(p2p$dist)))
	# so we have vast majority of nearest neighbours matching within just a few metres

	# do we have duplicated records?
	nalicences %>%
	group_by(ORG_ID) %>%
	filter(n() > 1 )
	# answer = no

	# filtering out the Not Applicable TYPE from nalicences
	nalicencesApplicable <- nalicences %>%
	filter(TYPE != "Not Applicable" ) %>%
	length()


	# next problem, we've matched by geocoordinate - now let's try and match by address using string similarity measures
	# n-grams work well but we have others like cosine/jacquard etc
	# https://cran.r-project.org/web/packages/ngram/vignettes/ngram-guide.pdf
	# https://cran.r-project.org/web/packages/stringdist/stringdist.pdf



	# small aside - might try and check how apportionment works in R for two data sets we can do the exact same
	# thing as I've done in ArcGIS - Census Estimated Resident Population apportioned to Police Areas - police boundaries are available on koordinates
	# https://koordinates.com/layer/3825-nz-police-area-boundaries/
	# looks like I can use sf::st_interpolate_aw or, more complete, the areal package
	PoliceAreasShp <- readOGR(dsn ="h:\\Documents\\wd\\erp\\Police_Areas_Clipped__2021_04", layer="police_area_clipped_with_maori")

	nrow(shp) # features in source
	nrow(PoliceAreasShp) # features in target

	nrow(suppressWarnings(sf::st_intersection(st_as_sf(shp), st_as_sf(PoliceAreasShp))))
	(sf::st_intersection(st_as_sf(shp), st_as_sf(PoliceAreasShp) ) )

	sourceERP <- st_as_sf(shp)
	targetPoliceAreas <- st_as_sf(PoliceAreasShp)

	apportioned <- areal::aw_interpolate(targetPoliceAreas,
	tid = "AREA_ID",
	source = sourceERP,
	sid = "SA22020_V1",
	weight = "sum",
	output = "sf",
	extensive = c("erp18"))

	head(apportioned)