Skip to content

Instantly share code, notes, and snippets.

@lashlee
Created March 16, 2018 03:24
Show Gist options
  • Save lashlee/a1579c99395a5589d50f9683dd424755 to your computer and use it in GitHub Desktop.
Save lashlee/a1579c99395a5589d50f9683dd424755 to your computer and use it in GitHub Desktop.
library(tidyverse)
library(ggmap)
#Test number must be of the form of a number (ndx) followed by a single letter (version).
is_valid_test_nbr <- function(test_nbr) grepl("^[0-9]+[A-Za-z]?{1}$",test_nbr)
##Test
identical(is_valid_test_nbr(c("A","123","123A","123AB","_123A","_123","_A")),
c(FALSE,TRUE,TRUE,FALSE,FALSE,FALSE,FALSE))
get_test_nbr_ndx <- function(test_nbr) {
if (all(is_valid_test_nbr(test_nbr))) {
as.integer(gsub("[A-Za-z]","",test_nbr))
} else {
stop("At least one test number is not valid.")
}
}
get_test_nbr_version <- function(test_nbr) {
if (all(is_valid_test_nbr(test_nbr))) {
gsub("[0-9]","",test_nbr)
} else {
stop("At least one test number is not valid.")
}
}
csv <- read.csv("data.csv",stringsAsFactors=FALSE,na.strings=c(""," ","NA","-"))
dat <- csv %>%
rename(test_nbr = "Test..No.",
hydrant_nbr_pitot = "Pitot.Hydrant.No.",
hydrant_nbr_static_residual = "Static.and.Residual.Hydrant.No.",
test_dt = "Test.Date",
addr = "Approximate.Location",
pressure_static_psi = "Static.Press...psi.",
pressure_residual_psi = "Residual.Press...psi.",
pressure_pitot_psi = "Pitot.Press...psi.",
orifice_diameter_in = "Orifice.Diam...in.",
discharge_obs_gpm = "Observed.Discharge..gpm.",
discharge_est_at_20_psi_gpm = "Est..Discharge..20.psi..gpm.",
pitot_tube_ndx = "Pitot.Tube",
orifice_diameter_2_in = "Orifice.Diam...in..1",
discharge_obs_2_gpm = "Observed.Discharge..gpm..1") %>%
filter(test_nbr != "") %>%
mutate(test_nbr_ndx = get_test_nbr_ndx(test_nbr),
test_nbr_version = get_test_nbr_version(test_nbr),
test_dt = as.Date(test_dt,format="%m/%d/%Y"),
pitot_tube_ndx = if_else(pitot_tube_ndx=="<NA>",NA_integer_,as.integer(pitot_tube_ndx)),
addr = if_else(addr=="Sutro Street and Sean Street","Sutro Street and Dean Street",addr)) %>%
mutate(addr_haywarded = paste(addr, "Hayward CA")) %>%
mutate_at(vars(discharge_obs_gpm,
discharge_est_at_20_psi_gpm,
orifice_diameter_2_in,
discharge_obs_2_gpm),
function(x) {as.numeric(gsub(",","",x))}) %>%
mutate_geocode(addr_haywarded) %>% #This is the very time-expensive call to Google for lat,long pairs per address.
arrange(test_nbr_ndx,test_nbr_version,test_dt) %>%
group_by(test_nbr_ndx) %>% mutate(test_nbr_version_ndx = row_number()) %>% ungroup() %>%
# Manual corrections because Google is dumb
mutate(lat = if_else(addr_haywarded=="Tripaldi Way and Hesperian Blvd Hayward CA",37.614299,lat),
lon = if_else(addr_haywarded=="Tripaldi Way and Hesperian Blvd Hayward CA",-122.086035,lon))
save(dat,file="cleaned_data.RData")
write.table(dat,file="fire_flow.tsv",sep="\t",row.names=FALSE)
#This is really crap code.
dat2 <- dat %>% filter(is.na(lon) | is.na(lat))
missing_geocodes <- geocode(dat2$addr)
dat3 <- data.frame(dat2,missing_geocodes) %>% select(test_nbr_ndx,test_nbr_version_ndx,lat.1,lon.1)
dat4 <- dat %>%
left_join(dat3,by=c("test_nbr_ndx","test_nbr_version_ndx")) %>%
mutate(lat=coalesce(lat,lat.1),lon=coalesce(lon,lon.1)) %>%
select(-c(lat.1,lon.1))
dat5 <- dat4 %>% filter(is.na(lon) | is.na(lat))
missing_geocodes <- geocode(dat5$addr)
dat6 <- data.frame(dat5,missing_geocodes) %>% select(test_nbr_ndx,test_nbr_version_ndx,lat.1,lon.1)
dat7 <- dat4 %>%
left_join(dat6,by=c("test_nbr_ndx","test_nbr_version_ndx")) %>%
mutate(lat=coalesce(lat,lat.1),lon=coalesce(lon,lon.1)) %>%
select(-c(lat.1,lon.1))
dat8 <- dat7 %>% filter(is.na(lon) | is.na(lat))
missing_geocodes <- geocode(dat8$addr)
dat9 <- data.frame(dat8,missing_geocodes) %>% select(test_nbr_ndx,test_nbr_version_ndx,lat.1,lon.1)
dat10 <- dat7 %>%
left_join(dat9,by=c("test_nbr_ndx","test_nbr_version_ndx")) %>%
mutate(lat=coalesce(lat,lat.1),lon=coalesce(lon,lon.1)) %>%
select(-c(lat.1,lon.1))
dat_nonmissing <- dat10
save(dat_nonmissing,file="nonmissing_cleaned_data.RData")
write.table(dat_nonmissing,file="fire_flow.tsv",sep="\t",row.names=FALSE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment