Last active
September 19, 2016 18:43
Star
You must be signed in to star a gist
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(pdftools) | |
# read the data from the pdf | |
url="http://www.idfpr.com/Forms/MC/ListofLicensedDispensaries.pdf" | |
text<-pdf_text(url) | |
t<-text | |
# strip out info about start date, end date, etc... it is not consistently formatted | |
# 13 8/24/2015 8/24/2016 280.000\r\n | |
t<-gsub("\\d+\\s+\\d+/\\d+/\\d{4}\\s+\\d+/\\d+/\\d{4}\\s+\\d+.\\d+\\s+",'',t) | |
# turn \r\n into separators | |
t<-gsub("\r\n",";",t, fixed=TRUE) | |
# turn 2 or more spaces into separators | |
t<-gsub("\\s{2,}",';',t) | |
# get rid of mutiple consecutive separators | |
t<-gsub(";{2,}",';',t) | |
# also remove empty records | |
t<- gsub(";\\s+;",'',t) | |
# finally clean up some messiness in the data | |
t<-gsub(';Dispensary;',';',t) | |
t<-gsub("Salveo Health & Wellness", "Salveo Health & Wellness Dispensary",t) | |
t<-gsub(";Illinois;",';',t) | |
t<-gsub("Healthway Services of West","Healthway Services of West Illinois",t) | |
t<-gsub(";Centers;",';',t) | |
t<-gsub("Trinity Compassionate Care","Trinity Compassionate Care Centers",t) | |
t<-gsub("Maribis of Chicago Chicago","Maribis of Chicago; Chicago",t) | |
# some records need ";" in between zip and phone number as of 8/22/16 | |
t<-gsub("(\\d{5})\\s+\\(", "\\1;(", t) | |
t<-unlist(t) | |
# Remove the text at the top of the pdf (updated on 8/22/16) | |
top<-";Illinois Department of Financial and Professional Regulation;Division of Professional Regulation;BRUCE RAUNER;DANIEL KELBER;Governor;Acting Director;The Illinois Department of Financial and Professional Regulation, Division of;Professional Regulation has licensed the following medical cannabis dispensaries under the Illinois Compassion;Cannabis Pilot program Act, 410 ILCS 130/1 et seq., and the regulations adopted pursuant ther;PATIENTS: You must select a dispensary with the Illinois Department of Public Health. BEFORE YOUR FIRST VIS;and ask when it is open for business.;IDFPR - LICENSED MEDICAL CANNABIS DISPENSARIES;Medical;License;License;Cred;Name;Address & Phone Number;Cannabis;Expiration;Issue Date;Nu;District;Date" | |
t<-gsub(top,'',t) | |
#now read the cleaned text as a data.frame | |
d<-read.delim(textConnection(t),header=FALSE,sep=";",blank.lines.skip=T, stringsAsFactors=F) | |
# each set of 4 vars starting with V2 is a new record. | |
all<-data.frame(addr=character(), name=character(), citistatezip = character(), phone=character()) | |
nloops<-floor(ncol(d)/4) | |
for (i in c(1:nloops)){ | |
start <- 2 + (i-1)*4 | |
stop <- start + 3 | |
df <- d[, start:stop ] | |
names(df) <- c("addr","name","citystatezip","phone") | |
all <- rbind(all, df) | |
} | |
# get rid of all blank rows | |
all[all==""]<-NA | |
all<-all[complete.cases(all),] | |
# create address for goecoding | |
all$address<-paste(all$addr,all$citystatezip,sep=', ') | |
# now get lat, lon values from the address | |
# thanks to http://stackoverflow.com/questions/22887833/r-how-to-geocode-a-simple-address-using-data-science-toolbox | |
require(RDSTK) | |
geo.dsk <- function(addr){ # single address geocode with data sciences toolkit | |
require(httr) | |
require(rjson) | |
url <- "http://www.datasciencetoolkit.org/maps/api/geocode/json" | |
response <- GET(url,query=list(sensor="FALSE",address=addr)) | |
json <- fromJSON(content(response,type="text")) | |
loc <- json['results'][[1]][[1]]$geometry$location | |
return(c(address=addr,long=loc$lng, lat= loc$lat)) | |
} | |
result <- do.call(rbind,lapply(all$address,geo.dsk)) | |
result <- data.frame(result) | |
# now plot it on a map | |
# create id and LatLong for googleVis map | |
all$id <- paste(all$name, all$address, all$phone, sep='; ') | |
all$LatLong = paste(result$lat, result$long, sep=":") | |
# Now plot with googleVis | |
require(googleVis) | |
g1 <- gvisMap(all, "LatLong" , "id", | |
options=list(showTip=TRUE, | |
showLine=TRUE, | |
enableScrollWheel=TRUE, | |
mapType='normal', | |
width=400, height=400 | |
)) | |
# this opens a browser with the plot | |
plot(g1) | |
# write the code that will be used on my website | |
cat(g1$html$chart, file="dispensariesIL.html") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi,
I tried to follow your code, but I found an error when I ran this line:
result <- do.call(rbind,lapply(all$address,geo.dsk))
And the error is,
No encoding supplied: defaulting to UTF-8.
Error in json["results"][[1]][[1]] : subscript out of bounds