Skip to content

Instantly share code, notes, and snippets.

@daltare
Last active February 9, 2021 22:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save daltare/6185457d3d81ea59fe7c1730a7e40318 to your computer and use it in GitHub Desktop.
Save daltare/6185457d3d81ea59fe7c1730a7e40318 to your computer and use it in GitHub Desktop.
# This example shows one way to access subsets of CEDEN data via the California open data portal API (it's not
# necessarily the only or best way though) - if you find any problems or have questions, please contact: david.altare@waterboards.ca.gov
# This example applies to the following CEDEN datasets on the open data portal:
# Toxicity: https://data.ca.gov/dataset/surface-water-toxicity-results/resource/bd484e9b-426a-4ba6-ba4d-f5f8ce095836
# Benthic Macroinvertebrates: https://data.ca.gov/dataset/surface-water-benthic-macroinvertebrate-results/resource/3dfee140-47d5-4e29-99ae-16b9b12a404f
# (NOTE: other CEDEN datasets are split by year, and cannont be queried across multiple years using this method -- for other CEDEN datasets, see: https://gist.github.com/daltare/2de1517ad1e315c4b1cad01278de96dd)
# This example assumes that you have your own API key saved in a system environment variable
# named: "data_portal_key" (to obtain an API key, create an account at data.ca.gov, then go to your
# account page and look for the "API Key" header in the lower left side of the page)
# NOTE: There appears to be a limit of 32,000 records that can be returned from the API in a single call -- if your API query
# is likely to return more records than that limit, you'll need to break it up into multiple calls to the API
# The function below returns a warning message when this occurs
# load packages
library(ckanr)
library(dplyr)
library(lubridate)
library(httr)
library(jsonlite)
library(glue)
library(readr)
library(purrr)
# set the option to use standard notation rather than scientific notation
options(scipen = 999)
# create a generic function to run an API query and return the formatted results, given a URL for the query
generic_api_query <- function(query_url) {
# encode URL
url_encoded <- URLencode(query_url)
# get the data
query_response <- httr::GET(url_encoded, add_headers(Authorization = Sys.getenv('data_portal_key')))
query_char <- rawToChar(query_response$content)
query_content <- jsonlite::fromJSON(query_char)
resource_records <- query_content$result$records
# get information about the fields included in the dataset (allows for reording the fields to match the original dataset)
resource_fields <- query_content$result$fields %>%
filter(!id %in% c('_id', '_full_text')) %>% # drop these two fields, which are added by the portal and not actually part of the dataset
pull(id)
# format the data- convert to tibble, set the data types, and reorder the fields to match the order of the original dataset
resource_records <- tibble(resource_records) %>%
type_convert(na = c("", "NaN")) %>%
select(all_of(resource_fields))
# print a warning if the API limit was reached by the query
if (nrow(resource_records) == 32000) {
warning('WARNING: the API record limit was reached - it\'s likely that some records matching your query were not returned')
}
# return the results
return(resource_records)
}
# create a function to download data from the API, given an input resource ID and optional filters for date, text, and numeric fields
get_api_data <- function(resource_id, filters_date_start = NA, filters_date_end = NA, filters_text = NA, filters_numeric = NA, filters_numeric_drop_na = NA) {
# NOTES: any of the filters_... variables can be left as NA to ignore them
# for date and text filters, enter inputs as named character vectors (multiple name-value pairs can be enterd) -- e.g: c('field_name' = 'query_value')
# for numeric filters, include the condition (>, <, >=, <=, <>) as part of the value (multiple name-condition pairs can be entered) -- e.g.: c('field_name' = '> query_value')
# for the filters_numeric_drop_na field, just enter the name of the field to drop missing numeric values from (multiple field names can be entered) -- e.g., c('field_name')
# format the filter for minimum/starting dates
if (!is.na(filters_date_start[1])) {
query_date_start <- map2_chr(filters_date_start, names(filters_date_start), ~glue("\"{.y}\" >= '{.x}'")) %>%
unname() %>%
glue_collapse(sep = ' AND ')
} else {
query_date_start <- c()
}
# format the filters for maximum/ending dates
if (!is.na(filters_date_end[1])) {
query_date_end <- map2_chr(filters_date_end, names(filters_date_end), ~glue("\"{.y}\" <= '{.x}'")) %>%
unname() %>%
glue_collapse(sep = ' AND ')
} else {
query_date_end <- c()
}
# format the filters for text values
if (!is.na(filters_text[1])) {
query_text_filter <- map2_chr(filters_text, names(filters_text), ~glue("\"{.y}\" = '{.x}'")) %>%
unname() %>%
glue_collapse(sep = ' AND ')
} else {
query_text_filter <- c()
}
# format the filters for numeric values
if (!is.na(filters_numeric[1])) {
query_numeric_filter <- map2_chr(filters_numeric, names(filters_numeric), ~glue("\"{.y}\" {.x}")) %>%
unname() %>%
glue_collapse(sep = ' AND ')
} else {
query_numeric_filter <- c()
}
# format the filters to drop missing/null numeric values
if (!is.na(filters_numeric_drop_na[1])) {
query_numeric_drop_na <- map_chr(filters_numeric_drop_na, ~glue("\"{.x}\" <> '{NaN}'")) %>%
# unname() %>%
glue_collapse(sep = ' AND ')
} else {
query_numeric_drop_na <- c()
}
# combine all of the different filter types into one statement
filters_combined <- glue_collapse(c(query_date_start, query_date_end, query_text_filter, query_numeric_filter, query_numeric_drop_na), sep = ' AND ')
# create URL for the API call
api_url <- glue("https://data.ca.gov/api/3/action/datastore_search_sql?sql=SELECT * from \"{resource_id}\"")
if (length(filters_combined) > 0) { # if there are additional filters, add them
api_url <- glue("{api_url} WHERE {filters_combined}")
}
# submit the API query using the constructed url, and return the resulting data frame
return(generic_api_query(api_url))
}
#### EXAMPLE QUERY - GET DATA (using the toxicity dataset) ####
# make an API call to get some data (in this case, get samples between 2016-07-01 and 2018-06-30,
# where the analyte name is "Oxygen, Dissolved, Total", the units are "mg/L", and the result value is
# greater than 10 and is not missing/null)
df_toxicity <- get_api_data(resource_id = 'bd484e9b-426a-4ba6-ba4d-f5f8ce095836',
filters_date_start = c('SampleDate' = '2016-07-01'),
filters_date_end = c('SampleDate' = '2018-06-30'),
filters_text = c('Analyte' = 'Oxygen, Dissolved, Total',
'Unit' = 'mg/L'),
filters_numeric = c('Result' = '> 10'),
filters_numeric_drop_na = c('Result'))
#### EXAMPLE QUERY - VERIFY DATA ####
# verify that records with the correct analyte names and units were returned, and get the number of records
df_toxicity %>% count(Analyte, Unit)
# # A tibble: 1 x 3
# Analyte Unit n
# <chr> <chr> <int>
# 1 Oxygen, Dissolved, Total mg/L 1463
# Verify that the sample date range is correct
range(df_toxicity$SampleDate)
# [1] "2016-07-11 UTC" "2018-06-21 UTC"
# Verify that the result range is correct
range(df_toxicity$Result)
# [1] 10.07 88.00
# view a summary of the entire dataset
glimpse(df_toxicity)
# Rows: 1,463
# Columns: 88
# $ Program <chr> "BASMAA Regional Monitoring Coalition", "BASMAA Regional M...
# $ ParentProject <chr> "BASMAA RMC Monitoring in WY2016", "BASMAA RMC Monitoring ...
# $ Project <chr> "ACCWP Creek Status Monitoring in WY2016", "SCVURPPP Pesti...
# $ StationName <chr> "Sausal at E.22nd", "San Tomas Aquino 250m u/s Mission Col...
# $ StationCode <chr> "204SAU030", "205STQ010", "205STQ010", "205STE021", "205ST...
# $ SampleDate <dttm> 2016-07-11, 2016-07-11, 2016-07-11, 2016-07-11, 2016-07-1...
# $ CollectionTime <dttm> 1899-12-30 12:40:00, 1899-12-30 14:30:00, 1899-12-30 14:3...
# $ LocationCode <chr> "Midchannel", "Bank", "Bank", "Bank", "Bank", "Midchannel"...
# $ CollectionDepth <dbl> 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1...
# $ UnitCollectionDepth <chr> "m", "m", "m", "m", "m", "m", "m", "m", "m", "m", "m", "m"...
# $ SampleTypeCode <chr> "Grab", "Grab", "Grab", "FieldBLDup_Grab", "Grab", "Grab",...
# $ CollectionReplicate <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
# $ LabReplicate <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
# $ ToxBatch <chr> "AMS_071216PP_C1_W_TOX", "KLI_071216_PP_C2_W_TOX", "KLI_07...
# $ LabSampleID <chr> "204SAU030-W-06", "205STQ010-W-01", "205STQ010-W-01", "250...
# $ MatrixName <chr> "samplewater", "samplewater", "samplewater", "samplewater"...
# $ MethodName <chr> "EPA 821/R-02-013", "EPA 821/R-02-013", "EPA 821/R-02-013"...
# $ ToxTestDurCode <chr> "7 days", "7 days", "4 days", "4 days", "4 days", "4 days"...
# $ OrganismName <chr> "Pimephales promelas", "Pimephales promelas", "Selenastrum...
# $ Analyte <chr> "Oxygen, Dissolved, Total", "Oxygen, Dissolved, Total", "O...
# $ Unit <chr> "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "m...
# $ WQSource <chr> "overlyingwater", "overlyingwater", "overlyingwater", "ove...
# $ TimePointName <chr> "initial", "initial", "final", "initial", "initial", "init...
# $ Result <dbl> 10.2, 12.0, 11.4, 10.1, 10.1, 11.2, 11.0, 11.5, 16.0, 15.0...
# $ ResultQualCode <chr> "=", "=", "=", "=", "=", "=", "=", "=", "=", "=", "=", "="...
# $ ToxResultQACode <chr> "None", "None", "None", "None", "None", "None", "None", "N...
# $ QACode <chr> "None", "None", "None", "None", "None", "None", "None", "N...
# $ BatchVerificationCode <chr> "NR", "NR", "NR", "NR", "NR", "NR", "NR", "NR", "NR", "NR"...
# $ ComplianceCode <chr> "NR", "Com", "Com", "Com", "Com", "Com", "Com", "Com", "NR...
# $ SampleComments <chr> NA, "Same as 205R01411, sampled 2015", "Same as 205R01411,...
# $ CollectionComments <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
# $ ToxTestComments <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
# $ TimePointComments <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
# $ ToxResultComments <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
# $ ToxBatchComments <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
# $ EventCode <chr> "WQ", "WQ", "WQ", "WQ", "WQ", "WQ", "WQ", "WQ", "WQ", "WQ"...
# $ ProtocolCode <chr> "BASMAA_RMC_SOP2016v3", "BASMAA_RMC_SOP2016v3", "BASMAA_RM...
# $ SampleAgency <chr> "Applied Marine Sciences, Inc. California", "Santa Clara V...
# $ GroupSamples <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
# $ CollectionMethodName <chr> "Water_Grab", "Water_Grab", "Water_Grab", "Water_Grab", "W...
# $ Latitude <dbl> 37.78566, 37.38654, 37.38654, 37.41056, 37.41056, 37.61729...
# $ Longitude <dbl> -122.2242, -121.9688, -121.9688, -122.0689, -122.0689, -12...
# $ CollectionDeviceDescription <chr> "Not Recorded", "Not Recorded", "Not Recorded", "Not Recor...
# $ PositionWaterColumn <chr> "Not Recorded", "Not Recorded", "Not Recorded", "Not Recor...
# $ TIENarrative <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
# $ Dilution <dbl> 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100...
# $ ToxPointMethod <chr> "ToxWQMeasurement", "ToxWQMeasurement", "ToxWQMeasurement"...
# $ Treatment <chr> "None", "None", "None", "None", "None", "None", "None", "N...
# $ UnitTreatment <chr> "none", "none", "none", "none", "none", "none", "none", "n...
# $ TreatmentConcentration <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
# $ LabAgency <chr> "PER", "PER", "PER", "PER", "PER", "PER", "PER", "PER", "P...
# $ SubmittingAgency <chr> "AMS-CA", "SCVURPPP", "SCVURPPP", "SCVURPPP", "SCVURPPP", ...
# $ ToxBatchStartDate <chr> "Jul 12 2016 2:30PM", "Jul 12 2016 12:00AM", "Jul 12 2016...
# $ RefToxBatch <chr> "25990PPRTPER", "25990PPRTPER", "26019SCRTPER", "26021CHDR...
# $ OrganismAgeAtTestStart <chr> "<48 hrs", "<48 hrs", "6 days", "7 days", "7 days", "7 day...
# $ LabSubmissionCode <chr> "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A"...
# $ OccupationMethod <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Walk ...
# $ StartingBank <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "LB", ...
# $ DistanceFromBank <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -88, N...
# $ UnitDistanceFromBank <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "m", N...
# $ StreamWidth <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -88, N...
# $ UnitStreamWidth <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "m", N...
# $ StationWaterDepth <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -88, N...
# $ UnitStationWaterDepth <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "m", N...
# $ HydroMod <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Pipes...
# $ HydroModLoc <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "US", ...
# $ LocationDetailWQComments <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "upstr...
# $ PctControl <dbl> NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN...
# $ RepCount <dbl> NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN...
# $ Mean <dbl> NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN...
# $ StdDev <dbl> NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN...
# $ StatMethod <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
# $ Probability <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
# $ Alphalevel <dbl> NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN...
# $ EvalThreshold <dbl> NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN...
# $ MSD <dbl> NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN...
# $ SigEffectCode <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
# $ ChannelWidth <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
# $ UpstreamLength <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
# $ DownstreamLength <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
# $ TotalReach <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
# $ CalculatedValue <dbl> NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN...
# $ PercentEffect <dbl> NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN...
# $ SampleID <chr> "204SAU030-W-06", "205STQ010-W-01", "205STQ010-W-01", "250...
# $ DW_AnalyteName <chr> "Oxygen, Dissolved", "Oxygen, Dissolved", "Oxygen, Dissolv...
# $ DataQuality <chr> "Unknown data quality", "Unknown data quality", "Unknown d...
# $ DataQualityIndicator <chr> "BatchVerificationCode:NR", "BatchVerificationCode:NR", "B...
# $ Datum <chr> "WGS84", "NAD83", "NAD83", "NAD83", "NAD83", "NAD83", "NAD...
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment