Skip to content

Instantly share code, notes, and snippets.

@daltare
Last active November 21, 2022 19:52
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save daltare/2de1517ad1e315c4b1cad01278de96dd to your computer and use it in GitHub Desktop.
Save daltare/2de1517ad1e315c4b1cad01278de96dd to your computer and use it in GitHub Desktop.
# This example shows one way to access subsets of CEDEN data via the California Open Data Portal API (it's not
# necessarily the only or best way though) - if you find any problems or have questions, please contact: david.altare@waterboards.ca.gov
# This example applies to the following CEDEN datasets, which have been split into separate
# resources by year (due to file size limitations)
# Water Chemistry: https://data.ca.gov/dataset/surface-water-chemistry-results
# Habitat: https://data.ca.gov/dataset/surface-water-habitat-results
# Tissue: https://data.ca.gov/dataset/surface-water-aquatic-organism-tissue-sample-results
# It assumes that you have your own API key saved in a system environment variable
# named: "data_portal_key" (to obtain an API key, create an account at data.ca.gov, then go to your
# account page and look for the "API Key" header in the lower left side of the page)
# NOTE: There appears to be a limit of 32,000 records that can be returned from the API in a single call -- if your API query
# is likely to return more records than that limit, you'll need to break it up into multiple calls to the API
# The function below returns a warning message when this occurs
# NOTE: to query across all years in this dataset, you'll need to enter the package/dataset ID for the chemistry data,
# rather than the ID for an individual resource (since the data is split into multiple years, need to query across
# multiple resources) - the package IDs for the datasets this example applies to are:
# Water Chemistry: 28d7a81d-6458-47bd-9b79-4fcbfbb88671
# Habitat: f5edfd1b-a9b3-48eb-a33e-9c246ab85adf
# Tissue: 38cb5cca-1500-42e7-b359-e8e3c5d1e087
# load packages
library(ckanr)
library(dplyr)
library(lubridate)
library(httr)
library(jsonlite)
library(glue)
library(readr)
library(stringr)
library(purrr)
# set the option to use standard notation rather than scientific notation
options(scipen = 999)
# create a generic function to run an API query and return the formatted results, given a URL for the query
generic_api_query <- function(query_url) {
# encode URL
url_encoded <- URLencode(query_url)
# get the data
query_response <- httr::GET(url_encoded, add_headers(Authorization = Sys.getenv('data_portal_key')))
query_char <- rawToChar(query_response$content)
query_content <- jsonlite::fromJSON(query_char)
resource_records <- query_content$result$records
# get information about the fields included in the dataset (allows for reording the fields to match the original dataset)
resource_fields <- query_content$result$fields %>%
filter(!id %in% c('_id', '_full_text')) %>% # drop these two fields, which are added by the portal and not actually part of the dataset
pull(id)
# format the data- convert to tibble, set the data types, and reorder the fields to match the order of the original dataset
resource_records <- tibble(resource_records) %>%
# type_convert(na = c("", "NaN")) %>% # move this part to the function below, so that it's not done separately each time
mutate_all(as.character) %>% # to make sure the data frames from each year can be combined
select(all_of(resource_fields))
# print a warning if the API limit was reached by the query
if (nrow(resource_records) == 32000) {
warning('WARNING: the API record limit was reached - it\'s likely that some records matching your query were not returned')
}
# return the results
return(resource_records)
}
# create a function that takes a single year, a dataframe of resource_ids (by year), and the API filters, then returns the correct id
format_url_year <- function(query_year, resource_id_table, filters_date_start, filters_date_end, filters_text, filters_numeric, filters_numeric_drop_na) {
# get the resource ID corresponding to the given year
resource_id <- resource_id_table %>%
filter(year == query_year) %>%
pull(resource_id)
# get the range of start and end dates (in case there are different dates entered for different fields)
dates_start <- range(as.Date(unname(filters_date_start)))
dates_end <- range(as.Date(unname(filters_date_end)))
# get the years to use the starting date filters for
if (!is.na(filters_date_start)[1]) {
use_date_query_years_start <- c(year(min(dates_start)):year(max(dates_start)))
} else (
use_date_query_years_start <- c()
)
# get the years to use the ending date filters for
if (!is.na(filters_date_end)[1]) {
use_date_query_years_end <- year(min(dates_end)):year(max(dates_end)) # just in case there are multiple start or end dates, make sure the date filters are applied to years needed
} else {
use_date_query_years_end <- c()
}
# format the filter for minimum/starting dates
if (!is.na(filters_date_start[1]) & (query_year %in% use_date_query_years_start)) {
query_date_start <- map2_chr(filters_date_start, names(filters_date_start), ~glue("\"{.y}\" >= '{.x}'")) %>%
unname() %>%
glue_collapse(sep = ' AND ')
} else {
query_date_start <- c()
}
# format the filters for maximum/ending dates
if (!is.na(filters_date_end[1]) & (query_year %in% use_date_query_years_end)) {
query_date_end <- map2_chr(filters_date_end, names(filters_date_end), ~glue("\"{.y}\" <= '{.x}'")) %>%
unname() %>%
glue_collapse(sep = ' AND ')
} else {
query_date_end <- c()
}
# format the filters for text values
if (!is.na(filters_text[1])) {
query_text_filter <- map2_chr(filters_text, names(filters_text), ~glue("\"{.y}\" = '{.x}'")) %>%
unname() %>%
glue_collapse(sep = ' AND ')
} else {
query_text_filter <- c()
}
# format the filters for numeric values
if (!is.na(filters_numeric[1])) {
query_numeric_filter <- map2_chr(filters_numeric, names(filters_numeric), ~glue("\"{.y}\" {.x}")) %>%
unname() %>%
glue_collapse(sep = ' AND ')
} else {
query_numeric_filter <- c()
}
# format the filters to drop missing/null numeric values
if (!is.na(filters_numeric_drop_na[1])) {
query_numeric_drop_na <- map_chr(filters_numeric_drop_na, ~glue("\"{.x}\" <> '{NaN}'")) %>%
# unname() %>%
glue_collapse(sep = ' AND ')
} else {
query_numeric_drop_na <- c()
}
# combine all of the different filter types into one statement
filters_combined <- glue_collapse(c(query_date_start, query_date_end, query_text_filter, query_numeric_filter, query_numeric_drop_na), sep = ' AND ')
# create URL for the API call
api_url <- glue("https://data.ca.gov/api/3/action/datastore_search_sql?sql=SELECT * from \"{resource_id}\"")
if (length(filters_combined) > 0) { # if there are additional filters, add them
api_url <- glue("{api_url} WHERE {filters_combined}")
}
return(api_url)
}
# create a function to download data from the API, given an input resource ID and optional filters for date, text, and numeric fields
get_api_data <- function(package_id, filters_date_start = NA, filters_date_end = NA, filters_text = NA, filters_numeric = NA, filters_numeric_drop_na = NA) {
# NOTES: any of the filters_... variables can be left blank
# for date and text filters, enter inputs as named character vectors (multiple name-value pairs can be enterd) -- e.g: c('field_name' = 'query_value')
# for numeric filters, include the condition (>, <, >=, <=, <>) as part of the value (multiple name-condition pairs can be entered) -- e.g.: c('field_name' = '> query_value')
# for the filters_numeric_drop_na field, just enter the name of the field to drop missing numeric values from (multiple field names can be entered) -- e.g., c('field_name')
# get a table of the resource IDs in the dataset
package_info <- package_show(package_id, as = 'table',
url = 'https://data.ca.gov/',
key = Sys.getenv('data_portal_key'))
package_resources <- package_info$resources
package_resources <- package_resources %>%
filter(format %in% c('CSV')) %>% # filter for just the resources containing csv files
select(name, id) %>% # drop un-needed fields
mutate(name = str_replace(name, pattern = 'CEDEN Water Chemistry Data Prior to 2000', '1999')) %>% # use 1999 to represent any year prior to 2000, since all pre-2000 data is in one resource
mutate(name = parse_number(name)) %>% # extract the year from the resource name
rename(year = name, resource_id = id)
# determine the range of resources to query, based on the start and end dates (check for multiple/different start or end dates for different fields)
dates_start <- range(as.Date(unname(filters_date_start)))
dates_end <- range(as.Date(unname(filters_date_end)))
# get the first year to query
if (is.na(filters_date_start[1]) | year(min(dates_start)) < 2000) {
year_start <- 1999 # treat 1999 as the first year, since all data prior to 2000 is in one resource/file
} else {
year_start <- year(min(dates_start))
}
# get the last year to query
if (is.na(filters_date_end[1]) | year(max(dates_end)) > max(package_resources$year)) {
year_end <- max(package_resources$year) # only query up to the latest year for which data is available
} else {
year_end <- year(max(dates_end))
}
# get the range of years to query
query_years_range <- year_start:year_end # get the minimum of the start dates, and the max of the end dates, to know which resources to query
# pass the query filter data to a separate function to build a url for each year, then run the query for each year
# and combine into a single data frame
return(map_df(query_years_range, ~ format_url_year(., package_resources, filters_date_start,
filters_date_end, filters_text, filters_numeric,
filters_numeric_drop_na) %>%
generic_api_query()) %>%
type_convert(na = c("", "NaN")) # detect field types and convert
)
}
#### EXAMPLE QUERY - GET DATA (using the water chemistry dataset) ####
# make an API call to get some data (in this case, get samples between 2016-07-01 and 2018-06-30,
# where the analyte name is "Oxygen, Dissolved, Total", the units are "mg/L", and the result value is
# greater than 10 and is not missing/null)
df_chemistry <- get_api_data(package_id = '28d7a81d-6458-47bd-9b79-4fcbfbb88671',
filters_date_start = c('SampleDate' = '2016-07-01'),
filters_date_end = c('SampleDate' = '2018-06-30'),
filters_text = c('Analyte' = 'Oxygen, Dissolved, Total',
'Unit' = 'mg/L'),
filters_numeric = c('Result' = '> 10'),
filters_numeric_drop_na = c('Result'))
#### EXAMPLE QUERY - VERIFY DATA ####
# verify that records with the correct analyte names and units were returned, and get the number of records
df_chemistry %>% count(Analyte, Unit)
# # A tibble: 1 x 3
# Analyte Unit n
# <chr> <chr> <int>
# 1 Oxygen, Dissolved, Total mg/L 13589
# Verify that the sample date range is correct
range(df_chemistry$SampleDate)
# [1] "2016-07-01 UTC" "2018-06-28 UTC"
# Verify that the result range is correct
range(df_chemistry$Result)
# [1] 10.00012 591.00000
# view a summary of the entire dataset
glimpse(df_chemistry)
# Rows: 13,589
# Columns: 72
# $ Program <chr> "Central Coast Cooperative Monitoring Program for Agriculture", "Central...
# $ ParentProject <chr> "RWB3 Cooperative Monitoring Program", "RWB3 Cooperative Monitoring Prog...
# $ Project <chr> "RWB 3 Cooperative Monitoring Program", "RWB 3 Cooperative Monitoring Pr...
# $ StationName <chr> "Alisal Slough @ White Barn", "Bradley Channel @ Jones St", "Bradley Cha...
# $ StationCode <chr> "309ASB", "312BCJ", "312BCJ", "312BCJ", "312BCJ", "309BLA", "309BLA", "3...
# $ SampleDate <dttm> 2016-12-13, 2016-08-24, 2016-09-27, 2016-10-24, 2016-11-22, 2016-09-27,...
# $ CollectionTime <dttm> 1899-12-30 13:55:00, 1899-12-30 09:01:00, 1899-12-30 15:25:00, 1899-12-...
# $ LocationCode <chr> "Midchannel", "Midchannel", "Midchannel", "Midchannel", "Midchannel", "M...
# $ CollectionDepth <dbl> 0.12, 0.12, 0.12, 0.12, 0.09, 0.12, 0.12, 0.12, 0.03, 0.12, 0.12, 0.12, ...
# $ UnitCollectionDepth <chr> "m", "m", "m", "m", "m", "m", "m", "m", "m", "m", "m", "m", "m", "m", "m...
# $ SampleTypeCode <chr> "Not Recorded", "Not Recorded", "Not Recorded", "Not Recorded", "Not Rec...
# $ CollectionReplicate <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
# $ ResultsReplicate <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
# $ LabBatch <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
# $ LabSampleID <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
# $ MatrixName <chr> "samplewater", "samplewater", "samplewater", "samplewater", "samplewater...
# $ MethodName <chr> "FieldMeasure", "FieldMeasure", "FieldMeasure", "FieldMeasure", "FieldMe...
# $ Analyte <chr> "Oxygen, Dissolved, Total", "Oxygen, Dissolved, Total", "Oxygen, Dissolv...
# $ Unit <chr> "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", ...
# $ Result <dbl> 10.40, 10.86, 17.25, 11.56, 10.79, 12.60, 13.90, 10.13, 13.37, 11.50, 11...
# $ Observation <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
# $ MDL <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
# $ RL <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
# $ ResultQualCode <chr> "=", "=", "=", "=", "=", "=", "=", "=", "=", "=", "=", "=", "=", "=", "=...
# $ QACode <chr> "None", "None", "None", "None", "None", "None", "None", "None", "None", ...
# $ BatchVerification <chr> "NR", "NR", "NR", "NR", "NR", "NR", "NR", "NR", "NR", "NR", "NR", "NR", ...
# $ ComplianceCode <chr> "Com", "Com", "Com", "Com", "Com", "Com", "Com", "Com", "Com", "Com", "C...
# $ SampleComments <chr> NA, "Samples were returned to SMX without ever being sent to FGL. Resam...
# $ CollectionComments <chr> NA, NA, NA, "Chlorophyll a sample sent to Sierra", NA, NA, NA, NA, NA, N...
# $ ResultsComments <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
# $ BatchComments <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
# $ EventCode <chr> "WQ", "WQ", "WQ", "WQ", "WQ", "WQ", "WQ", "WQ", "WQ", "WQ", "WQ", "WQ", ...
# $ ProtocolCode <chr> "CMP_QAPP_2012", "CMP_QAPP_2012", "CMP_QAPP_2012", "CMP_QAPP_2012", "CMP...
# $ SampleAgency <chr> "PER", "TetraTech", "TetraTech", "TetraTech", "TetraTech", "PER", "PER",...
# $ GroupSamples <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
# $ CollectionMethodName <chr> "Field Method", "Field Method", "Field Method", "Field Method", "Field M...
# $ Latitude <dbl> 36.72545, 34.94544, 34.94544, 34.94544, 34.94544, 36.70852, 36.70852, 34...
# $ Longitude <dbl> -121.7302, -120.4168, -120.4168, -120.4168, -120.4168, -121.7489, -121.7...
# $ CollectionDeviceDescription <chr> "PER_Hydrolab DS4a", "YSI 6820 v2", "YSI 6820 v2", "YSI 6820 v2", "YSI 6...
# $ CalibrationDate <dttm> 2016-12-13, 2016-08-23, 2016-09-27, 2016-10-24, 2016-11-22, 2016-09-27,...
# $ PositionWaterColumn <chr> "Subsurface", "Subsurface", "Subsurface", "Subsurface", "Subsurface", "S...
# $ PrepPreservationName <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
# $ PrepPreservationDate <dttm> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
# $ DigestExtractMethod <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
# $ DigestExtractDate <dttm> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
# $ AnalysisDate <dttm> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
# $ DilutionFactor <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
# $ ExpectedValue <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
# $ LabAgency <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
# $ SubmittingAgency <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
# $ SubmissionCode <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
# $ OccupationMethod <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
# $ StartingBank <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
# $ DistanceFromBank <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
# $ UnitDistanceFromBank <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
# $ StreamWidth <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
# $ UnitStreamWidth <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
# $ StationWaterDepth <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
# $ UnitStationWaterDepth <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
# $ HydroMod <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
# $ HydroModLoc <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
# $ LocationDetailWQComments <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
# $ ChannelWidth <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
# $ UpstreamLength <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
# $ DownStreamLength <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
# $ TotalReach <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
# $ LocationDetailBAComments <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
# $ SampleID <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
# $ DW_AnalyteName <chr> "Oxygen, Dissolved", "Oxygen, Dissolved", "Oxygen, Dissolved", "Oxygen, ...
# $ DataQuality <chr> "Unknown data quality", "Unknown data quality", "Unknown data quality", ...
# $ DataQualityIndicator <chr> "BatchVerification:NR", "BatchVerification:NR", "BatchVerification:NR", ...
# $ Datum <chr> "NAD83", "NAD83", "NAD83", "NAD83", "NAD83", "NAD83", "NAD83", "NAD83", ...
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment