daltare/ceden-data-portal-API_multi-year.r

## ceden-data-portal-API_multi-year.r
# This example shows one way to access subsets of CEDEN data via the California Open Data Portal API  (it's not
# necessarily the only or best way though) - if you find any problems or have questions, please contact: david.altare@waterboards.ca.gov

# This example applies to the following CEDEN datasets, which have been split into separate
# resources by year (due to file size limitations)
    # Water Chemistry: https://data.ca.gov/dataset/surface-water-chemistry-results
    # Habitat: https://data.ca.gov/dataset/surface-water-habitat-results
    # Tissue: https://data.ca.gov/dataset/surface-water-aquatic-organism-tissue-sample-results

# It assumes that you have your own API key saved in a system environment variable
# named: "data_portal_key" (to obtain an API key, create an account at data.ca.gov, then go to your
# account page and look for the "API Key" header in the lower left side of the page)

# NOTE: There appears to be a limit of 32,000 records that can be returned from the API in a single call -- if your API query
# is likely to return more records than that limit, you'll need to break it up into multiple calls to the API
# The function below returns a warning message when this occurs

# NOTE: to query across all years in this dataset, you'll need to enter the package/dataset ID for the chemistry data,
# rather than the ID for an individual resource (since the data is split into multiple years, need to query across
# multiple resources) - the package IDs for the datasets this example applies to are:
    # Water Chemistry: 28d7a81d-6458-47bd-9b79-4fcbfbb88671
    # Habitat: f5edfd1b-a9b3-48eb-a33e-9c246ab85adf
    # Tissue: 38cb5cca-1500-42e7-b359-e8e3c5d1e087


# load packages
library(ckanr)
library(dplyr)
library(lubridate)
library(httr)
library(jsonlite)
library(glue)
library(readr)
library(stringr)
library(purrr)

# set the option to use standard notation rather than scientific notation
options(scipen = 999)

# create a generic function to run an API query and return the formatted results, given a URL for the query
generic_api_query <- function(query_url) {
    # encode URL
    url_encoded <- URLencode(query_url)

    # get the data
    query_response <- httr::GET(url_encoded, add_headers(Authorization = Sys.getenv('data_portal_key')))
    query_char <- rawToChar(query_response$content)
    query_content <- jsonlite::fromJSON(query_char)
    resource_records <- query_content$result$records

    # get information about the fields included in the dataset (allows for reording the fields to match the original dataset)
    resource_fields <- query_content$result$fields %>%
        filter(!id %in% c('_id', '_full_text')) %>%  # drop these two fields, which are added by the portal and not actually part of the dataset
        pull(id)

    # format the data- convert to tibble, set the data types, and reorder the fields to match the order of the original dataset
    resource_records <- tibble(resource_records) %>%
        # type_convert(na = c("", "NaN")) %>% # move this part to the function below, so that it's not done separately each time
        mutate_all(as.character) %>% # to make sure the data frames from each year can be combined
        select(all_of(resource_fields))

    # print a warning if the API limit was reached by the query
    if (nrow(resource_records) == 32000) {
        warning('WARNING: the API record limit was reached - it\'s likely that some records matching your query were not returned')
    }

    # return the results
    return(resource_records)
}


# create a function that takes a single year, a dataframe of resource_ids (by year), and the API filters, then returns the correct id
format_url_year <- function(query_year, resource_id_table, filters_date_start, filters_date_end, filters_text, filters_numeric, filters_numeric_drop_na) {
    # get the resource ID corresponding to the given year
    resource_id <- resource_id_table %>%
        filter(year == query_year) %>%
        pull(resource_id)

    # get the range of start and end dates (in case there are different dates entered for different fields)
    dates_start <- range(as.Date(unname(filters_date_start)))
    dates_end <- range(as.Date(unname(filters_date_end)))

    # get the years to use the starting date filters for
    if (!is.na(filters_date_start)[1]) {
        use_date_query_years_start <- c(year(min(dates_start)):year(max(dates_start)))
    } else (
        use_date_query_years_start <- c()
    )

    # get the years to use the ending date filters for
    if (!is.na(filters_date_end)[1]) {
        use_date_query_years_end <- year(min(dates_end)):year(max(dates_end)) # just in case there are multiple start or end dates, make sure the date filters are applied to years needed
    } else {
        use_date_query_years_end <- c()
    }

    # format the filter for minimum/starting dates
    if (!is.na(filters_date_start[1]) & (query_year %in% use_date_query_years_start)) {
        query_date_start <- map2_chr(filters_date_start, names(filters_date_start), ~glue("\"{.y}\" >= '{.x}'")) %>%
            unname() %>%
            glue_collapse(sep = ' AND ')
    } else {
        query_date_start <- c()
    }

    # format the filters for maximum/ending dates
    if (!is.na(filters_date_end[1]) & (query_year %in% use_date_query_years_end)) {
        query_date_end <- map2_chr(filters_date_end, names(filters_date_end), ~glue("\"{.y}\" <= '{.x}'")) %>%
            unname() %>%
            glue_collapse(sep = ' AND ')
    } else {
        query_date_end <- c()
    }

    # format the filters for text values
    if (!is.na(filters_text[1])) {
        query_text_filter <- map2_chr(filters_text, names(filters_text), ~glue("\"{.y}\" = '{.x}'")) %>%
            unname() %>%
            glue_collapse(sep = ' AND ')
    } else {
        query_text_filter <- c()
    }

    # format the filters for numeric values
    if (!is.na(filters_numeric[1])) {
        query_numeric_filter <- map2_chr(filters_numeric, names(filters_numeric), ~glue("\"{.y}\" {.x}")) %>%
            unname() %>%
            glue_collapse(sep = ' AND ')
    } else {
        query_numeric_filter <- c()
    }

    # format the filters to drop missing/null numeric values
    if (!is.na(filters_numeric_drop_na[1])) {
        query_numeric_drop_na <- map_chr(filters_numeric_drop_na, ~glue("\"{.x}\" <> '{NaN}'")) %>%
            # unname() %>%
            glue_collapse(sep = ' AND ')
    } else {
        query_numeric_drop_na <- c()
    }

    # combine all of the different filter types into one statement
    filters_combined <- glue_collapse(c(query_date_start, query_date_end, query_text_filter, query_numeric_filter, query_numeric_drop_na), sep = ' AND ')

    # create URL for the API call
    api_url <- glue("https://data.ca.gov/api/3/action/datastore_search_sql?sql=SELECT * from \"{resource_id}\"")
    if (length(filters_combined) > 0) { # if there are additional filters, add them
        api_url <- glue("{api_url} WHERE {filters_combined}")
    }
    return(api_url)
}


# create a function to download data from the API, given an input resource ID and optional filters for date, text, and numeric fields
get_api_data <- function(package_id, filters_date_start = NA, filters_date_end = NA, filters_text = NA, filters_numeric = NA, filters_numeric_drop_na = NA) {
    # NOTES: any of the filters_... variables can be left blank
        # for date and text filters, enter inputs as named character vectors (multiple name-value pairs can be enterd) -- e.g: c('field_name' = 'query_value')
        # for numeric filters, include the condition (>, <, >=, <=, <>) as part of the value (multiple name-condition pairs can be entered) -- e.g.: c('field_name' = '> query_value')
        # for the filters_numeric_drop_na field, just enter the name of the field to drop missing numeric values from (multiple field names can be entered) -- e.g., c('field_name')

    # get a table of the resource IDs in the dataset
    package_info <- package_show(package_id, as = 'table',
                                 url = 'https://data.ca.gov/',
                                 key = Sys.getenv('data_portal_key'))
    package_resources <- package_info$resources
    package_resources <- package_resources %>%
        filter(format %in% c('CSV')) %>% # filter for just the resources containing csv files
        select(name, id) %>% # drop un-needed fields
        mutate(name = str_replace(name, pattern = 'CEDEN Water Chemistry Data Prior to 2000', '1999')) %>% # use 1999 to represent any year prior to 2000, since all pre-2000 data is in one resource
        mutate(name = parse_number(name)) %>% # extract the year from the resource name
        rename(year = name, resource_id = id)

    # determine the range of resources to query, based on the start and end dates (check for multiple/different start or end dates for different fields)
    dates_start <- range(as.Date(unname(filters_date_start)))
    dates_end <- range(as.Date(unname(filters_date_end)))

    # get the first year to query
    if (is.na(filters_date_start[1]) | year(min(dates_start)) < 2000) {
        year_start <- 1999 # treat 1999 as the first year, since all data prior to 2000 is in one resource/file
    } else {
        year_start <- year(min(dates_start))
    }

    # get the last year to query
    if (is.na(filters_date_end[1]) | year(max(dates_end)) > max(package_resources$year)) {
        year_end <- max(package_resources$year) # only query up to the latest year for which data is available
    } else {
        year_end <- year(max(dates_end))
    }

    # get the range of years to query
    query_years_range <- year_start:year_end # get the minimum of the start dates, and the max of the end dates, to know which resources to query

    # pass the query filter data to a separate function to build a url for each year, then run the query for each year
    # and combine into a single data frame
    return(map_df(query_years_range, ~ format_url_year(., package_resources, filters_date_start,
                                                       filters_date_end, filters_text, filters_numeric,
                                                       filters_numeric_drop_na) %>%
                      generic_api_query()) %>%
               type_convert(na = c("", "NaN")) # detect field types and convert
    )
}


#### EXAMPLE QUERY - GET DATA (using the water chemistry dataset) ####
# make an API call to get some data (in this case, get samples between 2016-07-01 and 2018-06-30,
# where the analyte name is "Oxygen, Dissolved, Total", the units are "mg/L", and the result value is
# greater than 10 and is not missing/null)
df_chemistry <- get_api_data(package_id = '28d7a81d-6458-47bd-9b79-4fcbfbb88671',
                             filters_date_start = c('SampleDate' = '2016-07-01'),
                             filters_date_end = c('SampleDate' = '2018-06-30'),
                             filters_text = c('Analyte' = 'Oxygen, Dissolved, Total',
                                              'Unit' = 'mg/L'),
                             filters_numeric = c('Result' = '> 10'),
                             filters_numeric_drop_na = c('Result'))


#### EXAMPLE QUERY - VERIFY DATA ####
# verify that records with the correct analyte names and units were returned, and get the number of records
df_chemistry %>% count(Analyte, Unit)
    # # A tibble: 1 x 3
    #   Analyte                  Unit      n
    #   <chr>                    <chr> <int>
    # 1 Oxygen, Dissolved, Total mg/L  13589

# Verify that the sample date range is correct
range(df_chemistry$SampleDate)
    # [1] "2016-07-01 UTC" "2018-06-28 UTC"

# Verify that the result range is correct
range(df_chemistry$Result)
    # [1]  10.00012 591.00000

# view a summary of the entire dataset
glimpse(df_chemistry)
    # Rows: 13,589
    # Columns: 72
    # $ Program                     <chr> "Central Coast Cooperative Monitoring Program for Agriculture", "Central...
    # $ ParentProject               <chr> "RWB3 Cooperative Monitoring Program", "RWB3 Cooperative Monitoring Prog...
    # $ Project                     <chr> "RWB 3 Cooperative Monitoring Program", "RWB 3 Cooperative Monitoring Pr...
    # $ StationName                 <chr> "Alisal Slough @ White Barn", "Bradley Channel @ Jones St", "Bradley Cha...
    # $ StationCode                 <chr> "309ASB", "312BCJ", "312BCJ", "312BCJ", "312BCJ", "309BLA", "309BLA", "3...
    # $ SampleDate                  <dttm> 2016-12-13, 2016-08-24, 2016-09-27, 2016-10-24, 2016-11-22, 2016-09-27,...
    # $ CollectionTime              <dttm> 1899-12-30 13:55:00, 1899-12-30 09:01:00, 1899-12-30 15:25:00, 1899-12-...
    # $ LocationCode                <chr> "Midchannel", "Midchannel", "Midchannel", "Midchannel", "Midchannel", "M...
    # $ CollectionDepth             <dbl> 0.12, 0.12, 0.12, 0.12, 0.09, 0.12, 0.12, 0.12, 0.03, 0.12, 0.12, 0.12, ...
    # $ UnitCollectionDepth         <chr> "m", "m", "m", "m", "m", "m", "m", "m", "m", "m", "m", "m", "m", "m", "m...
    # $ SampleTypeCode              <chr> "Not Recorded", "Not Recorded", "Not Recorded", "Not Recorded", "Not Rec...
    # $ CollectionReplicate         <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
    # $ ResultsReplicate            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
    # $ LabBatch                    <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
    # $ LabSampleID                 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
    # $ MatrixName                  <chr> "samplewater", "samplewater", "samplewater", "samplewater", "samplewater...
    # $ MethodName                  <chr> "FieldMeasure", "FieldMeasure", "FieldMeasure", "FieldMeasure", "FieldMe...
    # $ Analyte                     <chr> "Oxygen, Dissolved, Total", "Oxygen, Dissolved, Total", "Oxygen, Dissolv...
    # $ Unit                        <chr> "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", ...
    # $ Result                      <dbl> 10.40, 10.86, 17.25, 11.56, 10.79, 12.60, 13.90, 10.13, 13.37, 11.50, 11...
    # $ Observation                 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
    # $ MDL                         <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
    # $ RL                          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
    # $ ResultQualCode              <chr> "=", "=", "=", "=", "=", "=", "=", "=", "=", "=", "=", "=", "=", "=", "=...
    # $ QACode                      <chr> "None", "None", "None", "None", "None", "None", "None", "None", "None", ...
    # $ BatchVerification           <chr> "NR", "NR", "NR", "NR", "NR", "NR", "NR", "NR", "NR", "NR", "NR", "NR", ...
    # $ ComplianceCode              <chr> "Com", "Com", "Com", "Com", "Com", "Com", "Com", "Com", "Com", "Com", "C...
    # $ SampleComments              <chr> NA, "Samples were returned to SMX without ever being sent to FGL.  Resam...
    # $ CollectionComments          <chr> NA, NA, NA, "Chlorophyll a sample sent to Sierra", NA, NA, NA, NA, NA, N...
    # $ ResultsComments             <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
    # $ BatchComments               <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
    # $ EventCode                   <chr> "WQ", "WQ", "WQ", "WQ", "WQ", "WQ", "WQ", "WQ", "WQ", "WQ", "WQ", "WQ", ...
    # $ ProtocolCode                <chr> "CMP_QAPP_2012", "CMP_QAPP_2012", "CMP_QAPP_2012", "CMP_QAPP_2012", "CMP...
    # $ SampleAgency                <chr> "PER", "TetraTech", "TetraTech", "TetraTech", "TetraTech", "PER", "PER",...
    # $ GroupSamples                <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
    # $ CollectionMethodName        <chr> "Field Method", "Field Method", "Field Method", "Field Method", "Field M...
    # $ Latitude                    <dbl> 36.72545, 34.94544, 34.94544, 34.94544, 34.94544, 36.70852, 36.70852, 34...
    # $ Longitude                   <dbl> -121.7302, -120.4168, -120.4168, -120.4168, -120.4168, -121.7489, -121.7...
    # $ CollectionDeviceDescription <chr> "PER_Hydrolab DS4a", "YSI 6820 v2", "YSI 6820 v2", "YSI 6820 v2", "YSI 6...
    # $ CalibrationDate             <dttm> 2016-12-13, 2016-08-23, 2016-09-27, 2016-10-24, 2016-11-22, 2016-09-27,...
    # $ PositionWaterColumn         <chr> "Subsurface", "Subsurface", "Subsurface", "Subsurface", "Subsurface", "S...
    # $ PrepPreservationName        <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
    # $ PrepPreservationDate        <dttm> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
    # $ DigestExtractMethod         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
    # $ DigestExtractDate           <dttm> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
    # $ AnalysisDate                <dttm> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
    # $ DilutionFactor              <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
    # $ ExpectedValue               <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
    # $ LabAgency                   <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
    # $ SubmittingAgency            <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
    # $ SubmissionCode              <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
    # $ OccupationMethod            <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
    # $ StartingBank                <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
    # $ DistanceFromBank            <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
    # $ UnitDistanceFromBank        <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
    # $ StreamWidth                 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
    # $ UnitStreamWidth             <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
    # $ StationWaterDepth           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
    # $ UnitStationWaterDepth       <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
    # $ HydroMod                    <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
    # $ HydroModLoc                 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
    # $ LocationDetailWQComments    <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
    # $ ChannelWidth                <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
    # $ UpstreamLength              <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
    # $ DownStreamLength            <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
    # $ TotalReach                  <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
    # $ LocationDetailBAComments    <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
    # $ SampleID                    <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
    # $ DW_AnalyteName              <chr> "Oxygen, Dissolved", "Oxygen, Dissolved", "Oxygen, Dissolved", "Oxygen, ...
    # $ DataQuality                 <chr> "Unknown data quality", "Unknown data quality", "Unknown data quality", ...
    # $ DataQualityIndicator        <chr> "BatchVerification:NR", "BatchVerification:NR", "BatchVerification:NR", ...
    # $ Datum                       <chr> "NAD83", "NAD83", "NAD83", "NAD83", "NAD83", "NAD83", "NAD83", "NAD83", ...
	# This example shows one way to access subsets of CEDEN data via the California Open Data Portal API (it's not
	# necessarily the only or best way though) - if you find any problems or have questions, please contact: david.altare@waterboards.ca.gov

	# This example applies to the following CEDEN datasets, which have been split into separate
	# resources by year (due to file size limitations)
	# Water Chemistry: https://data.ca.gov/dataset/surface-water-chemistry-results
	# Habitat: https://data.ca.gov/dataset/surface-water-habitat-results
	# Tissue: https://data.ca.gov/dataset/surface-water-aquatic-organism-tissue-sample-results

	# It assumes that you have your own API key saved in a system environment variable
	# named: "data_portal_key" (to obtain an API key, create an account at data.ca.gov, then go to your
	# account page and look for the "API Key" header in the lower left side of the page)

	# NOTE: There appears to be a limit of 32,000 records that can be returned from the API in a single call -- if your API query
	# is likely to return more records than that limit, you'll need to break it up into multiple calls to the API
	# The function below returns a warning message when this occurs

	# NOTE: to query across all years in this dataset, you'll need to enter the package/dataset ID for the chemistry data,
	# rather than the ID for an individual resource (since the data is split into multiple years, need to query across
	# multiple resources) - the package IDs for the datasets this example applies to are:
	# Water Chemistry: 28d7a81d-6458-47bd-9b79-4fcbfbb88671
	# Habitat: f5edfd1b-a9b3-48eb-a33e-9c246ab85adf
	# Tissue: 38cb5cca-1500-42e7-b359-e8e3c5d1e087


	# load packages
	library(ckanr)
	library(dplyr)
	library(lubridate)
	library(httr)
	library(jsonlite)
	library(glue)
	library(readr)
	library(stringr)
	library(purrr)

	# set the option to use standard notation rather than scientific notation
	options(scipen = 999)

	# create a generic function to run an API query and return the formatted results, given a URL for the query
	generic_api_query <- function(query_url) {
	# encode URL
	url_encoded <- URLencode(query_url)

	# get the data
	query_response <- httr::GET(url_encoded, add_headers(Authorization = Sys.getenv('data_portal_key')))
	query_char <- rawToChar(query_response$content)
	query_content <- jsonlite::fromJSON(query_char)
	resource_records <- query_content$result$records

	# get information about the fields included in the dataset (allows for reording the fields to match the original dataset)
	resource_fields <- query_content$result$fields %>%
	filter(!id %in% c('_id', '_full_text')) %>% # drop these two fields, which are added by the portal and not actually part of the dataset
	pull(id)

	# format the data- convert to tibble, set the data types, and reorder the fields to match the order of the original dataset
	resource_records <- tibble(resource_records) %>%
	# type_convert(na = c("", "NaN")) %>% # move this part to the function below, so that it's not done separately each time
	mutate_all(as.character) %>% # to make sure the data frames from each year can be combined
	select(all_of(resource_fields))

	# print a warning if the API limit was reached by the query
	if (nrow(resource_records) == 32000) {
	warning('WARNING: the API record limit was reached - it\'s likely that some records matching your query were not returned')
	}

	# return the results
	return(resource_records)
	}


	# create a function that takes a single year, a dataframe of resource_ids (by year), and the API filters, then returns the correct id
	format_url_year <- function(query_year, resource_id_table, filters_date_start, filters_date_end, filters_text, filters_numeric, filters_numeric_drop_na) {
	# get the resource ID corresponding to the given year
	resource_id <- resource_id_table %>%
	filter(year == query_year) %>%
	pull(resource_id)

	# get the range of start and end dates (in case there are different dates entered for different fields)
	dates_start <- range(as.Date(unname(filters_date_start)))
	dates_end <- range(as.Date(unname(filters_date_end)))

	# get the years to use the starting date filters for
	if (!is.na(filters_date_start)[1]) {
	use_date_query_years_start <- c(year(min(dates_start)):year(max(dates_start)))
	} else (
	use_date_query_years_start <- c()
	)

	# get the years to use the ending date filters for
	if (!is.na(filters_date_end)[1]) {
	use_date_query_years_end <- year(min(dates_end)):year(max(dates_end)) # just in case there are multiple start or end dates, make sure the date filters are applied to years needed
	} else {
	use_date_query_years_end <- c()
	}

	# format the filter for minimum/starting dates
	if (!is.na(filters_date_start[1]) & (query_year %in% use_date_query_years_start)) {
	query_date_start <- map2_chr(filters_date_start, names(filters_date_start), ~glue("\"{.y}\" >= '{.x}'")) %>%
	unname() %>%
	glue_collapse(sep = ' AND ')
	} else {
	query_date_start <- c()
	}

	# format the filters for maximum/ending dates
	if (!is.na(filters_date_end[1]) & (query_year %in% use_date_query_years_end)) {
	query_date_end <- map2_chr(filters_date_end, names(filters_date_end), ~glue("\"{.y}\" <= '{.x}'")) %>%
	unname() %>%
	glue_collapse(sep = ' AND ')
	} else {
	query_date_end <- c()
	}

	# format the filters for text values
	if (!is.na(filters_text[1])) {
	query_text_filter <- map2_chr(filters_text, names(filters_text), ~glue("\"{.y}\" = '{.x}'")) %>%
	unname() %>%
	glue_collapse(sep = ' AND ')
	} else {
	query_text_filter <- c()
	}

	# format the filters for numeric values
	if (!is.na(filters_numeric[1])) {
	query_numeric_filter <- map2_chr(filters_numeric, names(filters_numeric), ~glue("\"{.y}\" {.x}")) %>%
	unname() %>%
	glue_collapse(sep = ' AND ')
	} else {
	query_numeric_filter <- c()
	}

	# format the filters to drop missing/null numeric values
	if (!is.na(filters_numeric_drop_na[1])) {
	query_numeric_drop_na <- map_chr(filters_numeric_drop_na, ~glue("\"{.x}\" <> '{NaN}'")) %>%
	# unname() %>%
	glue_collapse(sep = ' AND ')
	} else {
	query_numeric_drop_na <- c()
	}

	# combine all of the different filter types into one statement
	filters_combined <- glue_collapse(c(query_date_start, query_date_end, query_text_filter, query_numeric_filter, query_numeric_drop_na), sep = ' AND ')

	# create URL for the API call
	api_url <- glue("https://data.ca.gov/api/3/action/datastore_search_sql?sql=SELECT * from \"{resource_id}\"")
	if (length(filters_combined) > 0) { # if there are additional filters, add them
	api_url <- glue("{api_url} WHERE {filters_combined}")
	}
	return(api_url)
	}


	# create a function to download data from the API, given an input resource ID and optional filters for date, text, and numeric fields
	get_api_data <- function(package_id, filters_date_start = NA, filters_date_end = NA, filters_text = NA, filters_numeric = NA, filters_numeric_drop_na = NA) {
	# NOTES: any of the filters_... variables can be left blank
	# for date and text filters, enter inputs as named character vectors (multiple name-value pairs can be enterd) -- e.g: c('field_name' = 'query_value')
	# for numeric filters, include the condition (>, <, >=, <=, <>) as part of the value (multiple name-condition pairs can be entered) -- e.g.: c('field_name' = '> query_value')
	# for the filters_numeric_drop_na field, just enter the name of the field to drop missing numeric values from (multiple field names can be entered) -- e.g., c('field_name')

	# get a table of the resource IDs in the dataset
	package_info <- package_show(package_id, as = 'table',
	url = 'https://data.ca.gov/',
	key = Sys.getenv('data_portal_key'))
	package_resources <- package_info$resources
	package_resources <- package_resources %>%
	filter(format %in% c('CSV')) %>% # filter for just the resources containing csv files
	select(name, id) %>% # drop un-needed fields
	mutate(name = str_replace(name, pattern = 'CEDEN Water Chemistry Data Prior to 2000', '1999')) %>% # use 1999 to represent any year prior to 2000, since all pre-2000 data is in one resource
	mutate(name = parse_number(name)) %>% # extract the year from the resource name
	rename(year = name, resource_id = id)

	# determine the range of resources to query, based on the start and end dates (check for multiple/different start or end dates for different fields)
	dates_start <- range(as.Date(unname(filters_date_start)))
	dates_end <- range(as.Date(unname(filters_date_end)))

	# get the first year to query
	if (is.na(filters_date_start[1]) \| year(min(dates_start)) < 2000) {
	year_start <- 1999 # treat 1999 as the first year, since all data prior to 2000 is in one resource/file
	} else {
	year_start <- year(min(dates_start))
	}

	# get the last year to query
	if (is.na(filters_date_end[1]) \| year(max(dates_end)) > max(package_resources$year)) {
	year_end <- max(package_resources$year) # only query up to the latest year for which data is available
	} else {
	year_end <- year(max(dates_end))
	}

	# get the range of years to query
	query_years_range <- year_start:year_end # get the minimum of the start dates, and the max of the end dates, to know which resources to query

	# pass the query filter data to a separate function to build a url for each year, then run the query for each year
	# and combine into a single data frame
	return(map_df(query_years_range, ~ format_url_year(., package_resources, filters_date_start,
	filters_date_end, filters_text, filters_numeric,
	filters_numeric_drop_na) %>%
	generic_api_query()) %>%
	type_convert(na = c("", "NaN")) # detect field types and convert
	)
	}


	#### EXAMPLE QUERY - GET DATA (using the water chemistry dataset) ####
	# make an API call to get some data (in this case, get samples between 2016-07-01 and 2018-06-30,
	# where the analyte name is "Oxygen, Dissolved, Total", the units are "mg/L", and the result value is
	# greater than 10 and is not missing/null)
	df_chemistry <- get_api_data(package_id = '28d7a81d-6458-47bd-9b79-4fcbfbb88671',
	filters_date_start = c('SampleDate' = '2016-07-01'),
	filters_date_end = c('SampleDate' = '2018-06-30'),
	filters_text = c('Analyte' = 'Oxygen, Dissolved, Total',
	'Unit' = 'mg/L'),
	filters_numeric = c('Result' = '> 10'),
	filters_numeric_drop_na = c('Result'))


	#### EXAMPLE QUERY - VERIFY DATA ####
	# verify that records with the correct analyte names and units were returned, and get the number of records
	df_chemistry %>% count(Analyte, Unit)
	# # A tibble: 1 x 3
	# Analyte Unit n
	# <chr> <chr> <int>
	# 1 Oxygen, Dissolved, Total mg/L 13589

	# Verify that the sample date range is correct
	range(df_chemistry$SampleDate)
	# [1] "2016-07-01 UTC" "2018-06-28 UTC"

	# Verify that the result range is correct
	range(df_chemistry$Result)
	# [1] 10.00012 591.00000

	# view a summary of the entire dataset
	glimpse(df_chemistry)
	# Rows: 13,589
	# Columns: 72
	# $ Program <chr> "Central Coast Cooperative Monitoring Program for Agriculture", "Central...
	# $ ParentProject <chr> "RWB3 Cooperative Monitoring Program", "RWB3 Cooperative Monitoring Prog...
	# $ Project <chr> "RWB 3 Cooperative Monitoring Program", "RWB 3 Cooperative Monitoring Pr...
	# $ StationName <chr> "Alisal Slough @ White Barn", "Bradley Channel @ Jones St", "Bradley Cha...
	# $ StationCode <chr> "309ASB", "312BCJ", "312BCJ", "312BCJ", "312BCJ", "309BLA", "309BLA", "3...
	# $ SampleDate <dttm> 2016-12-13, 2016-08-24, 2016-09-27, 2016-10-24, 2016-11-22, 2016-09-27,...
	# $ CollectionTime <dttm> 1899-12-30 13:55:00, 1899-12-30 09:01:00, 1899-12-30 15:25:00, 1899-12-...
	# $ LocationCode <chr> "Midchannel", "Midchannel", "Midchannel", "Midchannel", "Midchannel", "M...
	# $ CollectionDepth <dbl> 0.12, 0.12, 0.12, 0.12, 0.09, 0.12, 0.12, 0.12, 0.03, 0.12, 0.12, 0.12, ...
	# $ UnitCollectionDepth <chr> "m", "m", "m", "m", "m", "m", "m", "m", "m", "m", "m", "m", "m", "m", "m...
	# $ SampleTypeCode <chr> "Not Recorded", "Not Recorded", "Not Recorded", "Not Recorded", "Not Rec...
	# $ CollectionReplicate <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
	# $ ResultsReplicate <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
	# $ LabBatch <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
	# $ LabSampleID <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
	# $ MatrixName <chr> "samplewater", "samplewater", "samplewater", "samplewater", "samplewater...
	# $ MethodName <chr> "FieldMeasure", "FieldMeasure", "FieldMeasure", "FieldMeasure", "FieldMe...
	# $ Analyte <chr> "Oxygen, Dissolved, Total", "Oxygen, Dissolved, Total", "Oxygen, Dissolv...
	# $ Unit <chr> "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", ...
	# $ Result <dbl> 10.40, 10.86, 17.25, 11.56, 10.79, 12.60, 13.90, 10.13, 13.37, 11.50, 11...
	# $ Observation <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
	# $ MDL <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
	# $ RL <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
	# $ ResultQualCode <chr> "=", "=", "=", "=", "=", "=", "=", "=", "=", "=", "=", "=", "=", "=", "=...
	# $ QACode <chr> "None", "None", "None", "None", "None", "None", "None", "None", "None", ...
	# $ BatchVerification <chr> "NR", "NR", "NR", "NR", "NR", "NR", "NR", "NR", "NR", "NR", "NR", "NR", ...
	# $ ComplianceCode <chr> "Com", "Com", "Com", "Com", "Com", "Com", "Com", "Com", "Com", "Com", "C...
	# $ SampleComments <chr> NA, "Samples were returned to SMX without ever being sent to FGL. Resam...
	# $ CollectionComments <chr> NA, NA, NA, "Chlorophyll a sample sent to Sierra", NA, NA, NA, NA, NA, N...
	# $ ResultsComments <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
	# $ BatchComments <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
	# $ EventCode <chr> "WQ", "WQ", "WQ", "WQ", "WQ", "WQ", "WQ", "WQ", "WQ", "WQ", "WQ", "WQ", ...
	# $ ProtocolCode <chr> "CMP_QAPP_2012", "CMP_QAPP_2012", "CMP_QAPP_2012", "CMP_QAPP_2012", "CMP...
	# $ SampleAgency <chr> "PER", "TetraTech", "TetraTech", "TetraTech", "TetraTech", "PER", "PER",...
	# $ GroupSamples <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
	# $ CollectionMethodName <chr> "Field Method", "Field Method", "Field Method", "Field Method", "Field M...
	# $ Latitude <dbl> 36.72545, 34.94544, 34.94544, 34.94544, 34.94544, 36.70852, 36.70852, 34...
	# $ Longitude <dbl> -121.7302, -120.4168, -120.4168, -120.4168, -120.4168, -121.7489, -121.7...
	# $ CollectionDeviceDescription <chr> "PER_Hydrolab DS4a", "YSI 6820 v2", "YSI 6820 v2", "YSI 6820 v2", "YSI 6...
	# $ CalibrationDate <dttm> 2016-12-13, 2016-08-23, 2016-09-27, 2016-10-24, 2016-11-22, 2016-09-27,...
	# $ PositionWaterColumn <chr> "Subsurface", "Subsurface", "Subsurface", "Subsurface", "Subsurface", "S...
	# $ PrepPreservationName <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
	# $ PrepPreservationDate <dttm> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
	# $ DigestExtractMethod <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
	# $ DigestExtractDate <dttm> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
	# $ AnalysisDate <dttm> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
	# $ DilutionFactor <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
	# $ ExpectedValue <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
	# $ LabAgency <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
	# $ SubmittingAgency <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
	# $ SubmissionCode <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
	# $ OccupationMethod <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
	# $ StartingBank <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
	# $ DistanceFromBank <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
	# $ UnitDistanceFromBank <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
	# $ StreamWidth <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
	# $ UnitStreamWidth <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
	# $ StationWaterDepth <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
	# $ UnitStationWaterDepth <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
	# $ HydroMod <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
	# $ HydroModLoc <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
	# $ LocationDetailWQComments <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
	# $ ChannelWidth <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
	# $ UpstreamLength <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
	# $ DownStreamLength <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
	# $ TotalReach <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
	# $ LocationDetailBAComments <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
	# $ SampleID <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
	# $ DW_AnalyteName <chr> "Oxygen, Dissolved", "Oxygen, Dissolved", "Oxygen, Dissolved", "Oxygen, ...
	# $ DataQuality <chr> "Unknown data quality", "Unknown data quality", "Unknown data quality", ...
	# $ DataQualityIndicator <chr> "BatchVerification:NR", "BatchVerification:NR", "BatchVerification:NR", ...
	# $ Datum <chr> "NAD83", "NAD83", "NAD83", "NAD83", "NAD83", "NAD83", "NAD83", "NAD83", ...