tomschenkjr/tidy-api-downloader.R

## tidy-api-downloader.R
# This is a conceptual study to leverage the tidyverse syntax and lazy
# evaluation concepts (influenced by sparklyr) to approach API wrapper packages.
#
# Problem statement: API wrappers often require users to download *all* data
# to manipulate and filter the information. Some packages support custom
# queries but often complicate syntax by adding many parameters within the
# function that may be hard for data scientists to formulate.
#
# Potential solution: API queries can be separated into three principal parts.
# First, to query the API metadata to understand column names and data types.
# Second, to allow users to specify query parameters without having yet
# downloading the data but allowing users to understand what they would
# expect to see. Third, to move the actual download to the last step so data
# is only downloaded after the query has been defined. Each of these steps are
# tied together using pipes (%>%)
#
# Why this solution: First, tidyverse syntax helps break apart major types of
# data manipulation steps and ties them together using pipes. The result is a
# simpler code syntax that allows users to easily add or remove data
# manipulation logic. Second, APIs have large upfront costs. Misspecifying the
# query means users may need to wait a long time before they understand the
# mistake and need to reformulate their query. Often, users may opt to just
# download all the data, which puts the onus on local machines. Influenced by
# lazy evaluation in Spark (and the sparklyr syntax), the download of data
# is the final step of the process. The first step is limited to only
# downloading metadata.


# Simple download example

data <- read_api(data = "id") %>%    # Fetches columns, data types
  select(cols) %>%                 # Select desired columns
  top_n() %>%                      # Only download n rows
  download_api()                   # Download the data


# Prior to `download_api()`, evaluating the data frame will present a tibble
# showing column names, expected data types. This explicitly depends on the
# ability to quickly fetch or determine metadata for an API.


# Preparing to download all data

all_data <- read_api(data = "id")
str(all_data)
#> # A tibble: 0 x 5
#>    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#>           <dbl>       <dbl>        <dbl>       <dbl> <fct>


# Only some columns will be downloaded

select_data <- read_api(data = "id") %>%
  select(Sepal.Length, Sepal.Width)
str(select_data)
#> # A tibble: 0 x 2
#>    Sepal.Length Sepal.Width
#>           <dbl>       <dbl>


# Add download_api() to finally retrieve the data from an API

select_data <- read_api(data = "id") %>%
  select(Sepal.Length, Sepal.Width) %>%
  download_api()
str(select_data)
#> # A tibble: 150 x 2
#>    Sepal.Length Sepal.Width
#>           <dbl>       <dbl>
#>  1          5.1         3.5
#>  2          4.9         3
#>  3          4.7         3.2
#>  4          4.6         3.1
#>  5          5           3.6
#>  6          5.4         3.9
#>  7          4.6         3.4
#>  8          5           3.4
#>  9          4.4         2.9
#> 10          4.9         3.1
#> # … with 140 more rows

# Not all tidy-like functions make sense in preparing to query an API. Also,
# the extent of filtering and grouping (e.g., spread, gather) depends on
# specific features of the API.

data <- read_api(data = "id") %>% # Base function to retrieve metadata
         select(cols) %>%       # Select columns
         filter(criteria) %>%   # Filtering rows
         arrange(criteria)      # Sorting rows
         top_n(int)             # Select number of rows

# Behind the scenes, parameters are being compiled into valid REST calls

data <- read_api(data = "id") %>%
         select(Sepal.Length, Sepal.Width) %>%
         filter(Sepal.Width > 0.1) %>%
         top_n(5) %>%
         download_api(print_url = TRUE)
#> https://www.example.com/api?id=id&$select=Sepal.Length,Sepal.Width&$where=Sepal.Width>0.1&limit=5
#> # A tibble: 5 x 2
#>    Sepal.Length Sepal.Width
#>           <dbl>       <dbl>
#>  1          5.1         3.5
#>  2          4.9         3
#>  3          4.7         3.2
#>  4          4.6         3.1
#>  5          5           3.6


# Code can be combined with tidyr functions after fetching data. One potential
# issue is naming conflicts with tidyr.

data <- read_api(data = "id") %>%
  select(cols) %>%
  top_n()
  download_api() %>%
  tidyr::mutate(Sepal.Width.Inches = Sepal.Width * 0.393701)
str(data)
#> # A tibble: 5 x 3
#>    Sepal.Length Sepal.Width Sepal.Width.Inches
#>           <dbl>       <dbl>              <dbl>
#>  1          5.1         3.5          1.3779535
#>  2          4.9         3            1.1811030
#>  3          4.7         3.2          1.2598432
#>  4          4.6         3.1          1.2204731
#>  5          5           3.6          1.4173236


# Two key porcelain functions are needed: read_api() and download_api().
# read_api() is focused on retrieving metadata so users can understand what
# is available and building-out the data frame where data will be placed.
# download_api() accumulates the arguments, compiles the download URL, then
# handles the downloading process.
# These will need to be specific to an API source

read_api(data,      # Some unique identifier for a particular data set
	 base_url   # Base url e.g. www.example.com/api/v2
	)

download_api(print_url,    # Print URL on console
	     page_size,    # Handling how much data to fetch with each GET
	)

# Often, API wrappers are serving two audiences. One, audiences who are not
# familiar with APIs so appreciate a syntax that is familiar in R. However,
# some users are able to write/understand native REST calls so they leverage
# an API wrapper to handle data types and the GET request. For those users,
# they could directly leverage the `download_api()` function. This is also
# handy for those who see a specified URL on the web and can copy/paste
# it directly.


# This is a direct example to download data
data <- download_api(url = "http://www.example.com/api/v2?$id=id&$select=Sepal.Width,Sepal.Length$where=Sepal.Length>1.0")

# The above is equivalent to a tidy-like query. Note, the `url` in `download_api()`
# cannot be specified when using this approach.
data <- read_api(data = "id") %>%
         select(Sepal.Length, Sepal.Width) %>%
         filter(Sepal.Width > 0.1) %>%
         download_api(print_url = TRUE)
#> "http://www.example.com/api/v2?$id=id&$select=Sepal.Width,Sepal.Length$where=Sepal.Length>1.0"
	# This is a conceptual study to leverage the tidyverse syntax and lazy
	# evaluation concepts (influenced by sparklyr) to approach API wrapper packages.
	#
	# Problem statement: API wrappers often require users to download all data
	# to manipulate and filter the information. Some packages support custom
	# queries but often complicate syntax by adding many parameters within the
	# function that may be hard for data scientists to formulate.
	#
	# Potential solution: API queries can be separated into three principal parts.
	# First, to query the API metadata to understand column names and data types.
	# Second, to allow users to specify query parameters without having yet
	# downloading the data but allowing users to understand what they would
	# expect to see. Third, to move the actual download to the last step so data
	# is only downloaded after the query has been defined. Each of these steps are
	# tied together using pipes (%>%)
	#
	# Why this solution: First, tidyverse syntax helps break apart major types of
	# data manipulation steps and ties them together using pipes. The result is a
	# simpler code syntax that allows users to easily add or remove data
	# manipulation logic. Second, APIs have large upfront costs. Misspecifying the
	# query means users may need to wait a long time before they understand the
	# mistake and need to reformulate their query. Often, users may opt to just
	# download all the data, which puts the onus on local machines. Influenced by
	# lazy evaluation in Spark (and the sparklyr syntax), the download of data
	# is the final step of the process. The first step is limited to only
	# downloading metadata.



	# Simple download example

	data <- read_api(data = "id") %>% # Fetches columns, data types
	select(cols) %>% # Select desired columns
	top_n() %>% # Only download n rows
	download_api() # Download the data



	# Prior to `download_api()`, evaluating the data frame will present a tibble
	# showing column names, expected data types. This explicitly depends on the
	# ability to quickly fetch or determine metadata for an API.


	# Preparing to download all data

	all_data <- read_api(data = "id")
	str(all_data)
	#> # A tibble: 0 x 5
	#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
	#> <dbl> <dbl> <dbl> <dbl> <fct>



	# Only some columns will be downloaded

	select_data <- read_api(data = "id") %>%
	select(Sepal.Length, Sepal.Width)
	str(select_data)
	#> # A tibble: 0 x 2
	#> Sepal.Length Sepal.Width
	#> <dbl> <dbl>



	# Add download_api() to finally retrieve the data from an API

	select_data <- read_api(data = "id") %>%
	select(Sepal.Length, Sepal.Width) %>%
	download_api()
	str(select_data)
	#> # A tibble: 150 x 2
	#> Sepal.Length Sepal.Width
	#> <dbl> <dbl>
	#> 1 5.1 3.5
	#> 2 4.9 3
	#> 3 4.7 3.2
	#> 4 4.6 3.1
	#> 5 5 3.6
	#> 6 5.4 3.9
	#> 7 4.6 3.4
	#> 8 5 3.4
	#> 9 4.4 2.9
	#> 10 4.9 3.1
	#> # … with 140 more rows

	# Not all tidy-like functions make sense in preparing to query an API. Also,
	# the extent of filtering and grouping (e.g., spread, gather) depends on
	# specific features of the API.

	data <- read_api(data = "id") %>% # Base function to retrieve metadata
	select(cols) %>% # Select columns
	filter(criteria) %>% # Filtering rows
	arrange(criteria) # Sorting rows
	top_n(int) # Select number of rows

	# Behind the scenes, parameters are being compiled into valid REST calls

	data <- read_api(data = "id") %>%
	select(Sepal.Length, Sepal.Width) %>%
	filter(Sepal.Width > 0.1) %>%
	top_n(5) %>%
	download_api(print_url = TRUE)
	#> https://www.example.com/api?id=id&$select=Sepal.Length,Sepal.Width&$where=Sepal.Width>0.1&limit=5
	#> # A tibble: 5 x 2
	#> Sepal.Length Sepal.Width
	#> <dbl> <dbl>
	#> 1 5.1 3.5
	#> 2 4.9 3
	#> 3 4.7 3.2
	#> 4 4.6 3.1
	#> 5 5 3.6



	# Code can be combined with tidyr functions after fetching data. One potential
	# issue is naming conflicts with tidyr.

	data <- read_api(data = "id") %>%
	select(cols) %>%
	top_n()
	download_api() %>%
	tidyr::mutate(Sepal.Width.Inches = Sepal.Width * 0.393701)
	str(data)
	#> # A tibble: 5 x 3
	#> Sepal.Length Sepal.Width Sepal.Width.Inches
	#> <dbl> <dbl> <dbl>
	#> 1 5.1 3.5 1.3779535
	#> 2 4.9 3 1.1811030
	#> 3 4.7 3.2 1.2598432
	#> 4 4.6 3.1 1.2204731
	#> 5 5 3.6 1.4173236



	# Two key porcelain functions are needed: read_api() and download_api().
	# read_api() is focused on retrieving metadata so users can understand what
	# is available and building-out the data frame where data will be placed.
	# download_api() accumulates the arguments, compiles the download URL, then
	# handles the downloading process.
	# These will need to be specific to an API source

	read_api(data, # Some unique identifier for a particular data set
	base_url # Base url e.g. www.example.com/api/v2
	)

	download_api(print_url, # Print URL on console
	page_size, # Handling how much data to fetch with each GET
	)

	# Often, API wrappers are serving two audiences. One, audiences who are not
	# familiar with APIs so appreciate a syntax that is familiar in R. However,
	# some users are able to write/understand native REST calls so they leverage
	# an API wrapper to handle data types and the GET request. For those users,
	# they could directly leverage the `download_api()` function. This is also
	# handy for those who see a specified URL on the web and can copy/paste
	# it directly.


	# This is a direct example to download data
	data <- download_api(url = "http://www.example.com/api/v2?$id=id&$select=Sepal.Width,Sepal.Length$where=Sepal.Length>1.0")

	# The above is equivalent to a tidy-like query. Note, the `url` in `download_api()`
	# cannot be specified when using this approach.
	data <- read_api(data = "id") %>%
	select(Sepal.Length, Sepal.Width) %>%
	filter(Sepal.Width > 0.1) %>%
	download_api(print_url = TRUE)
	#> "http://www.example.com/api/v2?$id=id&$select=Sepal.Width,Sepal.Length$where=Sepal.Length>1.0"