Skip to content

Instantly share code, notes, and snippets.

@edonnachie
Created July 18, 2022 12:02
Show Gist options
  • Star 5 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save edonnachie/6e12ef92b9a672e68a036c489e93d0ae to your computer and use it in GitHub Desktop.
Save edonnachie/6e12ef92b9a672e68a036c489e93d0ae to your computer and use it in GitHub Desktop.
library(dplyr)
library(arrow)
library(duckdb)
# install.packages(c("dplyr", "arrpw", "duckdb", "nycflights13"))
## Export the nycflights13 dataset to arrow format ----
nycflights13::flights |>
# Partition by month to speed up and make keep files small
group_by(month) |>
arrow::write_dataset(path = "nyflights/flights")
nycflights13::airlines |>
arrow::write_dataset(path = "nyflights/airlines")
nycflights13::airports |>
arrow::write_dataset(path = "nyflights/airports")
nycflights13::weather |>
arrow::write_dataset(path = "nyflights/weather")
## Open the dataset using arrow ----
flights <- open_dataset("nyflights/flights", partitioning = "month")
airlines <- open_dataset("nyflights/airlines")
weather <- open_dataset("nyflights/weather")
airports <- open_dataset("nyflights/airports")
## DuckDB setup ----
# Initialise a duckDB database connection
duck <- dbConnect(duckdb::duckdb())
# Register the arrow datasets as duckDB views
duckdb::duckdb_register_arrow(duck, "flights", flights)
duckdb::duckdb_register_arrow(duck, "airlines", airlines)
duckdb::duckdb_register_arrow(duck, "airports", airports)
duckdb::duckdb_register_arrow(duck, "weather", weather)
## Do some queries ----
# Using the arrow datasets, without duckDB
flights |>
filter(month == 5 & day == 15) |>
left_join(airports, by = c("origin" = "faa")) |>
collect()
# dbplyr
tbl(duck, "flights") |>
filter(month == 2) |>
left_join(tbl(duck, "airports"), by = c("origin" = "faa"))
# duckDB
dbGetQuery(duck, "
select dest, count(*) n_flights from flights
inner join weather
using (origin, year, month, day, hour)
where wind_speed > 15
group by dest
order by n_flights desc
")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment