grantmcdermott/duckdb-dplyr-ex.R

## duckdb-dplyr-ex.R
# preliminaries -----

# packages you'll need (install these if you don't have them)
# install.packages(c("arrow", "duckdb", "dplyr", "tidyr"))

# write a dummy parquet dataset (i.e., multiple parquet files) to disk
dir.create("airquality")
arrow::write_dataset(
  airquality,
  path = "airquality",
  partitioning = c("Month") # <-- NB: partition your data by the main grouping vars for efficiency
)
# check files (note Hive-style partitioning "month=X", etc)
dir("airquality")

# duckdb + dplyr -----

library(duckdb)
library(dplyr)

# create connection in-memory duckdb database
con = dbConnect(duckdb())
# add parquet dataset to the connection and reference it from R by assigning
# a name (here: aq)
aq = tbl(con, "airquality/**/*.parquet")

# aside: another way of referencing a duckdb table is by going via arrow
# see the example at the bottom of this script

# preview the dataset
aq

# now run dplyr commands as per normal; these will
# automatically be translated to the DuckDB SQL backend

aq |>
  summarise(
    across(Ozone:Temp, mean),
    .by = Month
  )

# Note: the above still just a (lazy) preview. To actually pull the data into
# R's memory, call collect() at the end

aq |>
  summarise(
    across(Ozone:Temp, mean),
    .by = Month
  ) |>
  collect()

# The collect feature is nice because it lets us defer computation for as long
# as possible, allowing you to string together queries

q1 = aq |>
  filter(Month == 7)
q2 = q1 |>
  summarise(
    across(Ozone:Temp, mean),
    .by = Day
  )

# Pro-tip: use explain to see the underlying SQL and DuckDB's optimised query sequence
explain(q2)

collect(q2)


# joins and pivot (reshape) front-end functions all work too. Although you'll
# need to load tidyr for the latter

library(tidyr)
q2 |>
  pivot_longer(Ozone:Temp) |>
  collect()


# ---- duckdb + dplyr + arrow

# As an alternative to the tbl() reference method, you can go via arrow, thanks
# to zero-copy and the to_duckdb() function

library(arrow)

aq2 = arrow::open_dataset("airquality")
aq2 # just shows the arrow schema

aq2 |>
  to_duckdb() |> # <-- key step
  summarise(
    across(Ozone:Temp, mean),
    .by = Month
  ) |>
  collect()

# One nice feature of going through the arrow intermediary is that it correctly
# picks up the partition column types (e.g., numeric etc.)

aq2 |> to_duckdb() |> filter(Month <= 6) |> head()


# If you tried this next query with the original aq tbl object, it still works
# thanks to some internal magic. But note that Month is actually registered as
# a character type (and this can mess up sorting later on)
aq |> filter(Month <= 6) |> head()

# TBF you can fix the latter issue by mutating Month to as.integer first but YMMV.
	# preliminaries -----

	# packages you'll need (install these if you don't have them)
	# install.packages(c("arrow", "duckdb", "dplyr", "tidyr"))

	# write a dummy parquet dataset (i.e., multiple parquet files) to disk
	dir.create("airquality")
	arrow::write_dataset(
	airquality,
	path = "airquality",
	partitioning = c("Month") # <-- NB: partition your data by the main grouping vars for efficiency
	)
	# check files (note Hive-style partitioning "month=X", etc)
	dir("airquality")

	# duckdb + dplyr -----

	library(duckdb)
	library(dplyr)

	# create connection in-memory duckdb database
	con = dbConnect(duckdb())
	# add parquet dataset to the connection and reference it from R by assigning
	# a name (here: aq)
	aq = tbl(con, "airquality/*/.parquet")

	# aside: another way of referencing a duckdb table is by going via arrow
	# see the example at the bottom of this script

	# preview the dataset
	aq

	# now run dplyr commands as per normal; these will
	# automatically be translated to the DuckDB SQL backend

	aq \|>
	summarise(
	across(Ozone:Temp, mean),
	.by = Month
	)

	# Note: the above still just a (lazy) preview. To actually pull the data into
	# R's memory, call collect() at the end

	aq \|>
	summarise(
	across(Ozone:Temp, mean),
	.by = Month
	) \|>
	collect()

	# The collect feature is nice because it lets us defer computation for as long
	# as possible, allowing you to string together queries

	q1 = aq \|>
	filter(Month == 7)
	q2 = q1 \|>
	summarise(
	across(Ozone:Temp, mean),
	.by = Day
	)

	# Pro-tip: use explain to see the underlying SQL and DuckDB's optimised query sequence
	explain(q2)

	collect(q2)


	# joins and pivot (reshape) front-end functions all work too. Although you'll
	# need to load tidyr for the latter

	library(tidyr)
	q2 \|>
	pivot_longer(Ozone:Temp) \|>
	collect()


	# ---- duckdb + dplyr + arrow

	# As an alternative to the tbl() reference method, you can go via arrow, thanks
	# to zero-copy and the to_duckdb() function

	library(arrow)

	aq2 = arrow::open_dataset("airquality")
	aq2 # just shows the arrow schema

	aq2 \|>
	to_duckdb() \|> # <-- key step
	summarise(
	across(Ozone:Temp, mean),
	.by = Month
	) \|>
	collect()

	# One nice feature of going through the arrow intermediary is that it correctly
	# picks up the partition column types (e.g., numeric etc.)

	aq2 \|> to_duckdb() \|> filter(Month <= 6) \|> head()


	# If you tried this next query with the original aq tbl object, it still works
	# thanks to some internal magic. But note that Month is actually registered as
	# a character type (and this can mess up sorting later on)
	aq \|> filter(Month <= 6) \|> head()

	# TBF you can fix the latter issue by mutating Month to as.integer first but YMMV.