tomsing1/postgres_json_duckdb.R

## postgres_json_duckdb.R
library(duckdb)
library(jsonlite)
library(purrr)

con <- dbConnect(duckdb())

# duckdb: install and load postgres extension to connect
# to a Postgres database via duckdb
dbExecute(con, "INSTALL postgres;")
dbExecute(con, "LOAD postgres;")

# connect to the `ngs` database on the local host
dbExecute(con, "CALL postgres_attach(
    'dbname=ngs user=postgres host=127.0.0.1', source_schema='public', sink_schema='main');"
)
# show available tables
dbGetQuery(con, "PRAGMA show_tables;")  # or dbListTables(con)

# retrieve the `name` and `attributes` columns as VARCHAR types
df <- dbGetQuery(
  con,
  "SELECT name, attributes FROM sample LIMIT 5"
)

# When each record can contain different (e.g. unknown) keys, then we need
# to parse key-value pairs with a programming language. (SQL needs to know
# the name and types of crosstab column in advance.)
# Let's use R to parse the key-value pairs of the JSON string in the `attributes`
# column into a data.frame with a separate column for each key:
data.frame(
  name = df$name,  # keep the other columns
  purrr::map_df(df$attributes, fromJSON)  # cross-tab JSON key:value pairs
)

# for future reference: the duckdb json extension enables JSON functions
# in queries https://duckdb.org/docs/extensions/json.html

dbExecute(con, "INSTALL 'json';")
dbExecute(con, "LOAD 'json';")

# examples of JSON queries

# Note: the following code explicitly casts the `attributes`
# column to `json` type. That's only necessary if the column
# is currently of type VARCHAR (=text). It if is already
# of type `json` or `jsonb` then no casting is needed.

# does the attribute text field contain valid json (for each record)?
dbGetQuery(
    con,
    "SELECT json_valid(CAST(attributes AS text)) AS valid FROM sample;")

# list keys in the attribute field of each record
# json(attributes) = equivalent to CAST(attributes as json)
dbGetQuery(
  con,
  "SELECT json_keys(json(attributes)) AS keys FROM sample;")

# same as above, but returns the unique keys used across all records
dbGetQuery(
  con,
  "SELECT DISTINCT unnest(json_keys(json(attributes))) AS keys FROM sample;")

# check for presence of a specific value
dbGetQuery(
  con,
  "SELECT json_contains(CAST(attributes AS json), '\"C57BL6\"') AS keys FROM sample;")

# extract the (scalar) value for each `genotype` key
dbGetQuery(
  con,
  "SELECT json_extract(json(attributes), '$.genotype') AS genotype FROM sample;")
	library(duckdb)
	library(jsonlite)
	library(purrr)

	con <- dbConnect(duckdb())

	# duckdb: install and load postgres extension to connect
	# to a Postgres database via duckdb
	dbExecute(con, "INSTALL postgres;")
	dbExecute(con, "LOAD postgres;")

	# connect to the `ngs` database on the local host
	dbExecute(con, "CALL postgres_attach(
	'dbname=ngs user=postgres host=127.0.0.1', source_schema='public', sink_schema='main');"
	)
	# show available tables
	dbGetQuery(con, "PRAGMA show_tables;") # or dbListTables(con)

	# retrieve the `name` and `attributes` columns as VARCHAR types
	df <- dbGetQuery(
	con,
	"SELECT name, attributes FROM sample LIMIT 5"
	)

	# When each record can contain different (e.g. unknown) keys, then we need
	# to parse key-value pairs with a programming language. (SQL needs to know
	# the name and types of crosstab column in advance.)
	# Let's use R to parse the key-value pairs of the JSON string in the `attributes`
	# column into a data.frame with a separate column for each key:
	data.frame(
	name = df$name, # keep the other columns
	purrr::map_df(df$attributes, fromJSON) # cross-tab JSON key:value pairs
	)

	# for future reference: the duckdb json extension enables JSON functions
	# in queries https://duckdb.org/docs/extensions/json.html

	dbExecute(con, "INSTALL 'json';")
	dbExecute(con, "LOAD 'json';")

	# examples of JSON queries

	# Note: the following code explicitly casts the `attributes`
	# column to `json` type. That's only necessary if the column
	# is currently of type VARCHAR (=text). It if is already
	# of type `json` or `jsonb` then no casting is needed.

	# does the attribute text field contain valid json (for each record)?
	dbGetQuery(
	con,
	"SELECT json_valid(CAST(attributes AS text)) AS valid FROM sample;")

	# list keys in the attribute field of each record
	# json(attributes) = equivalent to CAST(attributes as json)
	dbGetQuery(
	con,
	"SELECT json_keys(json(attributes)) AS keys FROM sample;")

	# same as above, but returns the unique keys used across all records
	dbGetQuery(
	con,
	"SELECT DISTINCT unnest(json_keys(json(attributes))) AS keys FROM sample;")

	# check for presence of a specific value
	dbGetQuery(
	con,
	"SELECT json_contains(CAST(attributes AS json), '\"C57BL6\"') AS keys FROM sample;")

	# extract the (scalar) value for each `genotype` key
	dbGetQuery(
	con,
	"SELECT json_extract(json(attributes), '$.genotype') AS genotype FROM sample;")