Skip to content

Instantly share code, notes, and snippets.

@tomsing1
Last active September 10, 2023 18:07
Show Gist options
  • Save tomsing1/81bcb4efac364a005fc21e29210f9c4a to your computer and use it in GitHub Desktop.
Save tomsing1/81bcb4efac364a005fc21e29210f9c4a to your computer and use it in GitHub Desktop.
duckdb: Retrieving a Postgres table with a json column and unpacking it
library(duckdb)
library(jsonlite)
library(purrr)
con <- dbConnect(duckdb())
# duckdb: install and load postgres extension to connect
# to a Postgres database via duckdb
dbExecute(con, "INSTALL postgres;")
dbExecute(con, "LOAD postgres;")
# connect to the `ngs` database on the local host
dbExecute(con, "CALL postgres_attach(
'dbname=ngs user=postgres host=127.0.0.1', source_schema='public', sink_schema='main');"
)
# show available tables
dbGetQuery(con, "PRAGMA show_tables;") # or dbListTables(con)
# retrieve the `name` and `attributes` columns as VARCHAR types
df <- dbGetQuery(
con,
"SELECT name, attributes FROM sample LIMIT 5"
)
# When each record can contain different (e.g. unknown) keys, then we need
# to parse key-value pairs with a programming language. (SQL needs to know
# the name and types of crosstab column in advance.)
# Let's use R to parse the key-value pairs of the JSON string in the `attributes`
# column into a data.frame with a separate column for each key:
data.frame(
name = df$name, # keep the other columns
purrr::map_df(df$attributes, fromJSON) # cross-tab JSON key:value pairs
)
# for future reference: the duckdb json extension enables JSON functions
# in queries https://duckdb.org/docs/extensions/json.html
dbExecute(con, "INSTALL 'json';")
dbExecute(con, "LOAD 'json';")
# examples of JSON queries
# Note: the following code explicitly casts the `attributes`
# column to `json` type. That's only necessary if the column
# is currently of type VARCHAR (=text). It if is already
# of type `json` or `jsonb` then no casting is needed.
# does the attribute text field contain valid json (for each record)?
dbGetQuery(
con,
"SELECT json_valid(CAST(attributes AS text)) AS valid FROM sample;")
# list keys in the attribute field of each record
# json(attributes) = equivalent to CAST(attributes as json)
dbGetQuery(
con,
"SELECT json_keys(json(attributes)) AS keys FROM sample;")
# same as above, but returns the unique keys used across all records
dbGetQuery(
con,
"SELECT DISTINCT unnest(json_keys(json(attributes))) AS keys FROM sample;")
# check for presence of a specific value
dbGetQuery(
con,
"SELECT json_contains(CAST(attributes AS json), '\"C57BL6\"') AS keys FROM sample;")
# extract the (scalar) value for each `genotype` key
dbGetQuery(
con,
"SELECT json_extract(json(attributes), '$.genotype') AS genotype FROM sample;")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment