Last active
September 10, 2023 18:07
-
-
Save tomsing1/81bcb4efac364a005fc21e29210f9c4a to your computer and use it in GitHub Desktop.
duckdb: Retrieving a Postgres table with a json column and unpacking it
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(duckdb) | |
library(jsonlite) | |
library(purrr) | |
con <- dbConnect(duckdb()) | |
# duckdb: install and load postgres extension to connect | |
# to a Postgres database via duckdb | |
dbExecute(con, "INSTALL postgres;") | |
dbExecute(con, "LOAD postgres;") | |
# connect to the `ngs` database on the local host | |
dbExecute(con, "CALL postgres_attach( | |
'dbname=ngs user=postgres host=127.0.0.1', source_schema='public', sink_schema='main');" | |
) | |
# show available tables | |
dbGetQuery(con, "PRAGMA show_tables;") # or dbListTables(con) | |
# retrieve the `name` and `attributes` columns as VARCHAR types | |
df <- dbGetQuery( | |
con, | |
"SELECT name, attributes FROM sample LIMIT 5" | |
) | |
# When each record can contain different (e.g. unknown) keys, then we need | |
# to parse key-value pairs with a programming language. (SQL needs to know | |
# the name and types of crosstab column in advance.) | |
# Let's use R to parse the key-value pairs of the JSON string in the `attributes` | |
# column into a data.frame with a separate column for each key: | |
data.frame( | |
name = df$name, # keep the other columns | |
purrr::map_df(df$attributes, fromJSON) # cross-tab JSON key:value pairs | |
) | |
# for future reference: the duckdb json extension enables JSON functions | |
# in queries https://duckdb.org/docs/extensions/json.html | |
dbExecute(con, "INSTALL 'json';") | |
dbExecute(con, "LOAD 'json';") | |
# examples of JSON queries | |
# Note: the following code explicitly casts the `attributes` | |
# column to `json` type. That's only necessary if the column | |
# is currently of type VARCHAR (=text). It if is already | |
# of type `json` or `jsonb` then no casting is needed. | |
# does the attribute text field contain valid json (for each record)? | |
dbGetQuery( | |
con, | |
"SELECT json_valid(CAST(attributes AS text)) AS valid FROM sample;") | |
# list keys in the attribute field of each record | |
# json(attributes) = equivalent to CAST(attributes as json) | |
dbGetQuery( | |
con, | |
"SELECT json_keys(json(attributes)) AS keys FROM sample;") | |
# same as above, but returns the unique keys used across all records | |
dbGetQuery( | |
con, | |
"SELECT DISTINCT unnest(json_keys(json(attributes))) AS keys FROM sample;") | |
# check for presence of a specific value | |
dbGetQuery( | |
con, | |
"SELECT json_contains(CAST(attributes AS json), '\"C57BL6\"') AS keys FROM sample;") | |
# extract the (scalar) value for each `genotype` key | |
dbGetQuery( | |
con, | |
"SELECT json_extract(json(attributes), '$.genotype') AS genotype FROM sample;") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment