Skip to content

Instantly share code, notes, and snippets.

@jaeddy
Created April 24, 2020 20:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jaeddy/fd5b3c82cb6b0c7de0a7aaded607e06d to your computer and use it in GitHub Desktop.
Save jaeddy/fd5b3c82cb6b0c7de0a7aaded607e06d to your computer and use it in GitHub Desktop.
title output
List Column Switch
html_notebook
toc toc_float
true
true

Summary

Convert a single column in a Synapse Table with comma-separated lists of values in STRING items into a STRING_LIST column (and save a copy of the original); remove the STRING_LIST column and revert to the original if you need to access the table with synapser.

Setup

Environment

library(reticulate)
library(dccvalidator)
library(tidyverse)

# local conda env with Python 3.7 and synapseclient installed
use_condaenv("csbc-pson-dcc", required = TRUE)

Synapse things

synapseclient <- reticulate::import("synapseclient")
syntab <- reticulate::import("synapseclient.table")
syn <- synapseclient$Synapse()
syn$login()

Functions

Helpers

#' Refresh the rows of a table in Synapse
#'
#' Note: this will overwrite all existing rows, not attempt to merge or
#' update only those rows with changes.
#'
#' @param table_id A string specifying the Synapse Table ID
#' @param update_df A dataframe or tbl representing the new version of the table
#' @param syn A reticulated synapseclient Python object
#' @param syntab A reticulated synapseclient.table module
#'
#' @return The reticulated synapseclient.table.CsvFileTable object representing
#' the updated table

#' @export
#'
#' @examples update_synapse_table("syn21980893", update_df, syn, syntab)
update_synapse_table <- function(table_id, update_df, syn, syntab) {
  current_rows <- syn$tableQuery(glue::glue("SELECT * FROM {table_id}"))
  syn$delete(current_rows)
  tmpfile <- fs::file_temp("rows.csv")
  write_csv(update_df, tmpfile)
  update_rows <- syntab$Table(table_id, tmpfile)
  syn$store(update_rows)
}

#' Convert comma-separated string values into a JSON array string
#'
#' @param str A string with values separated by commas
#'
#' @return The string containing a JSON array of the values
#' @export
#'
#' @examples
#' .csv_to_json("foo, bar, baz")
.csv_str_to_json <- function(str) {
  json_str <- str_split(str, ",")[[1]] %>% 
    str_trim() %>% 
    jsonlite::toJSON()
  if (json_str == "[null]") {
    "[]"
  } else {
    json_str
  }
}

#' Compute the maximum string length for a string list column with JSON arrays
#'
#' @param list_col A character vector with all items in the list column
#'
#' @return The (integer) length of the largest string
#' @export
#'
#' @examples 
#' .max_list_str_length(c('["123", "123456"]', '["1234", "12345"]' )
.max_list_str_length <- function(list_col) {
  map(list_col, function(l) {
  if (!is.na(l)) {
    jsonlite::fromJSON(l) %>% 
      map_int(str_length) %>% 
      max(na.rm = TRUE) %>%
      I
  } else {
    0L
  }
}) %>% 
    discard(is.infinite) %>%
    flatten_int() %>% 
    max()
}

The meat

add_list_column

add_list_column <- function(table_id, column_name, delimiter = ",", 
                            syn, syntab) {
  df <- dccvalidator::get_synapse_table(table_id, syn)
  
  old_name <- str_c(column_name, "asCsv", sep = "_")
  column_name <- rlang::ensym(column_name)
  old_name <- rlang::ensym(old_name)
  
  df <- df %>% 
    mutate(!!old_name := str_trim(!!column_name)) %>% 
    rowwise() %>% 
    mutate(!!column_name := .csv_str_to_json(!!column_name)) %>% 
    ungroup()
  
  max_str_len <- .max_list_str_length(df[[rlang::as_string(column_name)]])*1.5
  max_str_len <- as.integer(round(max_str_len))

  col_id <-reticulate::iterate(syn$getTableColumns(table_id))  %>%
    map(~ jsonlite::fromJSON(.$json())) %>%
    keep(~ .$name == column_name) %>%
    map_chr("id")
  old_column <- syn$getColumn(col_id)


  new_column <- syn$store(
    syntab$Column(
      name = rlang::as_string(column_name),
      columnType = "STRING_LIST",
      maximumSize = max_str_len,
      facetType = "enumeration"
    )
  )

  schema <- syn$get(table_id)
  schema$removeColumn(old_column)
  schema$addColumn(new_column)

  old_column_data <- jsonlite::fromJSON(old_column$json())
  old_column <- syn$store(
    syntab$Column(
      name = rlang::as_string(old_name),
      columnType = old_column_data$columnType,
      maximumSize = old_column_data$maximumSize
    )
  )
  schema$addColumn(old_column)

  schema <- syn$store(schema)

  update_synapse_table(table_id, df, syn, syntab)
}

remove_list_column

remove_list_column <- function(table_id, column_name, delimiter = ",", 
                               syn, syntab) {
  df <- dccvalidator::get_synapse_table(table_id, syn)
  
  new_name <- str_c(column_name, "asCsv", sep = "_")
  column_name <- rlang::ensym(column_name)
  new_name <- rlang::ensym(new_name)
  
  df <- df %>% 
    select(-!!column_name) %>% 
    rename(!!column_name := !!new_name)

  col_id <-reticulate::iterate(syn$getTableColumns(table_id))  %>% 
    map(~ jsonlite::fromJSON(.$json())) %>% 
    keep(~ .$name == column_name) %>% 
    map_chr("id")
  old_column <- syn$getColumn(col_id)
  
  new_col_id <-reticulate::iterate(syn$getTableColumns(table_id))  %>% 
    map(~ jsonlite::fromJSON(.$json())) %>% 
    keep(~ .$name == rlang::as_string(new_name)) %>% 
    map_chr("id")
  new_column <- syn$getColumn(new_col_id)
  
  schema <- syn$get(table_id)
  schema$removeColumn(old_column)
  schema$removeColumn(new_column)
  schema <- syn$store(schema)
  
  new_column_data <- jsonlite::fromJSON(new_column$json())
  new_column <- syn$store(
    syntab$Column(
      name = rlang::as_string(column_name),
      columnType = new_column_data$columnType,
      maximumSize = new_column_data$maximumSize,
      facetType = "enumeration"
    )
  )
  schema$addColumn(new_column)
  
  schema <- syn$store(schema)
  
  update_synapse_table(table_id, df, syn, syntab)
}

Example

Adding a list column

table_id <- "syn21980893"
dccvalidator::get_synapse_table(table_id, syn)
##   string_col       list_col_2
## 1       row1 row12_1, row12_2
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment