Skip to content

Instantly share code, notes, and snippets.

@jaeddy
Created April 24, 2020 20:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jaeddy/8b0eace7dc039d63eb1e9916eed07516 to your computer and use it in GitHub Desktop.
Save jaeddy/8b0eace7dc039d63eb1e9916eed07516 to your computer and use it in GitHub Desktop.
title output
List Column Switch
html_notebook
toc toc_float
true
true

Summary

Convert a single column in a Synapse Table with comma-separated lists of values in STRING items into a STRING_LIST column (and save a copy of the original); remove the STRING_LIST column and revert to the original if you need to access the table with synapser.

Setup

Environment

library(reticulate)
library(dccvalidator)
library(tidyverse)

# local conda env with Python 3.7 and synapseclient installed
use_condaenv("csbc-pson-dcc", required = TRUE)

Synapse things

synapseclient <- reticulate::import("synapseclient")
syntab <- reticulate::import("synapseclient.table")
syn <- synapseclient$Synapse()
syn$login()

Functions

Helpers

#' Refresh the rows of a table in Synapse
#'
#' Note: this will overwrite all existing rows, not attempt to merge or
#' update only those rows with changes.
#'
#' @param table_id A string specifying the Synapse Table ID
#' @param update_df A dataframe or tbl representing the new version of the table
#' @param syn A reticulated synapseclient Python object
#' @param syntab A reticulated synapseclient.table module
#'
#' @return The reticulated synapseclient.table.CsvFileTable object representing
#' the updated table

#' @export
#'
#' @examples update_synapse_table("syn21980893", update_df, syn, syntab)
update_synapse_table <- function(table_id, update_df, syn, syntab) {
  current_rows <- syn$tableQuery(glue::glue("SELECT * FROM {table_id}"))
  syn$delete(current_rows)
  tmpfile <- fs::file_temp("rows.csv")
  write_csv(update_df, tmpfile)
  update_rows <- syntab$Table(table_id, tmpfile)
  syn$store(update_rows)
}

#' Convert comma-separated string values into a JSON array string
#'
#' @param str A string with values separated by commas
#'
#' @return The string containing a JSON array of the values
#' @export
#'
#' @examples
#' .csv_to_json("foo, bar, baz")
.csv_str_to_json <- function(str) {
  json_str <- str_split(str, ",")[[1]] %>% 
    str_trim() %>% 
    jsonlite::toJSON()
  if (json_str == "[null]") {
    "[]"
  } else {
    json_str
  }
}

#' Compute the maximum string length for a string list column with JSON arrays
#'
#' @param list_col A character vector with all items in the list column
#'
#' @return The (integer) length of the largest string
#' @export
#'
#' @examples 
#' .max_list_str_length(c('["123", "123456"]', '["1234", "12345"]' )
.max_list_str_length <- function(list_col) {
  map(list_col, function(l) {
  if (!is.na(l)) {
    jsonlite::fromJSON(l) %>% 
      map_int(str_length) %>% 
      max(na.rm = TRUE) %>%
      I
  } else {
    0L
  }
}) %>% 
    discard(is.infinite) %>%
    flatten_int() %>% 
    max()
}

The meat

add_list_column

add_list_column <- function(table_id, column_name, delimiter = ",", 
                            syn, syntab) {
  df <- dccvalidator::get_synapse_table(table_id, syn)
  
  old_name <- str_c(column_name, "asCsv", sep = "_")
  column_name <- rlang::ensym(column_name)
  old_name <- rlang::ensym(old_name)
  
  df <- df %>% 
    mutate(!!old_name := str_trim(!!column_name)) %>% 
    rowwise() %>% 
    mutate(!!column_name := .csv_str_to_json(!!column_name)) %>% 
    ungroup()
  
  max_str_len <- .max_list_str_length(df[[rlang::as_string(column_name)]])*1.5
  max_str_len <- as.integer(round(max_str_len))

  col_id <-reticulate::iterate(syn$getTableColumns(table_id))  %>%
    map(~ jsonlite::fromJSON(.$json())) %>%
    keep(~ .$name == column_name) %>%
    map_chr("id")
  old_column <- syn$getColumn(col_id)


  new_column <- syn$store(
    syntab$Column(
      name = rlang::as_string(column_name),
      columnType = "STRING_LIST",
      maximumSize = max_str_len,
      facetType = "enumeration"
    )
  )

  schema <- syn$get(table_id)
  schema$removeColumn(old_column)
  schema$addColumn(new_column)

  old_column_data <- jsonlite::fromJSON(old_column$json())
  old_column <- syn$store(
    syntab$Column(
      name = rlang::as_string(old_name),
      columnType = old_column_data$columnType,
      maximumSize = old_column_data$maximumSize
    )
  )
  schema$addColumn(old_column)

  schema <- syn$store(schema)

  update_synapse_table(table_id, df, syn, syntab)
}

remove_list_column

remove_list_column <- function(table_id, column_name, delimiter = ",", 
                               syn, syntab) {
  df <- dccvalidator::get_synapse_table(table_id, syn)
  
  new_name <- str_c(column_name, "asCsv", sep = "_")
  column_name <- rlang::ensym(column_name)
  new_name <- rlang::ensym(new_name)
  
  df <- df %>% 
    select(-!!column_name) %>% 
    rename(!!column_name := !!new_name)

  col_id <-reticulate::iterate(syn$getTableColumns(table_id))  %>% 
    map(~ jsonlite::fromJSON(.$json())) %>% 
    keep(~ .$name == column_name) %>% 
    map_chr("id")
  old_column <- syn$getColumn(col_id)
  
  new_col_id <-reticulate::iterate(syn$getTableColumns(table_id))  %>% 
    map(~ jsonlite::fromJSON(.$json())) %>% 
    keep(~ .$name == rlang::as_string(new_name)) %>% 
    map_chr("id")
  new_column <- syn$getColumn(new_col_id)
  
  schema <- syn$get(table_id)
  schema$removeColumn(old_column)
  schema$removeColumn(new_column)
  schema <- syn$store(schema)
  
  new_column_data <- jsonlite::fromJSON(new_column$json())
  new_column <- syn$store(
    syntab$Column(
      name = rlang::as_string(column_name),
      columnType = new_column_data$columnType,
      maximumSize = new_column_data$maximumSize,
      facetType = "enumeration"
    )
  )
  schema$addColumn(new_column)
  
  schema <- syn$store(schema)
  
  update_synapse_table(table_id, df, syn, syntab)
}

Example

Adding a list column

table_id <- "syn21980893"
dccvalidator::get_synapse_table(table_id, syn)
##   string_col       list_col_2
## 1       row1 row12_1, row12_2
table <- add_list_column(
  "syn21980893", column_name = "list_col_1", ",", syn, syntab
)
## Error in stri_trim_both(string): object 'list_col_1' not found
dccvalidator::get_synapse_table(table_id, syn)
##   string_col       list_col_2
## 1       row1 row12_1, row12_2

Removing a list column

table <- remove_list_column(
  "syn21980893", column_name = "list_col_1", ",", syn, syntab
)
## Error in is_character(x): object 'list_col_1' not found
dccvalidator::get_synapse_table(table_id, syn)
##   string_col       list_col_2
## 1       row1 row12_1, row12_2

Fin.

sessionInfo()
## R version 3.5.3 (2019-03-11)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.6
## 
## Matrix products: default
## BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] gistr_0.5.0        forcats_0.4.0      stringr_1.4.0     
##  [4] dplyr_0.8.3        purrr_0.3.4        readr_1.3.1       
##  [7] tidyr_1.0.0        tibble_2.1.3       ggplot2_3.2.0     
## [10] tidyverse_1.2.1    dccvalidator_0.2.0 shinyBS_0.61      
## [13] reticulate_1.14   
## 
## loaded via a namespace (and not attached):
##  [1] httr_1.4.0           pkgload_1.0.2        jsonlite_1.6        
##  [4] modelr_0.1.4         shiny_1.3.2          assertthat_0.2.1    
##  [7] askpass_1.1          cellranger_1.1.0     yaml_2.2.0          
## [10] remotes_2.0.4        pillar_1.4.2         backports_1.1.4     
## [13] lattice_0.20-38      glue_1.3.1           digest_0.6.20       
## [16] promises_1.0.1       rvest_0.3.4          colorspace_1.4-1    
## [19] htmltools_0.3.6      httpuv_1.5.1         pkgconfig_2.0.2     
## [22] broom_0.5.2          haven_2.1.0          config_0.3          
## [25] xtable_1.8-4         scales_1.0.0         later_0.8.0         
## [28] openssl_1.4          generics_0.0.2       usethis_1.5.0       
## [31] ellipsis_0.2.0.1     withr_2.1.2          lazyeval_0.2.2      
## [34] cli_1.1.0            magrittr_1.5         crayon_1.3.4        
## [37] readxl_1.3.1         mime_0.7             evaluate_0.14       
## [40] golem_0.2.1          fs_1.3.1             dockerfiler_0.1.3   
## [43] fansi_0.4.0          nlme_3.1-139         xml2_1.2.0          
## [46] shinydashboard_0.7.1 tools_3.5.3          hms_0.5.0           
## [49] lifecycle_0.1.0      munsell_0.5.0        packrat_0.5.0       
## [52] compiler_3.5.3       rlang_0.4.0          grid_3.5.3          
## [55] attempt_0.3.0        rstudioapi_0.10      base64enc_0.1-3     
## [58] rmarkdown_1.12       testthat_2.1.1       gtable_0.3.0        
## [61] curl_3.3             roxygen2_7.1.0       R6_2.4.1            
## [64] lubridate_1.7.4      knitr_1.23           zeallot_0.1.0       
## [67] utf8_1.1.4           rprojroot_1.3-2      desc_1.2.0          
## [70] stringi_1.4.3        Rcpp_1.0.1           vctrs_0.2.0         
## [73] tidyselect_0.2.5     xfun_0.8
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment