Skip to content

Instantly share code, notes, and snippets.

@vjcitn
Created March 15, 2024 13:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vjcitn/e30bca744e28d9dbc4388ed57a7f6d04 to your computer and use it in GitHub Desktop.
Save vjcitn/e30bca744e28d9dbc4388ed57a7f6d04 to your computer and use it in GitHub Desktop.
defines a function probe_lake() to produce shiny app to explore BiocBuildDB data lake
# setup
library(aws.s3)
library(DBI)
library(dplyr)
library(duckdb)
library(shiny)
# get bucket content metadata into a data.frame "bb"
bb = get_bucket_df("s3://bioc-builddb-mirror/buildResults")
type = sapply(strsplit(bb$Key, "-"), "[", 2)
#table(type)
bb$type = type
bb$repdate = as.Date(bb$LastModified)
# for testing
devinf = "buildResults/f9785dba87426695825cc6524dcb82c6-info.csv.gz"
if (!exists("con")) con <- dbConnect(duckdb::duckdb(), read_only=TRUE)
dbExecute(con, "install 'httpfs'")
dbExecute(con, "load 'httpfs'")
available_types = c("build_summary.csv.gz",
"info.csv.gz", "propagation_status.csv.gz",
"report.tgz")
# probe into a file for information
#' given a bucket file id (key), with duckdb connection con, get
#' a list of relevant information including 6 records from the
#' associated table in 'head'
#' @param key character(1) filename in bucket bioc-builddb-mirror
#' @param con duckdb db connection
#' @return a list with a provisional S3 class corresponding
#' to the file type, with elements fields, nrec, head, key,
#' type, guess, branch
#' @export
probe_file = function(key, con) {
pa = sprintf('s3://bioc-builddb-mirror/%s', key)
sqlstring = sprintf("FROM read_csv('%s')", pa)
# acquire the db and learn fields and number of records
tmp = con |>
dplyr::tbl(dplyr::sql(sqlstring))
fields = colnames(tmp)
nrec = tmp |> dplyr::count() |> as.data.frame() |>
unlist() |> as.numeric()
# get first 6 records in data.frame for 'probing'
thead=(tmp |> head() |> as.data.frame())
# obtain the file type based on substring of key
type = unlist(Map(grepl, available_types, key))
type = names(type)[which(type)]
# guess the content type among experiment, software, annotation, book
guess = ""
branch = ""
# specific guess code for info.csv
if (type == "info.csv.gz") {
guesses = c("affydata", "a4Reporting", "AHCytoBands", "csawBook")
names(guesses) = c("experiment", "software", "annotation", "book")
chks = sapply(guesses, function(x) dplyr::filter(dplyr::select(tmp, Package), Package==x) |> count()
|> as.data.frame() |> unlist())
guess = names(guesses[which(chks>0)])
branch = thead$git_branch[1]
}
# produce list and add class for some generics like print() and details()
ans = list(fields=fields, nrec = nrec, head=thead, key=key,
type=type, guess=guess, branch = branch)
class(ans) = c(type, class(ans))
ans
}
print.info.csv.gz = function(x, ...) {
cat(sprintf("%s info for %s branch with %d records\n", x$guess, x$branch, x$nrec))
}
print.build_summary.csv.gz = function(x, ...) {
cat(sprintf("build_summary with %d records\n", x$nrec))
}
print.propagation_status.csv.gz = function(x, ...) {
cat(sprintf("propagation_status with %d records\n", x$nrec))
}
details = function(x) UseMethod("details")
details.info.csv.gz = function(x) {
cat(sprintf("info.csv for %s branch, %d records.\n",
x$head$git_branch[1], x$nrec))
}
ui = fluidPage(
sidebarLayout(
sidebarPanel(
helpText("probe BiocBuildDB bucket contents"),
dateInput("date", "date", min="2024-02-29", max="2024-03-15", value="2024-03-07"),
radioButtons("mode", "mode", choices=c("info",
"propagation", "build_summary"), selected="info"),
uiOutput("boxes")
),
mainPanel(
tabsetPanel(
tabPanel("main",
verbatimTextOutput("thedate"),
DT::dataTableOutput("pick"))
)
)
)
)
server = function(input, output) {
#
# assuming bb is available as the bucket data.frame, confine
# attention to records with a specific date and mode (info, propagation, build_summary)
#
gettab = reactive({
validate(need(!is.null(input$date), "pick a date"))
tmp = bb[ intersect(which(bb$repdate == as.Date(input$date)),
grep(input$mode, bb$type)), ]
tks = make.names(paste(tmp$type, tmp$repdate, sep=":"), unique=TRUE)
rownames(tmp) = tks
tmp
})
#
# use output of gettab to find a selected file (named in 'Key' field)
# and probe it
#
do_probe = reactive({
tmp = gettab()
validate(need(nchar(input$tabs)>0, "waiting for tab"))
kk = tmp[input$tabs, "Key"]
validate(need(nchar(kk)>0,"getting content"))
probe_file(kk, con)
})
#
# provide 6 records from selected table
#
output$pick = DT::renderDataTable({
ans = do_probe()
ans$head
})
output$thedate = renderPrint( cat(print(do_probe())) )
#
# generate buttons for all available files of given date and mode
#
output$boxes = renderUI({
tmp = gettab()
rn = rownames(tmp)
radioButtons("tabs", "tabs", choices=rn, selected=rn[1])
})
}
#
# run the app
#
probe_lake = function() {
runApp(list(ui=ui, server=server))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment