Skip to content

Instantly share code, notes, and snippets.

View vjcitn's full-sized avatar

Vince Carey vjcitn

  • Boston
View GitHub Profile
@vjcitn
vjcitn / explore_bert.R
Last active July 17, 2024 13:35
look at 'safetensors' component of bert (use after usebert.R succeeds)
# probe R user cache file system for bert resources
explore_bert = function() {
if (!requireNamespace("tibble")) stop("please install tibble to use this function")
res = dir(tools::R_user_dir("../huggingface/hub/models--google-bert--bert-base-uncased/snapshots/", "cache"), full=TRUE, recursive=TRUE)
if (length(res)<1) stop("not finding huggingface/hub or bert components in cache")
safetpath = grep("model.safetensors", res, value=TRUE)
st = try(reticulate::import("safetensors"))
if (inherits(st, "try-error")) stop("can't import safetensors with reticulate")
oo = st$safe_open(filename=safetpath, framework="pt")
kk = oo$keys()
@vjcitn
vjcitn / usebert.R
Last active July 20, 2024 23:28
demonstration of bert-base-uncased in huggingface
use_bert = function(phrase) {
# use reticulate::py_install(c("torch", "transformers"), pip=TRUE) to set up
# devtools::source_gist() may produce some warnings related to GPU
# note that first run will populate .cache/huggingface/hub with model components
my_bert_template = "
# ensure accessible python has transformers installed
from transformers import AutoTokenizer, BertForMaskedLM, logging
from transformers import pipeline
logging.set_verbosity_error()
@vjcitn
vjcitn / get68proc.R
Last active June 16, 2024 15:38
function that will create a SingleCellExperiment by retrieving a serialized version of 'processed' PBMC 68k from Open Storage Network
get68proc = function(cache=BiocFileCache::BiocFileCache(),
targetfolder=tempdir()) {
zippath = "https://mghp.osn.xsede.org/bir190004-bucket01/BiocMatrixGenerics/pbmc68kproc.zip"
ent = BiocFileCache::bfcquery(cache, "pbmc68kproc.zip")
if (nrow(ent)==0) {
ent = BiocFileCache::bfcadd(cache, rname=zippath, action="copy")
}
refresh = BiocFileCache::bfcquery(cache, "pbmc68kproc.zip")
nzip = nrow(refresh)
ind = 1
library(TENxPBMCData)
p68 = TENxPBMCData("pbmc68k")
rownames(p68) = make.names(rowData(p68)$Symbol_TENx, unique=TRUE)
library(scater)
library(scran)
p68 = logNormCounts(p68)
library(celldex)
hpca = HumanPrimaryCellAtlasData()
library(SingleR)
library(BiocParallel)
@vjcitn
vjcitn / lang.R
Created May 18, 2024 14:42
make silly sentences
nouns = c("cat", "mat", "cow", "chair", "table", "person", "food")
pastverbs = c("sat", "walked", "rode", "spent", "moved", "wrote", "managed", "went")
preps = c("on", "over", "beyond", "above", "beneath")
indic = c("the", "a", "any", "some")
adjectives = c("cute", "big", "small", "tiny", "huge", "green")
get1 = function (x) sample(x, size=1)
sentence = function() {
sprintf("%s %s %s %s %s %s %s\n", get1(indic), get1(nouns), get1(pastverbs),
@vjcitn
vjcitn / inst.R
Created March 25, 2024 10:51
"instrumented" do_SingleR
do_SingleRi = function(sce=NULL, path="/home/vincent/tenx3k.h5ad",
ref=celldex::HumanPrimaryCellAtlasData(),
ref.type = "label.main",
min.common = 1000, assay.type.test=1L, instrument=TRUE,
clprocid=NULL, ...) {
if (instrument == TRUE && is.null(clprocid)) stop("clprocid not set")
cl_timestamp(clprocid, "init")
stopifnot(ref.type %in% c("label.main", "label.fine"))
if (is.null(sce)) {
is_h5ad = length(grep("h5ad$", path)==1)
@vjcitn
vjcitn / testi
Last active March 25, 2024 10:51
use 'instrumented' do_SingleR(i)
source("inst.R", echo=TRUE)
library(Rcollectl)
library(AnVILBestPractices)
library(SingleR)
library(BiocParallel)
clid = cl_start()
Rcollectl::cl_timestamp(clid, "pre-data")
p3k = TENxPBMCData::TENxPBMCData("pbmc3k")
Rcollectl::cl_timestamp(clid, "3k loaded")
rownames(p3k) = make.names(rowData(p3k)$Symbol, unique=TRUE)
@vjcitn
vjcitn / probe_lake.R
Created March 15, 2024 13:26
defines a function probe_lake() to produce shiny app to explore BiocBuildDB data lake
# setup
library(aws.s3)
library(DBI)
library(dplyr)
library(duckdb)
library(shiny)
# get bucket content metadata into a data.frame "bb"
@vjcitn
vjcitn / dohist.R
Created March 11, 2024 11:56
histogram of commit times from an info.csv
con <- dbConnect(duckdb::duckdb(), read_only=TRUE)
dbExecute(con, "install 'httpfs'")
dbExecute(con, "load 'httpfs'")
library(ggplot2)
devinf = "buildResults/f9785dba87426695825cc6524dcb82c6-info.csv.gz"
pa = sprintf('s3://bioc-builddb-mirror/%s', devinf)
sqlstring = sprintf("FROM read_csv('%s')", pa)
@vjcitn
vjcitn / pull_info.R
Created March 11, 2024 11:08
code to explore info.csv collected by BiocBuildDB process
allinf = c(
"buildResults/08150976a8cce9f7bf2d08a6ff86160c-info.csv.gz",
"buildResults/19dfe90f431098a035502d632404e0f2-info.csv.gz",
"buildResults/27dc71f7942ea33a660ca2495d2701d0-info.csv.gz",
"buildResults/2a0158be35acdff034889a8b072b823d-info.csv.gz",
"buildResults/2cc2a659a35d607f71655f3c9c9e4283-info.csv.gz",
"buildResults/3ebd0a185863d3d036726be357bedc60-info.csv.gz",
"buildResults/4104e02fcdf0685bf51735c7abb2c06e-info.csv.gz",
"buildResults/519f35883478df30fd90189f094770d7-info.csv.gz",
"buildResults/64744d6015f8f931cae0c13d1e50b092-info.csv.gz",