klprint/single_cell_functions.R

## single_cell_functions.R
# Taken from Simon
# compute variances across column or rows for column-sparse matrices
library(Matrix)
colVars_spm <- function( spm ) {
  stopifnot( is( spm, "dgCMatrix" ) )
  ans <- sapply( seq.int(spm@Dim[2]), function(j) {
    mean <- sum( spm@x[ (spm@p[j]+1):spm@p[j+1] ] ) / spm@Dim[1]
    sum( ( spm@x[ (spm@p[j]+1):spm@p[j+1] ] - mean )^2 ) +
      mean^2 * ( spm@Dim[1] - ( spm@p[j+1] - spm@p[j] ) ) } ) / ( spm@Dim[1] - 1 )
  names(ans) <- spm@Dimnames[[2]]
  ans
}
rowVars_spm <- function( spm ) {
  colVars_spm( t(spm) )
}

pca_on_informative_genes <- function( counts, nDim=20 ) {

  norm_counts <- t(t(counts) / colSums(counts))

  gene_means <- rowMeans( norm_counts )
  gene_vars <- rowVars_spm( norm_counts )
  poisson_vmr <- mean( 1 / colSums( counts ) )

  informative_genes <- names(which(
    gene_vars / gene_means  >  1.5 * poisson_vmr ))

  return(
    list(
      pca = irlba::prcomp_irlba( t(sqrt(norm_counts[informative_genes,])), n = nDim)$x,
      info.genes = informative_genes
    )
  )

}

get.info.genes <- function( counts ) {
  library(Matrix)
  stopifnot(!is.null(dim(counts)))
  norm_counts <- t(t(counts) / Matrix::colSums(counts))

  gene_means <- rowMeans( norm_counts )
  gene_vars <- rowVars_spm( norm_counts )
  poisson_vmr <- mean( 1 / Matrix::colSums( counts ) )

  informative_genes <- names(which(
    gene_vars / gene_means  >  1.5 * poisson_vmr ))

  return(
    informative_genes
  )

}

get.info.genes.batchwise <- function( all_counts, per.cell.batch ){
  tmp <- lapply(unique(per.cell.batch), function(pcb){
    counts <- all_counts[,per.cell.batch == pcb]
    get.info.genes(counts)
  })

  print(lapply(tmp, length))
  return(
    unique(
      do.call("c", tmp)
    )
  )
}

###################


do.smooth <- function(pca, raw, gene, alpha=.1){
  library(locfit)
  if(ncol(pca) > 15){
    cat("Reducing the number of PCs to 15\nto prevent locfit bug.")
    pca <- pca[,1:15]
  }

  predict( locfit.raw(
    pca[,1:15],
    raw[gene,],
    base = log( colSums(raw) ),
    alpha = alpha, deg = 1, family = "poisson", ev = dat() ) )
}


get.10x.data <- function(path){
  out <- list()
  out$raw <- Seurat::Read10X(path)
  return(out)
}

get.h5.data <- function(path){
  hf <- hdf5r::H5File$new(path, mode="r")
  cnts <- Matrix::Matrix(data = hf[["umi/ft/counts"]][,],
                         dimnames = list(hf[["umi/ft/genes"]][],
                                         hf[["umi/ft/cells"]][]))

  datalist <- list()
  datalist$raw <- cnts
  return(datalist)
}


do.preprocess <- function(datalist, nGenes = 500, nCells = 1, nDim = 20){
  cat("Subsetting the data for: ")
  cat(nGenes, "genes per cell and", nCells, "cells per gene minimum\n")
  datalist$raw <- datalist$raw[ , colSums( datalist$raw>0 ) >= nGenes ]
  datalist$raw <- datalist$raw[ rowSums(datalist$raw) >= nCells,  ]

  cat("Normalizing\n")
  datalist$nrm <- t(t(datalist$raw) / colSums(datalist$raw))

  cat("Running PCA\n")
  tmp <- pca_on_informative_genes(datalist$raw, nDim)
  datalist$pca <- tmp$pca
  datalist$info.genes <- tmp$info.genes

  return(datalist)
}


runUMAP <- function(datalist, nPC = 15){
  uwot::umap( datalist$pca[,1:nPC],
              n_neighbors = 30,
              min_dist = .3,
              metric = "cosine" )
}


run.seurat <- function(datalist, ident.var.genes = F){
  library(Seurat)
  seu <- CreateSeuratObject( datalist$raw , min.cells = 0, min.features = 0)
  if(ident.var.genes){
    seu <- FindVariableFeatures( seu, selection.method = "vst", nfeatures = length(datalist$info.genes))
  }else{
    seu[["RNA"]]@var.features <- datalist$info.genes
  }

  #seu <- NormalizeData(seu)
  seu <- ScaleData( seu )
  seu <- RunPCA( seu, verbose=FALSE )
  seu <- FindNeighbors( seu )
  seu <- FindClusters( seu )

  return(seu)
}


rem.batch <- function(mtx, l){
  out <- lapply(unique(l), function(x){
    cat("Collecting batch from", x, "\n")
    tmp.mtx <- as.matrix(mtx[,l == x])
    # rs <- rowSums(mtx)
    # names(rs) <- rownames(mtx)
    rs <- rowSums(tmp.mtx)

    # tmp.mtx <- sqrt(t(
    #   t(tmp.mtx) / colSums(tmp.mtx)
    # ))

    rs <- sqrt(rs / sum(rs))
    #rs <- apply(as.matrix(rs), 2, function(x) (x - min(x)) / (max(x - min(x))))

    return(rs)

    #lm(as.matrix(tmp.mtx)~rs)$residuals
  })

  rs.mat <- do.call(cbind, out)

  cat("Normalizing\n")
  exprs <- sqrt(t(t(mtx) / colSums(mtx)))
  #exprs <- apply(as.matrix(exprs), 2, function(x) (x - min(x)) / (max(x - min(x))))

  cat("Calculating dot product\n")
  dp <- t(exprs) %*% rs.mat

  print(head(dp))

  cat("Doing linear regression\n")
  out <- lm(as.matrix(t(exprs)) ~ as.matrix(dp))$residuals

  return(
    out
  )
}


do.process.batches <- function( counts, batch.vec ){
  out <- list()
  out$raw <- counts
  cat("-----------------\n")
  cat("Getting interesting genes\n")
  cat("-----------------\n")
  out$info.genes <- get.info.genes.batchwise(out$raw, batch.vec)
  cat("-----------------\n")
  cat("\n")
  cat("-----------------\n")
  cat("Batch removal\n")
  cat("-----------------\n")
  out$brm <- rem.batch(out$raw, batch.vec)
  out$nrm <- sqrt(t(t(out$raw) / colSums(out$raw)))
  cat("-----------------\n")
  cat("\n")
  cat("-----------------\n")
  cat("Running PCA on interesting genes\n")
  cat("-----------------\n")
  out$pca <- irlba::prcomp_irlba(out$brm[,out$info.genes], n=20, scale. = F, center = T)$x
  cat("-----------------\n")
  cat("\n")
  cat("-----------------\n")
  cat("Running UMAP\n")
  cat("-----------------\n")
  out$umap <- uwot::umap(out$pca, metric="cosine", min_dist=.3, n_neighbors=30)
  cat("-----------------\n")
  out$batch <- batch.vec
  return(out)
}


gen.seurat <- function(cnts, brm.mtx, var.genes){
  cat("Generating Seurat Object\n")
  seu <- CreateSeuratObject(cnts)
  cat("Storing precomputed data\n")
  seu[["RNA"]]@data <-  Matrix(brm.mtx[var.genes,], sparse = T)
  seu[["RNA"]]@scale.data <- brm.mtx[var.genes,]
  seu[["RNA"]]@var.features <- var.genes

  cat("Running PCA\n")
  seu <- RunPCA(seu)
  seu <- FindNeighbors(seu)
  seu <- FindClusters(seu)
}


project.data <- function(a.brm, b.brm, nDim=30){
  cat("Running PCA\n")
  pca.a <- irlba::prcomp_irlba(a.brm, n = nDim)
  cat("Projecting data\n")
  pca.b <- scale(b.brm, center = pca.a$center, scale = pca.a$scale) %*% pca.a$rotation

  return(rbind(pca.a$x, pca.b))
}

project.annotation <- function(source.mtx, querry.mtx, source.annotation, assign.prob = .5){
  shared.genes <- intersect(colnames(source.mtx), colnames(querry.mtx))
  cat("Number of shared genes:", length(shared.genes), "\n")

  cat("Running projection PCA\n")
  cmbd.pca <- project.data(source.mtx[,shared.genes],
                           querry.mtx[,shared.genes])

  cat("Finding kNN\n")
  tst.knn <- FNN::get.knnx(cmbd.pca[1:nrow(source.mtx), ],
                           cmbd.pca[(nrow(source.mtx)+1):nrow(cmbd.pca), ])

  cat("Voting\n")
  tst.cl.vote <- t(apply(tst.knn$nn.index, 1, function(rw) {
    source.annotation[rw]
  }))

  po.cl <- matrix(NA, ncol=length(unique(source.annotation)), nrow = nrow(tst.cl.vote))
  colnames(po.cl) <- unique(source.annotation)
  for(i in 1:nrow(tst.cl.vote)){
    tbl <- table(tst.cl.vote[i,])
    po.cl[i,names(tbl)] <- as.vector(tbl)
  }

  rel.po.cl <- t(apply(po.cl, 1, function(rw){
    rw / sum(rw, na.rm = T)
  }))

  voted <- apply(rel.po.cl, 1, function(rw){
    if(max(rw, na.rm = T) < assign.prob){
      return(NA)
    }else{
      y <- colnames(rel.po.cl)[rw == max(rw, na.rm = T)]
      y[!is.na(y)][1]
    }
  })

  return(list(
    voting.mtx = rel.po.cl,
    result = voted,
    knn.out = tst.knn
  ))
}

union.matrix <- function(m1, m2){
  library(Matrix)
  u.rw <- union(rownames(m1), rownames(m2))
  u.cl <- c(colnames(m1), colnames(m2))

  cmat <- Matrix(0, ncol=length(u.cl), nrow=length(u.rw), dimnames = list(u.rw, u.cl),sparse=F)

  cat("Generated matrix with", ncol(cmat), "columns and", nrow(cmat), "rows.\n")

  cat("Filling in the values\n")
  cmat[rownames(m1), colnames(m1)] <- m1
  cmat[rownames(m2), colnames(m2)] <- m2

  return(Matrix(cmat, sparse = T))
}


union.matrix.list <- function(...){
    library(Matrix)
    mlist = list(...)

    out = union.matrix(mlist[[1]], mlist[[2]])

    if(length(mlist) > 2){
       for(i in 3:length(mlist)){
        out = union.matrix(out, mlist[[i]])
       }
    }


    return(out)
}


create.h5 <- function(m,
                      filepath,
                      batch = NULL,
                      cluster = NULL,
                      embedding = NULL,
                      indiv_embedding = NULL){
  library(hdf5r)
  h5file <- H5File$new(filepath, mode = "w")
  h5file[["X"]] <- m
  h5file[["genes"]] <- rownames(m)
  h5file[["cells"]] <- colnames(m)

  if(!is.null(batch)){
    h5file[["batch"]] <- batch
  }

  if(!is.null(cluster)){
    h5file[["cluster"]] <- cluster
  }

  if(!is.null(embedding)){
    h5file[["embedding"]] <- embedding
  }

  if(!is.null(indiv_embedding)){
      h5file[["indiv_embedding"]] <- indiv_embedding
  }

  h5file$close_all()
}


merge.sparse <- function(...) {

  cnnew <- character()
  rnnew <- character()
  x <- vector()
  i <- numeric()
  j <- numeric()

  icount <- 1
  cat("Merged matrices: ")
  for (M in list(...)) {
    cat(icount, " ")
    cnold <- colnames(M)
    rnold <- rownames(M)

    cnnew <- union(cnnew,cnold)
    rnnew <- union(rnnew,rnold)

    cindnew <- match(cnold,cnnew)
    rindnew <- match(rnold,rnnew)
    ind <- unname(Matrix::which(M != 0,arr.ind=T))
    i <- c(i,rindnew[ind[,1]])
    j <- c(j,cindnew[ind[,2]])
    x <- c(x,M@x)

    icount <- icount +1

  }
  cat("\n")

  sparseMatrix(i=i,j=j,x=x,dims=c(length(rnnew),length(cnnew)),dimnames=list(rnnew,cnnew))
}

##### New Funcs #####
read.h5 <- function(path, slot = "ft"){
  H5 <- H5File$new(path, mode = "r")

  mtx <- Matrix(
    H5[[paste0("umi/", slot, "/counts")]][,],
    dimnames = list(
      H5[[paste0("umi/", slot, "/genes")]][],
      H5[[paste0("umi/", slot, "/cells")]][]
    ),
    sparse = T
  )

  H5$close_all()

  return(mtx)
}

generate.data.list <- function(...){
  dl <- list(...)

  all.sample.names <- do.call("c", lapply(dl, function(x) unique(as.character(x$orig.ident))))

  print(all.sample.names)

  all.samples <- do.call("c",lapply(dl, function(dat){
    lapply(unique(dat$orig.ident), function(x) dat@assays$RNA@counts[,dat$orig.ident == x])
  }))

  names(all.samples) <- all.sample.names

  return(all.samples)
}

get.ensembl.names <- function(ids, mart = "mmusculus_gene_ensembl"){
  library(biomaRt)
  mart <- useMart("ensembl", mart)

  getBM(
    c("ensembl_gene_id",
      "external_gene_name"),
    "ensembl_gene_id",
    values = ids,
    mart=mart
  )
}

get.ensembl.names.for.list <- function(data.list, mart = "mmusculus_gene_ensembl"){
  get.ensembl.names(
    unique(do.call("c", lapply(data.list, rownames))),
    mart
  )
}


filter.one2one <- function(data.list,
                           genes.meta,
                           translate.to.homolog = F,
                           return.gene.symbol = F){
  keep.genes <- genes.meta[genes.meta[,5] == "ortholog_one2one", "Gene.stable.ID"]

  for(n in names(data.list)){
    d <- data.list[[n]]
    old.n <- rownames(d)

    d <- d[rownames(d) %in% keep.genes,]

    if(nrow(d) == 0){
      print(n)
      print(head(old.n))
      print(head(keep.genes))
      stop("No genes kept!")
    }

    if(translate.to.homolog){
      new.names <- genes.meta[match(rownames(d), genes.meta[,1]),3]
      rownames(d) <- new.names
    }

    data.list[[n]] <- d
  }

  return(data.list)
}

translate.geneid.to.name <- function(data.list, genes.meta){
  for(n in names(data.list)){
    d <- data.list[[n]]
    old.n <- rownames(d)
    new.n <- as.character(genes.meta$Gene.name[match(old.n, genes.meta$Gene.stable.ID)])
    new.n[is.na(new.n)] <- old.n[is.na(new.n)]
    rownames(d) <- new.n


    data.list[[n]] <- d
  }

  return(data.list)
}

get.exprs <- function(dl, gene){
  exprs <- list()
  for(i in 1:length(dl)){
    nrm <- dl[[i]]
    if(gene %in% rownames(nrm)){
      tmp <-  nrm[gene, ]
      names(tmp) <- colnames(nrm)
      exprs[[i]] <- tmp
    }else{
      tmp <- rep(0, ncol(nrm))
      names(tmp) <- colnames(nrm)
      exprs[[i]] <- tmp
    }
  }
  exprs <- do.call("c", exprs)

  return(exprs)
}


plot.exprs.3d <- function(coord, exprs, size=.1){
  require(threejs)
  require(RColorBrewer)

  # tmp <- names(exprs)
  exprs <- scale(exprs)
  # names(exprs) <- tmp
  exprs <- round(exprs, digits=2)
  cols <- rev(colorRampPalette(brewer.pal(5, "Spectral"))(length(sort(unique(exprs)))))

  coord <- coord[order(exprs), ]
  exprs <- sort(exprs)
  cols.vec <- cols[as.numeric(as.factor(exprs))]
  # coord <- coord[names(exprs), ]

  scatterplot3js(
    coord[,1],
    coord[,2],
    coord[,3],
    color = cols.vec,
    size=size
  )
}

plot.factor.3d <- function(coord, color, size=.1){
  color <- as.factor(color)
  require(threejs)
  require(RColorBrewer)

  cols <- colorRampPalette(brewer.pal(8, "Set2"))(length(levels(color)))
  cols.vec <- cols[as.numeric(color)]

  scatterplot3js(
    coord[,1],
    coord[,2],
    coord[,3],
    color = cols.vec,
    size=size,
    labels = as.character(color)
  )
}

scale.batchwise <- function(umi, hvg, n.cores = 10){
  cat("Preprocessing UMI matrices\n")
  d.added <- pbmcapply::pbmclapply(umi, function(x){
    x <- x[rownames(x) %in% hvg, ]
    # print(dim(x))
    add.genes <- hvg[!(hvg %in% rownames(x))]

    # print(length(add.genes))


    mg <- Matrix(0, nrow = length(add.genes), ncol = ncol(x))
    rownames(mg) <- add.genes
    colnames(mg) <- colnames(x)
    # print(dim(mg))

    out <- rbind(x, mg)

    out <- t(t(out) / colSums(out))
    return(out[hvg,])
  }, mc.cores = 10, ignore.interactive = T)

  cat("Scaling\n")
  avgs <- lapply(d.added, rowMeans)
  ref <- do.call(pmin, avgs)

  rescale <- pbmcapply::pbmclapply(seq_along(avgs), function(i){
    out <- ref/avgs[[i]]
    out[!is.finite(out)] <- 0
    return(out)
  }, mc.cores = n.cores, ignore.interactive = T)


  fin <- pbmcapply::pbmclapply(seq_along(d.added), function(i){
    m <- d.added[[i]]
    rs <- rescale[[i]]

    Matrix(log1p(apply(m, 2, function(x) x * rs)), sparse = T)
  }, mc.cores = n.cores, ignore.interactive = T)

  cat("Correcting the matrix\n")
  mtx <- t(do.call(cbind, fin))
  mtx <- mtx[,apply(mtx, 2, sd) != 0]

  # cat("z-scoring\n")
  # z.score <- function(x) (x - mean(x)) / sd(x)
  # mtx.zscore <- apply(mtx, 2, z.score)
  mtx.zscore <- mtx

  cat("PCA\n")
  set.seed(1234)
  pca <- irlba::prcomp_irlba(mtx.zscore, n=75, verbose = T, scale.=T, center=T)
  rownames(pca$x) <- rownames(mtx.zscore)
  rownames(pca$rotation) <- colnames(mtx.zscore)

  cat("UMAP\n")
  set.seed(1234)
  um <- uwot::umap(pca$x, metric="cosine", min_dist = .1, n_neighbors = 15, verbose = T, n_components = 3)
  rownames(um) <- rownames(mtx.zscore)

  return(
    list(
      pca = pca,
      umap = um
    )
  )
}


read_merged_data <- function(data){
  library(hdf5r)
  out <- list()

  out$pca <- readRDS(file.path(data, "pca_model.rds"))
  out$meta.data <- read_csv(file.path(data, "meta_data.csv"))
  out$umap <- readRDS(file.path(data, "umap_model.rds"))

  h5 <- H5File$new(file.path(data, "umi.hdf5"), mode="r")

  cat("Reading UMI\n")
  out$umi <- list()
  for(n in names(h5)){
    print(n)
    out$umi[[n]] <- Matrix(
      h5[[n]][["counts"]][,],
      dimnames = list(
        h5[[n]][["genes"]][],
        h5[[n]][["cells"]][]
      ),
      sparse = T
    )
  }
  h5$close_all()

  out$stage <- do.call("c", lapply(out$umi, function(x) str_split_fixed(colnames(x)[1], "_",4)[,3]))
  return(out)
}


make.edgelist <- function(nn){
  do.call(rbind, lapply(1:nrow(nn), function(i){
    x <- nn[i,]
    xf <- x[1]
    y <- x[-1]

    from <- rep(xf, length(y))
    to <- y

    cbind(from, to)
  }))
}

get.clusters <- function(nn){
  cat("Generating graph\n")
  el <- make.edgelist(nn)
  g <- simplify(graph_from_edgelist(el, directed = F))

  cat("Cluster data\n")
  cl <- cluster_louvain(g)

  return(
    list(
      clusters = membership(cl),
      graph = g
    )
  )
}

quick.split.cell.names <- function(cell.names, n=4){
  str_split_fixed(cell.names, "_", n)
}

make.pseudobulk <- function(m, group.by){
  group.by <- as.factor(group.by)

  out <- tapply(colnames(m), group.by, function(i){
    if(length(i) > 1){
      Matrix::rowSums(m[,i])
    }else{
      m[,1]
    }
  })

  out <- do.call(cbind, out)

  return(out)
}

qsplit <- function(x, n, sep="_"){
  str_split_fixed(x, sep, length(strsplit(x[1], sep)[[1]]))[,n]
}


scanpy.louvain <- function(umi.list, pca.coords, hvg = NULL, resolution = 1, n_pcs = 75L, n_neighbors = 30L){
  library(reticulate)

  use_condaenv("r-reticulate")
  py_config()

  cat("Clustering using scanpy\n")
  sc <- import("scanpy")

  if(is.list(umi.list)){
    mat <- do.call(merge.sparse, umi.list)
  }else{
    mat <- umi.list
  }
  mat <- mat[,rownames(pca.coords)]

  if(!is.null(hvg)){
    mat <- mat[rownames(mat) %in% hvg, ]
  }

  adata <- sc$AnnData(X = t(mat),
                      obsm = list("PCA" = pca.coords))

  cat("\tFinding neighbors\n")
  sc$pp$neighbors(adata, use_rep = "PCA", n_neighbors = n_neighbors, n_pcs = n_pcs)
  cat("\tRunning Louvain algorithm\n")
  sc$tl$louvain(adata, resolution = resolution)

  cl.orig <- adata$obs[,1]
  cl <- paste0("c", cl.orig)
  names(cl.orig) <- colnames(mat)
  names(cl) <- colnames(mat)

  return(cl.orig)
}


split.umi <- function(umi, batch.vec, prune = T){

  umi.list <- tapply(colnames(umi), as.factor(batch.vec), function(i){

    if(length(i) > 2){
      out <- umi[,i]
      if(prune){
        out <- out[rowSums(out) > 0, ]
      }
    }else{
      out <- NA
    }

    return(out)

  })

  umi.list <- umi.list[!is.na(umi.list)]

  return(umi.list)
}

do.softmerge <- function(umi.list, features,  n_pcs = 75, n_umaps = 3, subsample_pca = NULL, n_cores = 10, batchwise.zscore = T){

  if(batchwise.zscore){
    scl <- F
    cntr <- F
  }
  cat("Processing a list of matrices with length", length(umi.list), ": ")

  t.out <- pbmcapply::pbmclapply( umi.list, function(x){
    sf <- colSums(x)

    o <- Matrix(apply(t(t(x[rownames(x) %in% features, ]) / sf), 1, function(y){
      z <- sqrt(y / mean(y))
      z[is.na(z)] <- 0
      return(z)
    }),sparse=T)


    return(o)

  }, mc.cores = n_cores, ignore.interactive = T)

  cat("\nDone preprocessing \n")

  cat("Merging matrices\n")
  t.mat <- do.call(merge.sparse, t.out)
  #t.mat[is.na(t.mat)] <- 0
  t.mat <- t.mat[,colSums(t.mat) > 0]

  if(batchwise.zscore){

    t.mat <- do.call(rbind, pbmcapply::pbmclapply(umi.list, function(u){
      cells <- colnames(u)
      tmp <- t.mat[cells, ]
      tmp <- scale(tmp)
      tmp[is.na(tmp)] <- 0
      return(tmp)
    }, mc.cores = n_cores, ignore.interactive = T))

  }


  print(dim(t.mat))

  if(is.null(subsample_pca)){

    cat("PCA\n")
    set.seed(1234)
    t.pca <- irlba::prcomp_irlba(t.mat, n=n_pcs, center = cntr, scale. = scl, verbose=T)
    rownames(t.pca$x) <- rownames(t.mat)
    rownames(t.pca$rotation) <- colnames(t.mat)
    cat("UMAP\n")
    set.seed(1234)
    t.umap <- uwot::umap(t.pca$x, metric="cosine", n_neighbors = 30, min_dist=.1, n_components = n_umaps, verbose = T)
    rownames(t.umap) <- rownames(t.mat)

  }else{

    sel.cells <- sample(rownames(t.mat), subsample_pca)


    sub.mat <- t.mat[sel.cells, ]
    sub.mat <- sub.mat[,colSums(sub.mat) > 0]

    cat("Running PCA with a subsampled dataset of size: ", nrow(sub.mat), ncol(sub.mat), "\n")
    set.seed(1234)
    t.pca <- irlba::prcomp_irlba(sub.mat, n=n_pcs, center = cntr, scale. = scl, verbose = T)
    rownames(t.pca$x) <- sel.cells
    rownames(t.pca$rotation) <- colnames(sub.mat)
    cat("UMAP\n")
    set.seed(1234)
    t.umap <- uwot::umap(t.pca$x, metric="cosine", n_neighbors = 30, min_dist=.1, n_components = n_umaps, verbose = T)
    rownames(t.umap) <- sel.cells

  }


  return(list(
    umap = t.umap,
    pca = t.pca,
    feats = colnames(t.mat)
  ))
}


sparse.columnNorm <- function (A)
{
  if (class(A)[1] == "dgTMatrix") {
    temp = summary(A)
    A = sparseMatrix(i = temp[, 1], j = temp[, 2], x = temp[,
                                                            3])
  }
  A@x <- A@x/rep.int(Matrix::colSums(A), diff(A@p))
  return(A)
}

write_matrix_h5 <- function(mat, file){
  require(hdf5r)
  H5 <- H5File$new(file, "w")
  if(is(mat, "dgCMatrix")){
    H5[["counts"]] <- summary(mat)
  }else{
    H5[["counts"]] <- mat
  }

  H5[["colnames"]] <- colnames(mat)
  H5[["rownames"]] <- rownames(mat)
  H5$close_all()
}


read_sparse_h5 <- function(file){
  require(hdf5r)
  H5 <- H5File$new(file, "r")
  cnts <- H5[["counts"]][]
  mtx <- sparseMatrix(i = cnts$i, j = cnts$j, x = cnts$x)
  colnames(mtx) <- H5[["colnames"]][]
  rownames(mtx) <- H5[["rownames"]][]
  H5$close_all()
  return(mtx)
}

#' GO enrichment of selected genes
#'
#' @param interesting.genes Genes to test enrichment for
#' @param gene.universe All genes to test against
#' @param gene2go.list
run.topGO <- function(interesting.genes, gene.universe, gene2go.list, n.out = 25){
  require(topGO)
  gene.list <- as.factor(as.numeric(gene.universe %in% interesting.genes))
  names(gene.list) <- gene.universe

  sampleGO <- new("topGOdata",
                  ontology = "BP",
                  allGenes = gene.list,
                  # geneSel = names(treecut.pt)[treecut.pt == 2],
                  annot = annFUN.gene2GO,
                  gene2GO = gene2go.list)
  resultFisher <- runTest(sampleGO, algorithm = "classic", statistic = "fisher")

  return(
    GenTable(sampleGO,
             classicFisher = resultFisher,
             orderBy = "classicFisher", ranksOf = "classicFisher", topNodes = n.out)
  )

}
	# Taken from Simon
	# compute variances across column or rows for column-sparse matrices
	library(Matrix)
	colVars_spm <- function( spm ) {
	stopifnot( is( spm, "dgCMatrix" ) )
	ans <- sapply( seq.int(spm@Dim[2]), function(j) {
	mean <- sum( spm@x[ (spm@p[j]+1):spm@p[j+1] ] ) / spm@Dim[1]
	sum( ( spm@x[ (spm@p[j]+1):spm@p[j+1] ] - mean )^2 ) +
	mean^2 * ( spm@Dim[1] - ( spm@p[j+1] - spm@p[j] ) ) } ) / ( spm@Dim[1] - 1 )
	names(ans) <- spm@Dimnames[[2]]
	ans
	}
	rowVars_spm <- function( spm ) {
	colVars_spm( t(spm) )
	}

	pca_on_informative_genes <- function( counts, nDim=20 ) {

	norm_counts <- t(t(counts) / colSums(counts))

	gene_means <- rowMeans( norm_counts )
	gene_vars <- rowVars_spm( norm_counts )
	poisson_vmr <- mean( 1 / colSums( counts ) )

	informative_genes <- names(which(
	gene_vars / gene_means > 1.5 * poisson_vmr ))

	return(
	list(
	pca = irlba::prcomp_irlba( t(sqrt(norm_counts[informative_genes,])), n = nDim)$x,
	info.genes = informative_genes
	)
	)

	}

	get.info.genes <- function( counts ) {
	library(Matrix)
	stopifnot(!is.null(dim(counts)))
	norm_counts <- t(t(counts) / Matrix::colSums(counts))

	gene_means <- rowMeans( norm_counts )
	gene_vars <- rowVars_spm( norm_counts )
	poisson_vmr <- mean( 1 / Matrix::colSums( counts ) )

	informative_genes <- names(which(
	gene_vars / gene_means > 1.5 * poisson_vmr ))

	return(
	informative_genes
	)

	}

	get.info.genes.batchwise <- function( all_counts, per.cell.batch ){
	tmp <- lapply(unique(per.cell.batch), function(pcb){
	counts <- all_counts[,per.cell.batch == pcb]
	get.info.genes(counts)
	})

	print(lapply(tmp, length))
	return(
	unique(
	do.call("c", tmp)
	)
	)
	}

	###################


	do.smooth <- function(pca, raw, gene, alpha=.1){
	library(locfit)
	if(ncol(pca) > 15){
	cat("Reducing the number of PCs to 15\nto prevent locfit bug.")
	pca <- pca[,1:15]
	}

	predict( locfit.raw(
	pca[,1:15],
	raw[gene,],
	base = log( colSums(raw) ),
	alpha = alpha, deg = 1, family = "poisson", ev = dat() ) )
	}


	get.10x.data <- function(path){
	out <- list()
	out$raw <- Seurat::Read10X(path)
	return(out)
	}

	get.h5.data <- function(path){
	hf <- hdf5r::H5File$new(path, mode="r")
	cnts <- Matrix::Matrix(data = hf[["umi/ft/counts"]][,],
	dimnames = list(hf[["umi/ft/genes"]][],
	hf[["umi/ft/cells"]][]))

	datalist <- list()
	datalist$raw <- cnts
	return(datalist)
	}


	do.preprocess <- function(datalist, nGenes = 500, nCells = 1, nDim = 20){
	cat("Subsetting the data for: ")
	cat(nGenes, "genes per cell and", nCells, "cells per gene minimum\n")
	datalist$raw <- datalist$raw[ , colSums( datalist$raw>0 ) >= nGenes ]
	datalist$raw <- datalist$raw[ rowSums(datalist$raw) >= nCells, ]

	cat("Normalizing\n")
	datalist$nrm <- t(t(datalist$raw) / colSums(datalist$raw))

	cat("Running PCA\n")
	tmp <- pca_on_informative_genes(datalist$raw, nDim)
	datalist$pca <- tmp$pca
	datalist$info.genes <- tmp$info.genes

	return(datalist)
	}


	runUMAP <- function(datalist, nPC = 15){
	uwot::umap( datalist$pca[,1:nPC],
	n_neighbors = 30,
	min_dist = .3,
	metric = "cosine" )
	}


	run.seurat <- function(datalist, ident.var.genes = F){
	library(Seurat)
	seu <- CreateSeuratObject( datalist$raw , min.cells = 0, min.features = 0)
	if(ident.var.genes){
	seu <- FindVariableFeatures( seu, selection.method = "vst", nfeatures = length(datalist$info.genes))
	}else{
	seu[["RNA"]]@var.features <- datalist$info.genes
	}

	#seu <- NormalizeData(seu)
	seu <- ScaleData( seu )
	seu <- RunPCA( seu, verbose=FALSE )
	seu <- FindNeighbors( seu )
	seu <- FindClusters( seu )

	return(seu)
	}


	rem.batch <- function(mtx, l){
	out <- lapply(unique(l), function(x){
	cat("Collecting batch from", x, "\n")
	tmp.mtx <- as.matrix(mtx[,l == x])
	# rs <- rowSums(mtx)
	# names(rs) <- rownames(mtx)
	rs <- rowSums(tmp.mtx)

	# tmp.mtx <- sqrt(t(
	# t(tmp.mtx) / colSums(tmp.mtx)
	# ))

	rs <- sqrt(rs / sum(rs))
	#rs <- apply(as.matrix(rs), 2, function(x) (x - min(x)) / (max(x - min(x))))

	return(rs)

	#lm(as.matrix(tmp.mtx)~rs)$residuals
	})

	rs.mat <- do.call(cbind, out)

	cat("Normalizing\n")
	exprs <- sqrt(t(t(mtx) / colSums(mtx)))
	#exprs <- apply(as.matrix(exprs), 2, function(x) (x - min(x)) / (max(x - min(x))))

	cat("Calculating dot product\n")
	dp <- t(exprs) %*% rs.mat

	print(head(dp))

	cat("Doing linear regression\n")
	out <- lm(as.matrix(t(exprs)) ~ as.matrix(dp))$residuals

	return(
	out
	)
	}


	do.process.batches <- function( counts, batch.vec ){
	out <- list()
	out$raw <- counts
	cat("-----------------\n")
	cat("Getting interesting genes\n")
	cat("-----------------\n")
	out$info.genes <- get.info.genes.batchwise(out$raw, batch.vec)
	cat("-----------------\n")
	cat("\n")
	cat("-----------------\n")
	cat("Batch removal\n")
	cat("-----------------\n")
	out$brm <- rem.batch(out$raw, batch.vec)
	out$nrm <- sqrt(t(t(out$raw) / colSums(out$raw)))
	cat("-----------------\n")
	cat("\n")
	cat("-----------------\n")
	cat("Running PCA on interesting genes\n")
	cat("-----------------\n")
	out$pca <- irlba::prcomp_irlba(out$brm[,out$info.genes], n=20, scale. = F, center = T)$x
	cat("-----------------\n")
	cat("\n")
	cat("-----------------\n")
	cat("Running UMAP\n")
	cat("-----------------\n")
	out$umap <- uwot::umap(out$pca, metric="cosine", min_dist=.3, n_neighbors=30)
	cat("-----------------\n")
	out$batch <- batch.vec
	return(out)
	}


	gen.seurat <- function(cnts, brm.mtx, var.genes){
	cat("Generating Seurat Object\n")
	seu <- CreateSeuratObject(cnts)
	cat("Storing precomputed data\n")
	seu[["RNA"]]@data <- Matrix(brm.mtx[var.genes,], sparse = T)
	seu[["RNA"]]@scale.data <- brm.mtx[var.genes,]
	seu[["RNA"]]@var.features <- var.genes

	cat("Running PCA\n")
	seu <- RunPCA(seu)
	seu <- FindNeighbors(seu)
	seu <- FindClusters(seu)
	}


	project.data <- function(a.brm, b.brm, nDim=30){
	cat("Running PCA\n")
	pca.a <- irlba::prcomp_irlba(a.brm, n = nDim)
	cat("Projecting data\n")
	pca.b <- scale(b.brm, center = pca.a$center, scale = pca.a$scale) %*% pca.a$rotation

	return(rbind(pca.a$x, pca.b))
	}

	project.annotation <- function(source.mtx, querry.mtx, source.annotation, assign.prob = .5){
	shared.genes <- intersect(colnames(source.mtx), colnames(querry.mtx))
	cat("Number of shared genes:", length(shared.genes), "\n")

	cat("Running projection PCA\n")
	cmbd.pca <- project.data(source.mtx[,shared.genes],
	querry.mtx[,shared.genes])

	cat("Finding kNN\n")
	tst.knn <- FNN::get.knnx(cmbd.pca[1:nrow(source.mtx), ],
	cmbd.pca[(nrow(source.mtx)+1):nrow(cmbd.pca), ])

	cat("Voting\n")
	tst.cl.vote <- t(apply(tst.knn$nn.index, 1, function(rw) {
	source.annotation[rw]
	}))

	po.cl <- matrix(NA, ncol=length(unique(source.annotation)), nrow = nrow(tst.cl.vote))
	colnames(po.cl) <- unique(source.annotation)
	for(i in 1:nrow(tst.cl.vote)){
	tbl <- table(tst.cl.vote[i,])
	po.cl[i,names(tbl)] <- as.vector(tbl)
	}

	rel.po.cl <- t(apply(po.cl, 1, function(rw){
	rw / sum(rw, na.rm = T)
	}))

	voted <- apply(rel.po.cl, 1, function(rw){
	if(max(rw, na.rm = T) < assign.prob){
	return(NA)
	}else{
	y <- colnames(rel.po.cl)[rw == max(rw, na.rm = T)]
	y[!is.na(y)][1]
	}
	})

	return(list(
	voting.mtx = rel.po.cl,
	result = voted,
	knn.out = tst.knn
	))
	}

	union.matrix <- function(m1, m2){
	library(Matrix)
	u.rw <- union(rownames(m1), rownames(m2))
	u.cl <- c(colnames(m1), colnames(m2))

	cmat <- Matrix(0, ncol=length(u.cl), nrow=length(u.rw), dimnames = list(u.rw, u.cl),sparse=F)

	cat("Generated matrix with", ncol(cmat), "columns and", nrow(cmat), "rows.\n")

	cat("Filling in the values\n")
	cmat[rownames(m1), colnames(m1)] <- m1
	cmat[rownames(m2), colnames(m2)] <- m2

	return(Matrix(cmat, sparse = T))
	}


	union.matrix.list <- function(...){
	library(Matrix)
	mlist = list(...)

	out = union.matrix(mlist[[1]], mlist[[2]])

	if(length(mlist) > 2){
	for(i in 3:length(mlist)){
	out = union.matrix(out, mlist[[i]])
	}
	}


	return(out)
	}


	create.h5 <- function(m,
	filepath,
	batch = NULL,
	cluster = NULL,
	embedding = NULL,
	indiv_embedding = NULL){
	library(hdf5r)
	h5file <- H5File$new(filepath, mode = "w")
	h5file[["X"]] <- m
	h5file[["genes"]] <- rownames(m)
	h5file[["cells"]] <- colnames(m)

	if(!is.null(batch)){
	h5file[["batch"]] <- batch
	}

	if(!is.null(cluster)){
	h5file[["cluster"]] <- cluster
	}

	if(!is.null(embedding)){
	h5file[["embedding"]] <- embedding
	}

	if(!is.null(indiv_embedding)){
	h5file[["indiv_embedding"]] <- indiv_embedding
	}

	h5file$close_all()
	}


	merge.sparse <- function(...) {

	cnnew <- character()
	rnnew <- character()
	x <- vector()
	i <- numeric()
	j <- numeric()

	icount <- 1
	cat("Merged matrices: ")
	for (M in list(...)) {
	cat(icount, " ")
	cnold <- colnames(M)
	rnold <- rownames(M)

	cnnew <- union(cnnew,cnold)
	rnnew <- union(rnnew,rnold)

	cindnew <- match(cnold,cnnew)
	rindnew <- match(rnold,rnnew)
	ind <- unname(Matrix::which(M != 0,arr.ind=T))
	i <- c(i,rindnew[ind[,1]])
	j <- c(j,cindnew[ind[,2]])
	x <- c(x,M@x)

	icount <- icount +1

	}
	cat("\n")

	sparseMatrix(i=i,j=j,x=x,dims=c(length(rnnew),length(cnnew)),dimnames=list(rnnew,cnnew))
	}

	##### New Funcs #####
	read.h5 <- function(path, slot = "ft"){
	H5 <- H5File$new(path, mode = "r")

	mtx <- Matrix(
	H5[[paste0("umi/", slot, "/counts")]][,],
	dimnames = list(
	H5[[paste0("umi/", slot, "/genes")]][],
	H5[[paste0("umi/", slot, "/cells")]][]
	),
	sparse = T
	)

	H5$close_all()

	return(mtx)
	}

	generate.data.list <- function(...){
	dl <- list(...)

	all.sample.names <- do.call("c", lapply(dl, function(x) unique(as.character(x$orig.ident))))

	print(all.sample.names)

	all.samples <- do.call("c",lapply(dl, function(dat){
	lapply(unique(dat$orig.ident), function(x) dat@assays$RNA@counts[,dat$orig.ident == x])
	}))

	names(all.samples) <- all.sample.names

	return(all.samples)
	}

	get.ensembl.names <- function(ids, mart = "mmusculus_gene_ensembl"){
	library(biomaRt)
	mart <- useMart("ensembl", mart)

	getBM(
	c("ensembl_gene_id",
	"external_gene_name"),
	"ensembl_gene_id",
	values = ids,
	mart=mart
	)
	}

	get.ensembl.names.for.list <- function(data.list, mart = "mmusculus_gene_ensembl"){
	get.ensembl.names(
	unique(do.call("c", lapply(data.list, rownames))),
	mart
	)
	}


	filter.one2one <- function(data.list,
	genes.meta,
	translate.to.homolog = F,
	return.gene.symbol = F){
	keep.genes <- genes.meta[genes.meta[,5] == "ortholog_one2one", "Gene.stable.ID"]

	for(n in names(data.list)){
	d <- data.list[[n]]
	old.n <- rownames(d)

	d <- d[rownames(d) %in% keep.genes,]

	if(nrow(d) == 0){
	print(n)
	print(head(old.n))
	print(head(keep.genes))
	stop("No genes kept!")
	}

	if(translate.to.homolog){
	new.names <- genes.meta[match(rownames(d), genes.meta[,1]),3]
	rownames(d) <- new.names
	}

	data.list[[n]] <- d
	}

	return(data.list)
	}

	translate.geneid.to.name <- function(data.list, genes.meta){
	for(n in names(data.list)){
	d <- data.list[[n]]
	old.n <- rownames(d)
	new.n <- as.character(genes.meta$Gene.name[match(old.n, genes.meta$Gene.stable.ID)])
	new.n[is.na(new.n)] <- old.n[is.na(new.n)]
	rownames(d) <- new.n


	data.list[[n]] <- d
	}

	return(data.list)
	}

	get.exprs <- function(dl, gene){
	exprs <- list()
	for(i in 1:length(dl)){
	nrm <- dl[[i]]
	if(gene %in% rownames(nrm)){
	tmp <- nrm[gene, ]
	names(tmp) <- colnames(nrm)
	exprs[[i]] <- tmp
	}else{
	tmp <- rep(0, ncol(nrm))
	names(tmp) <- colnames(nrm)
	exprs[[i]] <- tmp
	}
	}
	exprs <- do.call("c", exprs)

	return(exprs)
	}


	plot.exprs.3d <- function(coord, exprs, size=.1){
	require(threejs)
	require(RColorBrewer)

	# tmp <- names(exprs)
	exprs <- scale(exprs)
	# names(exprs) <- tmp
	exprs <- round(exprs, digits=2)
	cols <- rev(colorRampPalette(brewer.pal(5, "Spectral"))(length(sort(unique(exprs)))))

	coord <- coord[order(exprs), ]
	exprs <- sort(exprs)
	cols.vec <- cols[as.numeric(as.factor(exprs))]
	# coord <- coord[names(exprs), ]

	scatterplot3js(
	coord[,1],
	coord[,2],
	coord[,3],
	color = cols.vec,
	size=size
	)
	}

	plot.factor.3d <- function(coord, color, size=.1){
	color <- as.factor(color)
	require(threejs)
	require(RColorBrewer)

	cols <- colorRampPalette(brewer.pal(8, "Set2"))(length(levels(color)))
	cols.vec <- cols[as.numeric(color)]

	scatterplot3js(
	coord[,1],
	coord[,2],
	coord[,3],
	color = cols.vec,
	size=size,
	labels = as.character(color)
	)
	}

	scale.batchwise <- function(umi, hvg, n.cores = 10){
	cat("Preprocessing UMI matrices\n")
	d.added <- pbmcapply::pbmclapply(umi, function(x){
	x <- x[rownames(x) %in% hvg, ]
	# print(dim(x))
	add.genes <- hvg[!(hvg %in% rownames(x))]

	# print(length(add.genes))


	mg <- Matrix(0, nrow = length(add.genes), ncol = ncol(x))
	rownames(mg) <- add.genes
	colnames(mg) <- colnames(x)
	# print(dim(mg))

	out <- rbind(x, mg)

	out <- t(t(out) / colSums(out))
	return(out[hvg,])
	}, mc.cores = 10, ignore.interactive = T)

	cat("Scaling\n")
	avgs <- lapply(d.added, rowMeans)
	ref <- do.call(pmin, avgs)

	rescale <- pbmcapply::pbmclapply(seq_along(avgs), function(i){
	out <- ref/avgs[[i]]
	out[!is.finite(out)] <- 0
	return(out)
	}, mc.cores = n.cores, ignore.interactive = T)


	fin <- pbmcapply::pbmclapply(seq_along(d.added), function(i){
	m <- d.added[[i]]
	rs <- rescale[[i]]

	Matrix(log1p(apply(m, 2, function(x) x * rs)), sparse = T)
	}, mc.cores = n.cores, ignore.interactive = T)

	cat("Correcting the matrix\n")
	mtx <- t(do.call(cbind, fin))
	mtx <- mtx[,apply(mtx, 2, sd) != 0]

	# cat("z-scoring\n")
	# z.score <- function(x) (x - mean(x)) / sd(x)
	# mtx.zscore <- apply(mtx, 2, z.score)
	mtx.zscore <- mtx

	cat("PCA\n")
	set.seed(1234)
	pca <- irlba::prcomp_irlba(mtx.zscore, n=75, verbose = T, scale.=T, center=T)
	rownames(pca$x) <- rownames(mtx.zscore)
	rownames(pca$rotation) <- colnames(mtx.zscore)

	cat("UMAP\n")
	set.seed(1234)
	um <- uwot::umap(pca$x, metric="cosine", min_dist = .1, n_neighbors = 15, verbose = T, n_components = 3)
	rownames(um) <- rownames(mtx.zscore)

	return(
	list(
	pca = pca,
	umap = um
	)
	)
	}


	read_merged_data <- function(data){
	library(hdf5r)
	out <- list()

	out$pca <- readRDS(file.path(data, "pca_model.rds"))
	out$meta.data <- read_csv(file.path(data, "meta_data.csv"))
	out$umap <- readRDS(file.path(data, "umap_model.rds"))

	h5 <- H5File$new(file.path(data, "umi.hdf5"), mode="r")

	cat("Reading UMI\n")
	out$umi <- list()
	for(n in names(h5)){
	print(n)
	out$umi[[n]] <- Matrix(
	h5[[n]][["counts"]][,],
	dimnames = list(
	h5[[n]][["genes"]][],
	h5[[n]][["cells"]][]
	),
	sparse = T
	)
	}
	h5$close_all()

	out$stage <- do.call("c", lapply(out$umi, function(x) str_split_fixed(colnames(x)[1], "_",4)[,3]))
	return(out)
	}


	make.edgelist <- function(nn){
	do.call(rbind, lapply(1:nrow(nn), function(i){
	x <- nn[i,]
	xf <- x[1]
	y <- x[-1]

	from <- rep(xf, length(y))
	to <- y

	cbind(from, to)
	}))
	}

	get.clusters <- function(nn){
	cat("Generating graph\n")
	el <- make.edgelist(nn)
	g <- simplify(graph_from_edgelist(el, directed = F))

	cat("Cluster data\n")
	cl <- cluster_louvain(g)

	return(
	list(
	clusters = membership(cl),
	graph = g
	)
	)
	}

	quick.split.cell.names <- function(cell.names, n=4){
	str_split_fixed(cell.names, "_", n)
	}

	make.pseudobulk <- function(m, group.by){
	group.by <- as.factor(group.by)

	out <- tapply(colnames(m), group.by, function(i){
	if(length(i) > 1){
	Matrix::rowSums(m[,i])
	}else{
	m[,1]
	}
	})

	out <- do.call(cbind, out)

	return(out)
	}

	qsplit <- function(x, n, sep="_"){
	str_split_fixed(x, sep, length(strsplit(x[1], sep)[[1]]))[,n]
	}



	scanpy.louvain <- function(umi.list, pca.coords, hvg = NULL, resolution = 1, n_pcs = 75L, n_neighbors = 30L){
	library(reticulate)

	use_condaenv("r-reticulate")
	py_config()

	cat("Clustering using scanpy\n")
	sc <- import("scanpy")

	if(is.list(umi.list)){
	mat <- do.call(merge.sparse, umi.list)
	}else{
	mat <- umi.list
	}
	mat <- mat[,rownames(pca.coords)]

	if(!is.null(hvg)){
	mat <- mat[rownames(mat) %in% hvg, ]
	}

	adata <- sc$AnnData(X = t(mat),
	obsm = list("PCA" = pca.coords))

	cat("\tFinding neighbors\n")
	sc$pp$neighbors(adata, use_rep = "PCA", n_neighbors = n_neighbors, n_pcs = n_pcs)
	cat("\tRunning Louvain algorithm\n")
	sc$tl$louvain(adata, resolution = resolution)

	cl.orig <- adata$obs[,1]
	cl <- paste0("c", cl.orig)
	names(cl.orig) <- colnames(mat)
	names(cl) <- colnames(mat)

	return(cl.orig)
	}


	split.umi <- function(umi, batch.vec, prune = T){

	umi.list <- tapply(colnames(umi), as.factor(batch.vec), function(i){

	if(length(i) > 2){
	out <- umi[,i]
	if(prune){
	out <- out[rowSums(out) > 0, ]
	}
	}else{
	out <- NA
	}

	return(out)

	})

	umi.list <- umi.list[!is.na(umi.list)]

	return(umi.list)
	}

	do.softmerge <- function(umi.list, features, n_pcs = 75, n_umaps = 3, subsample_pca = NULL, n_cores = 10, batchwise.zscore = T){

	if(batchwise.zscore){
	scl <- F
	cntr <- F
	}
	cat("Processing a list of matrices with length", length(umi.list), ": ")

	t.out <- pbmcapply::pbmclapply( umi.list, function(x){
	sf <- colSums(x)

	o <- Matrix(apply(t(t(x[rownames(x) %in% features, ]) / sf), 1, function(y){
	z <- sqrt(y / mean(y))
	z[is.na(z)] <- 0
	return(z)
	}),sparse=T)


	return(o)

	}, mc.cores = n_cores, ignore.interactive = T)

	cat("\nDone preprocessing \n")

	cat("Merging matrices\n")
	t.mat <- do.call(merge.sparse, t.out)
	#t.mat[is.na(t.mat)] <- 0
	t.mat <- t.mat[,colSums(t.mat) > 0]

	if(batchwise.zscore){

	t.mat <- do.call(rbind, pbmcapply::pbmclapply(umi.list, function(u){
	cells <- colnames(u)
	tmp <- t.mat[cells, ]
	tmp <- scale(tmp)
	tmp[is.na(tmp)] <- 0
	return(tmp)
	}, mc.cores = n_cores, ignore.interactive = T))

	}



	print(dim(t.mat))

	if(is.null(subsample_pca)){

	cat("PCA\n")
	set.seed(1234)
	t.pca <- irlba::prcomp_irlba(t.mat, n=n_pcs, center = cntr, scale. = scl, verbose=T)
	rownames(t.pca$x) <- rownames(t.mat)
	rownames(t.pca$rotation) <- colnames(t.mat)
	cat("UMAP\n")
	set.seed(1234)
	t.umap <- uwot::umap(t.pca$x, metric="cosine", n_neighbors = 30, min_dist=.1, n_components = n_umaps, verbose = T)
	rownames(t.umap) <- rownames(t.mat)

	}else{

	sel.cells <- sample(rownames(t.mat), subsample_pca)


	sub.mat <- t.mat[sel.cells, ]
	sub.mat <- sub.mat[,colSums(sub.mat) > 0]

	cat("Running PCA with a subsampled dataset of size: ", nrow(sub.mat), ncol(sub.mat), "\n")
	set.seed(1234)
	t.pca <- irlba::prcomp_irlba(sub.mat, n=n_pcs, center = cntr, scale. = scl, verbose = T)
	rownames(t.pca$x) <- sel.cells
	rownames(t.pca$rotation) <- colnames(sub.mat)
	cat("UMAP\n")
	set.seed(1234)
	t.umap <- uwot::umap(t.pca$x, metric="cosine", n_neighbors = 30, min_dist=.1, n_components = n_umaps, verbose = T)
	rownames(t.umap) <- sel.cells

	}



	return(list(
	umap = t.umap,
	pca = t.pca,
	feats = colnames(t.mat)
	))
	}


	sparse.columnNorm <- function (A)
	{
	if (class(A)[1] == "dgTMatrix") {
	temp = summary(A)
	A = sparseMatrix(i = temp[, 1], j = temp[, 2], x = temp[,
	3])
	}
	A@x <- A@x/rep.int(Matrix::colSums(A), diff(A@p))
	return(A)
	}

	write_matrix_h5 <- function(mat, file){
	require(hdf5r)
	H5 <- H5File$new(file, "w")
	if(is(mat, "dgCMatrix")){
	H5[["counts"]] <- summary(mat)
	}else{
	H5[["counts"]] <- mat
	}

	H5[["colnames"]] <- colnames(mat)
	H5[["rownames"]] <- rownames(mat)
	H5$close_all()
	}


	read_sparse_h5 <- function(file){
	require(hdf5r)
	H5 <- H5File$new(file, "r")
	cnts <- H5[["counts"]][]
	mtx <- sparseMatrix(i = cnts$i, j = cnts$j, x = cnts$x)
	colnames(mtx) <- H5[["colnames"]][]
	rownames(mtx) <- H5[["rownames"]][]
	H5$close_all()
	return(mtx)
	}

	#' GO enrichment of selected genes
	#'
	#' @param interesting.genes Genes to test enrichment for
	#' @param gene.universe All genes to test against
	#' @param gene2go.list
	run.topGO <- function(interesting.genes, gene.universe, gene2go.list, n.out = 25){
	require(topGO)
	gene.list <- as.factor(as.numeric(gene.universe %in% interesting.genes))
	names(gene.list) <- gene.universe

	sampleGO <- new("topGOdata",
	ontology = "BP",
	allGenes = gene.list,
	# geneSel = names(treecut.pt)[treecut.pt == 2],
	annot = annFUN.gene2GO,
	gene2GO = gene2go.list)
	resultFisher <- runTest(sampleGO, algorithm = "classic", statistic = "fisher")

	return(
	GenTable(sampleGO,
	classicFisher = resultFisher,
	orderBy = "classicFisher", ranksOf = "classicFisher", topNodes = n.out)
	)

	}