Skip to content

Instantly share code, notes, and snippets.

@klprint
Last active January 30, 2020 12:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save klprint/1ab4468eb3c54abcf0422dec6223b8fc to your computer and use it in GitHub Desktop.
Save klprint/1ab4468eb3c54abcf0422dec6223b8fc to your computer and use it in GitHub Desktop.
Widely used single cell functions in R
# Taken from Simon
# compute variances across column or rows for column-sparse matrices
library(Matrix)
colVars_spm <- function( spm ) {
stopifnot( is( spm, "dgCMatrix" ) )
ans <- sapply( seq.int(spm@Dim[2]), function(j) {
mean <- sum( spm@x[ (spm@p[j]+1):spm@p[j+1] ] ) / spm@Dim[1]
sum( ( spm@x[ (spm@p[j]+1):spm@p[j+1] ] - mean )^2 ) +
mean^2 * ( spm@Dim[1] - ( spm@p[j+1] - spm@p[j] ) ) } ) / ( spm@Dim[1] - 1 )
names(ans) <- spm@Dimnames[[2]]
ans
}
rowVars_spm <- function( spm ) {
colVars_spm( t(spm) )
}
pca_on_informative_genes <- function( counts, nDim=20 ) {
norm_counts <- t(t(counts) / colSums(counts))
gene_means <- rowMeans( norm_counts )
gene_vars <- rowVars_spm( norm_counts )
poisson_vmr <- mean( 1 / colSums( counts ) )
informative_genes <- names(which(
gene_vars / gene_means > 1.5 * poisson_vmr ))
return(
list(
pca = irlba::prcomp_irlba( t(sqrt(norm_counts[informative_genes,])), n = nDim)$x,
info.genes = informative_genes
)
)
}
get.info.genes <- function( counts ) {
library(Matrix)
stopifnot(!is.null(dim(counts)))
norm_counts <- t(t(counts) / Matrix::colSums(counts))
gene_means <- rowMeans( norm_counts )
gene_vars <- rowVars_spm( norm_counts )
poisson_vmr <- mean( 1 / Matrix::colSums( counts ) )
informative_genes <- names(which(
gene_vars / gene_means > 1.5 * poisson_vmr ))
return(
informative_genes
)
}
get.info.genes.batchwise <- function( all_counts, per.cell.batch ){
tmp <- lapply(unique(per.cell.batch), function(pcb){
counts <- all_counts[,per.cell.batch == pcb]
get.info.genes(counts)
})
print(lapply(tmp, length))
return(
unique(
do.call("c", tmp)
)
)
}
###################
do.smooth <- function(pca, raw, gene, alpha=.1){
library(locfit)
if(ncol(pca) > 15){
cat("Reducing the number of PCs to 15\nto prevent locfit bug.")
pca <- pca[,1:15]
}
predict( locfit.raw(
pca[,1:15],
raw[gene,],
base = log( colSums(raw) ),
alpha = alpha, deg = 1, family = "poisson", ev = dat() ) )
}
get.10x.data <- function(path){
out <- list()
out$raw <- Seurat::Read10X(path)
return(out)
}
get.h5.data <- function(path){
hf <- hdf5r::H5File$new(path, mode="r")
cnts <- Matrix::Matrix(data = hf[["umi/ft/counts"]][,],
dimnames = list(hf[["umi/ft/genes"]][],
hf[["umi/ft/cells"]][]))
datalist <- list()
datalist$raw <- cnts
return(datalist)
}
do.preprocess <- function(datalist, nGenes = 500, nCells = 1, nDim = 20){
cat("Subsetting the data for: ")
cat(nGenes, "genes per cell and", nCells, "cells per gene minimum\n")
datalist$raw <- datalist$raw[ , colSums( datalist$raw>0 ) >= nGenes ]
datalist$raw <- datalist$raw[ rowSums(datalist$raw) >= nCells, ]
cat("Normalizing\n")
datalist$nrm <- t(t(datalist$raw) / colSums(datalist$raw))
cat("Running PCA\n")
tmp <- pca_on_informative_genes(datalist$raw, nDim)
datalist$pca <- tmp$pca
datalist$info.genes <- tmp$info.genes
return(datalist)
}
runUMAP <- function(datalist, nPC = 15){
uwot::umap( datalist$pca[,1:nPC],
n_neighbors = 30,
min_dist = .3,
metric = "cosine" )
}
run.seurat <- function(datalist, ident.var.genes = F){
library(Seurat)
seu <- CreateSeuratObject( datalist$raw , min.cells = 0, min.features = 0)
if(ident.var.genes){
seu <- FindVariableFeatures( seu, selection.method = "vst", nfeatures = length(datalist$info.genes))
}else{
seu[["RNA"]]@var.features <- datalist$info.genes
}
#seu <- NormalizeData(seu)
seu <- ScaleData( seu )
seu <- RunPCA( seu, verbose=FALSE )
seu <- FindNeighbors( seu )
seu <- FindClusters( seu )
return(seu)
}
rem.batch <- function(mtx, l){
out <- lapply(unique(l), function(x){
cat("Collecting batch from", x, "\n")
tmp.mtx <- as.matrix(mtx[,l == x])
# rs <- rowSums(mtx)
# names(rs) <- rownames(mtx)
rs <- rowSums(tmp.mtx)
# tmp.mtx <- sqrt(t(
# t(tmp.mtx) / colSums(tmp.mtx)
# ))
rs <- sqrt(rs / sum(rs))
#rs <- apply(as.matrix(rs), 2, function(x) (x - min(x)) / (max(x - min(x))))
return(rs)
#lm(as.matrix(tmp.mtx)~rs)$residuals
})
rs.mat <- do.call(cbind, out)
cat("Normalizing\n")
exprs <- sqrt(t(t(mtx) / colSums(mtx)))
#exprs <- apply(as.matrix(exprs), 2, function(x) (x - min(x)) / (max(x - min(x))))
cat("Calculating dot product\n")
dp <- t(exprs) %*% rs.mat
print(head(dp))
cat("Doing linear regression\n")
out <- lm(as.matrix(t(exprs)) ~ as.matrix(dp))$residuals
return(
out
)
}
do.process.batches <- function( counts, batch.vec ){
out <- list()
out$raw <- counts
cat("-----------------\n")
cat("Getting interesting genes\n")
cat("-----------------\n")
out$info.genes <- get.info.genes.batchwise(out$raw, batch.vec)
cat("-----------------\n")
cat("\n")
cat("-----------------\n")
cat("Batch removal\n")
cat("-----------------\n")
out$brm <- rem.batch(out$raw, batch.vec)
out$nrm <- sqrt(t(t(out$raw) / colSums(out$raw)))
cat("-----------------\n")
cat("\n")
cat("-----------------\n")
cat("Running PCA on interesting genes\n")
cat("-----------------\n")
out$pca <- irlba::prcomp_irlba(out$brm[,out$info.genes], n=20, scale. = F, center = T)$x
cat("-----------------\n")
cat("\n")
cat("-----------------\n")
cat("Running UMAP\n")
cat("-----------------\n")
out$umap <- uwot::umap(out$pca, metric="cosine", min_dist=.3, n_neighbors=30)
cat("-----------------\n")
out$batch <- batch.vec
return(out)
}
gen.seurat <- function(cnts, brm.mtx, var.genes){
cat("Generating Seurat Object\n")
seu <- CreateSeuratObject(cnts)
cat("Storing precomputed data\n")
seu[["RNA"]]@data <- Matrix(brm.mtx[var.genes,], sparse = T)
seu[["RNA"]]@scale.data <- brm.mtx[var.genes,]
seu[["RNA"]]@var.features <- var.genes
cat("Running PCA\n")
seu <- RunPCA(seu)
seu <- FindNeighbors(seu)
seu <- FindClusters(seu)
}
project.data <- function(a.brm, b.brm, nDim=30){
cat("Running PCA\n")
pca.a <- irlba::prcomp_irlba(a.brm, n = nDim)
cat("Projecting data\n")
pca.b <- scale(b.brm, center = pca.a$center, scale = pca.a$scale) %*% pca.a$rotation
return(rbind(pca.a$x, pca.b))
}
project.annotation <- function(source.mtx, querry.mtx, source.annotation, assign.prob = .5){
shared.genes <- intersect(colnames(source.mtx), colnames(querry.mtx))
cat("Number of shared genes:", length(shared.genes), "\n")
cat("Running projection PCA\n")
cmbd.pca <- project.data(source.mtx[,shared.genes],
querry.mtx[,shared.genes])
cat("Finding kNN\n")
tst.knn <- FNN::get.knnx(cmbd.pca[1:nrow(source.mtx), ],
cmbd.pca[(nrow(source.mtx)+1):nrow(cmbd.pca), ])
cat("Voting\n")
tst.cl.vote <- t(apply(tst.knn$nn.index, 1, function(rw) {
source.annotation[rw]
}))
po.cl <- matrix(NA, ncol=length(unique(source.annotation)), nrow = nrow(tst.cl.vote))
colnames(po.cl) <- unique(source.annotation)
for(i in 1:nrow(tst.cl.vote)){
tbl <- table(tst.cl.vote[i,])
po.cl[i,names(tbl)] <- as.vector(tbl)
}
rel.po.cl <- t(apply(po.cl, 1, function(rw){
rw / sum(rw, na.rm = T)
}))
voted <- apply(rel.po.cl, 1, function(rw){
if(max(rw, na.rm = T) < assign.prob){
return(NA)
}else{
y <- colnames(rel.po.cl)[rw == max(rw, na.rm = T)]
y[!is.na(y)][1]
}
})
return(list(
voting.mtx = rel.po.cl,
result = voted,
knn.out = tst.knn
))
}
union.matrix <- function(m1, m2){
library(Matrix)
u.rw <- union(rownames(m1), rownames(m2))
u.cl <- c(colnames(m1), colnames(m2))
cmat <- Matrix(0, ncol=length(u.cl), nrow=length(u.rw), dimnames = list(u.rw, u.cl),sparse=F)
cat("Generated matrix with", ncol(cmat), "columns and", nrow(cmat), "rows.\n")
cat("Filling in the values\n")
cmat[rownames(m1), colnames(m1)] <- m1
cmat[rownames(m2), colnames(m2)] <- m2
return(Matrix(cmat, sparse = T))
}
union.matrix.list <- function(...){
library(Matrix)
mlist = list(...)
out = union.matrix(mlist[[1]], mlist[[2]])
if(length(mlist) > 2){
for(i in 3:length(mlist)){
out = union.matrix(out, mlist[[i]])
}
}
return(out)
}
create.h5 <- function(m,
filepath,
batch = NULL,
cluster = NULL,
embedding = NULL,
indiv_embedding = NULL){
library(hdf5r)
h5file <- H5File$new(filepath, mode = "w")
h5file[["X"]] <- m
h5file[["genes"]] <- rownames(m)
h5file[["cells"]] <- colnames(m)
if(!is.null(batch)){
h5file[["batch"]] <- batch
}
if(!is.null(cluster)){
h5file[["cluster"]] <- cluster
}
if(!is.null(embedding)){
h5file[["embedding"]] <- embedding
}
if(!is.null(indiv_embedding)){
h5file[["indiv_embedding"]] <- indiv_embedding
}
h5file$close_all()
}
merge.sparse <- function(...) {
cnnew <- character()
rnnew <- character()
x <- vector()
i <- numeric()
j <- numeric()
icount <- 1
cat("Merged matrices: ")
for (M in list(...)) {
cat(icount, " ")
cnold <- colnames(M)
rnold <- rownames(M)
cnnew <- union(cnnew,cnold)
rnnew <- union(rnnew,rnold)
cindnew <- match(cnold,cnnew)
rindnew <- match(rnold,rnnew)
ind <- unname(Matrix::which(M != 0,arr.ind=T))
i <- c(i,rindnew[ind[,1]])
j <- c(j,cindnew[ind[,2]])
x <- c(x,M@x)
icount <- icount +1
}
cat("\n")
sparseMatrix(i=i,j=j,x=x,dims=c(length(rnnew),length(cnnew)),dimnames=list(rnnew,cnnew))
}
##### New Funcs #####
read.h5 <- function(path, slot = "ft"){
H5 <- H5File$new(path, mode = "r")
mtx <- Matrix(
H5[[paste0("umi/", slot, "/counts")]][,],
dimnames = list(
H5[[paste0("umi/", slot, "/genes")]][],
H5[[paste0("umi/", slot, "/cells")]][]
),
sparse = T
)
H5$close_all()
return(mtx)
}
generate.data.list <- function(...){
dl <- list(...)
all.sample.names <- do.call("c", lapply(dl, function(x) unique(as.character(x$orig.ident))))
print(all.sample.names)
all.samples <- do.call("c",lapply(dl, function(dat){
lapply(unique(dat$orig.ident), function(x) dat@assays$RNA@counts[,dat$orig.ident == x])
}))
names(all.samples) <- all.sample.names
return(all.samples)
}
get.ensembl.names <- function(ids, mart = "mmusculus_gene_ensembl"){
library(biomaRt)
mart <- useMart("ensembl", mart)
getBM(
c("ensembl_gene_id",
"external_gene_name"),
"ensembl_gene_id",
values = ids,
mart=mart
)
}
get.ensembl.names.for.list <- function(data.list, mart = "mmusculus_gene_ensembl"){
get.ensembl.names(
unique(do.call("c", lapply(data.list, rownames))),
mart
)
}
filter.one2one <- function(data.list,
genes.meta,
translate.to.homolog = F,
return.gene.symbol = F){
keep.genes <- genes.meta[genes.meta[,5] == "ortholog_one2one", "Gene.stable.ID"]
for(n in names(data.list)){
d <- data.list[[n]]
old.n <- rownames(d)
d <- d[rownames(d) %in% keep.genes,]
if(nrow(d) == 0){
print(n)
print(head(old.n))
print(head(keep.genes))
stop("No genes kept!")
}
if(translate.to.homolog){
new.names <- genes.meta[match(rownames(d), genes.meta[,1]),3]
rownames(d) <- new.names
}
data.list[[n]] <- d
}
return(data.list)
}
translate.geneid.to.name <- function(data.list, genes.meta){
for(n in names(data.list)){
d <- data.list[[n]]
old.n <- rownames(d)
new.n <- as.character(genes.meta$Gene.name[match(old.n, genes.meta$Gene.stable.ID)])
new.n[is.na(new.n)] <- old.n[is.na(new.n)]
rownames(d) <- new.n
data.list[[n]] <- d
}
return(data.list)
}
get.exprs <- function(dl, gene){
exprs <- list()
for(i in 1:length(dl)){
nrm <- dl[[i]]
if(gene %in% rownames(nrm)){
tmp <- nrm[gene, ]
names(tmp) <- colnames(nrm)
exprs[[i]] <- tmp
}else{
tmp <- rep(0, ncol(nrm))
names(tmp) <- colnames(nrm)
exprs[[i]] <- tmp
}
}
exprs <- do.call("c", exprs)
return(exprs)
}
plot.exprs.3d <- function(coord, exprs, size=.1){
require(threejs)
require(RColorBrewer)
# tmp <- names(exprs)
exprs <- scale(exprs)
# names(exprs) <- tmp
exprs <- round(exprs, digits=2)
cols <- rev(colorRampPalette(brewer.pal(5, "Spectral"))(length(sort(unique(exprs)))))
coord <- coord[order(exprs), ]
exprs <- sort(exprs)
cols.vec <- cols[as.numeric(as.factor(exprs))]
# coord <- coord[names(exprs), ]
scatterplot3js(
coord[,1],
coord[,2],
coord[,3],
color = cols.vec,
size=size
)
}
plot.factor.3d <- function(coord, color, size=.1){
color <- as.factor(color)
require(threejs)
require(RColorBrewer)
cols <- colorRampPalette(brewer.pal(8, "Set2"))(length(levels(color)))
cols.vec <- cols[as.numeric(color)]
scatterplot3js(
coord[,1],
coord[,2],
coord[,3],
color = cols.vec,
size=size,
labels = as.character(color)
)
}
scale.batchwise <- function(umi, hvg, n.cores = 10){
cat("Preprocessing UMI matrices\n")
d.added <- pbmcapply::pbmclapply(umi, function(x){
x <- x[rownames(x) %in% hvg, ]
# print(dim(x))
add.genes <- hvg[!(hvg %in% rownames(x))]
# print(length(add.genes))
mg <- Matrix(0, nrow = length(add.genes), ncol = ncol(x))
rownames(mg) <- add.genes
colnames(mg) <- colnames(x)
# print(dim(mg))
out <- rbind(x, mg)
out <- t(t(out) / colSums(out))
return(out[hvg,])
}, mc.cores = 10, ignore.interactive = T)
cat("Scaling\n")
avgs <- lapply(d.added, rowMeans)
ref <- do.call(pmin, avgs)
rescale <- pbmcapply::pbmclapply(seq_along(avgs), function(i){
out <- ref/avgs[[i]]
out[!is.finite(out)] <- 0
return(out)
}, mc.cores = n.cores, ignore.interactive = T)
fin <- pbmcapply::pbmclapply(seq_along(d.added), function(i){
m <- d.added[[i]]
rs <- rescale[[i]]
Matrix(log1p(apply(m, 2, function(x) x * rs)), sparse = T)
}, mc.cores = n.cores, ignore.interactive = T)
cat("Correcting the matrix\n")
mtx <- t(do.call(cbind, fin))
mtx <- mtx[,apply(mtx, 2, sd) != 0]
# cat("z-scoring\n")
# z.score <- function(x) (x - mean(x)) / sd(x)
# mtx.zscore <- apply(mtx, 2, z.score)
mtx.zscore <- mtx
cat("PCA\n")
set.seed(1234)
pca <- irlba::prcomp_irlba(mtx.zscore, n=75, verbose = T, scale.=T, center=T)
rownames(pca$x) <- rownames(mtx.zscore)
rownames(pca$rotation) <- colnames(mtx.zscore)
cat("UMAP\n")
set.seed(1234)
um <- uwot::umap(pca$x, metric="cosine", min_dist = .1, n_neighbors = 15, verbose = T, n_components = 3)
rownames(um) <- rownames(mtx.zscore)
return(
list(
pca = pca,
umap = um
)
)
}
read_merged_data <- function(data){
library(hdf5r)
out <- list()
out$pca <- readRDS(file.path(data, "pca_model.rds"))
out$meta.data <- read_csv(file.path(data, "meta_data.csv"))
out$umap <- readRDS(file.path(data, "umap_model.rds"))
h5 <- H5File$new(file.path(data, "umi.hdf5"), mode="r")
cat("Reading UMI\n")
out$umi <- list()
for(n in names(h5)){
print(n)
out$umi[[n]] <- Matrix(
h5[[n]][["counts"]][,],
dimnames = list(
h5[[n]][["genes"]][],
h5[[n]][["cells"]][]
),
sparse = T
)
}
h5$close_all()
out$stage <- do.call("c", lapply(out$umi, function(x) str_split_fixed(colnames(x)[1], "_",4)[,3]))
return(out)
}
make.edgelist <- function(nn){
do.call(rbind, lapply(1:nrow(nn), function(i){
x <- nn[i,]
xf <- x[1]
y <- x[-1]
from <- rep(xf, length(y))
to <- y
cbind(from, to)
}))
}
get.clusters <- function(nn){
cat("Generating graph\n")
el <- make.edgelist(nn)
g <- simplify(graph_from_edgelist(el, directed = F))
cat("Cluster data\n")
cl <- cluster_louvain(g)
return(
list(
clusters = membership(cl),
graph = g
)
)
}
quick.split.cell.names <- function(cell.names, n=4){
str_split_fixed(cell.names, "_", n)
}
make.pseudobulk <- function(m, group.by){
group.by <- as.factor(group.by)
out <- tapply(colnames(m), group.by, function(i){
if(length(i) > 1){
Matrix::rowSums(m[,i])
}else{
m[,1]
}
})
out <- do.call(cbind, out)
return(out)
}
qsplit <- function(x, n, sep="_"){
str_split_fixed(x, sep, length(strsplit(x[1], sep)[[1]]))[,n]
}
scanpy.louvain <- function(umi.list, pca.coords, hvg = NULL, resolution = 1, n_pcs = 75L, n_neighbors = 30L){
library(reticulate)
use_condaenv("r-reticulate")
py_config()
cat("Clustering using scanpy\n")
sc <- import("scanpy")
if(is.list(umi.list)){
mat <- do.call(merge.sparse, umi.list)
}else{
mat <- umi.list
}
mat <- mat[,rownames(pca.coords)]
if(!is.null(hvg)){
mat <- mat[rownames(mat) %in% hvg, ]
}
adata <- sc$AnnData(X = t(mat),
obsm = list("PCA" = pca.coords))
cat("\tFinding neighbors\n")
sc$pp$neighbors(adata, use_rep = "PCA", n_neighbors = n_neighbors, n_pcs = n_pcs)
cat("\tRunning Louvain algorithm\n")
sc$tl$louvain(adata, resolution = resolution)
cl.orig <- adata$obs[,1]
cl <- paste0("c", cl.orig)
names(cl.orig) <- colnames(mat)
names(cl) <- colnames(mat)
return(cl.orig)
}
split.umi <- function(umi, batch.vec, prune = T){
umi.list <- tapply(colnames(umi), as.factor(batch.vec), function(i){
if(length(i) > 2){
out <- umi[,i]
if(prune){
out <- out[rowSums(out) > 0, ]
}
}else{
out <- NA
}
return(out)
})
umi.list <- umi.list[!is.na(umi.list)]
return(umi.list)
}
do.softmerge <- function(umi.list, features, n_pcs = 75, n_umaps = 3, subsample_pca = NULL, n_cores = 10, batchwise.zscore = T){
if(batchwise.zscore){
scl <- F
cntr <- F
}
cat("Processing a list of matrices with length", length(umi.list), ": ")
t.out <- pbmcapply::pbmclapply( umi.list, function(x){
sf <- colSums(x)
o <- Matrix(apply(t(t(x[rownames(x) %in% features, ]) / sf), 1, function(y){
z <- sqrt(y / mean(y))
z[is.na(z)] <- 0
return(z)
}),sparse=T)
return(o)
}, mc.cores = n_cores, ignore.interactive = T)
cat("\nDone preprocessing \n")
cat("Merging matrices\n")
t.mat <- do.call(merge.sparse, t.out)
#t.mat[is.na(t.mat)] <- 0
t.mat <- t.mat[,colSums(t.mat) > 0]
if(batchwise.zscore){
t.mat <- do.call(rbind, pbmcapply::pbmclapply(umi.list, function(u){
cells <- colnames(u)
tmp <- t.mat[cells, ]
tmp <- scale(tmp)
tmp[is.na(tmp)] <- 0
return(tmp)
}, mc.cores = n_cores, ignore.interactive = T))
}
print(dim(t.mat))
if(is.null(subsample_pca)){
cat("PCA\n")
set.seed(1234)
t.pca <- irlba::prcomp_irlba(t.mat, n=n_pcs, center = cntr, scale. = scl, verbose=T)
rownames(t.pca$x) <- rownames(t.mat)
rownames(t.pca$rotation) <- colnames(t.mat)
cat("UMAP\n")
set.seed(1234)
t.umap <- uwot::umap(t.pca$x, metric="cosine", n_neighbors = 30, min_dist=.1, n_components = n_umaps, verbose = T)
rownames(t.umap) <- rownames(t.mat)
}else{
sel.cells <- sample(rownames(t.mat), subsample_pca)
sub.mat <- t.mat[sel.cells, ]
sub.mat <- sub.mat[,colSums(sub.mat) > 0]
cat("Running PCA with a subsampled dataset of size: ", nrow(sub.mat), ncol(sub.mat), "\n")
set.seed(1234)
t.pca <- irlba::prcomp_irlba(sub.mat, n=n_pcs, center = cntr, scale. = scl, verbose = T)
rownames(t.pca$x) <- sel.cells
rownames(t.pca$rotation) <- colnames(sub.mat)
cat("UMAP\n")
set.seed(1234)
t.umap <- uwot::umap(t.pca$x, metric="cosine", n_neighbors = 30, min_dist=.1, n_components = n_umaps, verbose = T)
rownames(t.umap) <- sel.cells
}
return(list(
umap = t.umap,
pca = t.pca,
feats = colnames(t.mat)
))
}
sparse.columnNorm <- function (A)
{
if (class(A)[1] == "dgTMatrix") {
temp = summary(A)
A = sparseMatrix(i = temp[, 1], j = temp[, 2], x = temp[,
3])
}
A@x <- A@x/rep.int(Matrix::colSums(A), diff(A@p))
return(A)
}
write_matrix_h5 <- function(mat, file){
require(hdf5r)
H5 <- H5File$new(file, "w")
if(is(mat, "dgCMatrix")){
H5[["counts"]] <- summary(mat)
}else{
H5[["counts"]] <- mat
}
H5[["colnames"]] <- colnames(mat)
H5[["rownames"]] <- rownames(mat)
H5$close_all()
}
read_sparse_h5 <- function(file){
require(hdf5r)
H5 <- H5File$new(file, "r")
cnts <- H5[["counts"]][]
mtx <- sparseMatrix(i = cnts$i, j = cnts$j, x = cnts$x)
colnames(mtx) <- H5[["colnames"]][]
rownames(mtx) <- H5[["rownames"]][]
H5$close_all()
return(mtx)
}
#' GO enrichment of selected genes
#'
#' @param interesting.genes Genes to test enrichment for
#' @param gene.universe All genes to test against
#' @param gene2go.list
run.topGO <- function(interesting.genes, gene.universe, gene2go.list, n.out = 25){
require(topGO)
gene.list <- as.factor(as.numeric(gene.universe %in% interesting.genes))
names(gene.list) <- gene.universe
sampleGO <- new("topGOdata",
ontology = "BP",
allGenes = gene.list,
# geneSel = names(treecut.pt)[treecut.pt == 2],
annot = annFUN.gene2GO,
gene2GO = gene2go.list)
resultFisher <- runTest(sampleGO, algorithm = "classic", statistic = "fisher")
return(
GenTable(sampleGO,
classicFisher = resultFisher,
orderBy = "classicFisher", ranksOf = "classicFisher", topNodes = n.out)
)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment