mikelove/osca_plots.R

## osca_plots.R
library(SingleCellExperiment)
sce <- readRDS("sce.rds")
labels <- readRDS("cellLabels.rds")

sce <- sce[,names(labels)]
colLabels(sce) <- labels

table(labels)
assayNames(sce)
total_count <- rowSums(assay(sce))
hist(log10(total_count+1))
hist(round(total_count[total_count < 10]))
mcols(sce)$total_count <- total_count # save for later in mcols = metadata columns
sce <- sce[mcols(sce)$total_count > 2,] # arbitrary - drop genes with super low counts

# some basic steps for plotting in Bioc
library(scran)
set.seed(1)
clust0 <- quickCluster(sce)
sce <- computeSumFactors(sce, cluster=clust0)
sce <- logNormCounts(sce)
save(sce, file="log_normed_sce.rda")

# PCA
vardf <- modelGeneVar(sce)
top <- getTopHVGs(vardf, n=2000)
set.seed(1)
sce <- fixedPCA(sce, subset.row=top)
library(scater)
# make less labels just for plotting...
sce$label_less <- factor(ifelse(as.numeric(colLabels(sce)) > 8, "else",
                                as.character(colLabels(sce))))
table(sce$label_less)
plotReducedDim(sce, dimred="PCA", colour_by="label_less")

# UMAP
set.seed(1)
sce <- runUMAP(sce, dimred="PCA")
plotReducedDim(sce, dimred="UMAP", colour_by="label_less")

# look at simple plots
assay(sce, "allele") <- assay(sce, "ratio") * assay(sce, "counts")

# look at bulk ratios
mcols(sce)$bulk_ratio <- rowSums(assay(sce, "allele")) / mcols(sce)$total_count
myhist <- function(x, brks, min_count) {
  # idx could instead by the names of housekeeping genes
  idx <- mcols(sce)$total_count >= min_count
  with(mcols(sce)[idx,],
       hist(bulk_ratio, breaks=brks, col="grey50", border="white"))
}
myhist(sce, 100, 0)
myhist(sce, 100, 10)
myhist(sce, 100, 20)

library(ggplot2)
dat <- as.data.frame(mcols(sce))
ggplot(dat, aes(total_count, bulk_ratio)) +
  geom_point(alpha=0.3) +
  geom_hline(yintercept=0.5,col="red") +
  scale_x_log10()

library(dplyr)
dat %>% filter(total_count > 1e4, abs(bulk_ratio - 0.5) < .05)

plotReducedDim(sce, dimred="UMAP", colour_by="chinmo")

# but want to find a gene with expression in most clusters
# compute cluster-level total counts and allelic counts
for (lvl in levels(sce$label_less)) {
  idx <- sce$label_less %in% lvl
  clst_total <- rowSums(assay(sce)[,idx])
  clst_allele <- rowSums(assay(sce, "allele")[,idx])
  mcols(sce)[[paste0("clst_total_",lvl)]] <- clst_total
  mcols(sce)[[paste0("clst_ratio_",lvl)]] <- clst_allele / clst_total
}

# do some logic on the cluster total counts
library(tibble)
dat <- as.data.frame(mcols(sce)) %>%
  rownames_to_column("gene") %>%
  tibble()
# genes where each cluster has a count of 1000 or more...
dat %>% filter_at(vars(starts_with("clst_total")), all_vars(. >= 1e3))

plotReducedDim(sce, dimred="UMAP", colour_by="label_less")
plotReducedDim(sce, dimred="UMAP", colour_by="hth")
plotReducedDim(sce, dimred="UMAP", colour_by="hth", by_exprs_values="ratio")

ggplot(dat, aes(clst_total_4, clst_ratio_4)) +
  geom_point(alpha=0.3) +
  geom_hline(yintercept=0.5,col="red") +
  scale_x_log10()
	library(SingleCellExperiment)
	sce <- readRDS("sce.rds")
	labels <- readRDS("cellLabels.rds")

	sce <- sce[,names(labels)]
	colLabels(sce) <- labels

	table(labels)
	assayNames(sce)
	total_count <- rowSums(assay(sce))
	hist(log10(total_count+1))
	hist(round(total_count[total_count < 10]))
	mcols(sce)$total_count <- total_count # save for later in mcols = metadata columns
	sce <- sce[mcols(sce)$total_count > 2,] # arbitrary - drop genes with super low counts

	# some basic steps for plotting in Bioc
	library(scran)
	set.seed(1)
	clust0 <- quickCluster(sce)
	sce <- computeSumFactors(sce, cluster=clust0)
	sce <- logNormCounts(sce)
	save(sce, file="log_normed_sce.rda")

	# PCA
	vardf <- modelGeneVar(sce)
	top <- getTopHVGs(vardf, n=2000)
	set.seed(1)
	sce <- fixedPCA(sce, subset.row=top)
	library(scater)
	# make less labels just for plotting...
	sce$label_less <- factor(ifelse(as.numeric(colLabels(sce)) > 8, "else",
	as.character(colLabels(sce))))
	table(sce$label_less)
	plotReducedDim(sce, dimred="PCA", colour_by="label_less")

	# UMAP
	set.seed(1)
	sce <- runUMAP(sce, dimred="PCA")
	plotReducedDim(sce, dimred="UMAP", colour_by="label_less")

	# look at simple plots
	assay(sce, "allele") <- assay(sce, "ratio") * assay(sce, "counts")

	# look at bulk ratios
	mcols(sce)$bulk_ratio <- rowSums(assay(sce, "allele")) / mcols(sce)$total_count
	myhist <- function(x, brks, min_count) {
	# idx could instead by the names of housekeeping genes
	idx <- mcols(sce)$total_count >= min_count
	with(mcols(sce)[idx,],
	hist(bulk_ratio, breaks=brks, col="grey50", border="white"))
	}
	myhist(sce, 100, 0)
	myhist(sce, 100, 10)
	myhist(sce, 100, 20)

	library(ggplot2)
	dat <- as.data.frame(mcols(sce))
	ggplot(dat, aes(total_count, bulk_ratio)) +
	geom_point(alpha=0.3) +
	geom_hline(yintercept=0.5,col="red") +
	scale_x_log10()

	library(dplyr)
	dat %>% filter(total_count > 1e4, abs(bulk_ratio - 0.5) < .05)

	plotReducedDim(sce, dimred="UMAP", colour_by="chinmo")

	# but want to find a gene with expression in most clusters
	# compute cluster-level total counts and allelic counts
	for (lvl in levels(sce$label_less)) {
	idx <- sce$label_less %in% lvl
	clst_total <- rowSums(assay(sce)[,idx])
	clst_allele <- rowSums(assay(sce, "allele")[,idx])
	mcols(sce)[[paste0("clst_total_",lvl)]] <- clst_total
	mcols(sce)[[paste0("clst_ratio_",lvl)]] <- clst_allele / clst_total
	}

	# do some logic on the cluster total counts
	library(tibble)
	dat <- as.data.frame(mcols(sce)) %>%
	rownames_to_column("gene") %>%
	tibble()
	# genes where each cluster has a count of 1000 or more...
	dat %>% filter_at(vars(starts_with("clst_total")), all_vars(. >= 1e3))

	plotReducedDim(sce, dimred="UMAP", colour_by="label_less")
	plotReducedDim(sce, dimred="UMAP", colour_by="hth")
	plotReducedDim(sce, dimred="UMAP", colour_by="hth", by_exprs_values="ratio")

	ggplot(dat, aes(clst_total_4, clst_ratio_4)) +
	geom_point(alpha=0.3) +
	geom_hline(yintercept=0.5,col="red") +
	scale_x_log10()