tomsing1/aggregate_normalize_center_counts.R

## aggregate_normalize_center_counts.R
library(edgeR)
library(Seurat)

# download example Seurat object (2000 genes, 10000 cells)
object <- local({
  temp_file <- tempfile(".rds")
  download.file("https://bioinf.wehi.edu.au/edgeR/UserGuideData/SeuratObj.rds",
                temp_file)
  readRDS(temp_file)
})

# There are 13 samples
table(object@meta.data$group)
# with cells assigned to 7 clusters
table(object@meta.data$seurat_clusters)

# let's define three arbitrary samples as the control group
ctrl <- c("N_0019_total", "N_0021_total", "N_0064_total")

##------ Using edgeR

# aggreate into pseudo-bulk counts by group and cluster
y <- Seurat2PB(object, sample="group", cluster="seurat_clusters")

# Seurat's group info is now in the  `sample` column
table(y$samples$sample)
y$samples$group <- factor(
  ifelse(y$samples$sample %in% ctrl, "Ctrl", "Treatment")
)

# TMM normalization
y <- normLibSizes(y)

# normalized counts
cpms <- cpm(y, normalized.lib.sizes = TRUE, log = TRUE)

# subtract the mean of the control samples (within each cluster) from each gene
centred_cpms <- lapply(unique(y$samples$cluster), \(n) {
  ctrl_group <- y$samples$group == "Ctrl"
  this_cluster <- y$samples$cluster == n
  cpms_ctrl <- rowMeans(cpms[, which(ctrl_group & this_cluster)])
  sweep(cpms[, which(this_cluster)], MARGIN = 1, STATS = cpms_ctrl, FUN = "-")
})
# Concatenate the matrices (one for each cluster) into a single matrix
centred_cpms <- do.call(cbind, centred_cpms)
# Restore the original column ordering
centred_cpms <- centred_cpms[, colnames(y)]
dim(centred_cpms)  # 13527 genes, 85 pseudo-bulk samples
	library(edgeR)
	library(Seurat)

	# download example Seurat object (2000 genes, 10000 cells)
	object <- local({
	temp_file <- tempfile(".rds")
	download.file("https://bioinf.wehi.edu.au/edgeR/UserGuideData/SeuratObj.rds",
	temp_file)
	readRDS(temp_file)
	})

	# There are 13 samples
	table(object@meta.data$group)
	# with cells assigned to 7 clusters
	table(object@meta.data$seurat_clusters)

	# let's define three arbitrary samples as the control group
	ctrl <- c("N_0019_total", "N_0021_total", "N_0064_total")

	##------ Using edgeR

	# aggreate into pseudo-bulk counts by group and cluster
	y <- Seurat2PB(object, sample="group", cluster="seurat_clusters")

	# Seurat's group info is now in the `sample` column
	table(y$samples$sample)
	y$samples$group <- factor(
	ifelse(y$samples$sample %in% ctrl, "Ctrl", "Treatment")
	)

	# TMM normalization
	y <- normLibSizes(y)

	# normalized counts
	cpms <- cpm(y, normalized.lib.sizes = TRUE, log = TRUE)

	# subtract the mean of the control samples (within each cluster) from each gene
	centred_cpms <- lapply(unique(y$samples$cluster), \(n) {
	ctrl_group <- y$samples$group == "Ctrl"
	this_cluster <- y$samples$cluster == n
	cpms_ctrl <- rowMeans(cpms[, which(ctrl_group & this_cluster)])
	sweep(cpms[, which(this_cluster)], MARGIN = 1, STATS = cpms_ctrl, FUN = "-")
	})
	# Concatenate the matrices (one for each cluster) into a single matrix
	centred_cpms <- do.call(cbind, centred_cpms)
	# Restore the original column ordering
	centred_cpms <- centred_cpms[, colnames(y)]
	dim(centred_cpms) # 13527 genes, 85 pseudo-bulk samples