kevinrue/Seurat_PBMC3k_tutorial.R

## Seurat_PBMC3k_tutorial.R

# Adapted from https://satijalab.org/seurat/pbmc3k_tutorial.html on 2018-11-24

timestamp()
message("Started")

library(BiocFileCache)
library(Seurat)
library(dplyr)

# Download the data programmatically ----
bfc <- BiocFileCache()
pbmc3k_rname <- "pbmc3k_filtered_gene_bc_matrices.tar.gz"
if (!pbmc3k_rname %in% bfcinfo(bfc)[, "rname", drop=TRUE]) {
    rpath <- bfcadd(
        x = bfc,
        rname = "pbmc3k_filtered_gene_bc_matrices.tar.gz",
        fpath = "https://s3-us-west-2.amazonaws.com/10x.files/samples/cell/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz")
} else {
    rpath <- subset(bfcinfo(bfc), pbmc3k_rname == rname, "rpath", drop=TRUE)
}

tempDir <- tempdir()
untar(rpath, exdir = tempDir)

# Summarized Seurat tutorial ----

# Load the PBMC dataset
pbmc.data <- Read10X(data.dir = file.path(tempDir, "filtered_gene_bc_matrices", "hg19"))

# Initialize the Seurat object with the raw (non-normalized data).  Keep all
# genes expressed in >= 3 cells (~0.1% of the data). Keep all cells with at
# least 200 detected genes
pbmc <- CreateSeuratObject(raw.data = pbmc.data, min.cells = 3, min.genes = 200, project = "10X_PBMC")

# The number of genes and UMIs (nGene and nUMI) are automatically calculated
# for every object by Seurat.  For non-UMI data, nUMI represents the sum of
# the non-normalized values within a cell We calculate the percentage of
# mitochondrial genes here and store it in percent.mito using AddMetaData.
# We use object@raw.data since this represents non-transformed and
# non-log-normalized counts The % of UMI mapping to MT-genes is a common
# scRNA-seq QC metric.
mito.genes <- grep(pattern = "^MT-", x = rownames(x = pbmc@data), value = TRUE)
percent.mito <- Matrix::colSums(pbmc@raw.data[mito.genes, ])/Matrix::colSums(pbmc@raw.data)

# AddMetaData adds columns to object@meta.data, and is a great place to
# stash QC stats
pbmc <- AddMetaData(object = pbmc, metadata = percent.mito, col.name = "percent.mito")

# We filter out cells that have unique gene counts over 2,500 or less than
# 200 Note that low.thresholds and high.thresholds are used to define a
# 'gate'.  -Inf and Inf should be used if you don't want a lower or upper
# threshold.
pbmc <- FilterCells(object = pbmc, subset.names = c("nGene", "percent.mito"),
                    low.thresholds = c(200, -Inf), high.thresholds = c(2500, 0.05))

pbmc <- NormalizeData(object = pbmc, normalization.method = "LogNormalize",
                      scale.factor = 10000)

pbmc <- FindVariableGenes(object = pbmc, mean.function = ExpMean, dispersion.function = LogVMR,
                          x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5)

pbmc <- ScaleData(object = pbmc, vars.to.regress = c("nUMI", "percent.mito"))

pbmc <- RunPCA(object = pbmc, pc.genes = pbmc@var.genes, do.print = TRUE, pcs.print = 1:5,
               genes.print = 5)

# save.SNN = T saves the SNN so that the clustering algorithm can be rerun
# using the same graph but with a different resolution value (see docs for
# full details)
pbmc <- FindClusters(object = pbmc, reduction.type = "pca", dims.use = 1:10,
                     resolution = 0.6, print.output = 0, save.SNN = TRUE)

pbmc <- RunTSNE(object = pbmc, dims.use = 1:10, do.fast = TRUE)

# Save the preprocessed Seurat object to BiocFileCache ----

tempFile <- tempfile(fileext = ".rds")
saveRDS(pbmc, tempFile)

seurat_pbmc3k_rname <- "seurat_pbmc3k"
if (!seurat_pbmc3k_rname %in% bfcinfo(bfc)[, "rname", drop=TRUE]) {
    rpath <- bfcadd(
        x = bfc,
        rname = seurat_pbmc3k_rname,
        fpath = tempFile)
} else {
    rpath <- subset(bfcinfo(bfc), seurat_pbmc3k_rname == rname, "rpath", drop=TRUE)
}

# Save the SingleCellExperiment object to BiocFileCache ----

sce <- as.SingleCellExperiment(pbmc)

tempFile <- tempfile(fileext = ".rds")
saveRDS(sce, tempFile)

sce_pbmc3k_rname <- "sce_pbmc3k"
if (!sce_pbmc3k_rname %in% bfcinfo(bfc)[, "rname", drop=TRUE]) {
    rpath <- bfcadd(
        x = bfc,
        rname = sce_pbmc3k_rname,
        fpath = tempFile)
} else {
    rpath <- subset(bfcinfo(bfc), sce_pbmc3k_rname == rname, "rpath", drop=TRUE)
}

message("Completed")
timestamp()

## Markers and cell type annotations by cluster
# Cluster ID	Markers	        Cell Type
# 0             IL7R	        CD4 T cells
# 1             CD14, LYZ	    CD14+ Monocytes
# 2	            MS4A1           B cells
# 3	            CD8A	        CD8 T cells
# 4	            FCGR3A, MS4A7	FCGR3A+ Monocytes
# 5	            GNLY, NKG7	    NK cells
# 6	            FCER1A, CST3	Dendritic Cells
# 7	            PPBP	        Megakaryocytes

	# Adapted from https://satijalab.org/seurat/pbmc3k_tutorial.html on 2018-11-24

	timestamp()
	message("Started")

	library(BiocFileCache)
	library(Seurat)
	library(dplyr)

	# Download the data programmatically ----
	bfc <- BiocFileCache()
	pbmc3k_rname <- "pbmc3k_filtered_gene_bc_matrices.tar.gz"
	if (!pbmc3k_rname %in% bfcinfo(bfc)[, "rname", drop=TRUE]) {
	rpath <- bfcadd(
	x = bfc,
	rname = "pbmc3k_filtered_gene_bc_matrices.tar.gz",
	fpath = "https://s3-us-west-2.amazonaws.com/10x.files/samples/cell/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz")
	} else {
	rpath <- subset(bfcinfo(bfc), pbmc3k_rname == rname, "rpath", drop=TRUE)
	}

	tempDir <- tempdir()
	untar(rpath, exdir = tempDir)

	# Summarized Seurat tutorial ----

	# Load the PBMC dataset
	pbmc.data <- Read10X(data.dir = file.path(tempDir, "filtered_gene_bc_matrices", "hg19"))

	# Initialize the Seurat object with the raw (non-normalized data). Keep all
	# genes expressed in >= 3 cells (~0.1% of the data). Keep all cells with at
	# least 200 detected genes
	pbmc <- CreateSeuratObject(raw.data = pbmc.data, min.cells = 3, min.genes = 200, project = "10X_PBMC")

	# The number of genes and UMIs (nGene and nUMI) are automatically calculated
	# for every object by Seurat. For non-UMI data, nUMI represents the sum of
	# the non-normalized values within a cell We calculate the percentage of
	# mitochondrial genes here and store it in percent.mito using AddMetaData.
	# We use object@raw.data since this represents non-transformed and
	# non-log-normalized counts The % of UMI mapping to MT-genes is a common
	# scRNA-seq QC metric.
	mito.genes <- grep(pattern = "^MT-", x = rownames(x = pbmc@data), value = TRUE)
	percent.mito <- Matrix::colSums(pbmc@raw.data[mito.genes, ])/Matrix::colSums(pbmc@raw.data)

	# AddMetaData adds columns to object@meta.data, and is a great place to
	# stash QC stats
	pbmc <- AddMetaData(object = pbmc, metadata = percent.mito, col.name = "percent.mito")

	# We filter out cells that have unique gene counts over 2,500 or less than
	# 200 Note that low.thresholds and high.thresholds are used to define a
	# 'gate'. -Inf and Inf should be used if you don't want a lower or upper
	# threshold.
	pbmc <- FilterCells(object = pbmc, subset.names = c("nGene", "percent.mito"),
	low.thresholds = c(200, -Inf), high.thresholds = c(2500, 0.05))

	pbmc <- NormalizeData(object = pbmc, normalization.method = "LogNormalize",
	scale.factor = 10000)

	pbmc <- FindVariableGenes(object = pbmc, mean.function = ExpMean, dispersion.function = LogVMR,
	x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5)

	pbmc <- ScaleData(object = pbmc, vars.to.regress = c("nUMI", "percent.mito"))

	pbmc <- RunPCA(object = pbmc, pc.genes = pbmc@var.genes, do.print = TRUE, pcs.print = 1:5,
	genes.print = 5)

	# save.SNN = T saves the SNN so that the clustering algorithm can be rerun
	# using the same graph but with a different resolution value (see docs for
	# full details)
	pbmc <- FindClusters(object = pbmc, reduction.type = "pca", dims.use = 1:10,
	resolution = 0.6, print.output = 0, save.SNN = TRUE)

	pbmc <- RunTSNE(object = pbmc, dims.use = 1:10, do.fast = TRUE)

	# Save the preprocessed Seurat object to BiocFileCache ----

	tempFile <- tempfile(fileext = ".rds")
	saveRDS(pbmc, tempFile)

	seurat_pbmc3k_rname <- "seurat_pbmc3k"
	if (!seurat_pbmc3k_rname %in% bfcinfo(bfc)[, "rname", drop=TRUE]) {
	rpath <- bfcadd(
	x = bfc,
	rname = seurat_pbmc3k_rname,
	fpath = tempFile)
	} else {
	rpath <- subset(bfcinfo(bfc), seurat_pbmc3k_rname == rname, "rpath", drop=TRUE)
	}

	# Save the SingleCellExperiment object to BiocFileCache ----

	sce <- as.SingleCellExperiment(pbmc)

	tempFile <- tempfile(fileext = ".rds")
	saveRDS(sce, tempFile)

	sce_pbmc3k_rname <- "sce_pbmc3k"
	if (!sce_pbmc3k_rname %in% bfcinfo(bfc)[, "rname", drop=TRUE]) {
	rpath <- bfcadd(
	x = bfc,
	rname = sce_pbmc3k_rname,
	fpath = tempFile)
	} else {
	rpath <- subset(bfcinfo(bfc), sce_pbmc3k_rname == rname, "rpath", drop=TRUE)
	}

	message("Completed")
	timestamp()

	## Markers and cell type annotations by cluster
	# Cluster ID Markers Cell Type
	# 0 IL7R CD4 T cells
	# 1 CD14, LYZ CD14+ Monocytes
	# 2 MS4A1 B cells
	# 3 CD8A CD8 T cells
	# 4 FCGR3A, MS4A7 FCGR3A+ Monocytes
	# 5 GNLY, NKG7 NK cells
	# 6 FCER1A, CST3 Dendritic Cells
	# 7 PPBP Megakaryocytes