Skip to content

Instantly share code, notes, and snippets.

@kevinrue
Created November 25, 2018 18:21
Show Gist options
  • Save kevinrue/f0f26e4585b075caf6420f5453976d01 to your computer and use it in GitHub Desktop.
Save kevinrue/f0f26e4585b075caf6420f5453976d01 to your computer and use it in GitHub Desktop.
Seurat - Guided Clustering Tutorial
# Adapted from https://satijalab.org/seurat/pbmc3k_tutorial.html on 2018-11-24
timestamp()
message("Started")
library(BiocFileCache)
library(Seurat)
library(dplyr)
# Download the data programmatically ----
bfc <- BiocFileCache()
pbmc3k_rname <- "pbmc3k_filtered_gene_bc_matrices.tar.gz"
if (!pbmc3k_rname %in% bfcinfo(bfc)[, "rname", drop=TRUE]) {
rpath <- bfcadd(
x = bfc,
rname = "pbmc3k_filtered_gene_bc_matrices.tar.gz",
fpath = "https://s3-us-west-2.amazonaws.com/10x.files/samples/cell/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz")
} else {
rpath <- subset(bfcinfo(bfc), pbmc3k_rname == rname, "rpath", drop=TRUE)
}
tempDir <- tempdir()
untar(rpath, exdir = tempDir)
# Summarized Seurat tutorial ----
# Load the PBMC dataset
pbmc.data <- Read10X(data.dir = file.path(tempDir, "filtered_gene_bc_matrices", "hg19"))
# Initialize the Seurat object with the raw (non-normalized data). Keep all
# genes expressed in >= 3 cells (~0.1% of the data). Keep all cells with at
# least 200 detected genes
pbmc <- CreateSeuratObject(raw.data = pbmc.data, min.cells = 3, min.genes = 200, project = "10X_PBMC")
# The number of genes and UMIs (nGene and nUMI) are automatically calculated
# for every object by Seurat. For non-UMI data, nUMI represents the sum of
# the non-normalized values within a cell We calculate the percentage of
# mitochondrial genes here and store it in percent.mito using AddMetaData.
# We use object@raw.data since this represents non-transformed and
# non-log-normalized counts The % of UMI mapping to MT-genes is a common
# scRNA-seq QC metric.
mito.genes <- grep(pattern = "^MT-", x = rownames(x = pbmc@data), value = TRUE)
percent.mito <- Matrix::colSums(pbmc@raw.data[mito.genes, ])/Matrix::colSums(pbmc@raw.data)
# AddMetaData adds columns to object@meta.data, and is a great place to
# stash QC stats
pbmc <- AddMetaData(object = pbmc, metadata = percent.mito, col.name = "percent.mito")
# We filter out cells that have unique gene counts over 2,500 or less than
# 200 Note that low.thresholds and high.thresholds are used to define a
# 'gate'. -Inf and Inf should be used if you don't want a lower or upper
# threshold.
pbmc <- FilterCells(object = pbmc, subset.names = c("nGene", "percent.mito"),
low.thresholds = c(200, -Inf), high.thresholds = c(2500, 0.05))
pbmc <- NormalizeData(object = pbmc, normalization.method = "LogNormalize",
scale.factor = 10000)
pbmc <- FindVariableGenes(object = pbmc, mean.function = ExpMean, dispersion.function = LogVMR,
x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5)
pbmc <- ScaleData(object = pbmc, vars.to.regress = c("nUMI", "percent.mito"))
pbmc <- RunPCA(object = pbmc, pc.genes = pbmc@var.genes, do.print = TRUE, pcs.print = 1:5,
genes.print = 5)
# save.SNN = T saves the SNN so that the clustering algorithm can be rerun
# using the same graph but with a different resolution value (see docs for
# full details)
pbmc <- FindClusters(object = pbmc, reduction.type = "pca", dims.use = 1:10,
resolution = 0.6, print.output = 0, save.SNN = TRUE)
pbmc <- RunTSNE(object = pbmc, dims.use = 1:10, do.fast = TRUE)
# Save the preprocessed Seurat object to BiocFileCache ----
tempFile <- tempfile(fileext = ".rds")
saveRDS(pbmc, tempFile)
seurat_pbmc3k_rname <- "seurat_pbmc3k"
if (!seurat_pbmc3k_rname %in% bfcinfo(bfc)[, "rname", drop=TRUE]) {
rpath <- bfcadd(
x = bfc,
rname = seurat_pbmc3k_rname,
fpath = tempFile)
} else {
rpath <- subset(bfcinfo(bfc), seurat_pbmc3k_rname == rname, "rpath", drop=TRUE)
}
# Save the SingleCellExperiment object to BiocFileCache ----
sce <- as.SingleCellExperiment(pbmc)
tempFile <- tempfile(fileext = ".rds")
saveRDS(sce, tempFile)
sce_pbmc3k_rname <- "sce_pbmc3k"
if (!sce_pbmc3k_rname %in% bfcinfo(bfc)[, "rname", drop=TRUE]) {
rpath <- bfcadd(
x = bfc,
rname = sce_pbmc3k_rname,
fpath = tempFile)
} else {
rpath <- subset(bfcinfo(bfc), sce_pbmc3k_rname == rname, "rpath", drop=TRUE)
}
message("Completed")
timestamp()
## Markers and cell type annotations by cluster
# Cluster ID Markers Cell Type
# 0 IL7R CD4 T cells
# 1 CD14, LYZ CD14+ Monocytes
# 2 MS4A1 B cells
# 3 CD8A CD8 T cells
# 4 FCGR3A, MS4A7 FCGR3A+ Monocytes
# 5 GNLY, NKG7 NK cells
# 6 FCER1A, CST3 Dendritic Cells
# 7 PPBP Megakaryocytes
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment