cd data
mkdir pbmc5k
cd pbmc5k
wget http://cf.10xgenomics.com/samples/cell-exp/3.0.2/5k_pbmc_v3/5k_pbmc_v3_filtered_feature_bc_matrix.tar.gz
tar xvzf 5k_pbmc_v3_filtered_feature_bc_matrix.tar.gz
# remove the .gz to save space
rm 5k_pbmc_v3_filtered_feature_bc_matrix.tar.gz
now, switch back to R and install the packages we are going to use in this workshop.
install.packages("tidyverse")
install.packages("rmarkdown")
install.packages('Seurat')
load the library
library(tidyverse)
library(Seurat)
# Load the PBMC dataset
pbmc.data <- Read10X(data.dir = "data/pbmc5k/filtered_feature_bc_matrix/")
# Initialize the Seurat object with the raw (non-normalized data).
pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc5k", min.cells = 3, min.features = 200)
pbmc
## getting help
?CreateSeuratObject
if you want to know more details of the Seurat
object, you can learn at https://github.com/satijalab/seurat/wiki
# Lets examine a few genes in the first thirty cells
pbmc.data[c("CD3D", "TCL1A", "MS4A1"), 1:30]
The .
values in the matrix represent 0s (no molecules detected). Since most values in an scRNA-seq matrix are 0, Seurat uses a sparse-matrix representation whenever possible. This results in significant memory and speed savings for Drop-seq/inDrop/10x data.
## check at metadata
head(pbmc@meta.data)
# The [[ operator can add columns to object metadata. This is a great place to stash QC stats
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
pbmc@meta.data %>% head()
# Visualize QC metrics as a violin plot
VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 5000 & percent.mt < 25)
and even this step
dense.size <- object.size(as.matrix(pbmc.data))
dense.size