dosorio/SC-8-1.R

## SC-8-1.R
setwd("../Downloads/")
# BiocManager::install(c('TFBSTools', 'ggbio', 'motifmatchr',
#                        'BSgenome','BSgenome.Hsapiens.UCSC.hg19',
#                        'EnsDb.Hsapiens.v75','hdf5r'))
# devtools::install_github("timoast/signac")
library(Signac)
library(Seurat)
library(GenomeInfoDb)
library(EnsDb.Hsapiens.v75)
library(ggplot2)
set.seed(1234)

# Files downloaded from https://support.10xgenomics.com/single-cell-atac/datasets/1.1.0/atac_v1_pbmc_10k
download.file("http://cf.10xgenomics.com/samples/cell-atac/1.1.0/atac_v1_pbmc_10k/atac_v1_pbmc_10k_filtered_peak_bc_matrix.h5", "atacCounts.h5", method = 'curl')
download.file("http://cf.10xgenomics.com/samples/cell-atac/1.1.0/atac_v1_pbmc_10k/atac_v1_pbmc_10k_singlecell.csv", "atacMetadata.csv", method = 'curl')
download.file("http://cf.10xgenomics.com/samples/cell-atac/1.1.0/atac_v1_pbmc_10k/atac_v1_pbmc_10k_fragments.tsv.gz", "atacFragments.tsv.gz", method = 'curl')
download.file("http://cf.10xgenomics.com/samples/cell-atac/1.1.0/atac_v1_pbmc_10k/atac_v1_pbmc_10k_fragments.tsv.gz.tbi", "atacFragments.tsv.gz.tbi", method = 'curl')

# Loading data
counts <- Read10X_h5("atacCounts.h5")
metadata <- read.csv("atacMetadata.csv", header = TRUE, row.names = 1)

# Converting into Seurat Objects
PBMC <- CreateSeuratObject(counts = counts,
                           assay = "peaks",
                           project = "ATAC",
                           meta.data = metadata)
PBMC <- SetFragments(object = PBMC,file = "atacFragments.tsv.gz")

# QC
PBMC <- NucleosomeSignal(PBMC)
PBMC$pctReadsPeaks <- PBMC$peak_region_fragments/PBMC$total
PBMC$blackListRatio <- PBMC$blacklist_region_fragments/PBMC$peak_region_fragments
PBMC$nucleosome_signal[is.infinite(PBMC$nucleosome_signal)] <- max(PBMC$nucleosome_signal[is.finite(PBMC$nucleosome_signal)])
P1 <- VlnPlot(object = PBMC, features = c("pctReadsPeaks", "blackListRatio", "nucleosome_signal"), pt.size = 0.1)
P2A <- VlnPlot(object = PBMC,features = 'peak_region_fragments',pt.size = 0.1, log = TRUE) + NoLegend()
P2B <- FeatureScatter(PBMC,"peak_region_fragments",'nucleosome_signal', pt.size = 0.1) + NoLegend()
P2C <- FeatureScatter(PBMC,"peak_region_fragments",'blackListRatio', pt.size = 0.1) + NoLegend()
P2 <- CombinePlots(plots = list(P2A, P2B, P2C), ncol = 3)
CombinePlots(list(P1,P2),ncol = 1)

PBMC$nucGroup <- ifelse(PBMC$nucleosome_signal > 5, "NS > 5", "NS < 5")
PeriodPlot(PBMC, group.by = "nucGroup")

# Filtering
PBMC <- subset(x = PBMC, subset= peak_region_fragments > 1000 &
         peak_region_fragments < 20000 &
         pctReadsPeaks > 0.2 &
         blackListRatio < 0.05 &
         nucleosome_signal < 10)

# Normalization
PBMC <- RunTFIDF(PBMC)
PBMC <- FindTopFeatures(PBMC, min.cutoff = "q0")

# Dimentional Reduction
PBMC <- RunSVD(object = PBMC, assay = "peaks",
               reduction.key = "LSI_",
               reduction.name = "lsi")

PBMC <- RunUMAP(PBMC, reduction = "lsi", dims = 1:30)
PBMC <- FindNeighbors(PBMC, reduction = "lsi", dims = 1:30)
PBMC <- FindClusters(PBMC, resolution = 0.1)
UMAPPlot(PBMC, label = TRUE) + NoLegend()


# Extract gene coordinates from Ensembl, and ensure name formatting is consistent with Seurat object
gene.coords <- genes(EnsDb.Hsapiens.v75, filter = ~ gene_biotype == "protein_coding")
seqlevelsStyle(gene.coords) <- 'UCSC'
genebody.coords <- keepStandardChromosomes(gene.coords, pruning.mode = 'coarse')
genebodyandpromoter.coords <- Extend(x = gene.coords, upstream = 2000, downstream = 0)

# Create a gene by cell matrix
gene.activities <- FeatureMatrix(
  fragments = "atacFragments.tsv.gz",
  features = genebodyandpromoter.coords,
  cells = colnames(PBMC),
  chunk = 10
)

# convert rownames from chromsomal coordinates into gene names
gene.key <- genebodyandpromoter.coords$gene_name
names(gene.key) <- GRangesToString(grange = genebodyandpromoter.coords)
rownames(gene.activities) <- gene.key[rownames(gene.activities)]

# add the gene activity matrix to the Seurat object as a new assay, and normalize it
PBMC[['RNA']] <- CreateAssayObject(counts = gene.activities)
PBMC <- NormalizeData(
  object = PBMC,
  assay = 'RNA',
  normalization.method = 'LogNormalize',
  scale.factor = median(PBMC$nCount_RNA)
)

# Setting the default
DefaultAssay(PBMC) <- 'RNA'

# Plotting marker genes
FeaturePlot(
  object = PBMC,
  features = c('MS4A1', 'CD3D', 'LEF1', 'NKG7', 'TREM1', 'LYZ'),
  pt.size = 0.1,
  max.cutoff = 'q95',
  ncol = 3
)

# RNA-seq
# Download the file from https://www.dropbox.com/s/zn6khirjafoyyxl/pbmc_10k_v3.rds?dl=0
PBMC_rna <- readRDS("pbmc_10k_v3.rds")


# Applying CCA
transfer.anchors <- FindTransferAnchors(
  reference = PBMC_rna,
  query = PBMC,
  reduction = 'cca'
)

# Transfering labels
predicted.labels <- TransferData(
  anchorset = transfer.anchors,
  refdata = PBMC_rna$celltype,
  weight.reduction = PBMC[['lsi']]
)
PBMC <- AddMetaData(object = PBMC, metadata = predicted.labels)

#Plotting
P1 <- DimPlot(
  object = PBMC_rna,
  group.by = 'celltype',
  label = TRUE,
  repel = TRUE) + NoLegend() + ggtitle('scRNA-seq')

P2 <- DimPlot(
  object = PBMC,
  group.by = 'predicted.id',
  label = TRUE,
  repel = TRUE) + NoLegend() + ggtitle('scATAC-seq')
CombinePlots(list(P1,P2), ncol = 2)


# Differences in peaks
DefaultAssay(PBMC) <- 'peaks'
Idents(PBMC) <- PBMC$predicted.id
dePeaks <- FindMarkers(
  object = PBMC,
  ident.1 = "CD14+ Monocytes",
  ident.2 = "CD16+ Monocytes",
  min.pct = 0.2,
  test.use = 'LR',
  latent.vars = 'peak_region_fragments'
)

# Plotting the differences
plot1 <- VlnPlot(
  object = PBMC,
  features = rownames(dePeaks)[1],
  ncol = 3,
  pt.size = 0.1
) + NoLegend()

plot2 <- FeaturePlot(
  object = PBMC,
  features = rownames(dePeaks)[1],
  ncol = 3,
  pt.size = 0.1
)
CombinePlots(list(plot1,plot2))

# Plotting the peaks
CoveragePlot(
  object = PBMC,
  region = rownames(dePeaks)[1],
  sep = c(":", "-"),
  annotation = EnsDb.Hsapiens.v75,
  extend.upstream = 20000,
  extend.downstream = 20000,
  ncol = 1
)
	setwd("../Downloads/")
	# BiocManager::install(c('TFBSTools', 'ggbio', 'motifmatchr',
	# 'BSgenome','BSgenome.Hsapiens.UCSC.hg19',
	# 'EnsDb.Hsapiens.v75','hdf5r'))
	# devtools::install_github("timoast/signac")
	library(Signac)
	library(Seurat)
	library(GenomeInfoDb)
	library(EnsDb.Hsapiens.v75)
	library(ggplot2)
	set.seed(1234)

	# Files downloaded from https://support.10xgenomics.com/single-cell-atac/datasets/1.1.0/atac_v1_pbmc_10k
	download.file("http://cf.10xgenomics.com/samples/cell-atac/1.1.0/atac_v1_pbmc_10k/atac_v1_pbmc_10k_filtered_peak_bc_matrix.h5", "atacCounts.h5", method = 'curl')
	download.file("http://cf.10xgenomics.com/samples/cell-atac/1.1.0/atac_v1_pbmc_10k/atac_v1_pbmc_10k_singlecell.csv", "atacMetadata.csv", method = 'curl')
	download.file("http://cf.10xgenomics.com/samples/cell-atac/1.1.0/atac_v1_pbmc_10k/atac_v1_pbmc_10k_fragments.tsv.gz", "atacFragments.tsv.gz", method = 'curl')
	download.file("http://cf.10xgenomics.com/samples/cell-atac/1.1.0/atac_v1_pbmc_10k/atac_v1_pbmc_10k_fragments.tsv.gz.tbi", "atacFragments.tsv.gz.tbi", method = 'curl')

	# Loading data
	counts <- Read10X_h5("atacCounts.h5")
	metadata <- read.csv("atacMetadata.csv", header = TRUE, row.names = 1)

	# Converting into Seurat Objects
	PBMC <- CreateSeuratObject(counts = counts,
	assay = "peaks",
	project = "ATAC",
	meta.data = metadata)
	PBMC <- SetFragments(object = PBMC,file = "atacFragments.tsv.gz")

	# QC
	PBMC <- NucleosomeSignal(PBMC)
	PBMC$pctReadsPeaks <- PBMC$peak_region_fragments/PBMC$total
	PBMC$blackListRatio <- PBMC$blacklist_region_fragments/PBMC$peak_region_fragments
	PBMC$nucleosome_signal[is.infinite(PBMC$nucleosome_signal)] <- max(PBMC$nucleosome_signal[is.finite(PBMC$nucleosome_signal)])
	P1 <- VlnPlot(object = PBMC, features = c("pctReadsPeaks", "blackListRatio", "nucleosome_signal"), pt.size = 0.1)
	P2A <- VlnPlot(object = PBMC,features = 'peak_region_fragments',pt.size = 0.1, log = TRUE) + NoLegend()
	P2B <- FeatureScatter(PBMC,"peak_region_fragments",'nucleosome_signal', pt.size = 0.1) + NoLegend()
	P2C <- FeatureScatter(PBMC,"peak_region_fragments",'blackListRatio', pt.size = 0.1) + NoLegend()
	P2 <- CombinePlots(plots = list(P2A, P2B, P2C), ncol = 3)
	CombinePlots(list(P1,P2),ncol = 1)

	PBMC$nucGroup <- ifelse(PBMC$nucleosome_signal > 5, "NS > 5", "NS < 5")
	PeriodPlot(PBMC, group.by = "nucGroup")

	# Filtering
	PBMC <- subset(x = PBMC, subset= peak_region_fragments > 1000 &
	peak_region_fragments < 20000 &
	pctReadsPeaks > 0.2 &
	blackListRatio < 0.05 &
	nucleosome_signal < 10)

	# Normalization
	PBMC <- RunTFIDF(PBMC)
	PBMC <- FindTopFeatures(PBMC, min.cutoff = "q0")

	# Dimentional Reduction
	PBMC <- RunSVD(object = PBMC, assay = "peaks",
	reduction.key = "LSI_",
	reduction.name = "lsi")

	PBMC <- RunUMAP(PBMC, reduction = "lsi", dims = 1:30)
	PBMC <- FindNeighbors(PBMC, reduction = "lsi", dims = 1:30)
	PBMC <- FindClusters(PBMC, resolution = 0.1)
	UMAPPlot(PBMC, label = TRUE) + NoLegend()


	# Extract gene coordinates from Ensembl, and ensure name formatting is consistent with Seurat object
	gene.coords <- genes(EnsDb.Hsapiens.v75, filter = ~ gene_biotype == "protein_coding")
	seqlevelsStyle(gene.coords) <- 'UCSC'
	genebody.coords <- keepStandardChromosomes(gene.coords, pruning.mode = 'coarse')
	genebodyandpromoter.coords <- Extend(x = gene.coords, upstream = 2000, downstream = 0)

	# Create a gene by cell matrix
	gene.activities <- FeatureMatrix(
	fragments = "atacFragments.tsv.gz",
	features = genebodyandpromoter.coords,
	cells = colnames(PBMC),
	chunk = 10
	)

	# convert rownames from chromsomal coordinates into gene names
	gene.key <- genebodyandpromoter.coords$gene_name
	names(gene.key) <- GRangesToString(grange = genebodyandpromoter.coords)
	rownames(gene.activities) <- gene.key[rownames(gene.activities)]

	# add the gene activity matrix to the Seurat object as a new assay, and normalize it
	PBMC[['RNA']] <- CreateAssayObject(counts = gene.activities)
	PBMC <- NormalizeData(
	object = PBMC,
	assay = 'RNA',
	normalization.method = 'LogNormalize',
	scale.factor = median(PBMC$nCount_RNA)
	)

	# Setting the default
	DefaultAssay(PBMC) <- 'RNA'

	# Plotting marker genes
	FeaturePlot(
	object = PBMC,
	features = c('MS4A1', 'CD3D', 'LEF1', 'NKG7', 'TREM1', 'LYZ'),
	pt.size = 0.1,
	max.cutoff = 'q95',
	ncol = 3
	)

	# RNA-seq
	# Download the file from https://www.dropbox.com/s/zn6khirjafoyyxl/pbmc_10k_v3.rds?dl=0
	PBMC_rna <- readRDS("pbmc_10k_v3.rds")


	# Applying CCA
	transfer.anchors <- FindTransferAnchors(
	reference = PBMC_rna,
	query = PBMC,
	reduction = 'cca'
	)

	# Transfering labels
	predicted.labels <- TransferData(
	anchorset = transfer.anchors,
	refdata = PBMC_rna$celltype,
	weight.reduction = PBMC[['lsi']]
	)
	PBMC <- AddMetaData(object = PBMC, metadata = predicted.labels)

	#Plotting
	P1 <- DimPlot(
	object = PBMC_rna,
	group.by = 'celltype',
	label = TRUE,
	repel = TRUE) + NoLegend() + ggtitle('scRNA-seq')

	P2 <- DimPlot(
	object = PBMC,
	group.by = 'predicted.id',
	label = TRUE,
	repel = TRUE) + NoLegend() + ggtitle('scATAC-seq')
	CombinePlots(list(P1,P2), ncol = 2)


	# Differences in peaks
	DefaultAssay(PBMC) <- 'peaks'
	Idents(PBMC) <- PBMC$predicted.id
	dePeaks <- FindMarkers(
	object = PBMC,
	ident.1 = "CD14+ Monocytes",
	ident.2 = "CD16+ Monocytes",
	min.pct = 0.2,
	test.use = 'LR',
	latent.vars = 'peak_region_fragments'
	)

	# Plotting the differences
	plot1 <- VlnPlot(
	object = PBMC,
	features = rownames(dePeaks)[1],
	ncol = 3,
	pt.size = 0.1
	) + NoLegend()

	plot2 <- FeaturePlot(
	object = PBMC,
	features = rownames(dePeaks)[1],
	ncol = 3,
	pt.size = 0.1
	)
	CombinePlots(list(plot1,plot2))

	# Plotting the peaks
	CoveragePlot(
	object = PBMC,
	region = rownames(dePeaks)[1],
	sep = c(":", "-"),
	annotation = EnsDb.Hsapiens.v75,
	extend.upstream = 20000,
	extend.downstream = 20000,
	ncol = 1
	)