moldach/import_QC_Koh

## import_QC_Koh
---
title: "Import and QC of Koh data set (SRP073808)"
output: html_document
editor_options:
  chunk_output_type: console
---

```{r load-packages}
suppressPackageStartupMessages({
  library(MultiAssayExperiment)
  library(SingleCellExperiment)
  library(scater)
  library(scran)
  library(dplyr)
  library(ggplot2)
})
```

## Load MultiAssayExperiment object

```{r load-dataset}
if (file.exists("GSE60749-GPL13112.rds")){
        maex <- readRDS("GSE60749-GPL13112.rds")
} else {
        download.file("http://imlspenticton.uzh.ch/robinson_lab/conquer/data-mae/GSE60749-GPL13112.rds", "GSE60749-GPL13112.rds")
        maex <- readRDS("GSE60749-GPL13112.rds")
}

## Extract the gene-level length-scaled TPMs
cts <- assays(experiments(maex)[["gene"]])[["count_lstpm"]]
## Extract the phenotype data.
phn <- colData(maex)
phn$phenoid <- as.character(interaction(as.data.frame(phn[, c("source_name_ch1",
                                                              "characteristics_ch1.1")])))
## Simplify labels
phn$phenoid <-  plyr::revalue(
  phn$phenoid,
  c("Dgcr8 knockout mouse embryonic stem cells.culture conditions: serum+LIF" = "Dgcr8 knockout mouse serum+LIF",
    "v6.5 mouse embryonic stem cells.culture conditions: 2i+LIF" = "v6.5 mouse 2i+LIF",
    "v6.5 mouse embryonic stem cells.culture conditions: serum+LIF" = "v6.5 mouse serum+LIF")
)
table(phn$phenoid)
```

## Create a SingleCellExperiment object

```{r create-sce}
stopifnot(all(colnames(cts) == rownames(phn)))
sce <- SingleCellExperiment(
  assays = list(counts = cts),
  colData = phn
)

## below doesn't work....
# sce <- normalise(sce, exprs_values = "counts", return_log = TRUE, return_norm_as_exprs = TRUE) ## generates logcounts(sce)
sce <- normalize(sce, exprs_values = "counts", return_log = TRUE) # is this correct? Get warning message here that its using lib sizes as size factors
```

Exclude features that are not expressed

```{r reduce-expression-matrix}
keep_features <- rowSums(counts(sce) > 0) > 0
table(keep_features)
sce <- sce[keep_features, ]
dim(sce)
```

## Identify the remaining ERCC spike-ins.

```{r ercc-mt}
is.spike <- grepl("^ERCC", rownames(sce))
table(is.spike)
summary(colSums(counts(sce[is.spike, ])))
isSpike(sce, "ERCC") <- is.spike
```

## Calculate QC metrics

```{r QC}
sce <- calculateQCMetrics(sce, feature_controls = list(ERCC = grepl("^ERCC", rownames(sce))))
```

## Quality control using PCA on column data

We create a PCA plot based the quality metrics for each cell, e.g., the total
number of reads, the total number of features and the proportion of spike-in
reads.

```{r qc-pca}
# get an error from below code:
# Error in .local(x, ...) : unused argument (pca_data_input = "coldata")
# sce <- scater::runPCA(sce, pca_data_input = "coldata")
sce <- scater::runPCA(sce)  # is this acceptable??
scater::plotPCA(sce, colour_by = "phenoid")
```

## Filter cells

We remove cells with log-library sizes (or total features) that are more than 3
median absolute deviations (MADs) below the median log-library size (or total
features).

```{r histogram}
colData(sce)$libsize.drop <- isOutlier(sce$total_counts, nmads = 3, type = "lower", log = TRUE)
ggplot(as.data.frame(colData(sce)), aes(x = total_counts)) +
  geom_histogram(bins = 20, fill = "grey80") + xlab("Total count") +
  ylab("Number of cells") +
  geom_vline(xintercept = min(sce$total_counts[!sce$libsize.drop]),
             color = "red", linetype = "dashed") +
  theme_bw()

# below throws an error:
# Error in log10(metric) : non-numeric argument to mathematical function
colData(sce)$feature.drop <- isOutlier(sce$total_features, nmads = 3, type = "lower", log = TRUE)

# get an error here too:
# Error in !sce$feature.drop : invalid argument type
ggplot(as.data.frame(colData(sce)), aes(x = total_features)) +
  geom_histogram(bins = 20, fill = "grey80") + xlab("Number of detected features") +
  ylab("Number of cells") +
  geom_vline(xintercept = min(sce$total_features[!sce$feature.drop]),
             color = "red", linetype = "dashed") +
  theme_bw()
# get an error here also:
# Error in base::table(...) : all arguments must have the same length
table(libsize = sce$libsize.drop, feature = sce$feature.drop)
```

We also filter out cells with a large fraction of ERCC reads.

```{r filter-ercc}
colData(sce)$spike.drop <- isOutlier(sce$pct_counts_ERCC, nmads = 3, type = "higher")
ggplot(as.data.frame(colData(sce)), aes(x = pct_counts_ERCC)) +
  geom_histogram(bins = 20, fill = "grey80") + xlab("ERCC proportion (%)") +
  ylab("Number of cells") +
  geom_vline(xintercept = max(sce$pct_counts_ERCC[!sce$spike.drop]),
             color = "red", linetype = "dashed") +
  theme_bw()
table(sce$spike.drop)
sce <- sce[, !(sce$libsize.drop | sce$feature.drop | sce$spike.drop)]
dim(sce)
```

## Quality control using highest expressed genes

```{r qc-filt}
plotQC(sce, type = "highest-expression", n = 50)
```

## Data normalization

```{r sizefactors}
# get ann error from computeSumFactors
# Error in .computeSumFactors(assay(x, i = assay.type), subset.row = subset.row,  : zero cells in one of the clusters
sce <- computeSumFactors(sce, sizes = pmin(ncol(sce), seq(20, 120, 20)), min.mean = 0.1)
summary(sizeFactors(sce))
sce <- computeSpikeFactors(sce, general.use = FALSE)
```

```{r normalization}
# same thing here with normalise/normalize... get an error and the "return_norm_as_exprs" param doesn't exist
sce <- normalise(sce, exprs_values = "counts", return_log = TRUE,
                 return_norm_as_exprs = TRUE)
sce <- normalise(sce, exprs_values = "counts", return_log = FALSE,
                 return_norm_as_exprs = FALSE)
```

## Plot the proportion of explained variances

```{r explained-variance, warning = FALSE}
expl_vars <- c("phenoid", "log10_total_counts", "log10_total_features", "pct_dropout",
               "pct_counts_top_200_features", "log10_counts_feature_controls",
               "pct_counts_feature_controls")
plotQC(sce, type = "explanatory-variables", variables = expl_vars)
```

## Plot t-SNE representations

```{r tSNE}
set.seed(1234)
sce <- runTSNE(sce, exprs_values = "logcounts", perplexity = 10)
plotTSNE(sce, colour_by = "phenoid")
plotTSNE(sce, colour_by = "total_features", size_by = "total_counts")
```

## Save the normalized and cell filtered dataset

```{r save-data}
sce <- sce[!isSpike(sce, "ERCC"), ]
dim(sce)
table(sce$phenoid)
saveRDS(sce, file = "../../data/sce_full/sce_full_Kumar.rds")
```

## Session info

```{r}
date()
sessionInfo()
```

I am attesting that this GitHub handle moldach is linked to the Tezos account tz1epnSBiaR334dL3wuX2TN9S52EJMGeSkXn for tzprofiles

sig:edsigtmnQtArWu3h1TuFqrh6g6bRJSM45w2Pxd4CXyExqx6Yaf4fLVqmnBSTa3YGkKiaWt7rBxUhVmz4vJphpzcCueq8ptmfPAk
	---
	title: "Import and QC of Koh data set (SRP073808)"
	output: html_document
	editor_options:
	chunk_output_type: console
	---

	```{r load-packages}
	suppressPackageStartupMessages({
	library(MultiAssayExperiment)
	library(SingleCellExperiment)
	library(scater)
	library(scran)
	library(dplyr)
	library(ggplot2)
	})
	```

	## Load MultiAssayExperiment object

	```{r load-dataset}
	if (file.exists("GSE60749-GPL13112.rds")){
	maex <- readRDS("GSE60749-GPL13112.rds")
	} else {
	download.file("http://imlspenticton.uzh.ch/robinson_lab/conquer/data-mae/GSE60749-GPL13112.rds", "GSE60749-GPL13112.rds")
	maex <- readRDS("GSE60749-GPL13112.rds")
	}

	## Extract the gene-level length-scaled TPMs
	cts <- assays(experiments(maex)[["gene"]])[["count_lstpm"]]
	## Extract the phenotype data.
	phn <- colData(maex)
	phn$phenoid <- as.character(interaction(as.data.frame(phn[, c("source_name_ch1",
	"characteristics_ch1.1")])))
	## Simplify labels
	phn$phenoid <- plyr::revalue(
	phn$phenoid,
	c("Dgcr8 knockout mouse embryonic stem cells.culture conditions: serum+LIF" = "Dgcr8 knockout mouse serum+LIF",
	"v6.5 mouse embryonic stem cells.culture conditions: 2i+LIF" = "v6.5 mouse 2i+LIF",
	"v6.5 mouse embryonic stem cells.culture conditions: serum+LIF" = "v6.5 mouse serum+LIF")
	)
	table(phn$phenoid)
	```

	## Create a SingleCellExperiment object

	```{r create-sce}
	stopifnot(all(colnames(cts) == rownames(phn)))
	sce <- SingleCellExperiment(
	assays = list(counts = cts),
	colData = phn
	)

	## below doesn't work....
	# sce <- normalise(sce, exprs_values = "counts", return_log = TRUE, return_norm_as_exprs = TRUE) ## generates logcounts(sce)
	sce <- normalize(sce, exprs_values = "counts", return_log = TRUE) # is this correct? Get warning message here that its using lib sizes as size factors
	```

	Exclude features that are not expressed

	```{r reduce-expression-matrix}
	keep_features <- rowSums(counts(sce) > 0) > 0
	table(keep_features)
	sce <- sce[keep_features, ]
	dim(sce)
	```

	## Identify the remaining ERCC spike-ins.

	```{r ercc-mt}
	is.spike <- grepl("^ERCC", rownames(sce))
	table(is.spike)
	summary(colSums(counts(sce[is.spike, ])))
	isSpike(sce, "ERCC") <- is.spike
	```

	## Calculate QC metrics

	```{r QC}
	sce <- calculateQCMetrics(sce, feature_controls = list(ERCC = grepl("^ERCC", rownames(sce))))
	```

	## Quality control using PCA on column data

	We create a PCA plot based the quality metrics for each cell, e.g., the total
	number of reads, the total number of features and the proportion of spike-in
	reads.

	```{r qc-pca}
	# get an error from below code:
	# Error in .local(x, ...) : unused argument (pca_data_input = "coldata")
	# sce <- scater::runPCA(sce, pca_data_input = "coldata")
	sce <- scater::runPCA(sce) # is this acceptable??
	scater::plotPCA(sce, colour_by = "phenoid")
	```

	## Filter cells

	We remove cells with log-library sizes (or total features) that are more than 3
	median absolute deviations (MADs) below the median log-library size (or total
	features).

	```{r histogram}
	colData(sce)$libsize.drop <- isOutlier(sce$total_counts, nmads = 3, type = "lower", log = TRUE)
	ggplot(as.data.frame(colData(sce)), aes(x = total_counts)) +
	geom_histogram(bins = 20, fill = "grey80") + xlab("Total count") +
	ylab("Number of cells") +
	geom_vline(xintercept = min(sce$total_counts[!sce$libsize.drop]),
	color = "red", linetype = "dashed") +
	theme_bw()

	# below throws an error:
	# Error in log10(metric) : non-numeric argument to mathematical function
	colData(sce)$feature.drop <- isOutlier(sce$total_features, nmads = 3, type = "lower", log = TRUE)

	# get an error here too:
	# Error in !sce$feature.drop : invalid argument type
	ggplot(as.data.frame(colData(sce)), aes(x = total_features)) +
	geom_histogram(bins = 20, fill = "grey80") + xlab("Number of detected features") +
	ylab("Number of cells") +
	geom_vline(xintercept = min(sce$total_features[!sce$feature.drop]),
	color = "red", linetype = "dashed") +
	theme_bw()
	# get an error here also:
	# Error in base::table(...) : all arguments must have the same length
	table(libsize = sce$libsize.drop, feature = sce$feature.drop)
	```

	We also filter out cells with a large fraction of ERCC reads.

	```{r filter-ercc}
	colData(sce)$spike.drop <- isOutlier(sce$pct_counts_ERCC, nmads = 3, type = "higher")
	ggplot(as.data.frame(colData(sce)), aes(x = pct_counts_ERCC)) +
	geom_histogram(bins = 20, fill = "grey80") + xlab("ERCC proportion (%)") +
	ylab("Number of cells") +
	geom_vline(xintercept = max(sce$pct_counts_ERCC[!sce$spike.drop]),
	color = "red", linetype = "dashed") +
	theme_bw()
	table(sce$spike.drop)
	sce <- sce[, !(sce$libsize.drop \| sce$feature.drop \| sce$spike.drop)]
	dim(sce)
	```

	## Quality control using highest expressed genes

	```{r qc-filt}
	plotQC(sce, type = "highest-expression", n = 50)
	```

	## Data normalization

	```{r sizefactors}
	# get ann error from computeSumFactors
	# Error in .computeSumFactors(assay(x, i = assay.type), subset.row = subset.row, : zero cells in one of the clusters
	sce <- computeSumFactors(sce, sizes = pmin(ncol(sce), seq(20, 120, 20)), min.mean = 0.1)
	summary(sizeFactors(sce))
	sce <- computeSpikeFactors(sce, general.use = FALSE)
	```

	```{r normalization}
	# same thing here with normalise/normalize... get an error and the "return_norm_as_exprs" param doesn't exist
	sce <- normalise(sce, exprs_values = "counts", return_log = TRUE,
	return_norm_as_exprs = TRUE)
	sce <- normalise(sce, exprs_values = "counts", return_log = FALSE,
	return_norm_as_exprs = FALSE)
	```

	## Plot the proportion of explained variances

	```{r explained-variance, warning = FALSE}
	expl_vars <- c("phenoid", "log10_total_counts", "log10_total_features", "pct_dropout",
	"pct_counts_top_200_features", "log10_counts_feature_controls",
	"pct_counts_feature_controls")
	plotQC(sce, type = "explanatory-variables", variables = expl_vars)
	```

	## Plot t-SNE representations

	```{r tSNE}
	set.seed(1234)
	sce <- runTSNE(sce, exprs_values = "logcounts", perplexity = 10)
	plotTSNE(sce, colour_by = "phenoid")
	plotTSNE(sce, colour_by = "total_features", size_by = "total_counts")
	```

	## Save the normalized and cell filtered dataset

	```{r save-data}
	sce <- sce[!isSpike(sce, "ERCC"), ]
	dim(sce)
	table(sce$phenoid)
	saveRDS(sce, file = "../../data/sce_full/sce_full_Kumar.rds")
	```

	## Session info

	```{r}
	date()
	sessionInfo()
	```

	I am attesting that this GitHub handle moldach is linked to the Tezos account tz1epnSBiaR334dL3wuX2TN9S52EJMGeSkXn for tzprofiles

	sig:edsigtmnQtArWu3h1TuFqrh6g6bRJSM45w2Pxd4CXyExqx6Yaf4fLVqmnBSTa3YGkKiaWt7rBxUhVmz4vJphpzcCueq8ptmfPAk