Skip to content

Instantly share code, notes, and snippets.

@SaskiaFreytag
Created April 17, 2019 09:09
Show Gist options
  • Save SaskiaFreytag/f0f6904ec1f6e754a655d9fb1d17cab2 to your computer and use it in GitHub Desktop.
Save SaskiaFreytag/f0f6904ec1f6e754a655d9fb1d17cab2 to your computer and use it in GitHub Desktop.
Code Review
# outline the packages needed for this script
# pak::pkg_install("hadley/requirements")
# requirements::req_file("SNP_saturation.R")
# Input 1-2 sentences that describe what this does, when it was done, and why
if (!requireNamespace("parallel", quietly = TRUE)) {
install.packages("parallel")
}
if (!requireNamespace("vcfR", quietly = TRUE)) {
install.packages("vcfR")
}
if (!requireNamespace("dplyr", quietly = TRUE)) {
install.packages("dplyr")
}
# State the libraries up front
library(parallel)
library(dplyr)
library(cvfR)
# Do not output scientific notation
options(scipen = 999)
#' Why are you running gc at the start?
# Initial garbage collection
gc()
# Set command line arguments
# (why? The output is character(0)? or is this not supposed to be run interactively?)
args <- commandArgs(TRUE)
bam <- args[1]
#' tools::file_ext("wat.bam") I think does the same thing?
sample_bam <- gsub(".bam", "", bam)
extr_seq <- as.numeric(args[2])
outFile <- args[3]
#' Jenny Bryan will set your computer on fire!
genome <-
"/dd_userdata/usrdat02/userdata/sfreytag/indexes/hg19.premrna/fasta/genome.fa"
# Prepare files
# First subset to only chromosome X
#system(paste0("samtools view -h ", bam, " X > ", sample_bam, "_chrX.bam"))
#' why are you using `system` on paste?
# Remove additional tags on barcodes
barcodes <-
read.table(paste0(sample_bam , "_barcodes.tsv"),
header = F,
sep = "\t")
# find and replace -1 with 0?
barcodes$V1 <- gsub("-1", "", barcodes$V1)
# is "aa" a good variable name?
aa <- length(barcodes$V1)
write.table(
barcodes,
file = paste0(sample_bam, "_barcodes_corr.tsv"),
sep = "\t",
quote = F,
col.names = F,
row.names = F
)
# Make vcf of SNPs
#system(paste0("source activate samtools
# samtools mpileup -f ", genome , " -g ", sample_bam,
# "_chrX.bam | bcftools call -m > ", sample_bam, "_chrX.vcf
# source deactivate"))
# Subset to heterozygous SNPs
#system(paste0('grep "0/1" ', sample_bam, '_chrX.vcf > ', sample_bam, '_chrX.het.vcf'))
# Reattach header
#system(paste0('grep "#" ', sample_bam, '_chrX.vcf | cat - ', sample_bam, '_chrX.het.vcf > ', sample_bam, '_chrX.het.corr.vcf'))
## Function for one run
one_run <- function(i, per_i) {
library(vcfR) # put library calls outside of the function
library(dplyr) # ^^
message(paste0("Iteration: ", i))
# Prevent all parallel runs from having the same starting seed
Sys.sleep(i)
# Subsample the bam file at percentage and then index
system(
paste0(
"source activate samtools
samtools view -s ",
per_i + i ,
" -b ",
sample_bam,
"_chrX.bam > sub_sampled_tmp_",
i,
".bam
source deactivate"
)
)
system(paste0("samtools index sub_sampled_tmp_", i, ".bam"))
# Apply CellSNP to subsampled file
system(
paste0(
"source activate CellSNP
cellSNP -s sub_sampled_tmp_",
i,
".bam -b ",
sample_bam,
"_barcodes_corr.tsv",
" -o sub_sampled_tmp_",
i,
".snp -R ",
sample_bam,
"_chrX.het.corr.vcf -p 20
source deactivate"
)
)
# Count the number of SNPs and cells with SNPs
vcf <- read.vcfR(paste0("sub_sampled_tmp_", i, ".snp.gz"),
verbose = FALSE)
number_snps <- dim(vcf@fix)[1]
vcf_tibble <- vcfR2tidy(vcf,
format_fields = c("DP", "AD", "OTH", "ALL"))
vcf_tibble_narm <- vcf_tibble$gt %>%
mutate(present = gt_DP >= 0) %>%
group_by(Indiv) %>%
summarize(n_alleles = sum(present, na.rm = TRUE))
number_cells_1_snps <- dim(vcf_tibble_narm %>%
ungroup() %>%
filter(n_alleles >= 1))[1]
number_cells_2_snps <- dim(vcf_tibble_narm %>%
ungroup() %>%
filter(n_alleles >= 2))[1]
# Remove temporary files
system(paste0("rm *_tmp_", i, "*"))
return(
list(
number_snps = number_snps,
number_cells_1_snps = number_cells_1_snps,
number_cells_2_snps = number_cells_2_snps
)
)
} # end of function
#' This function should be in a separate script that can be "source"'d.
## All percentages to be investigated
per <- seq(0.05, 0.95, 0.05)
## Initialize objects to save
#' this is a dangerous pattern - should try and specify the size of the list
res <- list()
for (i in 1:length(per)) {
per_tmp <- per[i]
cl <- makeCluster(5)
clusterExport(cl, c("per_tmp", "sample_bam", "one_run"))
res[[i]] <- parLapply(cl, 1:5,
function(x){
one_run(x, per_tmp)
}
) # make your function clearer
stopCluster(cl)
}
saveRDS(res, file = paste0("Results_", outFile, ".rds"))
# Make a dataframe from results averaged over 5 runs
res1 <-
data.frame(
number_snps = sapply(res,
function(x) mean(sapply(x,
function(y) y[[1]]))),
number_cells_1_snp = sapply(res,
function(x) mean(sapply(x,
function(y) y[[2]]))),
number_cells_2_snps = sapply(res,
function(x) mean(sapply(x,
function(y) y[[3]]))),
per = per
)
# Make a dataframe from results
res2 <-
data.frame(
number_snps = c(sapply(res, function(x) sapply(x,
function(y) y[[1]]))),
number_cells_1_snp = c(sapply(res,
function(x) sapply(x,
function(y) y[[2]]))),
number_cells_2_snps = c(sapply(res,
function(x) sapply(x,
function(y) y[[3]]))),
per = rep(per, each = 5)
)
# Polynomial function
f <- function(x, a, b, n) {
(a * x ^ n) / (x ^ n + b)
}
# Fit polynomial function for number of cells with at least one SNP
inital_value_n <-
lm(log(number_cells_1_snp + 1) ~ log(per),
res2)$coefficients[2]
fit <-
nls(
number_cells_1_snp ~ f(per, a, b, n),
data = res2,
start = list(a = 4634,
b = 1,
n = inital_value_n)
)
# Fit polynomial function for number of cells with at least two SNPs
inital_value_n_1 <-
lm(log(number_cells_1_snp + 1) ~ log(per),
res2)$coefficients[2]
fit1 <-
nls(
number_cells_2_snps ~ f(per, a, b, n),
data = res2,
start = list(a = 4634,
b = 1,
n = inital_value_n_1)
)
# Fit polynomial function for number of SNPs
inital_value_n_0 <-
lm(log(number_cells_1_snp + 1) ~ log(per), res2)$coefficients[2]
#' woh - why are are you using try here?
try(fit0 <-
nls(
number_snps ~ f(per, a, b, n),
data = res2,
start = list(a = res2[dim(res2)[1], 2], b = 1, n = inital_value_n_0)
))
# Find range for extrapolation
extr_end <- 1 + extr_seq + 0.5
per_extr <- seq(0.05, extr_end, 0.01)
# Make dataset for extrapolation
res_extr <- data.frame(
per = per_extr,
number_cells_1_snp = sapply(per_extr,
function(x){ f(x,
coef(fit)[1],
coef(fit)[2],
coef(fit)[3])
}
),
number_cells_2_snps = sapply(per_extr,
function(x){
f(x,
coef(fit1)[1],
coef(fit1)[2],
coef(fit1)[3])
}
)
)
try(res_extr$number_snps <-
sapply(per_extr,
function(x){
f(x,
coef(fit0)[1],
coef(fit0)[2],
coef(fit0)[3])
}
)
)
# Plot results and output them
png(paste0(outFile, "_cells_1_snp.png"))
plot(res_extr$per,
res_extr$number_cells_1_snp,
xlab = "Percentage*100",
ylab = "Number of cells with 1 SNP")
points(res1$per,
res1$number_cells_1_snp,
col = "red",
pch = 19)
abline(v = extr_seq + 1)
txt.x <- extr_end / 2
txt.y <- max(res_extr$number_cells_1_snp) / 2
text(txt.x,
txt.y,
labels = paste(
"With",
round(extr_seq, 2),
" more reads: \n",
round(f(
1 + extr_seq, coef(fit)[1], coef(fit)[2], coef(fit)[3]
), 0) ,
"cells detected"
))
dev.off()
png(paste0(outFile, "_cells_2_snps.png"))
plot(res_extr$per,
res_extr$number_cells_2_snps,
xlab = "Percentage*100",
ylab = "Number of cells with 2 SNPs")
points(res1$per,
res1$number_cells_2_snps,
col = "red",
pch = 19)
abline(v = extr_seq + 1)
txt.x <- extr_end / 2
txt.y <- max(res_extr$number_cells_2_snps) / 2
text(txt.x,
txt.y,
labels = paste(
"With",
round(extr_seq, 2),
" more reads: \n",
round(f(
1 + extr_seq, coef(fit1)[1], coef(fit1)[2], coef(fit1)[3]
), 0) ,
"cells detected"
))
dev.off()
try({
png(paste0(outFile, "_snps.png"))
plot(res_extr$per,
res_extr$number_snps,
xlab = "Percentage*100",
ylab = "Number ofSNPs")
points(res1$per, res1$number_snps, col = "red", pch = 19)
abline(v = extr_seq + 1)
txt.x <- extr_end / 2
txt.y <- 100
text(txt.x,
txt.y,
labels = paste(
"With",
round(extr_seq, 2),
" more reads: \n",
round(f(
1 + extr_seq, coef(fit0)[1], coef(fit0)[2], coef(fit0)[3]
), 0) ,
"SNPs detected"
))
dev.off()
}
)
library(parallel)
# Do not output scientific notation
options(scipen=999)
# Initial garbage collection
gc()
# Set command line arguments
args <- commandArgs(TRUE)
bam <- args[1]
sample_bam <- gsub(".bam", "", bam)
extr_seq <- as.numeric(args[2])
outFile <- args[3]
genome = "/dd_userdata/usrdat02/userdata/sfreytag/indexes/hg19.premrna/fasta/genome.fa"
# Prepare files
# First subset to only chromosome X
#system(paste0("samtools view -h ", bam, " X > ", sample_bam, "_chrX.bam"))
# Remove additional tags on barcodes
barcodes <- read.table(paste0(sample_bam ,"_barcodes.tsv"), header=F, sep="\t")
barcodes$V1 <- gsub("-1", "", barcodes$V1)
aa <- length(barcodes$V1)
write.table(barcodes, file=paste0(sample_bam, "_barcodes_corr.tsv"), sep="\t",
quote=F, col.names=F, row.names=F)
# Make vcf of SNPs
#system(paste0("source activate samtools
# samtools mpileup -f ", genome , " -g ", sample_bam,
# "_chrX.bam | bcftools call -m > ", sample_bam, "_chrX.vcf
# source deactivate"))
# Subset to heterozygous SNPs
#system(paste0('grep "0/1" ', sample_bam, '_chrX.vcf > ', sample_bam, '_chrX.het.vcf'))
# Reattach header
#system(paste0('grep "#" ', sample_bam, '_chrX.vcf | cat - ', sample_bam, '_chrX.het.vcf > ', sample_bam, '_chrX.het.corr.vcf'))
## Function for one run
one_run <- function(i, per_i) {
library(vcfR)
library(dplyr)
print(paste0("Iteration: ", i))
# Prevent all parallel runs from having the same starting seed
Sys.sleep(i)
# Subsample the bam file at percentage and then index
system(paste0("source activate samtools
samtools view -s ", per_i+i , " -b ", sample_bam, "_chrX.bam > sub_sampled_tmp_", i, ".bam
source deactivate"))
system(paste0("samtools index sub_sampled_tmp_", i, ".bam"))
# Apply CellSNP to subsampled file
system(paste0("source activate CellSNP
cellSNP -s sub_sampled_tmp_", i, ".bam -b ",
sample_bam, "_barcodes_corr.tsv",
" -o sub_sampled_tmp_", i, ".snp -R ", sample_bam, "_chrX.het.corr.vcf -p 20
source deactivate"))
# Count the number of SNPs and cells with SNPs
vcf <- read.vcfR(paste0("sub_sampled_tmp_", i, ".snp.gz"), verbose = FALSE)
number_snps <- dim(vcf@fix) [1]
vcf_tibble <- vcfR2tidy(vcf, format_fields = c("DP", "AD", "OTH", "ALL"))
vcf_tibble_narm <- vcf_tibble$gt %>%
mutate(present=gt_DP>=0) %>% group_by(Indiv) %>%
summarize(n_alleles=sum(present, na.rm=TRUE))
number_cells_1_snps <- dim(vcf_tibble_narm %>%
ungroup() %>%
filter(n_alleles>=1))[1]
number_cells_2_snps <- dim(vcf_tibble_narm %>%
ungroup() %>%
filter(n_alleles>=2))[1]
# Remove temporary files
system(paste0("rm *_tmp_", i, "*"))
return(list(number_snps=number_snps, number_cells_1_snps=number_cells_1_snps,
number_cells_2_snps=number_cells_2_snps))
}
## All percentages to be investigated
per <- seq(0.05,0.95,0.05)
## Initialize objects to save
res <- list()
for (i in 1:length(per)){
per_tmp <- per[i]
cl <- makeCluster(5)
clusterExport(cl, c("per_tmp", "sample_bam", "one_run"))
res[[i]] <- parLapply(cl, 1:5, function(x) one_run(x, per_tmp))
stopCluster(cl)
}
saveRDS(res, file=paste0("Results_", outFile, ".rds"))
# Make a dataframe from results averaged over 5 runs
res1 <- data.frame(number_snps = sapply(res, function(x) mean(sapply(x, function(y) y[[1]]))),
number_cells_1_snp = sapply(res, function(x) mean(sapply(x, function(y) y[[2]]))),
number_cells_2_snps = sapply(res, function(x) mean(sapply(x, function(y) y[[3]]))),
per = per)
# Make a dataframe from results
res2 <- data.frame(number_snps = c(sapply(res, function(x) sapply(x, function(y) y[[1]]))),
number_cells_1_snp = c(sapply(res, function(x) sapply(x, function(y) y[[2]]))),
number_cells_2_snps = c(sapply(res, function(x) sapply(x, function(y) y[[3]]))),
per = rep(per, each=5))
# Polynomial function
f <- function(x, a, b, n) {
(a*x^n)/ (x^n+b)
}
# Fit polynomial function for number of cells with at least one SNP
inital_value_n <- lm(log(number_cells_1_snp+1)~log(per), res2)$coefficients[2]
fit <- nls(number_cells_1_snp~f(per,a,b,n), data=res2, start=list(a=4634, b=1, n=inital_value_n))
# Fit polynomial function for number of cells with at least two SNPs
inital_value_n_1 <- lm(log(number_cells_1_snp+1)~log(per), res2)$coefficients[2]
fit1 <- nls(number_cells_2_snps~f(per,a,b,n), data=res2, start=list(a=4634, b=1, n=inital_value_n_1))
# Fit polynomial function for number of SNPs
inital_value_n_0 <- lm(log(number_cells_1_snp+1)~log(per), res2)$coefficients[2]
try(fit0 <- nls(number_snps~f(per,a,b,n), data=res2, start=list(a=res2[dim(res2)[1],2], b=1, n=inital_value_n_0)))
# Find range for extrapolation
extr_end <- 1 + extr_seq + 0.5
per_extr <- seq(0.05, extr_end, 0.01)
# Make dataset for extrapolation
res_extr <- data.frame(per=per_extr,
number_cells_1_snp=sapply(per_extr, function(x) f(x, coef(fit)[1], coef(fit)[2], coef(fit)[3])),
number_cells_2_snps=sapply(per_extr, function(x) f(x, coef(fit1)[1], coef(fit1)[2], coef(fit1)[3])))
try(res_extr$number_snps <- sapply(per_extr, function(x) f(x, coef(fit0)[1], coef(fit0)[2], coef(fit0)[3])))
# Plot results and output them
png(paste0(outFile, "_cells_1_snp.png"))
plot(res_extr$per, res_extr$number_cells_1_snp, xlab="Percentage*100", ylab="Number of cells with 1 SNP")
points(res1$per, res1$number_cells_1_snp, col="red", pch=19)
abline(v=extr_seq+1)
txt.x <- extr_end/2
txt.y <- max(res_extr$number_cells_1_snp)/2
text(txt.x, txt.y,
labels=paste("With", round(extr_seq,2), " more reads: \n",
round(f(1+extr_seq, coef(fit)[1], coef(fit)[2], coef(fit)[3]),0) ,"cells detected"))
dev.off()
png(paste0(outFile, "_cells_2_snps.png"))
plot(res_extr$per, res_extr$number_cells_2_snps, xlab="Percentage*100", ylab="Number of cells with 2 SNPs")
points(res1$per, res1$number_cells_2_snps, col="red", pch=19)
abline(v=extr_seq+1)
txt.x <- extr_end/2
txt.y <- max(res_extr$number_cells_2_snps)/2
text(txt.x, txt.y,
labels=paste("With", round(extr_seq,2), " more reads: \n",
round(f(1+extr_seq, coef(fit1)[1], coef(fit1)[2], coef(fit1)[3]),0) ,"cells detected"))
dev.off()
try({png(paste0(outFile, "_snps.png"))
plot(res_extr$per, res_extr$number_snps, xlab="Percentage*100", ylab="Number ofSNPs")
points(res1$per, res1$number_snps, col="red", pch=19)
abline(v=extr_seq+1)
txt.x <- extr_end/2
txt.y <- 100
text(txt.x, txt.y,
labels=paste("With", round(extr_seq,2), " more reads: \n",
round(f(1+extr_seq, coef(fit0)[1], coef(fit0)[2], coef(fit0)[3]),0) ,"SNPs detected"))
dev.off()})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment