SaskiaFreytag/after_code_review

## after_code_review
# outline the packages needed for this script
# pak::pkg_install("hadley/requirements")
# requirements::req_file("SNP_saturation.R")

# Input 1-2 sentences that describe what this does, when it was done, and why

if (!requireNamespace("parallel", quietly = TRUE)) {
  install.packages("parallel")
}

if (!requireNamespace("vcfR", quietly = TRUE)) {
  install.packages("vcfR")
}

if (!requireNamespace("dplyr", quietly = TRUE)) {
  install.packages("dplyr")
}

# State the libraries up front
library(parallel)
library(dplyr)
library(cvfR)

# Do not output scientific notation
options(scipen = 999)

#' Why are you running gc at the start?
# Initial garbage collection
gc()

# Set command line arguments
# (why? The output is character(0)? or is this not supposed to be run interactively?)
args <- commandArgs(TRUE)

bam <- args[1]
#' tools::file_ext("wat.bam") I think does the same thing?
sample_bam <- gsub(".bam", "", bam)
extr_seq <- as.numeric(args[2])
outFile <- args[3]

#' Jenny Bryan will set your computer on fire!
genome  <-
  "/dd_userdata/usrdat02/userdata/sfreytag/indexes/hg19.premrna/fasta/genome.fa"

# Prepare files
# First subset to only chromosome X
#system(paste0("samtools view -h ", bam, " X > ", sample_bam, "_chrX.bam"))
#' why are you using `system` on paste?

# Remove additional tags on barcodes
barcodes <-
  read.table(paste0(sample_bam , "_barcodes.tsv"),
             header = F,
             sep = "\t")

# find and replace -1 with 0?
barcodes$V1 <- gsub("-1", "", barcodes$V1)

# is "aa" a good variable name?
aa <- length(barcodes$V1)
write.table(
  barcodes,
  file = paste0(sample_bam, "_barcodes_corr.tsv"),
  sep = "\t",
  quote = F,
  col.names = F,
  row.names = F
)

# Make vcf of SNPs
#system(paste0("source activate samtools
#              samtools mpileup -f ", genome , " -g ", sample_bam,
#              "_chrX.bam | bcftools call -m > ", sample_bam, "_chrX.vcf
#              source deactivate"))

# Subset to heterozygous SNPs
#system(paste0('grep "0/1" ', sample_bam, '_chrX.vcf > ', sample_bam, '_chrX.het.vcf'))
# Reattach header
#system(paste0('grep "#" ', sample_bam, '_chrX.vcf | cat - ', sample_bam, '_chrX.het.vcf > ', sample_bam, '_chrX.het.corr.vcf'))

## Function for one run

one_run <- function(i, per_i) {
  library(vcfR) # put library calls outside of the function
  library(dplyr) # ^^

  message(paste0("Iteration: ", i))

  # Prevent all parallel runs from having the same starting seed
  Sys.sleep(i)

  # Subsample the bam file at percentage and then index
  system(
    paste0(
      "source activate samtools
                samtools view -s ",
      per_i + i ,
      " -b ",
      sample_bam,
      "_chrX.bam > sub_sampled_tmp_",
      i,
      ".bam
                source deactivate"
    )
  )
  system(paste0("samtools index sub_sampled_tmp_", i, ".bam"))

  # Apply CellSNP to subsampled file
  system(
    paste0(
      "source activate CellSNP
              cellSNP -s sub_sampled_tmp_",
      i,
      ".bam -b ",
      sample_bam,
      "_barcodes_corr.tsv",
      " -o sub_sampled_tmp_",
      i,
      ".snp -R ",
      sample_bam,
      "_chrX.het.corr.vcf -p 20
              source deactivate"
    )
  )

  # Count the number of SNPs and cells with SNPs
  vcf <- read.vcfR(paste0("sub_sampled_tmp_", i, ".snp.gz"),
                   verbose = FALSE)

  number_snps <- dim(vcf@fix)[1]

  vcf_tibble <- vcfR2tidy(vcf,
                          format_fields = c("DP", "AD", "OTH", "ALL"))

  vcf_tibble_narm <- vcf_tibble$gt %>%
    mutate(present = gt_DP >= 0) %>%
    group_by(Indiv) %>%
    summarize(n_alleles = sum(present, na.rm = TRUE))

  number_cells_1_snps <- dim(vcf_tibble_narm %>%
                               ungroup() %>%
                               filter(n_alleles >= 1))[1]

  number_cells_2_snps <- dim(vcf_tibble_narm %>%
                               ungroup() %>%
                               filter(n_alleles >= 2))[1]

  # Remove temporary files
  system(paste0("rm *_tmp_", i, "*"))

  return(
    list(
      number_snps = number_snps,
      number_cells_1_snps = number_cells_1_snps,
      number_cells_2_snps = number_cells_2_snps
    )
  )

} # end of function
#' This function should be in a separate script that can be "source"'d.

## All percentages to be investigated
per <- seq(0.05, 0.95, 0.05)

## Initialize objects to save
#' this is a dangerous pattern - should try and specify the size of the list
res <- list()

for (i in 1:length(per)) {
  per_tmp <- per[i]

  cl <- makeCluster(5)
  clusterExport(cl, c("per_tmp", "sample_bam", "one_run"))
  res[[i]] <- parLapply(cl, 1:5,
                        function(x){
                          one_run(x, per_tmp)
                        }
  ) # make your function clearer

  stopCluster(cl)

}

saveRDS(res, file = paste0("Results_", outFile, ".rds"))

# Make a dataframe from results averaged over 5 runs
res1 <-
  data.frame(
    number_snps = sapply(res,
                         function(x) mean(sapply(x,
                                                 function(y) y[[1]]))),
    number_cells_1_snp = sapply(res,
                                function(x) mean(sapply(x,
                                                        function(y) y[[2]]))),
    number_cells_2_snps = sapply(res,
                                 function(x) mean(sapply(x,
                                                         function(y) y[[3]]))),
    per = per
  )

# Make a dataframe from results
res2 <-
  data.frame(
    number_snps = c(sapply(res, function(x) sapply(x,
                                                   function(y) y[[1]]))),
    number_cells_1_snp = c(sapply(res,
                                  function(x) sapply(x,
                                                     function(y) y[[2]]))),
    number_cells_2_snps = c(sapply(res,
                                   function(x) sapply(x,
                                                      function(y) y[[3]]))),
    per = rep(per, each = 5)
  )

# Polynomial function
f <- function(x, a, b, n) {
  (a * x ^ n) / (x ^ n + b)
}

# Fit polynomial function for number of cells with at least one SNP
inital_value_n <-
  lm(log(number_cells_1_snp + 1) ~ log(per),
     res2)$coefficients[2]

fit <-
  nls(
    number_cells_1_snp ~ f(per, a, b, n),
    data = res2,
    start = list(a = 4634,
                 b = 1,
                 n = inital_value_n)
  )

# Fit polynomial function for number of cells with at least two SNPs
inital_value_n_1 <-
  lm(log(number_cells_1_snp + 1) ~ log(per),
     res2)$coefficients[2]

fit1 <-
  nls(
    number_cells_2_snps ~ f(per, a, b, n),
    data = res2,
    start = list(a = 4634,
                 b = 1,
                 n = inital_value_n_1)
  )

# Fit polynomial function for number of SNPs
inital_value_n_0 <-
  lm(log(number_cells_1_snp + 1) ~ log(per), res2)$coefficients[2]

#' woh - why are are you using try here?
try(fit0 <-
      nls(
        number_snps ~ f(per, a, b, n),
        data = res2,
        start = list(a = res2[dim(res2)[1], 2], b = 1, n = inital_value_n_0)
      ))

# Find range for extrapolation
extr_end <- 1 + extr_seq + 0.5
per_extr <- seq(0.05, extr_end, 0.01)

# Make dataset for extrapolation
res_extr <- data.frame(
  per = per_extr,
  number_cells_1_snp = sapply(per_extr,
                              function(x){ f(x,
                                            coef(fit)[1],
                                            coef(fit)[2],
                                            coef(fit)[3])
                                }
                              ),
  number_cells_2_snps = sapply(per_extr,
                               function(x){
                                 f(x,
                                   coef(fit1)[1],
                                   coef(fit1)[2],
                                   coef(fit1)[3])
                                 }
                               )
)

try(res_extr$number_snps <-
      sapply(per_extr,
             function(x){
               f(x,
                 coef(fit0)[1],
                 coef(fit0)[2],
                 coef(fit0)[3])
               }
             )
    )

# Plot results and output them
png(paste0(outFile, "_cells_1_snp.png"))

plot(res_extr$per,
     res_extr$number_cells_1_snp,
     xlab = "Percentage*100",
     ylab = "Number of cells with 1 SNP")
points(res1$per,
       res1$number_cells_1_snp,
       col = "red",
       pch = 19)
abline(v = extr_seq + 1)
txt.x <- extr_end / 2
txt.y <- max(res_extr$number_cells_1_snp) / 2
text(txt.x,
     txt.y,
     labels = paste(
       "With",
       round(extr_seq, 2),
       " more reads: \n",
       round(f(
         1 + extr_seq, coef(fit)[1], coef(fit)[2], coef(fit)[3]
       ), 0) ,
       "cells detected"
     ))
dev.off()

png(paste0(outFile, "_cells_2_snps.png"))

plot(res_extr$per,
     res_extr$number_cells_2_snps,
     xlab = "Percentage*100",
     ylab = "Number of cells with 2 SNPs")
points(res1$per,
       res1$number_cells_2_snps,
       col = "red",
       pch = 19)
abline(v = extr_seq + 1)
txt.x <- extr_end / 2
txt.y <- max(res_extr$number_cells_2_snps) / 2
text(txt.x,
     txt.y,
     labels = paste(
       "With",
       round(extr_seq, 2),
       " more reads: \n",
       round(f(
         1 + extr_seq, coef(fit1)[1], coef(fit1)[2], coef(fit1)[3]
       ), 0) ,
       "cells detected"
     ))
dev.off()

try({
  png(paste0(outFile, "_snps.png"))
  plot(res_extr$per,
       res_extr$number_snps,
       xlab = "Percentage*100",
       ylab = "Number ofSNPs")
  points(res1$per, res1$number_snps, col = "red", pch = 19)
  abline(v = extr_seq + 1)
  txt.x <- extr_end / 2
  txt.y <- 100
  text(txt.x,
       txt.y,
       labels = paste(
         "With",
         round(extr_seq, 2),
         " more reads: \n",
         round(f(
           1 + extr_seq, coef(fit0)[1], coef(fit0)[2], coef(fit0)[3]
         ), 0) ,
         "SNPs detected"
       ))
  dev.off()
}
)

## before_code_review
library(parallel)

# Do not output scientific notation
options(scipen=999)

# Initial garbage collection
gc()

# Set command line arguments
args <- commandArgs(TRUE)

bam <- args[1]
sample_bam <- gsub(".bam", "", bam)
extr_seq <- as.numeric(args[2])
outFile <- args[3]
genome  = "/dd_userdata/usrdat02/userdata/sfreytag/indexes/hg19.premrna/fasta/genome.fa"

# Prepare files
# First subset to only chromosome X
#system(paste0("samtools view -h ", bam, " X > ", sample_bam, "_chrX.bam"))

# Remove additional tags on barcodes
barcodes <- read.table(paste0(sample_bam ,"_barcodes.tsv"), header=F, sep="\t")
barcodes$V1 <- gsub("-1", "", barcodes$V1)
aa <- length(barcodes$V1)
write.table(barcodes, file=paste0(sample_bam, "_barcodes_corr.tsv"), sep="\t",
            quote=F, col.names=F, row.names=F)

# Make vcf of SNPs
#system(paste0("source activate samtools
#              samtools mpileup -f ", genome , " -g ", sample_bam,
#              "_chrX.bam | bcftools call -m > ", sample_bam, "_chrX.vcf
#              source deactivate"))

# Subset to heterozygous SNPs
#system(paste0('grep "0/1" ', sample_bam, '_chrX.vcf > ', sample_bam, '_chrX.het.vcf'))
# Reattach header
#system(paste0('grep "#" ', sample_bam, '_chrX.vcf | cat - ', sample_bam, '_chrX.het.vcf > ', sample_bam, '_chrX.het.corr.vcf'))

## Function for one run

one_run <- function(i, per_i) {

  library(vcfR)
  library(dplyr)

  print(paste0("Iteration: ", i))

  # Prevent all parallel runs from having the same starting seed
  Sys.sleep(i)

  # Subsample the bam file at percentage and then index
  system(paste0("source activate samtools
                samtools view -s ",  per_i+i , " -b ", sample_bam, "_chrX.bam > sub_sampled_tmp_", i, ".bam
                source deactivate"))
  system(paste0("samtools index sub_sampled_tmp_", i, ".bam"))

  # Apply CellSNP to subsampled file
  system(paste0("source activate CellSNP
              cellSNP -s sub_sampled_tmp_", i, ".bam -b ",
              sample_bam, "_barcodes_corr.tsv",
              " -o sub_sampled_tmp_", i, ".snp -R ", sample_bam, "_chrX.het.corr.vcf -p 20
              source deactivate"))

  # Count the number of SNPs and cells with SNPs
  vcf <- read.vcfR(paste0("sub_sampled_tmp_", i, ".snp.gz"), verbose = FALSE)
  number_snps <- dim(vcf@fix) [1]
  vcf_tibble <- vcfR2tidy(vcf, format_fields = c("DP", "AD", "OTH", "ALL"))
  vcf_tibble_narm <- vcf_tibble$gt %>%
                     mutate(present=gt_DP>=0) %>% group_by(Indiv) %>%
                     summarize(n_alleles=sum(present, na.rm=TRUE))
  number_cells_1_snps <- dim(vcf_tibble_narm %>%
                         ungroup() %>%
                         filter(n_alleles>=1))[1]
  number_cells_2_snps <- dim(vcf_tibble_narm %>%
                         ungroup() %>%
                         filter(n_alleles>=2))[1]

  # Remove temporary files
  system(paste0("rm *_tmp_", i, "*"))

  return(list(number_snps=number_snps, number_cells_1_snps=number_cells_1_snps,
              number_cells_2_snps=number_cells_2_snps))

}

## All percentages to be investigated
per <- seq(0.05,0.95,0.05)

## Initialize objects to save
res <- list()

for (i in 1:length(per)){

  per_tmp <- per[i]

  cl <- makeCluster(5)
  clusterExport(cl, c("per_tmp", "sample_bam", "one_run"))
  res[[i]] <- parLapply(cl, 1:5, function(x) one_run(x, per_tmp))
  stopCluster(cl)

}

saveRDS(res, file=paste0("Results_", outFile, ".rds"))

# Make a dataframe from results averaged over 5 runs
res1 <- data.frame(number_snps = sapply(res, function(x) mean(sapply(x, function(y) y[[1]]))),
                   number_cells_1_snp = sapply(res, function(x) mean(sapply(x, function(y) y[[2]]))),
                   number_cells_2_snps = sapply(res, function(x) mean(sapply(x, function(y) y[[3]]))),
                   per = per)

# Make a dataframe from results
res2 <- data.frame(number_snps = c(sapply(res, function(x) sapply(x, function(y) y[[1]]))),
                   number_cells_1_snp = c(sapply(res, function(x) sapply(x, function(y) y[[2]]))),
                   number_cells_2_snps = c(sapply(res, function(x) sapply(x, function(y) y[[3]]))),
                   per = rep(per, each=5))

# Polynomial function
f <- function(x, a, b, n) {
  (a*x^n)/ (x^n+b)
}

# Fit polynomial function for number of cells with at least one SNP
inital_value_n <- lm(log(number_cells_1_snp+1)~log(per), res2)$coefficients[2]
fit <- nls(number_cells_1_snp~f(per,a,b,n), data=res2, start=list(a=4634, b=1, n=inital_value_n))

# Fit polynomial function for number of cells with at least two SNPs
inital_value_n_1 <- lm(log(number_cells_1_snp+1)~log(per), res2)$coefficients[2]
fit1 <- nls(number_cells_2_snps~f(per,a,b,n), data=res2, start=list(a=4634, b=1, n=inital_value_n_1))

# Fit polynomial function for number of SNPs
inital_value_n_0 <- lm(log(number_cells_1_snp+1)~log(per), res2)$coefficients[2]
try(fit0 <- nls(number_snps~f(per,a,b,n), data=res2, start=list(a=res2[dim(res2)[1],2], b=1, n=inital_value_n_0)))

# Find range for extrapolation
extr_end <- 1 + extr_seq + 0.5
per_extr <- seq(0.05, extr_end, 0.01)

# Make dataset for extrapolation
res_extr <- data.frame(per=per_extr,
                       number_cells_1_snp=sapply(per_extr, function(x) f(x, coef(fit)[1], coef(fit)[2], coef(fit)[3])),
                       number_cells_2_snps=sapply(per_extr, function(x) f(x, coef(fit1)[1], coef(fit1)[2], coef(fit1)[3])))

try(res_extr$number_snps <- sapply(per_extr, function(x) f(x, coef(fit0)[1], coef(fit0)[2], coef(fit0)[3])))

# Plot results and output them
png(paste0(outFile, "_cells_1_snp.png"))
  plot(res_extr$per, res_extr$number_cells_1_snp, xlab="Percentage*100", ylab="Number of cells with 1 SNP")
  points(res1$per, res1$number_cells_1_snp, col="red", pch=19)
  abline(v=extr_seq+1)
  txt.x <- extr_end/2
  txt.y <- max(res_extr$number_cells_1_snp)/2
  text(txt.x, txt.y,
       labels=paste("With", round(extr_seq,2), " more reads: \n",
                  round(f(1+extr_seq, coef(fit)[1], coef(fit)[2], coef(fit)[3]),0) ,"cells detected"))
dev.off()

png(paste0(outFile, "_cells_2_snps.png"))
  plot(res_extr$per, res_extr$number_cells_2_snps, xlab="Percentage*100", ylab="Number of cells with 2 SNPs")
  points(res1$per, res1$number_cells_2_snps, col="red", pch=19)
  abline(v=extr_seq+1)
  txt.x <- extr_end/2
  txt.y <- max(res_extr$number_cells_2_snps)/2
  text(txt.x, txt.y,
      labels=paste("With", round(extr_seq,2), " more reads: \n",
                  round(f(1+extr_seq, coef(fit1)[1], coef(fit1)[2], coef(fit1)[3]),0) ,"cells detected"))
dev.off()

try({png(paste0(outFile, "_snps.png"))
  plot(res_extr$per, res_extr$number_snps, xlab="Percentage*100", ylab="Number ofSNPs")
  points(res1$per, res1$number_snps, col="red", pch=19)
  abline(v=extr_seq+1)
  txt.x <- extr_end/2
  txt.y <- 100
  text(txt.x, txt.y,
       labels=paste("With", round(extr_seq,2), " more reads: \n",
                  round(f(1+extr_seq, coef(fit0)[1], coef(fit0)[2], coef(fit0)[3]),0) ,"SNPs detected"))
dev.off()})
	# outline the packages needed for this script
	# pak::pkg_install("hadley/requirements")
	# requirements::req_file("SNP_saturation.R")

	# Input 1-2 sentences that describe what this does, when it was done, and why

	if (!requireNamespace("parallel", quietly = TRUE)) {
	install.packages("parallel")
	}

	if (!requireNamespace("vcfR", quietly = TRUE)) {
	install.packages("vcfR")
	}

	if (!requireNamespace("dplyr", quietly = TRUE)) {
	install.packages("dplyr")
	}

	# State the libraries up front
	library(parallel)
	library(dplyr)
	library(cvfR)

	# Do not output scientific notation
	options(scipen = 999)

	#' Why are you running gc at the start?
	# Initial garbage collection
	gc()

	# Set command line arguments
	# (why? The output is character(0)? or is this not supposed to be run interactively?)
	args <- commandArgs(TRUE)

	bam <- args[1]
	#' tools::file_ext("wat.bam") I think does the same thing?
	sample_bam <- gsub(".bam", "", bam)
	extr_seq <- as.numeric(args[2])
	outFile <- args[3]

	#' Jenny Bryan will set your computer on fire!
	genome <-
	"/dd_userdata/usrdat02/userdata/sfreytag/indexes/hg19.premrna/fasta/genome.fa"

	# Prepare files
	# First subset to only chromosome X
	#system(paste0("samtools view -h ", bam, " X > ", sample_bam, "_chrX.bam"))
	#' why are you using `system` on paste?

	# Remove additional tags on barcodes
	barcodes <-
	read.table(paste0(sample_bam , "_barcodes.tsv"),
	header = F,
	sep = "\t")

	# find and replace -1 with 0?
	barcodes$V1 <- gsub("-1", "", barcodes$V1)

	# is "aa" a good variable name?
	aa <- length(barcodes$V1)
	write.table(
	barcodes,
	file = paste0(sample_bam, "_barcodes_corr.tsv"),
	sep = "\t",
	quote = F,
	col.names = F,
	row.names = F
	)

	# Make vcf of SNPs
	#system(paste0("source activate samtools
	# samtools mpileup -f ", genome , " -g ", sample_bam,
	# "_chrX.bam \| bcftools call -m > ", sample_bam, "_chrX.vcf
	# source deactivate"))

	# Subset to heterozygous SNPs
	#system(paste0('grep "0/1" ', sample_bam, '_chrX.vcf > ', sample_bam, '_chrX.het.vcf'))
	# Reattach header
	#system(paste0('grep "#" ', sample_bam, '_chrX.vcf \| cat - ', sample_bam, '_chrX.het.vcf > ', sample_bam, '_chrX.het.corr.vcf'))

	## Function for one run

	one_run <- function(i, per_i) {
	library(vcfR) # put library calls outside of the function
	library(dplyr) # ^^

	message(paste0("Iteration: ", i))

	# Prevent all parallel runs from having the same starting seed
	Sys.sleep(i)

	# Subsample the bam file at percentage and then index
	system(
	paste0(
	"source activate samtools
	samtools view -s ",
	per_i + i ,
	" -b ",
	sample_bam,
	"_chrX.bam > sub_sampled_tmp_",
	i,
	".bam
	source deactivate"
	)
	)
	system(paste0("samtools index sub_sampled_tmp_", i, ".bam"))

	# Apply CellSNP to subsampled file
	system(
	paste0(
	"source activate CellSNP
	cellSNP -s sub_sampled_tmp_",
	i,
	".bam -b ",
	sample_bam,
	"_barcodes_corr.tsv",
	" -o sub_sampled_tmp_",
	i,
	".snp -R ",
	sample_bam,
	"_chrX.het.corr.vcf -p 20
	source deactivate"
	)
	)

	# Count the number of SNPs and cells with SNPs
	vcf <- read.vcfR(paste0("sub_sampled_tmp_", i, ".snp.gz"),
	verbose = FALSE)

	number_snps <- dim(vcf@fix)[1]

	vcf_tibble <- vcfR2tidy(vcf,
	format_fields = c("DP", "AD", "OTH", "ALL"))

	vcf_tibble_narm <- vcf_tibble$gt %>%
	mutate(present = gt_DP >= 0) %>%
	group_by(Indiv) %>%
	summarize(n_alleles = sum(present, na.rm = TRUE))

	number_cells_1_snps <- dim(vcf_tibble_narm %>%
	ungroup() %>%
	filter(n_alleles >= 1))[1]

	number_cells_2_snps <- dim(vcf_tibble_narm %>%
	ungroup() %>%
	filter(n_alleles >= 2))[1]

	# Remove temporary files
	system(paste0("rm _tmp_", i, ""))

	return(
	list(
	number_snps = number_snps,
	number_cells_1_snps = number_cells_1_snps,
	number_cells_2_snps = number_cells_2_snps
	)
	)

	} # end of function
	#' This function should be in a separate script that can be "source"'d.

	## All percentages to be investigated
	per <- seq(0.05, 0.95, 0.05)

	## Initialize objects to save
	#' this is a dangerous pattern - should try and specify the size of the list
	res <- list()

	for (i in 1:length(per)) {
	per_tmp <- per[i]

	cl <- makeCluster(5)
	clusterExport(cl, c("per_tmp", "sample_bam", "one_run"))
	res[[i]] <- parLapply(cl, 1:5,
	function(x){
	one_run(x, per_tmp)
	}
	) # make your function clearer

	stopCluster(cl)

	}

	saveRDS(res, file = paste0("Results_", outFile, ".rds"))

	# Make a dataframe from results averaged over 5 runs
	res1 <-
	data.frame(
	number_snps = sapply(res,
	function(x) mean(sapply(x,
	function(y) y[[1]]))),
	number_cells_1_snp = sapply(res,
	function(x) mean(sapply(x,
	function(y) y[[2]]))),
	number_cells_2_snps = sapply(res,
	function(x) mean(sapply(x,
	function(y) y[[3]]))),
	per = per
	)

	# Make a dataframe from results
	res2 <-
	data.frame(
	number_snps = c(sapply(res, function(x) sapply(x,
	function(y) y[[1]]))),
	number_cells_1_snp = c(sapply(res,
	function(x) sapply(x,
	function(y) y[[2]]))),
	number_cells_2_snps = c(sapply(res,
	function(x) sapply(x,
	function(y) y[[3]]))),
	per = rep(per, each = 5)
	)

	# Polynomial function
	f <- function(x, a, b, n) {
	(a * x ^ n) / (x ^ n + b)
	}

	# Fit polynomial function for number of cells with at least one SNP
	inital_value_n <-
	lm(log(number_cells_1_snp + 1) ~ log(per),
	res2)$coefficients[2]

	fit <-
	nls(
	number_cells_1_snp ~ f(per, a, b, n),
	data = res2,
	start = list(a = 4634,
	b = 1,
	n = inital_value_n)
	)

	# Fit polynomial function for number of cells with at least two SNPs
	inital_value_n_1 <-
	lm(log(number_cells_1_snp + 1) ~ log(per),
	res2)$coefficients[2]

	fit1 <-
	nls(
	number_cells_2_snps ~ f(per, a, b, n),
	data = res2,
	start = list(a = 4634,
	b = 1,
	n = inital_value_n_1)
	)

	# Fit polynomial function for number of SNPs
	inital_value_n_0 <-
	lm(log(number_cells_1_snp + 1) ~ log(per), res2)$coefficients[2]

	#' woh - why are are you using try here?
	try(fit0 <-
	nls(
	number_snps ~ f(per, a, b, n),
	data = res2,
	start = list(a = res2[dim(res2)[1], 2], b = 1, n = inital_value_n_0)
	))

	# Find range for extrapolation
	extr_end <- 1 + extr_seq + 0.5
	per_extr <- seq(0.05, extr_end, 0.01)

	# Make dataset for extrapolation
	res_extr <- data.frame(
	per = per_extr,
	number_cells_1_snp = sapply(per_extr,
	function(x){ f(x,
	coef(fit)[1],
	coef(fit)[2],
	coef(fit)[3])
	}
	),
	number_cells_2_snps = sapply(per_extr,
	function(x){
	f(x,
	coef(fit1)[1],
	coef(fit1)[2],
	coef(fit1)[3])
	}
	)
	)

	try(res_extr$number_snps <-
	sapply(per_extr,
	function(x){
	f(x,
	coef(fit0)[1],
	coef(fit0)[2],
	coef(fit0)[3])
	}
	)
	)

	# Plot results and output them
	png(paste0(outFile, "_cells_1_snp.png"))

	plot(res_extr$per,
	res_extr$number_cells_1_snp,
	xlab = "Percentage*100",
	ylab = "Number of cells with 1 SNP")
	points(res1$per,
	res1$number_cells_1_snp,
	col = "red",
	pch = 19)
	abline(v = extr_seq + 1)
	txt.x <- extr_end / 2
	txt.y <- max(res_extr$number_cells_1_snp) / 2
	text(txt.x,
	txt.y,
	labels = paste(
	"With",
	round(extr_seq, 2),
	" more reads: \n",
	round(f(
	1 + extr_seq, coef(fit)[1], coef(fit)[2], coef(fit)[3]
	), 0) ,
	"cells detected"
	))
	dev.off()

	png(paste0(outFile, "_cells_2_snps.png"))

	plot(res_extr$per,
	res_extr$number_cells_2_snps,
	xlab = "Percentage*100",
	ylab = "Number of cells with 2 SNPs")
	points(res1$per,
	res1$number_cells_2_snps,
	col = "red",
	pch = 19)
	abline(v = extr_seq + 1)
	txt.x <- extr_end / 2
	txt.y <- max(res_extr$number_cells_2_snps) / 2
	text(txt.x,
	txt.y,
	labels = paste(
	"With",
	round(extr_seq, 2),
	" more reads: \n",
	round(f(
	1 + extr_seq, coef(fit1)[1], coef(fit1)[2], coef(fit1)[3]
	), 0) ,
	"cells detected"
	))
	dev.off()

	try({
	png(paste0(outFile, "_snps.png"))
	plot(res_extr$per,
	res_extr$number_snps,
	xlab = "Percentage*100",
	ylab = "Number ofSNPs")
	points(res1$per, res1$number_snps, col = "red", pch = 19)
	abline(v = extr_seq + 1)
	txt.x <- extr_end / 2
	txt.y <- 100
	text(txt.x,
	txt.y,
	labels = paste(
	"With",
	round(extr_seq, 2),
	" more reads: \n",
	round(f(
	1 + extr_seq, coef(fit0)[1], coef(fit0)[2], coef(fit0)[3]
	), 0) ,
	"SNPs detected"
	))
	dev.off()
	}
	)
	library(parallel)

	# Do not output scientific notation
	options(scipen=999)

	# Initial garbage collection
	gc()

	# Set command line arguments
	args <- commandArgs(TRUE)

	bam <- args[1]
	sample_bam <- gsub(".bam", "", bam)
	extr_seq <- as.numeric(args[2])
	outFile <- args[3]
	genome = "/dd_userdata/usrdat02/userdata/sfreytag/indexes/hg19.premrna/fasta/genome.fa"

	# Prepare files
	# First subset to only chromosome X
	#system(paste0("samtools view -h ", bam, " X > ", sample_bam, "_chrX.bam"))

	# Remove additional tags on barcodes
	barcodes <- read.table(paste0(sample_bam ,"_barcodes.tsv"), header=F, sep="\t")
	barcodes$V1 <- gsub("-1", "", barcodes$V1)
	aa <- length(barcodes$V1)
	write.table(barcodes, file=paste0(sample_bam, "_barcodes_corr.tsv"), sep="\t",
	quote=F, col.names=F, row.names=F)

	# Make vcf of SNPs
	#system(paste0("source activate samtools
	# samtools mpileup -f ", genome , " -g ", sample_bam,
	# "_chrX.bam \| bcftools call -m > ", sample_bam, "_chrX.vcf
	# source deactivate"))

	# Subset to heterozygous SNPs
	#system(paste0('grep "0/1" ', sample_bam, '_chrX.vcf > ', sample_bam, '_chrX.het.vcf'))
	# Reattach header
	#system(paste0('grep "#" ', sample_bam, '_chrX.vcf \| cat - ', sample_bam, '_chrX.het.vcf > ', sample_bam, '_chrX.het.corr.vcf'))

	## Function for one run

	one_run <- function(i, per_i) {

	library(vcfR)
	library(dplyr)

	print(paste0("Iteration: ", i))

	# Prevent all parallel runs from having the same starting seed
	Sys.sleep(i)

	# Subsample the bam file at percentage and then index
	system(paste0("source activate samtools
	samtools view -s ", per_i+i , " -b ", sample_bam, "_chrX.bam > sub_sampled_tmp_", i, ".bam
	source deactivate"))
	system(paste0("samtools index sub_sampled_tmp_", i, ".bam"))

	# Apply CellSNP to subsampled file
	system(paste0("source activate CellSNP
	cellSNP -s sub_sampled_tmp_", i, ".bam -b ",
	sample_bam, "_barcodes_corr.tsv",
	" -o sub_sampled_tmp_", i, ".snp -R ", sample_bam, "_chrX.het.corr.vcf -p 20
	source deactivate"))

	# Count the number of SNPs and cells with SNPs
	vcf <- read.vcfR(paste0("sub_sampled_tmp_", i, ".snp.gz"), verbose = FALSE)
	number_snps <- dim(vcf@fix) [1]
	vcf_tibble <- vcfR2tidy(vcf, format_fields = c("DP", "AD", "OTH", "ALL"))
	vcf_tibble_narm <- vcf_tibble$gt %>%
	mutate(present=gt_DP>=0) %>% group_by(Indiv) %>%
	summarize(n_alleles=sum(present, na.rm=TRUE))
	number_cells_1_snps <- dim(vcf_tibble_narm %>%
	ungroup() %>%
	filter(n_alleles>=1))[1]
	number_cells_2_snps <- dim(vcf_tibble_narm %>%
	ungroup() %>%
	filter(n_alleles>=2))[1]

	# Remove temporary files
	system(paste0("rm _tmp_", i, ""))

	return(list(number_snps=number_snps, number_cells_1_snps=number_cells_1_snps,
	number_cells_2_snps=number_cells_2_snps))

	}

	## All percentages to be investigated
	per <- seq(0.05,0.95,0.05)

	## Initialize objects to save
	res <- list()

	for (i in 1:length(per)){

	per_tmp <- per[i]

	cl <- makeCluster(5)
	clusterExport(cl, c("per_tmp", "sample_bam", "one_run"))
	res[[i]] <- parLapply(cl, 1:5, function(x) one_run(x, per_tmp))
	stopCluster(cl)

	}

	saveRDS(res, file=paste0("Results_", outFile, ".rds"))

	# Make a dataframe from results averaged over 5 runs
	res1 <- data.frame(number_snps = sapply(res, function(x) mean(sapply(x, function(y) y[[1]]))),
	number_cells_1_snp = sapply(res, function(x) mean(sapply(x, function(y) y[[2]]))),
	number_cells_2_snps = sapply(res, function(x) mean(sapply(x, function(y) y[[3]]))),
	per = per)

	# Make a dataframe from results
	res2 <- data.frame(number_snps = c(sapply(res, function(x) sapply(x, function(y) y[[1]]))),
	number_cells_1_snp = c(sapply(res, function(x) sapply(x, function(y) y[[2]]))),
	number_cells_2_snps = c(sapply(res, function(x) sapply(x, function(y) y[[3]]))),
	per = rep(per, each=5))

	# Polynomial function
	f <- function(x, a, b, n) {
	(a*x^n)/ (x^n+b)
	}

	# Fit polynomial function for number of cells with at least one SNP
	inital_value_n <- lm(log(number_cells_1_snp+1)~log(per), res2)$coefficients[2]
	fit <- nls(number_cells_1_snp~f(per,a,b,n), data=res2, start=list(a=4634, b=1, n=inital_value_n))

	# Fit polynomial function for number of cells with at least two SNPs
	inital_value_n_1 <- lm(log(number_cells_1_snp+1)~log(per), res2)$coefficients[2]
	fit1 <- nls(number_cells_2_snps~f(per,a,b,n), data=res2, start=list(a=4634, b=1, n=inital_value_n_1))

	# Fit polynomial function for number of SNPs
	inital_value_n_0 <- lm(log(number_cells_1_snp+1)~log(per), res2)$coefficients[2]
	try(fit0 <- nls(number_snps~f(per,a,b,n), data=res2, start=list(a=res2[dim(res2)[1],2], b=1, n=inital_value_n_0)))

	# Find range for extrapolation
	extr_end <- 1 + extr_seq + 0.5
	per_extr <- seq(0.05, extr_end, 0.01)

	# Make dataset for extrapolation
	res_extr <- data.frame(per=per_extr,
	number_cells_1_snp=sapply(per_extr, function(x) f(x, coef(fit)[1], coef(fit)[2], coef(fit)[3])),
	number_cells_2_snps=sapply(per_extr, function(x) f(x, coef(fit1)[1], coef(fit1)[2], coef(fit1)[3])))

	try(res_extr$number_snps <- sapply(per_extr, function(x) f(x, coef(fit0)[1], coef(fit0)[2], coef(fit0)[3])))

	# Plot results and output them
	png(paste0(outFile, "_cells_1_snp.png"))
	plot(res_extr$per, res_extr$number_cells_1_snp, xlab="Percentage*100", ylab="Number of cells with 1 SNP")
	points(res1$per, res1$number_cells_1_snp, col="red", pch=19)
	abline(v=extr_seq+1)
	txt.x <- extr_end/2
	txt.y <- max(res_extr$number_cells_1_snp)/2
	text(txt.x, txt.y,
	labels=paste("With", round(extr_seq,2), " more reads: \n",
	round(f(1+extr_seq, coef(fit)[1], coef(fit)[2], coef(fit)[3]),0) ,"cells detected"))
	dev.off()

	png(paste0(outFile, "_cells_2_snps.png"))
	plot(res_extr$per, res_extr$number_cells_2_snps, xlab="Percentage*100", ylab="Number of cells with 2 SNPs")
	points(res1$per, res1$number_cells_2_snps, col="red", pch=19)
	abline(v=extr_seq+1)
	txt.x <- extr_end/2
	txt.y <- max(res_extr$number_cells_2_snps)/2
	text(txt.x, txt.y,
	labels=paste("With", round(extr_seq,2), " more reads: \n",
	round(f(1+extr_seq, coef(fit1)[1], coef(fit1)[2], coef(fit1)[3]),0) ,"cells detected"))
	dev.off()

	try({png(paste0(outFile, "_snps.png"))
	plot(res_extr$per, res_extr$number_snps, xlab="Percentage*100", ylab="Number ofSNPs")
	points(res1$per, res1$number_snps, col="red", pch=19)
	abline(v=extr_seq+1)
	txt.x <- extr_end/2
	txt.y <- 100
	text(txt.x, txt.y,
	labels=paste("With", round(extr_seq,2), " more reads: \n",
	round(f(1+extr_seq, coef(fit0)[1], coef(fit0)[2], coef(fit0)[3]),0) ,"SNPs detected"))
	dev.off()})