DrYak/dehuman.bsub

## dehuman.bsub
#!/bin/bash
#BSUB -L /bin/bash
#BSUB -J COVID-dehumanize
#BSUB -M 6144
#BSUB -n 16
#BSUB -R rusage[mem=6144]
#BSUB -R span[hosts=1]
#BSUB -W 1420

# 4096 ; 235

# directories
basedir=/cluster/project/pangolin/working
#scriptdir="$(dirname $(which $0))"
#scriptdir="$(pwd)"
scriptdir=/cluster/project/pangolin/work-meta

# get this jobs' target sample
listfile=ww-article-data.tsv
if (( $LSB_JOBINDEX )); then
	echo JobIdx $LSB_JOBINDEX

	#ETHZID=$(sed -n "${LSB_JOBINDEX}p;${LSB_JOBINDEX}q" basel_samples.txt)
	tsvline=( $(sed -n "${LSB_JOBINDEX}p;${LSB_JOBINDEX}q" ${listfile}) )
	ETHZID="${tsvline[0]}"
	batch="${tsvline[1]}"

	# check if samplename is ok
	if [[ -z "${ETHZID}" ]]; then
		echo "wrong job index ${LSB_JOBINDEX}"
		echo "Run with:"
		echo ' - bsub -J "COVID-dehumanize[1-$(wc -l<'${listfile}')]" < dehuman.bsub'
		exit 1;
	fi
else
	#ETHZID=240014_785_C07 # 100790_109_A5 #
	tsvline=( $(head -n 1  ${listfile}) )
	ETHZID="${tsvline[0]}"
	batch="${tsvline[1]}"

	echo "Testing with ${ETHZID}"
	echo "to run with all, use:"
	echo ' - bsub -J "COVID-dehumanize[1-$(wc -l<'${listfile}')]" < dehuman.bsub'
fi

hg=/cluster/project/igenomes/Homo_sapiens/NCBI/GRCh38/Sequence/BWAIndex/genome.fa

# paths
fileid="$ETHZID${batch:+_${batch}}"
workdir="${TMPDIR:-${SCRATCH:-/tmp}}/${fileid}"
outdir="${scriptdir}/sra/"
mkdir -m 0770 -p "${outdir}"

# get sample batch
#sampledir=$(gawk -vETHZID=${ETHZID} '$1==ETHZID{printf("%s/%s\n",$1,$2)}' /cluster/project/pangolin/working/samples.tsv)
sampledir="$ETHZID/${batch}"

if [[ -z $sampledir ]]; then
	echo "cannot find $ETHZID"
	exit 2
fi

# environment
mkdir -p $workdir
cd $workdir

echo "working in ${workdir}"
echo "output to ${outdir}"

. /cluster/project/pangolin/miniconda3/bin/activate 'dehuman'

title() {
	printf '\e[34;1m==============================================================\n%s\n==============================================================\e[0m\n\n'	"$1"
}


# step 1: align against SARS-CoV-2
title 'Filtering out SARS-CoV-2 reads (NC_045512.2)'
bwa mem -t 16 -o ref_aln.sam ${basedir}/references/NC_045512.2.fasta ${basedir}/samples/${sampledir}/preprocessed_data/R{1,2}.fastq.gz

# step 2: keep reject
samtools bam2fq -@ 16 -F 2 -1 reject_R1.fastq.gz -2 reject_R2.fastq.gz ref_aln.sam

# step 3: align against Homo Sapiens
title 'Checking rejects against Homo Sapiens (hg38)'
bwa mem -t 16 -o h38_aln.sam $hg reject_R{1,2}.fastq.gz

# count aligned reads
count=$(samtools view -@ 16 -c -f 2 h38_aln.sam|tee ${outdir}/${fileid}.cnt)

# get the files
if (( count > 0 )); then
	echo "Needs special care: ${count} potential human reads found"

	title 'Removing identified human reads from raw reads'
	# get list
	samtools view -@ 16 -f 2 h38_aln.sam | cut -f 1 > ${outdir}/${fileid}.filter
	# prepare filtered
	for i in 1 2; do
		# *_${ETHZID}_*
		zcat ${basedir}/samples/${sampledir}/raw_data/*_R${i}.fastq.gz | ${scriptdir}/remove_reads_list.pl ${outdir}/${fileid}.filter | gzip > ./filtered_R${i}.fastq.gz &
	done
	wait
	# keep the rejects for further analysis
	title 'Keeping Human-aligned SARS-CoV-2 rejects'
	# (we compress reference-less, because the reference size is larger than the contaminant reads)
	samtools sort -@ 16 -M --output-fmt cram,no_ref,use_bzip2,use_lzma,level=9,seqs_per_slice=1000000 -o h38_aln.cram h38_aln.sam
	cp -v h38_aln.cram ${outdir}/${fileid}-hg.cram
	rm h38_aln.sam
	samtools index -@ 16 h38_aln.cram
	samtools stats -@ 16 h38_aln.cram > ${outdir}/${fileid}.stats
	title 'Compressing human-depleted raw reads'
else
	echo "No potential human reads found"
	title 'Compressing raw reads file'
	for i in 1 2; do
		#ln -vs ${basedir}/samples/${sampledir}/raw_data/*_R${i}.fastq.gz ./filtered_R${i}.fastq.gz
		cat ${basedir}/samples/${sampledir}/raw_data/*_R${i}.fastq.gz > ./filtered_R${i}.fastq.gz &
	done
	wait
fi

# Compress 1: against reference dictionary
bwa mem -t 16 -C -o cram.sam ${basedir}/references/NC_045512.2.fasta filtered_R{1,2}.fastq.gz

# Compress 2: into CRAM3
[[ -e ${outdir}/${fileid}.cram ]] && rm ${outdir}/${fileid}.cram* || true
perl -p -e 's{(?<=\t)([[:digit:]]:[[:upper:]]:[[:digit:]]:[ATCGN]+(\+[ATCGN]+))$}{BC:Z:\1}' cram.sam | samtools sort -@ 16 -M --reference ${basedir}/references/NC_045512.2.fasta --output-fmt cram,embed_ref,use_bzip2,use_lzma,level=9,seqs_per_slice=1000000 -o ${outdir}/${fileid}.cram && title 'Success !!!'

exit 0


## remove_reads_list.pl
#!/usr/bin/env perl

use strict;
use warnings;

my %h; # hash with 'headers' to filter-out as keys

{
	my $listfile = shift @ARGV;
	open my $fh, '<', $listfile
		or die "cannot open ${listfile}";

	# build a hash out of all (unique) seq ID
	%h = map { chomp; ('@' . $_) => 1 } <$fh>;

	close $fh;
}
print STDERR "filtering out: ", scalar keys %h, " seq id\n";

while (<>) { # loop through headers...
	my @s = map { scalar <> } 1 .. 3; # ...fetch the remainer 3 lines of read: bp sequence, separator, phred quality scores
	print $_, @s
		unless(exists $h{(split)[0]}); # consider header's first part ('@' + seq ID), ignore the second half ("comments", e.g.: Illumina indexes)
}
	#!/bin/bash
	#BSUB -L /bin/bash
	#BSUB -J COVID-dehumanize
	#BSUB -M 6144
	#BSUB -n 16
	#BSUB -R rusage[mem=6144]
	#BSUB -R span[hosts=1]
	#BSUB -W 1420

	# 4096 ; 235

	# directories
	basedir=/cluster/project/pangolin/working
	#scriptdir="$(dirname $(which $0))"
	#scriptdir="$(pwd)"
	scriptdir=/cluster/project/pangolin/work-meta

	# get this jobs' target sample
	listfile=ww-article-data.tsv
	if (( $LSB_JOBINDEX )); then
	echo JobIdx $LSB_JOBINDEX

	#ETHZID=$(sed -n "${LSB_JOBINDEX}p;${LSB_JOBINDEX}q" basel_samples.txt)
	tsvline=( $(sed -n "${LSB_JOBINDEX}p;${LSB_JOBINDEX}q" ${listfile}) )
	ETHZID="${tsvline[0]}"
	batch="${tsvline[1]}"

	# check if samplename is ok
	if [[ -z "${ETHZID}" ]]; then
	echo "wrong job index ${LSB_JOBINDEX}"
	echo "Run with:"
	echo ' - bsub -J "COVID-dehumanize[1-$(wc -l<'${listfile}')]" < dehuman.bsub'
	exit 1;
	fi
	else
	#ETHZID=240014_785_C07 # 100790_109_A5 #
	tsvline=( $(head -n 1 ${listfile}) )
	ETHZID="${tsvline[0]}"
	batch="${tsvline[1]}"

	echo "Testing with ${ETHZID}"
	echo "to run with all, use:"
	echo ' - bsub -J "COVID-dehumanize[1-$(wc -l<'${listfile}')]" < dehuman.bsub'
	fi

	hg=/cluster/project/igenomes/Homo_sapiens/NCBI/GRCh38/Sequence/BWAIndex/genome.fa

	# paths
	fileid="$ETHZID${batch:+_${batch}}"
	workdir="${TMPDIR:-${SCRATCH:-/tmp}}/${fileid}"
	outdir="${scriptdir}/sra/"
	mkdir -m 0770 -p "${outdir}"

	# get sample batch
	#sampledir=$(gawk -vETHZID=${ETHZID} '$1==ETHZID{printf("%s/%s\n",$1,$2)}' /cluster/project/pangolin/working/samples.tsv)
	sampledir="$ETHZID/${batch}"

	if [[ -z $sampledir ]]; then
	echo "cannot find $ETHZID"
	exit 2
	fi

	# environment
	mkdir -p $workdir
	cd $workdir

	echo "working in ${workdir}"
	echo "output to ${outdir}"

	. /cluster/project/pangolin/miniconda3/bin/activate 'dehuman'

	title() {
	printf '\e[34;1m==============================================================\n%s\n==============================================================\e[0m\n\n' "$1"
	}


	# step 1: align against SARS-CoV-2
	title 'Filtering out SARS-CoV-2 reads (NC_045512.2)'
	bwa mem -t 16 -o ref_aln.sam ${basedir}/references/NC_045512.2.fasta ${basedir}/samples/${sampledir}/preprocessed_data/R{1,2}.fastq.gz

	# step 2: keep reject
	samtools bam2fq -@ 16 -F 2 -1 reject_R1.fastq.gz -2 reject_R2.fastq.gz ref_aln.sam

	# step 3: align against Homo Sapiens
	title 'Checking rejects against Homo Sapiens (hg38)'
	bwa mem -t 16 -o h38_aln.sam $hg reject_R{1,2}.fastq.gz

	# count aligned reads
	count=$(samtools view -@ 16 -c -f 2 h38_aln.sam\|tee ${outdir}/${fileid}.cnt)

	# get the files
	if (( count > 0 )); then
	echo "Needs special care: ${count} potential human reads found"

	title 'Removing identified human reads from raw reads'
	# get list
	samtools view -@ 16 -f 2 h38_aln.sam \| cut -f 1 > ${outdir}/${fileid}.filter
	# prepare filtered
	for i in 1 2; do
	# _${ETHZID}_
	zcat ${basedir}/samples/${sampledir}/raw_data/*_R${i}.fastq.gz \| ${scriptdir}/remove_reads_list.pl ${outdir}/${fileid}.filter \| gzip > ./filtered_R${i}.fastq.gz &
	done
	wait
	# keep the rejects for further analysis
	title 'Keeping Human-aligned SARS-CoV-2 rejects'
	# (we compress reference-less, because the reference size is larger than the contaminant reads)
	samtools sort -@ 16 -M --output-fmt cram,no_ref,use_bzip2,use_lzma,level=9,seqs_per_slice=1000000 -o h38_aln.cram h38_aln.sam
	cp -v h38_aln.cram ${outdir}/${fileid}-hg.cram
	rm h38_aln.sam
	samtools index -@ 16 h38_aln.cram
	samtools stats -@ 16 h38_aln.cram > ${outdir}/${fileid}.stats
	title 'Compressing human-depleted raw reads'
	else
	echo "No potential human reads found"
	title 'Compressing raw reads file'
	for i in 1 2; do
	#ln -vs ${basedir}/samples/${sampledir}/raw_data/*_R${i}.fastq.gz ./filtered_R${i}.fastq.gz
	cat ${basedir}/samples/${sampledir}/raw_data/*_R${i}.fastq.gz > ./filtered_R${i}.fastq.gz &
	done
	wait
	fi

	# Compress 1: against reference dictionary
	bwa mem -t 16 -C -o cram.sam ${basedir}/references/NC_045512.2.fasta filtered_R{1,2}.fastq.gz

	# Compress 2: into CRAM3
	[[ -e ${outdir}/${fileid}.cram ]] && rm ${outdir}/${fileid}.cram* \|\| true
	perl -p -e 's{(?<=\t)([[:digit:]]:[[:upper:]]:[[:digit:]]:[ATCGN]+(\+[ATCGN]+))$}{BC:Z:\1}' cram.sam \| samtools sort -@ 16 -M --reference ${basedir}/references/NC_045512.2.fasta --output-fmt cram,embed_ref,use_bzip2,use_lzma,level=9,seqs_per_slice=1000000 -o ${outdir}/${fileid}.cram && title 'Success !!!'

	exit 0
	#!/usr/bin/env perl

	use strict;
	use warnings;

	my %h; # hash with 'headers' to filter-out as keys

	{
	my $listfile = shift @ARGV;
	open my $fh, '<', $listfile
	or die "cannot open ${listfile}";

	# build a hash out of all (unique) seq ID
	%h = map { chomp; ('@' . $_) => 1 } <$fh>;

	close $fh;
	}
	print STDERR "filtering out: ", scalar keys %h, " seq id\n";

	while (<>) { # loop through headers...
	my @s = map { scalar <> } 1 .. 3; # ...fetch the remainer 3 lines of read: bp sequence, separator, phred quality scores
	print $_, @s
	unless(exists $h{(split)[0]}); # consider header's first part ('@' + seq ID), ignore the second half ("comments", e.g.: Illumina indexes)
	}