Last active
October 4, 2021 21:00
-
-
Save DrYak/056e5938176dbc8ba28db1acc63a03da to your computer and use it in GitHub Desktop.
Remove human reads from raw reads
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
#BSUB -L /bin/bash | |
#BSUB -J COVID-dehumanize | |
#BSUB -M 6144 | |
#BSUB -n 16 | |
#BSUB -R rusage[mem=6144] | |
#BSUB -R span[hosts=1] | |
#BSUB -W 1420 | |
# 4096 ; 235 | |
# directories | |
basedir=/cluster/project/pangolin/working | |
#scriptdir="$(dirname $(which $0))" | |
#scriptdir="$(pwd)" | |
scriptdir=/cluster/project/pangolin/work-meta | |
# get this jobs' target sample | |
listfile=ww-article-data.tsv | |
if (( $LSB_JOBINDEX )); then | |
echo JobIdx $LSB_JOBINDEX | |
#ETHZID=$(sed -n "${LSB_JOBINDEX}p;${LSB_JOBINDEX}q" basel_samples.txt) | |
tsvline=( $(sed -n "${LSB_JOBINDEX}p;${LSB_JOBINDEX}q" ${listfile}) ) | |
ETHZID="${tsvline[0]}" | |
batch="${tsvline[1]}" | |
# check if samplename is ok | |
if [[ -z "${ETHZID}" ]]; then | |
echo "wrong job index ${LSB_JOBINDEX}" | |
echo "Run with:" | |
echo ' - bsub -J "COVID-dehumanize[1-$(wc -l<'${listfile}')]" < dehuman.bsub' | |
exit 1; | |
fi | |
else | |
#ETHZID=240014_785_C07 # 100790_109_A5 # | |
tsvline=( $(head -n 1 ${listfile}) ) | |
ETHZID="${tsvline[0]}" | |
batch="${tsvline[1]}" | |
echo "Testing with ${ETHZID}" | |
echo "to run with all, use:" | |
echo ' - bsub -J "COVID-dehumanize[1-$(wc -l<'${listfile}')]" < dehuman.bsub' | |
fi | |
hg=/cluster/project/igenomes/Homo_sapiens/NCBI/GRCh38/Sequence/BWAIndex/genome.fa | |
# paths | |
fileid="$ETHZID${batch:+_${batch}}" | |
workdir="${TMPDIR:-${SCRATCH:-/tmp}}/${fileid}" | |
outdir="${scriptdir}/sra/" | |
mkdir -m 0770 -p "${outdir}" | |
# get sample batch | |
#sampledir=$(gawk -vETHZID=${ETHZID} '$1==ETHZID{printf("%s/%s\n",$1,$2)}' /cluster/project/pangolin/working/samples.tsv) | |
sampledir="$ETHZID/${batch}" | |
if [[ -z $sampledir ]]; then | |
echo "cannot find $ETHZID" | |
exit 2 | |
fi | |
# environment | |
mkdir -p $workdir | |
cd $workdir | |
echo "working in ${workdir}" | |
echo "output to ${outdir}" | |
. /cluster/project/pangolin/miniconda3/bin/activate 'dehuman' | |
title() { | |
printf '\e[34;1m==============================================================\n%s\n==============================================================\e[0m\n\n' "$1" | |
} | |
# step 1: align against SARS-CoV-2 | |
title 'Filtering out SARS-CoV-2 reads (NC_045512.2)' | |
bwa mem -t 16 -o ref_aln.sam ${basedir}/references/NC_045512.2.fasta ${basedir}/samples/${sampledir}/preprocessed_data/R{1,2}.fastq.gz | |
# step 2: keep reject | |
samtools bam2fq -@ 16 -F 2 -1 reject_R1.fastq.gz -2 reject_R2.fastq.gz ref_aln.sam | |
# step 3: align against Homo Sapiens | |
title 'Checking rejects against Homo Sapiens (hg38)' | |
bwa mem -t 16 -o h38_aln.sam $hg reject_R{1,2}.fastq.gz | |
# count aligned reads | |
count=$(samtools view -@ 16 -c -f 2 h38_aln.sam|tee ${outdir}/${fileid}.cnt) | |
# get the files | |
if (( count > 0 )); then | |
echo "Needs special care: ${count} potential human reads found" | |
title 'Removing identified human reads from raw reads' | |
# get list | |
samtools view -@ 16 -f 2 h38_aln.sam | cut -f 1 > ${outdir}/${fileid}.filter | |
# prepare filtered | |
for i in 1 2; do | |
# *_${ETHZID}_* | |
zcat ${basedir}/samples/${sampledir}/raw_data/*_R${i}.fastq.gz | ${scriptdir}/remove_reads_list.pl ${outdir}/${fileid}.filter | gzip > ./filtered_R${i}.fastq.gz & | |
done | |
wait | |
# keep the rejects for further analysis | |
title 'Keeping Human-aligned SARS-CoV-2 rejects' | |
# (we compress reference-less, because the reference size is larger than the contaminant reads) | |
samtools sort -@ 16 -M --output-fmt cram,no_ref,use_bzip2,use_lzma,level=9,seqs_per_slice=1000000 -o h38_aln.cram h38_aln.sam | |
cp -v h38_aln.cram ${outdir}/${fileid}-hg.cram | |
rm h38_aln.sam | |
samtools index -@ 16 h38_aln.cram | |
samtools stats -@ 16 h38_aln.cram > ${outdir}/${fileid}.stats | |
title 'Compressing human-depleted raw reads' | |
else | |
echo "No potential human reads found" | |
title 'Compressing raw reads file' | |
for i in 1 2; do | |
#ln -vs ${basedir}/samples/${sampledir}/raw_data/*_R${i}.fastq.gz ./filtered_R${i}.fastq.gz | |
cat ${basedir}/samples/${sampledir}/raw_data/*_R${i}.fastq.gz > ./filtered_R${i}.fastq.gz & | |
done | |
wait | |
fi | |
# Compress 1: against reference dictionary | |
bwa mem -t 16 -C -o cram.sam ${basedir}/references/NC_045512.2.fasta filtered_R{1,2}.fastq.gz | |
# Compress 2: into CRAM3 | |
[[ -e ${outdir}/${fileid}.cram ]] && rm ${outdir}/${fileid}.cram* || true | |
perl -p -e 's{(?<=\t)([[:digit:]]:[[:upper:]]:[[:digit:]]:[ATCGN]+(\+[ATCGN]+))$}{BC:Z:\1}' cram.sam | samtools sort -@ 16 -M --reference ${basedir}/references/NC_045512.2.fasta --output-fmt cram,embed_ref,use_bzip2,use_lzma,level=9,seqs_per_slice=1000000 -o ${outdir}/${fileid}.cram && title 'Success !!!' | |
exit 0 | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use strict; | |
use warnings; | |
my %h; # hash with 'headers' to filter-out as keys | |
{ | |
my $listfile = shift @ARGV; | |
open my $fh, '<', $listfile | |
or die "cannot open ${listfile}"; | |
# build a hash out of all (unique) seq ID | |
%h = map { chomp; ('@' . $_) => 1 } <$fh>; | |
close $fh; | |
} | |
print STDERR "filtering out: ", scalar keys %h, " seq id\n"; | |
while (<>) { # loop through headers... | |
my @s = map { scalar <> } 1 .. 3; # ...fetch the remainer 3 lines of read: bp sequence, separator, phred quality scores | |
print $_, @s | |
unless(exists $h{(split)[0]}); # consider header's first part ('@' + seq ID), ignore the second half ("comments", e.g.: Illumina indexes) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment