Skip to content

Instantly share code, notes, and snippets.

@danielecook
Last active March 18, 2022 07:46
Show Gist options
  • Save danielecook/6958bd4687c045a14e49 to your computer and use it in GitHub Desktop.
Save danielecook/6958bd4687c045a14e49 to your computer and use it in GitHub Desktop.
parallel bcftools
# ~/.bashrc: executed by bash(1) for non-login shells.
# see /usr/share/doc/bash/examples/startup-files (in the package bash-doc)
# for examples
function bam_chromosomes() {
# Fetch chromosomes from a bam file
samtools view -H $1 | \
grep -Po 'SN:(.*)\t' | \
cut -c 4-1000
}
function vcf_chromosomes() {
# Fetch contigs from a vcf file.
bcftools view -h $1 | \
grep 'contig' | \
egrep -o "ID=([^,]+)" | \
sed 's/ID=//g' | \
tr '\n' ' '
}
PARALLEL_CORES=6
function parallel_bcftools_merge() {
file_set=`echo $@ | egrep -o '(\-l|\-\-file-list)(=|[ ]+)[^ ]+' | tr '=' ' ' | cut -f 2 -d ' '`
if [ -n "${file_set}" ]
then
find_vcf=`cat ${file_set} | head -n 1`
else
find_vcf=`echo $@ | tr '\t' '\n' | egrep -o '[^ ]+.vcf.gz' | awk 'NR == 1 { print }' - `
fi
contigs=`vcf_chromosomes ${find_vcf}`
current_dir=$(dirname ${find_vcf})
hash_merge=`echo "$@" | md5sum | cut -c 1-5`
output_prefix="${current_dir}/parallel_merge.${hash_merge}"
parallel --gnu --workdir ${current_dir} \
--env args -j ${PARALLEL_CORES} \
'bcftools merge -r {1} -O u ' $@ ' > ' ${output_prefix}'.{1}.bcf' ::: ${contigs}
order=`echo $contigs | tr ' ' '\n' | awk -v "prefix=${output_prefix}" '{ print prefix "." $0 ".bcf" }'`
bcftools concat -O v ${order} | grep -v 'parallel_merge' | sed 's/##bcftools_mergeCommand=merge -r I -O u /##bcftools_mergeCommand=merge /g' | bcftools view -O u
rm ${order}
}
PARALLEL_CORES=6
function parallel_bcftools_gtcheck() {
find_vcf=`echo $@ | tr '\t' '\n' | egrep -o '[^ ]+.vcf.gz' | awk 'NR == 1 { print }' - `
contigs=`vcf_chromosomes ${find_vcf}`
current_dir=$(dirname ${find_vcf})
hash_merge=`echo "$@" | md5sum | cut -c 1-5`
output_prefix="${current_dir}/parallel_concordance.${hash_merge}"
gtcheck_options=`echo $@ | awk -v vcf=${find_vcf} '{ gsub(vcf,"",$0); print $0; }'`
parallel --gnu -j ${PARALLEL_CORES} --workdir ${current_dir} 'bcftools view ' ${find_vcf} ' {} | \
bcftools gtcheck ' ${gtcheck_options} ' - | \
awk -v chrom={} "/^CN/ { print \$0 \"\t\" chrom } \$0 !~ /.*CN.*/ { print } \$0 ~ /^# \[1\]CN/ { print \$0 \"\tchrom\"}" - > ' ${output_prefix}'.{}.tsv' ::: ${contigs}
order=`echo $contigs | tr ' ' '\n' | awk -v "prefix=${output_prefix}" '{ print prefix "." $0 ".tsv" }'`
cat ${order} | \
grep 'CN' | \
awk 'NR == 1 && /Discordance/ { print } NR > 1 && $0 !~ /Discordance/ { print }' | \
awk '{ gsub("(# |\\[[0-9]+\\])","", $0); gsub(" ","_", $0); print }' | \
cut -f 2-7 | datamash --header-in --sort --group=Sample_i,Sample_j sum Discordance sum Number_of_sites | \
cat <(echo -e "sample_i\tsample_j\tdiscordance\tnumber_of_sites") - | \
awk 'NR == 1 { print $0 "\tconcordance" } NR > 1 && $4 == 0 { print $0 "\t" } NR > 1 && $4 > 0 { print $0 "\t" ($4-$3)/$4 }'
rm ${order}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment