Last active
March 18, 2022 07:46
-
-
Save danielecook/6958bd4687c045a14e49 to your computer and use it in GitHub Desktop.
parallel bcftools
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ~/.bashrc: executed by bash(1) for non-login shells. | |
# see /usr/share/doc/bash/examples/startup-files (in the package bash-doc) | |
# for examples | |
function bam_chromosomes() { | |
# Fetch chromosomes from a bam file | |
samtools view -H $1 | \ | |
grep -Po 'SN:(.*)\t' | \ | |
cut -c 4-1000 | |
} | |
function vcf_chromosomes() { | |
# Fetch contigs from a vcf file. | |
bcftools view -h $1 | \ | |
grep 'contig' | \ | |
egrep -o "ID=([^,]+)" | \ | |
sed 's/ID=//g' | \ | |
tr '\n' ' ' | |
} | |
PARALLEL_CORES=6 | |
function parallel_bcftools_merge() { | |
file_set=`echo $@ | egrep -o '(\-l|\-\-file-list)(=|[ ]+)[^ ]+' | tr '=' ' ' | cut -f 2 -d ' '` | |
if [ -n "${file_set}" ] | |
then | |
find_vcf=`cat ${file_set} | head -n 1` | |
else | |
find_vcf=`echo $@ | tr '\t' '\n' | egrep -o '[^ ]+.vcf.gz' | awk 'NR == 1 { print }' - ` | |
fi | |
contigs=`vcf_chromosomes ${find_vcf}` | |
current_dir=$(dirname ${find_vcf}) | |
hash_merge=`echo "$@" | md5sum | cut -c 1-5` | |
output_prefix="${current_dir}/parallel_merge.${hash_merge}" | |
parallel --gnu --workdir ${current_dir} \ | |
--env args -j ${PARALLEL_CORES} \ | |
'bcftools merge -r {1} -O u ' $@ ' > ' ${output_prefix}'.{1}.bcf' ::: ${contigs} | |
order=`echo $contigs | tr ' ' '\n' | awk -v "prefix=${output_prefix}" '{ print prefix "." $0 ".bcf" }'` | |
bcftools concat -O v ${order} | grep -v 'parallel_merge' | sed 's/##bcftools_mergeCommand=merge -r I -O u /##bcftools_mergeCommand=merge /g' | bcftools view -O u | |
rm ${order} | |
} | |
PARALLEL_CORES=6 | |
function parallel_bcftools_gtcheck() { | |
find_vcf=`echo $@ | tr '\t' '\n' | egrep -o '[^ ]+.vcf.gz' | awk 'NR == 1 { print }' - ` | |
contigs=`vcf_chromosomes ${find_vcf}` | |
current_dir=$(dirname ${find_vcf}) | |
hash_merge=`echo "$@" | md5sum | cut -c 1-5` | |
output_prefix="${current_dir}/parallel_concordance.${hash_merge}" | |
gtcheck_options=`echo $@ | awk -v vcf=${find_vcf} '{ gsub(vcf,"",$0); print $0; }'` | |
parallel --gnu -j ${PARALLEL_CORES} --workdir ${current_dir} 'bcftools view ' ${find_vcf} ' {} | \ | |
bcftools gtcheck ' ${gtcheck_options} ' - | \ | |
awk -v chrom={} "/^CN/ { print \$0 \"\t\" chrom } \$0 !~ /.*CN.*/ { print } \$0 ~ /^# \[1\]CN/ { print \$0 \"\tchrom\"}" - > ' ${output_prefix}'.{}.tsv' ::: ${contigs} | |
order=`echo $contigs | tr ' ' '\n' | awk -v "prefix=${output_prefix}" '{ print prefix "." $0 ".tsv" }'` | |
cat ${order} | \ | |
grep 'CN' | \ | |
awk 'NR == 1 && /Discordance/ { print } NR > 1 && $0 !~ /Discordance/ { print }' | \ | |
awk '{ gsub("(# |\\[[0-9]+\\])","", $0); gsub(" ","_", $0); print }' | \ | |
cut -f 2-7 | datamash --header-in --sort --group=Sample_i,Sample_j sum Discordance sum Number_of_sites | \ | |
cat <(echo -e "sample_i\tsample_j\tdiscordance\tnumber_of_sites") - | \ | |
awk 'NR == 1 { print $0 "\tconcordance" } NR > 1 && $4 == 0 { print $0 "\t" } NR > 1 && $4 > 0 { print $0 "\t" ($4-$3)/$4 }' | |
rm ${order} | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment