Created
June 10, 2016 18:23
-
-
Save jakeenk/14faf11bf8c9ab2505a66e499d22cca4 to your computer and use it in GitHub Desktop.
SAM file read collapser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# ******REQUIRES SAMTOOLS****** | |
#This is a sam assembly collapser using coordinates and strand as the "unique insert" identifiers, keeping the read with the highest mapping quality | |
#Written/assembled by Jake Enk and Alison Devault (and named by Nathalie Mouttham) June 3, 2013 | |
# Usage: $ aweSAM_collapser [input samfile] [output collapsed samfile] | |
# PROCESS: | |
#1. Start with sam file as first variable, output file as second variable | |
inputsam=$1 | |
collsam=$2 | |
#2. Reduce the samfile to mapped reads only | |
awk '$3 != "*"' $inputsam | | |
#3. Convert mapped sam to bed using sam2bed from BEDOPS script package (by Eric Haugen and Alex Reynolds, version 2.2 (http://code.google.com/p/bedops/)) | |
samtools view -S -\ | |
| gawk -F "\t" '{ \ | |
if ( ! and(4, $2) ) { \ | |
sval = $6; \ | |
gsub("[0-9]+[ISHP]", "", sval ); \ | |
gsub("[MDN=X]", "|", sval); \ | |
num = split(sval, a, "|"); \ | |
lng = 0; \ | |
for ( i = 1; i <= num; ++i ) { \ | |
lng += int(a[i]); \ | |
} \ | |
printf "%s\t%s\t%d\t%s\t%s", $3, int($4) - 1, int($4) - 1 + lng, $1, $2; \ | |
if ( and(16, $2) ) { printf "\t-"; } else { printf "\t+"; } \ | |
for ( i = 5; i <= NF; ++i ) { \ | |
printf "\t%s", $(i); \ | |
} \ | |
printf "\n"; \ | |
} \ | |
}' | | |
#4. Sort so that the -u unique collapse function will keep the read with the highest mapping quality | |
# NOTE: -k2,2n sorts by 5' position, -k3,3n by 3' position, and -k6,6 by strand (+ or -); include or exclude these as desired | |
sort -k 7,7nr - | sort -k1,1 -k2,2n -k3,3n -k6,6 -u | | |
#5. Cut out names of kept sequences; find and extract these from the original sam file, writing to a temporary file | |
cut -f 4 - | fgrep -f - $inputsam > temporary.file; | |
#6. Append original sam header to make a functioning collapsed sam | |
grep "^@" $inputsam | cat - temporary.file > $collsam | |
#7. Cleanup | |
rm temporary.file; | |
#8. Calculate collapse rate (updated 2016-June-10, Jake Enk) | |
original=$(awk '$3 != "*" {print $1}' $inputsam | sort | uniq | wc -l); collapsed=$(awk '$3 != "*" {print $1}' $collsam | sort | uniq | wc -l); duplicates=$(echo $original-$collapsed | bc); dupprop=$(echo "scale=4; $duplicates/$original" | bc); duprate=$(echo "scale=2; $dupprop*100/1" | bc) | |
#9. Report | |
echo $1" collapsed to "$2"; "$duprate"% were duplicates" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment