Created
October 16, 2012 22:37
-
-
Save nathanhaigh/3902512 to your computer and use it in GitHub Desktop.
Add read groups to SAM file using AMOS-style mates file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library Illimina_pe 200 500 ^(AGRF).*$ | |
# Note sure if the following single-ended "library" is valid for AMOS: | |
library 454_se 0 0 ^(GKUZBF401|GKUZBF402|GK7W4KZ01|GK7W4KZ02|GK538RQ01|GK538RQ02|GK0HZXL01|GK0HZXL02|F196STF01|F196STF02|F4S7VAM01|F4S7VAM02|F5ZSAHT01|F5ZSAHT02|F6J8SXW01|F6J8SXW02).*$ | |
library 454_3kb 1000 5000 ^(GK33D5G01|GK33D5G02|GK9RFDF01|GK9RFDF02|GLBLI5G01|GLBLI5G02|GLDJ2JW01|GLKV9FY01).*$ | |
library 454_20kb 8000 32000 ^(F5XZBDV01|F5XZBDV02).*$ | |
pair ^(.+?)_left.*$ ^(.+?)_right.*$ | |
pair ^(.+?)/1.*$ ^(.+?)/2.*$ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
# | |
# Usage: sam_add_rg_using_mates.pl my.mates < my.sam > with_rg.sam | |
# | |
# Replaces ALL read group tags, associated with alignments, and @RG header lines | |
# using library information stored in an AMOS-style mates file: | |
# https://sourceforge.net/apps/mediawiki/amos/index.php?title=Bambus_Manual#The_.mates_file | |
# This script expects: | |
# A mates file containing "library" lines with regular expressions | |
# which, when matched against read names in the SAM file will associate | |
# them with the respective library/read group. i.e in the format: | |
# library <name> <min_size> <max_size> <regex> | |
# A SAM file on STDIN | |
# Writes the SAM file to STDOUT with the following changes: | |
# All original @RG header lines will be erased | |
# New @RG header lines will be output with the following information: | |
# @RG ID:<name> SM:unknown LB:<name> PI:((<min_size> + <max_size>)/2) | |
# All RG tags in the alignments will have their values replaced with the library | |
# <name> from the mates file according to which library <regex> the read name | |
# matches. The first <regex> from the mates file to match is used to assign | |
# the read to a particular library | |
use strict; | |
use warnings; | |
# Read | |
my $mates_file = shift(@ARGV); | |
open(MATES,'<',$mates_file) or die "Couldn't open mates file: $!\n"; | |
my (@rg, @regexs, @min, @max); | |
while(<MATES>){ | |
chomp; | |
# skip lines until the library lines containing 5 tab-delimited fields | |
next unless /^library\t(.+?)\t(.+?)\t(.+?)\t(.+?)$/; | |
my ($id,$min,$max,$regex) = ($1,$2,$3,$4); | |
# store info in order using arrays | |
push @regexs, $regex; | |
push @rg, $id; | |
push @min, $min; | |
push @max, $max; | |
} | |
close MATES; | |
sub print_new_rg { | |
# Print read group header lines using library information | |
# gleened from the mates file | |
for(my $i=0; $i<scalar@rg; $i++){ | |
printf "\@RG\tID:%s\tSM:unknown\tLB:%s\tPI:%d\n", | |
$rg[$i], | |
$rg[$i], | |
($min[$i]+$max[$i])/2; | |
} | |
} | |
# read SAM file on STDIN | |
my $in_header=1; | |
while(<>){ | |
if (eof() && $in_header) { | |
# got to end of file (eof) and only seen header lines | |
# possibly used: samtools view -H | |
print_new_rg(); | |
} | |
chomp; | |
my $line = $_; | |
if ($line =~ /^\@(\S+)/) { | |
# a header line | |
next if $1 eq 'RG'; # skip existing read group header lines | |
print "$line\n"; # print all other header lines unchanged | |
} else { | |
# non-header lines i.e. alignment lines | |
if ($in_header) { | |
# first non-header line | |
print_new_rg(); # print new read group header lines using | |
# info gleaned from the mates file | |
$in_header = 0; # toggle so we don't enter this block again | |
} | |
# Process each alignment by: | |
# Applying the library regex's from the mates file, in turn, to | |
# the read names and assigning the corresponding library name to | |
# the read group tag value. Stop applying regex's to a read name | |
# once the first match is found | |
my @line = split /\s+/, $line; | |
for(my $i=0; $i<scalar@regexs; $i++) { | |
my $regex = $regexs[$i]; | |
if($line[0] =~ $regex) { | |
if(! ($line =~ s/RG:Z:([^\t]+)/RG:Z:$rg[$i]/)) { | |
# the alignment didn't have a RG tag, lets append one | |
$line .= "\tRG:Z:$rg[$i]"; | |
} | |
print "$line\n"; | |
last; | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
NOTE: The
my.mates
file should be tab-delimited