Created
April 26, 2016 05:48
-
-
Save slavailn/30c977643017c33b48065423e2d9d1c9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/perl | |
# This script will take a standard bowtie alignment output file | |
# containing alignments of small RNAs to various datasets, like | |
# miRNA, piRNA, repeats, genes etc. Alignment was performed to retain | |
# multimatches in order to have information about various features this | |
# particular small RNA could be assigned to. | |
# Our goal is to turn alignment file into a table with the following columns: | |
# 1. sRNA id | |
# 2. sRNA sequence | |
# 3. miRNA | |
# 4. piRNA | |
# 5. tRNA | |
# 6. rRNA | |
# 7. promoter | |
# 8. transcript (unspliced, including UTRs) | |
# 9. repeat | |
################################################################################################################## | |
## MAIN BODY OF THE PROGRAM | |
use strict; use warnings; | |
use List::MoreUtils qw(uniq); | |
my $tag_alignment = shift or die "Please provide the file with alignments!"; # alignment file specified on command line | |
my $table_header = "srnaID\tsrnaSeq\tmiRNA\tpiRNA\ttRNA\trRNA\tpromoter\tgene\trepeat\n"; # table header | |
my %table_hash; # initialize hash that will contain table data | |
# where hash key is sRNA id and the rest of the data | |
# is stored as an array. | |
open( my $align_fh, "<", $tag_alignment ) or die "Cannot open alignment file: $!\n"; # open the file with alignments | |
my @alignments = <$align_fh>; | |
my @ids; # initialize arrays to store ids | |
# use for loop to get those ids | |
for ( @alignments ) | |
{ | |
chomp; | |
my @cols = split( "\t" ); | |
push( @ids, $cols[0] ); | |
} | |
# We only need unique ids for out table hash | |
my @unique_ids = uniq @ids; | |
# Now unique ids become keys in the table hash | |
for ( @unique_ids ) | |
{ | |
$table_hash{$_} = (); | |
} | |
# Specify hashes for intermediate annotation data storage, which will be filled | |
# with identify_category sub | |
my %mirna_hash; | |
my %pirna_hash; | |
my %trna_hash; | |
my %rrna_hash; | |
my %promoter_hash; | |
my %transcript_hash; | |
my %repeat_hash; | |
# Fill table hash with data | |
my @output_cols; # array to store output information | |
my $seq; # store reads sequence | |
for my $id ( keys %table_hash ) | |
{ | |
for my $alignment ( @alignments ) | |
{ | |
chomp $alignment; | |
my @cols = split( "\t", $alignment ); | |
if ( $id eq $cols[0] ) | |
{ | |
$seq = check_sequence( $cols[4], $cols[1] ); # check orientation, if "-", than reverse complement and return, else return sequence as is | |
$table_hash{$id} = [ $seq ]; | |
identify_category($cols[2], $id); # identify annotation category | |
} | |
} | |
} | |
# print table header | |
print $table_header; | |
# Build and output annotation table | |
build_annotation_table( \%table_hash, \%mirna_hash, \%pirna_hash, \%trna_hash, \%rrna_hash, \%promoter_hash, \%transcript_hash, \%repeat_hash ); | |
####################################################################################################################### | |
## SUBROUTINES | |
sub check_sequence | |
# Take sequence and strand, check if strand is "-" and reverse complement the sequence | |
# if strand is "+" then return the sequence as is | |
{ | |
my $seq = shift; | |
my $strand = shift; | |
if ( $strand eq '-' ) | |
{ | |
$seq = reverse_complement( $seq ); | |
return $seq; | |
} | |
else | |
{ | |
return $seq; | |
} | |
} | |
sub reverse_complement | |
# Reverse complement DNA strand | |
{ | |
my $seq = shift; | |
$seq =~ tr /atcgATCGnN/tagcTAGCnN/; | |
$seq = reverse( $seq ); | |
return $seq; | |
} | |
sub identify_category | |
# Identify annotation category based on the reference id and return the feature's name | |
{ | |
my $annot = shift; # Take annotation name | |
my $id = shift; # Take sequence id, note that the same id usually has several equally good annotations; | |
if ( $annot =~ /^(rno-mir-.+)\s+MI/ ) | |
{ | |
push ( @{ $mirna_hash{$id} }, $1 ); | |
} | |
elsif ( $annot =~ /^.+piRNA\s+(piR-\w+),\s+/ ) | |
{ | |
push ( @{ $pirna_hash{$id} }, $1 ); | |
} | |
elsif ( $annot =~ /^.+\.(trna\w+-\w+)\s+/ ) | |
{ | |
push ( @{ $trna_hash{$id} }, $1 ); | |
} | |
elsif ( $annot =~ /^(ENSRNOG\d+\|.+ribosomal RNA)/ ) | |
{ | |
push ( @{ $rrna_hash{$id} }, $1 ); | |
} | |
elsif ( $annot =~ /^(ENSRNOG\d+)\|.+\[/ ) | |
{ | |
push ( @{ $promoter_hash{$id} }, $1 ); | |
} | |
elsif ( $annot =~ /^rn5_ensGene_(ENSRNOT\d+)/ ) | |
{ | |
push ( @{ $transcript_hash{$id} }, $1 ); | |
} | |
elsif ( $annot =~ /(\w+)_\w+$/ ) | |
{ | |
push ( @{ $repeat_hash{$id} }, $1 ); | |
} | |
else | |
{ | |
warn "$id :: No legitimate annotation found!\n"; | |
} | |
} | |
sub build_annotation_table | |
# Fill annotation table with data contained in separate annotation hashes | |
# and print it to standard out | |
{ | |
my ( $table_hash_ref, $mirna_hash_ref, $pirna_hash_ref, $trna_hash_ref, | |
$rrna_hash_ref, $promoter_hash_ref, $transcript_hash_ref, $repeat_hash_ref ) = @_; | |
my %table_hash = %$table_hash_ref; | |
my %mirna_hash = %$mirna_hash_ref; | |
my %pirna_hash = %$pirna_hash_ref; | |
my %trna_hash = %$trna_hash_ref; | |
my %rrna_hash = %$rrna_hash_ref; | |
my %promoter_hash = %$promoter_hash_ref; | |
my %transcript_hash = %$transcript_hash_ref; | |
my %repeat_hash = %$repeat_hash_ref; | |
for my $id ( keys %table_hash ) | |
{ | |
# Fill in with miRNAs | |
if ( defined $mirna_hash{$id} ) | |
{ | |
push( @{$table_hash{$id}}, join( ',', @{$mirna_hash{$id}} ) ); | |
} | |
else | |
{ | |
push( @{ $table_hash{$id} }, "NA" ); | |
} | |
# Fill in with piRNAs | |
if ( defined $pirna_hash{$id} ) | |
{ | |
push( @{$table_hash{$id}}, join( ',', @{$pirna_hash{$id}} ) ); | |
} | |
else | |
{ | |
push( @{ $table_hash{$id} }, "NA" ); | |
} | |
# Fill in with tRNAs | |
if ( defined $trna_hash{$id} ) | |
{ | |
push( @{$table_hash{$id}}, join( ',', @{$trna_hash{$id}} ) ); | |
} | |
else | |
{ | |
push( @{ $table_hash{$id} }, "NA" ); | |
} | |
# Fill in with rRNAs | |
if ( defined $rrna_hash{$id} ) | |
{ | |
push( @{$table_hash{$id}}, join( ',', @{$rrna_hash{$id}} ) ); | |
} | |
else | |
{ | |
push( @{ $table_hash{$id} }, "NA" ); | |
} | |
# Fill in with promoters | |
if ( defined $promoter_hash{$id} ) | |
{ | |
push( @{$table_hash{$id}}, join( ',', @{$promoter_hash{$id}} ) ); | |
} | |
else | |
{ | |
push( @{ $table_hash{$id} }, "NA" ); | |
} | |
# Fill in with transcripts | |
if ( defined $transcript_hash{$id} ) | |
{ | |
push( @{$table_hash{$id}}, join( ',', @{$transcript_hash{$id}} ) ); | |
} | |
else | |
{ | |
push( @{ $table_hash{$id} }, "NA" ); | |
} | |
# Fill in with repeats | |
if ( defined $repeat_hash{$id} ) | |
{ | |
push( @{$table_hash{$id}}, join( ',', @{$repeat_hash{$id}} ) ); | |
} | |
else | |
{ | |
push( @{ $table_hash{$id} }, "NA" ); | |
} | |
print "$id\t", join( "\t", @{ $table_hash{$id} } ), "\n"; | |
} | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment