slavailn/bowtie2annotation.pl

## bowtie2annotation.pl
#! /usr/bin/perl
# This script will take a standard bowtie alignment output file
# containing alignments of small RNAs to various datasets, like
# miRNA, piRNA, repeats, genes etc. Alignment was performed to retain
# multimatches in order to have information about various features this
# particular small RNA could be assigned to.
# Our goal is to turn alignment file into a table with the following columns:
# 1. sRNA id
# 2. sRNA sequence
# 3. miRNA
# 4. piRNA
# 5. tRNA
# 6. rRNA
# 7. promoter
# 8. transcript (unspliced, including UTRs)
# 9. repeat

##################################################################################################################
## MAIN BODY OF THE PROGRAM
use strict; use warnings;
use List::MoreUtils qw(uniq);

my $tag_alignment = shift or die "Please provide the file with alignments!"; # alignment file specified on command line
my $table_header = "srnaID\tsrnaSeq\tmiRNA\tpiRNA\ttRNA\trRNA\tpromoter\tgene\trepeat\n"; # table header
my %table_hash; # initialize hash that will contain table data
                # where hash key is sRNA id and the rest of the data
                # is stored as an array.

open( my $align_fh, "<", $tag_alignment ) or die "Cannot open alignment file: $!\n"; # open the file with alignments
my @alignments = <$align_fh>;

my @ids; # initialize arrays to store ids
         # use for loop to get those ids

for ( @alignments )
{
    chomp;
    my @cols = split( "\t" );
    push( @ids, $cols[0] );
}

# We only need unique ids for out table hash
my @unique_ids = uniq @ids;

# Now unique ids become keys in the table hash
for ( @unique_ids )
{
    $table_hash{$_} = ();
}

# Specify hashes for intermediate annotation data storage, which will be filled
# with identify_category sub
my %mirna_hash;
my %pirna_hash;
my %trna_hash;
my %rrna_hash;
my %promoter_hash;
my %transcript_hash;
my %repeat_hash;

# Fill table hash with data
my @output_cols; # array to store output information
my $seq; # store reads sequence
for my $id ( keys %table_hash )
{
    for my $alignment ( @alignments )
    {
        chomp $alignment;
        my @cols = split( "\t", $alignment );
        if ( $id eq $cols[0] )
        {
            $seq = check_sequence( $cols[4], $cols[1] ); # check orientation, if "-", than reverse complement and return, else return sequence as is
            $table_hash{$id} = [ $seq ];
            identify_category($cols[2], $id); # identify annotation category

         }
    }
}

# print table header
print $table_header;
# Build and output annotation table
build_annotation_table( \%table_hash, \%mirna_hash, \%pirna_hash, \%trna_hash, \%rrna_hash, \%promoter_hash, \%transcript_hash, \%repeat_hash );

#######################################################################################################################
## SUBROUTINES
sub check_sequence
# Take sequence and strand, check if strand is "-" and reverse complement the sequence
# if strand is "+" then return the sequence as is
{
    my $seq = shift;
    my $strand = shift;
    if ( $strand eq '-' )
    {
        $seq = reverse_complement( $seq );
        return $seq;
    }
    else
    {
        return $seq;
    }
}

sub reverse_complement
# Reverse complement DNA strand
{
    my $seq = shift;
    $seq =~ tr /atcgATCGnN/tagcTAGCnN/;
    $seq = reverse( $seq );
    return $seq;
}

sub identify_category
# Identify annotation category based on the reference id and return the feature's name
{
    my $annot = shift; # Take annotation name
    my $id    = shift; # Take sequence id, note that the same id usually has several equally good annotations;


    if ( $annot =~ /^(rno-mir-.+)\s+MI/ )
    {
        push ( @{ $mirna_hash{$id} }, $1 );
    }
    elsif ( $annot =~ /^.+piRNA\s+(piR-\w+),\s+/ )
    {
        push ( @{ $pirna_hash{$id} }, $1 );
    }
    elsif ( $annot =~ /^.+\.(trna\w+-\w+)\s+/ )
    {
        push ( @{ $trna_hash{$id} }, $1 );
    }
    elsif ( $annot =~ /^(ENSRNOG\d+\|.+ribosomal RNA)/ )
    {
        push ( @{ $rrna_hash{$id} }, $1 );
    }
    elsif ( $annot =~ /^(ENSRNOG\d+)\|.+\[/ )
    {
        push ( @{ $promoter_hash{$id} }, $1 );
    }
    elsif ( $annot =~ /^rn5_ensGene_(ENSRNOT\d+)/ )
    {
        push ( @{ $transcript_hash{$id} }, $1 );
    }
    elsif ( $annot =~ /(\w+)_\w+$/ )
    {
        push ( @{ $repeat_hash{$id} }, $1 );
    }
    else
    {
        warn "$id :: No legitimate annotation found!\n";
    }
}

sub build_annotation_table
# Fill annotation table with data contained in separate annotation hashes
# and print it to standard out
{
    my ( $table_hash_ref, $mirna_hash_ref, $pirna_hash_ref, $trna_hash_ref,
        $rrna_hash_ref, $promoter_hash_ref, $transcript_hash_ref, $repeat_hash_ref ) = @_;
    my %table_hash      = %$table_hash_ref;
    my %mirna_hash      = %$mirna_hash_ref;
    my %pirna_hash      = %$pirna_hash_ref;
    my %trna_hash       = %$trna_hash_ref;
    my %rrna_hash       = %$rrna_hash_ref;
    my %promoter_hash   = %$promoter_hash_ref;
    my %transcript_hash = %$transcript_hash_ref;
    my %repeat_hash     = %$repeat_hash_ref;

    for my $id ( keys %table_hash )
    {
        # Fill in with miRNAs
        if ( defined $mirna_hash{$id} )
        {
            push( @{$table_hash{$id}}, join( ',', @{$mirna_hash{$id}} ) );
        }
        else
        {
           push( @{ $table_hash{$id} }, "NA" );
        }

        # Fill in with piRNAs
        if ( defined $pirna_hash{$id} )
        {
            push( @{$table_hash{$id}}, join( ',', @{$pirna_hash{$id}} ) );
        }
        else
        {
           push( @{ $table_hash{$id} }, "NA" );
        }

        # Fill in with tRNAs
        if ( defined $trna_hash{$id} )
        {
            push( @{$table_hash{$id}}, join( ',', @{$trna_hash{$id}} ) );
        }
        else
        {
           push( @{ $table_hash{$id} }, "NA" );
        }

        # Fill in with rRNAs
        if ( defined $rrna_hash{$id} )
        {
            push( @{$table_hash{$id}}, join( ',', @{$rrna_hash{$id}} ) );
        }
        else
        {
           push( @{ $table_hash{$id} }, "NA" );
        }

        # Fill in with promoters
        if ( defined $promoter_hash{$id} )
        {
            push( @{$table_hash{$id}}, join( ',', @{$promoter_hash{$id}} ) );
        }
        else
        {
           push( @{ $table_hash{$id} }, "NA" );
        }

        # Fill in with transcripts
        if ( defined $transcript_hash{$id} )
        {
            push( @{$table_hash{$id}}, join( ',', @{$transcript_hash{$id}} ) );
        }
        else
        {
           push( @{ $table_hash{$id} }, "NA" );
        }

        # Fill in with repeats
        if ( defined $repeat_hash{$id} )
        {
            push( @{$table_hash{$id}}, join( ',', @{$repeat_hash{$id}} ) );
        }
        else
        {
           push( @{ $table_hash{$id} }, "NA" );
        }


        print "$id\t", join( "\t", @{ $table_hash{$id} } ), "\n";

    }
}
	#! /usr/bin/perl
	# This script will take a standard bowtie alignment output file
	# containing alignments of small RNAs to various datasets, like
	# miRNA, piRNA, repeats, genes etc. Alignment was performed to retain
	# multimatches in order to have information about various features this
	# particular small RNA could be assigned to.
	# Our goal is to turn alignment file into a table with the following columns:
	# 1. sRNA id
	# 2. sRNA sequence
	# 3. miRNA
	# 4. piRNA
	# 5. tRNA
	# 6. rRNA
	# 7. promoter
	# 8. transcript (unspliced, including UTRs)
	# 9. repeat

	##################################################################################################################
	## MAIN BODY OF THE PROGRAM
	use strict; use warnings;
	use List::MoreUtils qw(uniq);

	my $tag_alignment = shift or die "Please provide the file with alignments!"; # alignment file specified on command line
	my $table_header = "srnaID\tsrnaSeq\tmiRNA\tpiRNA\ttRNA\trRNA\tpromoter\tgene\trepeat\n"; # table header
	my %table_hash; # initialize hash that will contain table data
	# where hash key is sRNA id and the rest of the data
	# is stored as an array.

	open( my $align_fh, "<", $tag_alignment ) or die "Cannot open alignment file: $!\n"; # open the file with alignments
	my @alignments = <$align_fh>;

	my @ids; # initialize arrays to store ids
	# use for loop to get those ids

	for ( @alignments )
	{
	chomp;
	my @cols = split( "\t" );
	push( @ids, $cols[0] );
	}

	# We only need unique ids for out table hash
	my @unique_ids = uniq @ids;

	# Now unique ids become keys in the table hash
	for ( @unique_ids )
	{
	$table_hash{$_} = ();
	}

	# Specify hashes for intermediate annotation data storage, which will be filled
	# with identify_category sub
	my %mirna_hash;
	my %pirna_hash;
	my %trna_hash;
	my %rrna_hash;
	my %promoter_hash;
	my %transcript_hash;
	my %repeat_hash;

	# Fill table hash with data
	my @output_cols; # array to store output information
	my $seq; # store reads sequence
	for my $id ( keys %table_hash )
	{
	for my $alignment ( @alignments )
	{
	chomp $alignment;
	my @cols = split( "\t", $alignment );
	if ( $id eq $cols[0] )
	{
	$seq = check_sequence( $cols[4], $cols[1] ); # check orientation, if "-", than reverse complement and return, else return sequence as is
	$table_hash{$id} = [ $seq ];
	identify_category($cols[2], $id); # identify annotation category

	}
	}
	}

	# print table header
	print $table_header;
	# Build and output annotation table
	build_annotation_table( \%table_hash, \%mirna_hash, \%pirna_hash, \%trna_hash, \%rrna_hash, \%promoter_hash, \%transcript_hash, \%repeat_hash );

	#######################################################################################################################
	## SUBROUTINES
	sub check_sequence
	# Take sequence and strand, check if strand is "-" and reverse complement the sequence
	# if strand is "+" then return the sequence as is
	{
	my $seq = shift;
	my $strand = shift;
	if ( $strand eq '-' )
	{
	$seq = reverse_complement( $seq );
	return $seq;
	}
	else
	{
	return $seq;
	}
	}

	sub reverse_complement
	# Reverse complement DNA strand
	{
	my $seq = shift;
	$seq =~ tr /atcgATCGnN/tagcTAGCnN/;
	$seq = reverse( $seq );
	return $seq;
	}

	sub identify_category
	# Identify annotation category based on the reference id and return the feature's name
	{
	my $annot = shift; # Take annotation name
	my $id = shift; # Take sequence id, note that the same id usually has several equally good annotations;


	if ( $annot =~ /^(rno-mir-.+)\s+MI/ )
	{
	push ( @{ $mirna_hash{$id} }, $1 );
	}
	elsif ( $annot =~ /^.+piRNA\s+(piR-\w+),\s+/ )
	{
	push ( @{ $pirna_hash{$id} }, $1 );
	}
	elsif ( $annot =~ /^.+\.(trna\w+-\w+)\s+/ )
	{
	push ( @{ $trna_hash{$id} }, $1 );
	}
	elsif ( $annot =~ /^(ENSRNOG\d+\\|.+ribosomal RNA)/ )
	{
	push ( @{ $rrna_hash{$id} }, $1 );
	}
	elsif ( $annot =~ /^(ENSRNOG\d+)\\|.+\[/ )
	{
	push ( @{ $promoter_hash{$id} }, $1 );
	}
	elsif ( $annot =~ /^rn5_ensGene_(ENSRNOT\d+)/ )
	{
	push ( @{ $transcript_hash{$id} }, $1 );
	}
	elsif ( $annot =~ /(\w+)_\w+$/ )
	{
	push ( @{ $repeat_hash{$id} }, $1 );
	}
	else
	{
	warn "$id :: No legitimate annotation found!\n";
	}
	}

	sub build_annotation_table
	# Fill annotation table with data contained in separate annotation hashes
	# and print it to standard out
	{
	my ( $table_hash_ref, $mirna_hash_ref, $pirna_hash_ref, $trna_hash_ref,
	$rrna_hash_ref, $promoter_hash_ref, $transcript_hash_ref, $repeat_hash_ref ) = @_;
	my %table_hash = %$table_hash_ref;
	my %mirna_hash = %$mirna_hash_ref;
	my %pirna_hash = %$pirna_hash_ref;
	my %trna_hash = %$trna_hash_ref;
	my %rrna_hash = %$rrna_hash_ref;
	my %promoter_hash = %$promoter_hash_ref;
	my %transcript_hash = %$transcript_hash_ref;
	my %repeat_hash = %$repeat_hash_ref;

	for my $id ( keys %table_hash )
	{
	# Fill in with miRNAs
	if ( defined $mirna_hash{$id} )
	{
	push( @{$table_hash{$id}}, join( ',', @{$mirna_hash{$id}} ) );
	}
	else
	{
	push( @{ $table_hash{$id} }, "NA" );
	}

	# Fill in with piRNAs
	if ( defined $pirna_hash{$id} )
	{
	push( @{$table_hash{$id}}, join( ',', @{$pirna_hash{$id}} ) );
	}
	else
	{
	push( @{ $table_hash{$id} }, "NA" );
	}

	# Fill in with tRNAs
	if ( defined $trna_hash{$id} )
	{
	push( @{$table_hash{$id}}, join( ',', @{$trna_hash{$id}} ) );
	}
	else
	{
	push( @{ $table_hash{$id} }, "NA" );
	}

	# Fill in with rRNAs
	if ( defined $rrna_hash{$id} )
	{
	push( @{$table_hash{$id}}, join( ',', @{$rrna_hash{$id}} ) );
	}
	else
	{
	push( @{ $table_hash{$id} }, "NA" );
	}

	# Fill in with promoters
	if ( defined $promoter_hash{$id} )
	{
	push( @{$table_hash{$id}}, join( ',', @{$promoter_hash{$id}} ) );
	}
	else
	{
	push( @{ $table_hash{$id} }, "NA" );
	}

	# Fill in with transcripts
	if ( defined $transcript_hash{$id} )
	{
	push( @{$table_hash{$id}}, join( ',', @{$transcript_hash{$id}} ) );
	}
	else
	{
	push( @{ $table_hash{$id} }, "NA" );
	}

	# Fill in with repeats
	if ( defined $repeat_hash{$id} )
	{
	push( @{$table_hash{$id}}, join( ',', @{$repeat_hash{$id}} ) );
	}
	else
	{
	push( @{ $table_hash{$id} }, "NA" );
	}


	print "$id\t", join( "\t", @{ $table_hash{$id} } ), "\n";

	}
	}