Skip to content

Instantly share code, notes, and snippets.

@slavailn
Created April 26, 2016 05:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save slavailn/30c977643017c33b48065423e2d9d1c9 to your computer and use it in GitHub Desktop.
Save slavailn/30c977643017c33b48065423e2d9d1c9 to your computer and use it in GitHub Desktop.
#! /usr/bin/perl
# This script will take a standard bowtie alignment output file
# containing alignments of small RNAs to various datasets, like
# miRNA, piRNA, repeats, genes etc. Alignment was performed to retain
# multimatches in order to have information about various features this
# particular small RNA could be assigned to.
# Our goal is to turn alignment file into a table with the following columns:
# 1. sRNA id
# 2. sRNA sequence
# 3. miRNA
# 4. piRNA
# 5. tRNA
# 6. rRNA
# 7. promoter
# 8. transcript (unspliced, including UTRs)
# 9. repeat
##################################################################################################################
## MAIN BODY OF THE PROGRAM
use strict; use warnings;
use List::MoreUtils qw(uniq);
my $tag_alignment = shift or die "Please provide the file with alignments!"; # alignment file specified on command line
my $table_header = "srnaID\tsrnaSeq\tmiRNA\tpiRNA\ttRNA\trRNA\tpromoter\tgene\trepeat\n"; # table header
my %table_hash; # initialize hash that will contain table data
# where hash key is sRNA id and the rest of the data
# is stored as an array.
open( my $align_fh, "<", $tag_alignment ) or die "Cannot open alignment file: $!\n"; # open the file with alignments
my @alignments = <$align_fh>;
my @ids; # initialize arrays to store ids
# use for loop to get those ids
for ( @alignments )
{
chomp;
my @cols = split( "\t" );
push( @ids, $cols[0] );
}
# We only need unique ids for out table hash
my @unique_ids = uniq @ids;
# Now unique ids become keys in the table hash
for ( @unique_ids )
{
$table_hash{$_} = ();
}
# Specify hashes for intermediate annotation data storage, which will be filled
# with identify_category sub
my %mirna_hash;
my %pirna_hash;
my %trna_hash;
my %rrna_hash;
my %promoter_hash;
my %transcript_hash;
my %repeat_hash;
# Fill table hash with data
my @output_cols; # array to store output information
my $seq; # store reads sequence
for my $id ( keys %table_hash )
{
for my $alignment ( @alignments )
{
chomp $alignment;
my @cols = split( "\t", $alignment );
if ( $id eq $cols[0] )
{
$seq = check_sequence( $cols[4], $cols[1] ); # check orientation, if "-", than reverse complement and return, else return sequence as is
$table_hash{$id} = [ $seq ];
identify_category($cols[2], $id); # identify annotation category
}
}
}
# print table header
print $table_header;
# Build and output annotation table
build_annotation_table( \%table_hash, \%mirna_hash, \%pirna_hash, \%trna_hash, \%rrna_hash, \%promoter_hash, \%transcript_hash, \%repeat_hash );
#######################################################################################################################
## SUBROUTINES
sub check_sequence
# Take sequence and strand, check if strand is "-" and reverse complement the sequence
# if strand is "+" then return the sequence as is
{
my $seq = shift;
my $strand = shift;
if ( $strand eq '-' )
{
$seq = reverse_complement( $seq );
return $seq;
}
else
{
return $seq;
}
}
sub reverse_complement
# Reverse complement DNA strand
{
my $seq = shift;
$seq =~ tr /atcgATCGnN/tagcTAGCnN/;
$seq = reverse( $seq );
return $seq;
}
sub identify_category
# Identify annotation category based on the reference id and return the feature's name
{
my $annot = shift; # Take annotation name
my $id = shift; # Take sequence id, note that the same id usually has several equally good annotations;
if ( $annot =~ /^(rno-mir-.+)\s+MI/ )
{
push ( @{ $mirna_hash{$id} }, $1 );
}
elsif ( $annot =~ /^.+piRNA\s+(piR-\w+),\s+/ )
{
push ( @{ $pirna_hash{$id} }, $1 );
}
elsif ( $annot =~ /^.+\.(trna\w+-\w+)\s+/ )
{
push ( @{ $trna_hash{$id} }, $1 );
}
elsif ( $annot =~ /^(ENSRNOG\d+\|.+ribosomal RNA)/ )
{
push ( @{ $rrna_hash{$id} }, $1 );
}
elsif ( $annot =~ /^(ENSRNOG\d+)\|.+\[/ )
{
push ( @{ $promoter_hash{$id} }, $1 );
}
elsif ( $annot =~ /^rn5_ensGene_(ENSRNOT\d+)/ )
{
push ( @{ $transcript_hash{$id} }, $1 );
}
elsif ( $annot =~ /(\w+)_\w+$/ )
{
push ( @{ $repeat_hash{$id} }, $1 );
}
else
{
warn "$id :: No legitimate annotation found!\n";
}
}
sub build_annotation_table
# Fill annotation table with data contained in separate annotation hashes
# and print it to standard out
{
my ( $table_hash_ref, $mirna_hash_ref, $pirna_hash_ref, $trna_hash_ref,
$rrna_hash_ref, $promoter_hash_ref, $transcript_hash_ref, $repeat_hash_ref ) = @_;
my %table_hash = %$table_hash_ref;
my %mirna_hash = %$mirna_hash_ref;
my %pirna_hash = %$pirna_hash_ref;
my %trna_hash = %$trna_hash_ref;
my %rrna_hash = %$rrna_hash_ref;
my %promoter_hash = %$promoter_hash_ref;
my %transcript_hash = %$transcript_hash_ref;
my %repeat_hash = %$repeat_hash_ref;
for my $id ( keys %table_hash )
{
# Fill in with miRNAs
if ( defined $mirna_hash{$id} )
{
push( @{$table_hash{$id}}, join( ',', @{$mirna_hash{$id}} ) );
}
else
{
push( @{ $table_hash{$id} }, "NA" );
}
# Fill in with piRNAs
if ( defined $pirna_hash{$id} )
{
push( @{$table_hash{$id}}, join( ',', @{$pirna_hash{$id}} ) );
}
else
{
push( @{ $table_hash{$id} }, "NA" );
}
# Fill in with tRNAs
if ( defined $trna_hash{$id} )
{
push( @{$table_hash{$id}}, join( ',', @{$trna_hash{$id}} ) );
}
else
{
push( @{ $table_hash{$id} }, "NA" );
}
# Fill in with rRNAs
if ( defined $rrna_hash{$id} )
{
push( @{$table_hash{$id}}, join( ',', @{$rrna_hash{$id}} ) );
}
else
{
push( @{ $table_hash{$id} }, "NA" );
}
# Fill in with promoters
if ( defined $promoter_hash{$id} )
{
push( @{$table_hash{$id}}, join( ',', @{$promoter_hash{$id}} ) );
}
else
{
push( @{ $table_hash{$id} }, "NA" );
}
# Fill in with transcripts
if ( defined $transcript_hash{$id} )
{
push( @{$table_hash{$id}}, join( ',', @{$transcript_hash{$id}} ) );
}
else
{
push( @{ $table_hash{$id} }, "NA" );
}
# Fill in with repeats
if ( defined $repeat_hash{$id} )
{
push( @{$table_hash{$id}}, join( ',', @{$repeat_hash{$id}} ) );
}
else
{
push( @{ $table_hash{$id} }, "NA" );
}
print "$id\t", join( "\t", @{ $table_hash{$id} } ), "\n";
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment