Created
March 3, 2014 10:17
-
-
Save szabadkai/9322088 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
# made by: Levente Szabadkai 2013 | |
use 5.014; | |
#use warnings; | |
#use diagnostics; | |
our($opt_h, $opt_o, $opt_t, $opt_g); | |
use Getopt::Std; | |
getopt('hotg'); | |
if ($opt_h){ | |
die"for help look in the source code!\n"; | |
} | |
my $transcripts_filename = shift @ARGV; | |
my $reference_cds_filename = shift @ARGV; | |
my %transcripts; | |
if($opt_o){ | |
open STDOUT, '>', "$opt_o.txt"; | |
} | |
############################################################################## | |
# This is a script to extract cds, 3'UTR, and 5'UTR | |
# from a set of transcrits in a single fasta file | |
# | |
# You have to call this script with 3 arguments like this: | |
# | |
# dump_utr <transcripts.fa> <reference_cds.fa> | |
# | |
# DATA structure: | |
# Each transcript has: | |
# | |
# {0} - PAC number provided by tophat-cufflinks | |
# [1] - accession (phytozom.org) | |
# [2] - reference CDS sequence | |
# [3] - transcript sequence from tophat-cufflinks out | |
# [4] - 3'UTR sequence (extracted, from [3]) | |
# [5] - 5'UTR | |
# [6] - 3'UTR length (bp) | |
# [7] - 5'UTR length (bp) | |
# [8] - 3'UTR intron (if there is <1> if not <0>) | |
# [9] - 5'UTR ORF? | |
# | |
# Switches: | |
# -o out_base name | |
# -t type of the output you can specify the output, like if you want to print | |
# the accession numebers and the corresponding 3'utr length you have to | |
# use -t <16> | |
# -g gene - if you want to list ony a few genes you can use the -g switch | |
# and specify the Name of the gene after the switch in a coma separated list, | |
# although it is not an efficient function of the script. | |
# | |
############################################################################## | |
# | |
# Block to fill the hash vith the reference CDS data | |
# | |
{ | |
if ($reference_cds_filename){ | |
open my $REF_FH, "$reference_cds_filename" || die "Cannot open reference! $reference_cds_filename\n"; | |
my @ref = <$REF_FH>; | |
close $REF_FH; | |
for (@ref) { | |
chomp; | |
if (/^>([a-z0-9A-Z.]+)\|.*PACid:([0-9]{8})/ ) { | |
$transcripts{$2}[1] = $1; | |
$transcripts{$2}[0] = $2; | |
} | |
else { | |
$transcripts{$2}[2] .= $_; | |
} | |
} | |
} | |
} | |
{ | |
if ($transcripts_filename){ | |
open my $TRANSCRIPTS_FH, "$transcripts_filename" || die "Cannot open file containing transcripts! $transcripts_filename\n"; | |
my @tra = <$TRANSCRIPTS_FH>; | |
close $TRANSCRIPTS_FH; | |
for (@tra) { | |
chomp; | |
if (/^>PAC:([0-9]{8})/) {} | |
else { | |
$transcripts{$1}[3] .= $_; | |
} | |
} | |
} | |
} | |
{ | |
for my $key (sort keys %transcripts){ | |
if (defined $transcripts{$key}[3] && $transcripts{$key}[3] =~ m/^(\S+)$transcripts{$key}[2](\S+)$/){ | |
$transcripts{$key}[4] = $2; | |
$transcripts{$key}[5] = $1; | |
$transcripts{$key}[6] = length $transcripts{$key}[4]; | |
$transcripts{$key}[7] = length $transcripts{$key}[5]; | |
} | |
} | |
} | |
# | |
# So far, so good. :) | |
# Now let do the output! | |
# | |
# the print_feature function awaits for a key and an array of numbers, to specify what to print, | |
# | |
sub print_feature{ | |
my %labels = ( | |
'0' => 'PAC number', | |
'1' => 'Accession(phytozome.org)', | |
'2' => 'Rreference CDS', | |
'3' => 'Transcript', | |
'4' => '3\'UTR', | |
'5' => '5\'UTR', | |
'6' => '3\'UTR length', | |
'7' => '5\'UTR length', | |
'8' => '3\'UTR intron', | |
'9' => '5\'UTR ORF?', | |
); | |
my ($key, $options) = @_; | |
my @options = split '', $options; | |
print "\n"; | |
for (@options){ | |
if ($transcripts{$key}[$_]){ | |
print ">$labels{1}:$transcripts{$key}[1]|$labels{0}:$transcripts{$key}[0]|$labels{$_}:\n$transcripts{$key}[$_]\n"; | |
} | |
} | |
} | |
if ($opt_g) { | |
unless($opt_t){ | |
&print_feature($opt_g,234567); | |
} | |
if($opt_t){ | |
&print_feature($opt_g,$opt_t); | |
} | |
} | |
else{ | |
for(sort keys %transcripts){ | |
unless ($opt_t){ | |
&print_feature($_,234567); | |
} | |
if ($opt_t) { | |
&print_feature($_,$opt_t) | |
} | |
} | |
} | |
__END__ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment