Skip to content

Instantly share code, notes, and snippets.

@szabadkai
Created March 3, 2014 10:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save szabadkai/9322088 to your computer and use it in GitHub Desktop.
Save szabadkai/9322088 to your computer and use it in GitHub Desktop.
#!/usr/bin/perl
# made by: Levente Szabadkai 2013
use 5.014;
#use warnings;
#use diagnostics;
our($opt_h, $opt_o, $opt_t, $opt_g);
use Getopt::Std;
getopt('hotg');
if ($opt_h){
die"for help look in the source code!\n";
}
my $transcripts_filename = shift @ARGV;
my $reference_cds_filename = shift @ARGV;
my %transcripts;
if($opt_o){
open STDOUT, '>', "$opt_o.txt";
}
##############################################################################
# This is a script to extract cds, 3'UTR, and 5'UTR
# from a set of transcrits in a single fasta file
#
# You have to call this script with 3 arguments like this:
#
# dump_utr <transcripts.fa> <reference_cds.fa>
#
# DATA structure:
# Each transcript has:
#
# {0} - PAC number provided by tophat-cufflinks
# [1] - accession (phytozom.org)
# [2] - reference CDS sequence
# [3] - transcript sequence from tophat-cufflinks out
# [4] - 3'UTR sequence (extracted, from [3])
# [5] - 5'UTR
# [6] - 3'UTR length (bp)
# [7] - 5'UTR length (bp)
# [8] - 3'UTR intron (if there is <1> if not <0>)
# [9] - 5'UTR ORF?
#
# Switches:
# -o out_base name
# -t type of the output you can specify the output, like if you want to print
# the accession numebers and the corresponding 3'utr length you have to
# use -t <16>
# -g gene - if you want to list ony a few genes you can use the -g switch
# and specify the Name of the gene after the switch in a coma separated list,
# although it is not an efficient function of the script.
#
##############################################################################
#
# Block to fill the hash vith the reference CDS data
#
{
if ($reference_cds_filename){
open my $REF_FH, "$reference_cds_filename" || die "Cannot open reference! $reference_cds_filename\n";
my @ref = <$REF_FH>;
close $REF_FH;
for (@ref) {
chomp;
if (/^>([a-z0-9A-Z.]+)\|.*PACid:([0-9]{8})/ ) {
$transcripts{$2}[1] = $1;
$transcripts{$2}[0] = $2;
}
else {
$transcripts{$2}[2] .= $_;
}
}
}
}
{
if ($transcripts_filename){
open my $TRANSCRIPTS_FH, "$transcripts_filename" || die "Cannot open file containing transcripts! $transcripts_filename\n";
my @tra = <$TRANSCRIPTS_FH>;
close $TRANSCRIPTS_FH;
for (@tra) {
chomp;
if (/^>PAC:([0-9]{8})/) {}
else {
$transcripts{$1}[3] .= $_;
}
}
}
}
{
for my $key (sort keys %transcripts){
if (defined $transcripts{$key}[3] && $transcripts{$key}[3] =~ m/^(\S+)$transcripts{$key}[2](\S+)$/){
$transcripts{$key}[4] = $2;
$transcripts{$key}[5] = $1;
$transcripts{$key}[6] = length $transcripts{$key}[4];
$transcripts{$key}[7] = length $transcripts{$key}[5];
}
}
}
#
# So far, so good. :)
# Now let do the output!
#
# the print_feature function awaits for a key and an array of numbers, to specify what to print,
#
sub print_feature{
my %labels = (
'0' => 'PAC number',
'1' => 'Accession(phytozome.org)',
'2' => 'Rreference CDS',
'3' => 'Transcript',
'4' => '3\'UTR',
'5' => '5\'UTR',
'6' => '3\'UTR length',
'7' => '5\'UTR length',
'8' => '3\'UTR intron',
'9' => '5\'UTR ORF?',
);
my ($key, $options) = @_;
my @options = split '', $options;
print "\n";
for (@options){
if ($transcripts{$key}[$_]){
print ">$labels{1}:$transcripts{$key}[1]|$labels{0}:$transcripts{$key}[0]|$labels{$_}:\n$transcripts{$key}[$_]\n";
}
}
}
if ($opt_g) {
unless($opt_t){
&print_feature($opt_g,234567);
}
if($opt_t){
&print_feature($opt_g,$opt_t);
}
}
else{
for(sort keys %transcripts){
unless ($opt_t){
&print_feature($_,234567);
}
if ($opt_t) {
&print_feature($_,$opt_t)
}
}
}
__END__
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment