szabadkai/dump_utr.pl

## dump_utr.pl
#!/usr/bin/perl
# made by: Levente Szabadkai 2013
use 5.014;
#use warnings;
#use diagnostics;

our($opt_h, $opt_o, $opt_t, $opt_g);
use Getopt::Std;

getopt('hotg');

if ($opt_h){
die"for help look in the source code!\n";
}


my $transcripts_filename = shift @ARGV;
my $reference_cds_filename = shift @ARGV;
my %transcripts;

if($opt_o){
open STDOUT, '>', "$opt_o.txt";
}
##############################################################################
# This is a script to extract cds, 3'UTR, and 5'UTR
# from a set of transcrits in a single fasta file
#
# You have to call this script with 3 arguments like this:
#
# dump_utr <transcripts.fa> <reference_cds.fa>
#
# DATA structure:
# Each transcript has:
#
# {0} - PAC number provided by tophat-cufflinks
# [1] - accession (phytozom.org)
# [2] - reference CDS sequence
# [3] - transcript sequence from tophat-cufflinks out
# [4] - 3'UTR sequence (extracted, from [3])
# [5] - 5'UTR
# [6] - 3'UTR length (bp)
# [7] - 5'UTR length (bp)
# [8] - 3'UTR intron (if there is <1> if not <0>)
# [9] - 5'UTR ORF?
#
# Switches:
# -o 	out_base name
# -t	type of the output you can specify the output, like if you want to print
# 		the accession numebers and the corresponding 3'utr length you have to
#		use -t <16>
# -g	gene - if you want to list ony a few genes you can use the -g switch
#		and specify the Name of the gene after the switch in a coma separated list,
#		although it is not an efficient function of the script.
#
##############################################################################


#
# Block to fill the hash vith the reference CDS data
#

{
if ($reference_cds_filename){
	open my $REF_FH, "$reference_cds_filename" || die "Cannot open reference! $reference_cds_filename\n";
	my @ref = <$REF_FH>;
	close $REF_FH;

    	for (@ref) {
    		chomp;
    	    if (/^>([a-z0-9A-Z.]+)\|.*PACid:([0-9]{8})/  ) {
    	        $transcripts{$2}[1] = $1;
    	        $transcripts{$2}[0] = $2;
    	    }
    	    else {
    	    	$transcripts{$2}[2] .= $_;
    	    }
    	}
	}
}


{
if ($transcripts_filename){
	open my $TRANSCRIPTS_FH, "$transcripts_filename" || die "Cannot open file containing transcripts! $transcripts_filename\n";
	my @tra = <$TRANSCRIPTS_FH>;
	close $TRANSCRIPTS_FH;

    	for (@tra) {
    	chomp;
    	    if (/^>PAC:([0-9]{8})/) {}
    	    else {
    	        $transcripts{$1}[3] .= $_;
    	    }
    	}
	}
}

{
	for my $key (sort keys %transcripts){
		if (defined $transcripts{$key}[3] && $transcripts{$key}[3] =~ m/^(\S+)$transcripts{$key}[2](\S+)$/){
			$transcripts{$key}[4] = $2;
			$transcripts{$key}[5] = $1;
			$transcripts{$key}[6] = length $transcripts{$key}[4];
			$transcripts{$key}[7] = length $transcripts{$key}[5];
		}
	}
}

#
# So far, so good. :)
# Now let do the output!
#
# the print_feature function awaits for a key and an array of numbers, to specify what to print,
#

sub print_feature{

	my %labels = (
		'0' => 'PAC number',
		'1' => 'Accession(phytozome.org)',
		'2' => 'Rreference CDS',
		'3' => 'Transcript',
		'4' => '3\'UTR',
		'5' => '5\'UTR',
		'6' => '3\'UTR length',
		'7' => '5\'UTR length',
		'8' => '3\'UTR intron',
		'9' => '5\'UTR ORF?',
	);

	my ($key, $options) = @_;
	my @options = split '', $options;
	print "\n";
		for (@options){
			if ($transcripts{$key}[$_]){
				print ">$labels{1}:$transcripts{$key}[1]|$labels{0}:$transcripts{$key}[0]|$labels{$_}:\n$transcripts{$key}[$_]\n";
			}
		}
}

if ($opt_g) {
	unless($opt_t){
		&print_feature($opt_g,234567);
	}
	if($opt_t){
		&print_feature($opt_g,$opt_t);
	}
}

else{
	for(sort keys %transcripts){
		unless ($opt_t){
			&print_feature($_,234567);
		}
		if ($opt_t) {
			&print_feature($_,$opt_t)
		}
	}
}


__END__
	#!/usr/bin/perl
	# made by: Levente Szabadkai 2013
	use 5.014;
	#use warnings;
	#use diagnostics;

	our($opt_h, $opt_o, $opt_t, $opt_g);
	use Getopt::Std;

	getopt('hotg');

	if ($opt_h){
	die"for help look in the source code!\n";
	}


	my $transcripts_filename = shift @ARGV;
	my $reference_cds_filename = shift @ARGV;
	my %transcripts;

	if($opt_o){
	open STDOUT, '>', "$opt_o.txt";
	}
	##############################################################################
	# This is a script to extract cds, 3'UTR, and 5'UTR
	# from a set of transcrits in a single fasta file
	#
	# You have to call this script with 3 arguments like this:
	#
	# dump_utr <transcripts.fa> <reference_cds.fa>
	#
	# DATA structure:
	# Each transcript has:
	#
	# {0} - PAC number provided by tophat-cufflinks
	# [1] - accession (phytozom.org)
	# [2] - reference CDS sequence
	# [3] - transcript sequence from tophat-cufflinks out
	# [4] - 3'UTR sequence (extracted, from [3])
	# [5] - 5'UTR
	# [6] - 3'UTR length (bp)
	# [7] - 5'UTR length (bp)
	# [8] - 3'UTR intron (if there is <1> if not <0>)
	# [9] - 5'UTR ORF?
	#
	# Switches:
	# -o out_base name
	# -t type of the output you can specify the output, like if you want to print
	# the accession numebers and the corresponding 3'utr length you have to
	# use -t <16>
	# -g gene - if you want to list ony a few genes you can use the -g switch
	# and specify the Name of the gene after the switch in a coma separated list,
	# although it is not an efficient function of the script.
	#
	##############################################################################



	#
	# Block to fill the hash vith the reference CDS data
	#

	{
	if ($reference_cds_filename){
	open my $REF_FH, "$reference_cds_filename" \|\| die "Cannot open reference! $reference_cds_filename\n";
	my @ref = <$REF_FH>;
	close $REF_FH;

	for (@ref) {
	chomp;
	if (/^>([a-z0-9A-Z.]+)\\|.*PACid:([0-9]{8})/ ) {
	$transcripts{$2}[1] = $1;
	$transcripts{$2}[0] = $2;
	}
	else {
	$transcripts{$2}[2] .= $_;
	}
	}
	}
	}


	{
	if ($transcripts_filename){
	open my $TRANSCRIPTS_FH, "$transcripts_filename" \|\| die "Cannot open file containing transcripts! $transcripts_filename\n";
	my @tra = <$TRANSCRIPTS_FH>;
	close $TRANSCRIPTS_FH;

	for (@tra) {
	chomp;
	if (/^>PAC:([0-9]{8})/) {}
	else {
	$transcripts{$1}[3] .= $_;
	}
	}
	}
	}

	{
	for my $key (sort keys %transcripts){
	if (defined $transcripts{$key}[3] && $transcripts{$key}[3] =~ m/^(\S+)$transcripts{$key}[2](\S+)$/){
	$transcripts{$key}[4] = $2;
	$transcripts{$key}[5] = $1;
	$transcripts{$key}[6] = length $transcripts{$key}[4];
	$transcripts{$key}[7] = length $transcripts{$key}[5];
	}
	}
	}

	#
	# So far, so good. :)
	# Now let do the output!
	#
	# the print_feature function awaits for a key and an array of numbers, to specify what to print,
	#

	sub print_feature{

	my %labels = (
	'0' => 'PAC number',
	'1' => 'Accession(phytozome.org)',
	'2' => 'Rreference CDS',
	'3' => 'Transcript',
	'4' => '3\'UTR',
	'5' => '5\'UTR',
	'6' => '3\'UTR length',
	'7' => '5\'UTR length',
	'8' => '3\'UTR intron',
	'9' => '5\'UTR ORF?',
	);

	my ($key, $options) = @_;
	my @options = split '', $options;
	print "\n";
	for (@options){
	if ($transcripts{$key}[$_]){
	print ">$labels{1}:$transcripts{$key}[1]\|$labels{0}:$transcripts{$key}[0]\|$labels{$_}:\n$transcripts{$key}[$_]\n";
	}
	}
	}

	if ($opt_g) {
	unless($opt_t){
	&print_feature($opt_g,234567);
	}
	if($opt_t){
	&print_feature($opt_g,$opt_t);
	}
	}

	else{
	for(sort keys %transcripts){
	unless ($opt_t){
	&print_feature($_,234567);
	}
	if ($opt_t) {
	&print_feature($_,$opt_t)
	}
	}
	}



	__END__