Perl script that, given an input file of DNA sequences of transcripts, infers their translations.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
=head1 NAME | |
translate_spliced_dna.pl | |
=head1 SYNOPSIS | |
translate_spliced_dna.pl spliced_dna translated_dna outputdir | |
where spliced_dna is the input fasta file of spliced DNA sequences for transcripts, | |
translated_dna is the output fasta file of amino acid translations of transcripts, | |
outputdir is the output directory for writing output files. | |
=head1 DESCRIPTION | |
This script reads in a fasta file of transcript DNA sequences (<spliced_dna>) and translates | |
each transcript. It prints out a fasta file of amino acid sequences for transcripts (<translated_dna>). | |
Note: this expects different genes to have unique names: two genes on different scaffolds cannot have the | |
same name. | |
=head1 VERSION | |
Perl script last edited 2-Apr-2013. | |
=head1 CONTACT | |
alc@sanger.ac.uk (Avril Coghlan) | |
=cut | |
# | |
# Perl script translate_spliced_dna.pl | |
# Written by Avril Coghlan (alc@sanger.ac.uk) | |
# 2-Apr-13. | |
# Last edited 2-Apr-2013. | |
# SCRIPT SYNOPSIS: translate_spliced_dna.pl: given an input file of DNA sequences of transcripts, infers their translations. | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
$num_args = $#ARGV + 1; | |
if ($num_args != 3) | |
{ | |
print "Usage of translate_spliced_dna.pl\n\n"; | |
print "perl translate_spliced_dna_.pl <spliced_dna> <translated_dna> <outputdir>\n"; | |
print "where <spliced_dna> is the input fasta file of spliced DNA sequences for transcripts,\n"; | |
print " <translated_dna> is the output fasta file of amino acid translations of transcripts,\n"; | |
print " <outputdir> is the output directory for writing output files.\n"; | |
print "For example, >perl -w translate_spliced_dna.pl spliced_dna translated_dna /lustre/scratch108/parasites/alc/50HGI_maker_translations\n"; | |
exit; | |
} | |
# FIND THE NAME OF THE INPUT FASTA FILE: | |
$spliced_dna = $ARGV[0]; | |
# FIND THE NAME OF THE OUTPUT FASTA FILE: | |
$translated_dna = $ARGV[1]; | |
# FIND THE NAME OF THE OUTPUT DIRECTORY: | |
$outputdir = $ARGV[2]; | |
# TEST SUBROUTINES: | |
my $PRINT_TEST_DATA = 0; # SAYS WHETHER TO PRINT DATA USED DURING TESTING. | |
&test_print_to_output($outputdir); | |
&test_print_error; | |
&test_read_assembly($outputdir); | |
&test_write_out_translations($outputdir); | |
&test_translate; | |
&test_run_main_program($outputdir); | |
print STDERR "Finished tests: now running main code...\n"; | |
#------------------------------------------------------------------# | |
# RUN THE MAIN PART OF THE CODE: | |
($errorcode,$errormsg) = &run_main_program($outputdir,$spliced_dna,$translated_dna); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
print STDERR "FINISHED.\n"; | |
#------------------------------------------------------------------# | |
# TEST &run_main_program | |
sub test_run_main_program | |
{ | |
my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES INTO | |
my $input; # INPUT FILE OF SPLICED DNA SEQUENCES | |
my $output; # OUTPUT FILE OF TRANSLATIONS | |
my $expected_output; # FILE CONTAINING EXPECTED CONTENT OF $output | |
my $differences; # DIFFERENCES BETWEEN $output AND $expected_output | |
my $length_differences; # LENGTH OF $differences | |
my $line; # | |
my @temp; # | |
($input,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(INPUT,">$input") || die "ERROR: test_run_main_program: cannot open $input\n"; | |
print INPUT ">seq1\n"; | |
print INPUT "ATGCAGTAA\n"; | |
print INPUT ">seq2\n"; | |
print INPUT "ATGGGTGCAACATTGTCTTCAACTTCTTCTTCACCATCTTCTATTGATGGAAGAGAAAATTTGATAAGAAGAAATAACACATCAACATACAATCATTTATTTGTCACACCTGGTAATACAAAAAATAAGAATAATAATTTTCTATTTGATTCAAAAAATCGGCACTCATCTCTTTCTTCATCTTCTTCTATGCTTGGTATTGCATGGAACTTTGCCAAAAAAACAACTACGGGAATTCCTAATAAAGAAAAAAGAAATATAAATAGATTATCATCATCTAGTTCAGTATCATCTGCTTTATCAAAATCTCATGATTCTGCAACTTCATTAACTTTTAATTCAAGTTTTTCTATTAATTCATCAAGTGACCAATTATCAGCATACTCTTCTAGTGCATCAAATTCTTCATCAGGTGATAGTGGTATTGGAATATCTAGAAAAATAACAATATCTAATCAATGTTATCATGAAATTGATGGCAAAAATAACAATAATAATATTGTAAATAATGTACCTACTTTAAATACTAATGAAAAATTTATTTGTATTAGAAAAAAAGAATCAATTTTATCAAGTAAAATAATGTCCGTACCATTAAATAATATAAAAAAAGGAAGTTCTAAAGTTCGTGATCATATATTAGCAATGACAAGATCAAAATCAACAAATATGCGTACAGAAAAAAATGGAAACATAATTATAAATGGTAATAATTATAATATTGAAAATAAATTTAATGATAATTTAAAATTATCATCAACAACTTATGATTATGAAAAAAAATTAAATCAAAATATAGAAAATAATATATCTAAATCATTGAAAAATGTTACTTTAACAAATCGAAATAATAATTCAAATTTATATAATACTGCAATATTTAATTTCGGCATTAAAAATGATAGTAATGTTATAGATAAAACAAGAAAAAAAACAATTATACAAGCATCAACAACTGAATTATTAAAAGGTGTTTCATTATTAATTACAACTAAATGTTCCCATAAAGTACCAGATTTTTTTGCTAATCAATTAACAATGTGGTTAAGAAGTGTTGATAGATCTTTAATTGTTCAGGGTTGGCAAGATATCGCATTTTTAAATCCAGCTAACATGGTATTTTTTTTTATGCTTTTAAGATCAATGCTTAATGATGAAGAAACATTTCCTGTTAATAATTTGGAAGACCTTCAAATGATTGTTTTTACTTGTTTATTTATTAGTTACAGTTACATGGGTAATGAAATTAGTTATCCGTTAAAACCATTTATTTCACAACAGGATCGCTCAAAATTTTGGGATATGTGTGTTCAAATAGTTAATAAATATTCTGGGGATATGTTACAATTAAATACATCAGCTTCATTTTTTACACAAGTTTTTAGTGATCTCAAAAATTTTACAACTATTGCTAACAATTAA\n"; | |
close(INPUT); | |
# WRITE OUT THE OUTPUT FILE OF TRANSLATED SEQUENCES: | |
($output,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
@temp = split(/\//,$output); | |
$output = $temp[$#temp]; | |
($errorcode,$errormsg) = &run_main_program($outputdir,$input,$output); | |
if ($errorcode != 0) { print STDERR "ERROR: test_run_main_program: failed test1\n"; exit;} | |
($expected_output,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(EXPECTED,">$expected_output") || die "ERROR: test_run_main_program: cannot open $expected_output\n"; | |
print EXPECTED ">seq1\n"; | |
print EXPECTED "MQ*\n"; | |
print EXPECTED ">seq2\n"; | |
print EXPECTED "MGATLSSTSSSPSSIDGRENLIRRNNTSTYNHLFVTPGNTKNKNNNFLFDSKNRHSSLSS\n"; | |
print EXPECTED "SSSMLGIAWNFAKKTTTGIPNKEKRNINRLSSSSSVSSALSKSHDSATSLTFNSSFSINS\n"; | |
print EXPECTED "SSDQLSAYSSSASNSSSGDSGIGISRKITISNQCYHEIDGKNNNNNIVNNVPTLNTNEKF\n"; | |
print EXPECTED "ICIRKKESILSSKIMSVPLNNIKKGSSKVRDHILAMTRSKSTNMRTEKNGNIIINGNNYN\n"; | |
print EXPECTED "IENKFNDNLKLSSTTYDYEKKLNQNIENNISKSLKNVTLTNRNNNSNLYNTAIFNFGIKN\n"; | |
print EXPECTED "DSNVIDKTRKKTIIQASTTELLKGVSLLITTKCSHKVPDFFANQLTMWLRSVDRSLIVQG\n"; | |
print EXPECTED "WQDIAFLNPANMVFFFMLLRSMLNDEETFPVNNLEDLQMIVFTCLFISYSYMGNEISYPL\n"; | |
print EXPECTED "KPFISQQDRSKFWDMCVQIVNKYSGDMLQLNTSASFFTQVFSDLKNFTTIANN*\n"; | |
close(EXPECTED); | |
$differences = ""; | |
open(TEMP,"diff $output $expected_output |"); | |
while(<TEMP>) | |
{ | |
$line = $_; | |
$differences = $differences.$line; | |
} | |
close(TEMP); | |
$length_differences = length($differences); | |
if ($length_differences != 0) { print STDERR "ERROR: test_run_main_program: failed test1 (output $output expected_output $expected_output)\n"; exit;} | |
system "rm -f $output"; | |
system "rm -f $expected_output"; | |
system "rm -f $input"; | |
} | |
#------------------------------------------------------------------# | |
# RUN THE MAIN PART OF THE CODE: | |
sub run_main_program | |
{ | |
my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES IN. | |
my $spliced_dna = $_[1]; # FASTA FILE OF SPLICED DNA SEQUENCES | |
my $translated_dna = $_[2]; # OUTPUT FASTA FILE OF TRANSLATIONS | |
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR. | |
my $errormsg = 'none';# RETURNED AS 'none' IF THERE IS NO ERROR. | |
my $SEQ; # HASH TABLE OF SPLICED DNA SEQUENCES | |
# READ IN THE FASTA FILE OF SPLICED DNA SEQUENCES: | |
($SEQ,$errorcode,$errormsg) = &read_assembly($spliced_dna); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
# WRITE OUT THE OUTPUT FILE OF TRANSLATED SEQUENCES: | |
($errorcode,$errormsg) = &write_out_translations($SEQ,$translated_dna,$outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
return($errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# TEST &write_out_translations | |
sub test_write_out_translations | |
{ | |
my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES INTO | |
my %SEQ = (); # HASH TABLE OF SPLICED DNA SEQUENCES | |
my $output; # OUTPUT FILE OF TRANSLATIONS | |
my $expected_output; # FILE CONTAINING EXPECTED CONTENT OF $output | |
my $differences; # DIFFERENCES BETWEEN $output AND $expected_output | |
my $length_differences; # LENGTH OF $differences | |
my $line; # | |
my @temp; # | |
$SEQ{"seq1"} = "ATGCAGTAA"; | |
$SEQ{"seq2"} = "ATGGGTGCAACATTGTCTTCAACTTCTTCTTCACCATCTTCTATTGATGGAAGAGAAAATTTGATAAGAAGAAATAACACATCAACATACAATCATTTATTTGTCACACCTGGTAATACAAAAAATAAGAATAATAATTTTCTATTTGATTCAAAAAATCGGCACTCATCTCTTTCTTCATCTTCTTCTATGCTTGGTATTGCATGGAACTTTGCCAAAAAAACAACTACGGGAATTCCTAATAAAGAAAAAAGAAATATAAATAGATTATCATCATCTAGTTCAGTATCATCTGCTTTATCAAAATCTCATGATTCTGCAACTTCATTAACTTTTAATTCAAGTTTTTCTATTAATTCATCAAGTGACCAATTATCAGCATACTCTTCTAGTGCATCAAATTCTTCATCAGGTGATAGTGGTATTGGAATATCTAGAAAAATAACAATATCTAATCAATGTTATCATGAAATTGATGGCAAAAATAACAATAATAATATTGTAAATAATGTACCTACTTTAAATACTAATGAAAAATTTATTTGTATTAGAAAAAAAGAATCAATTTTATCAAGTAAAATAATGTCCGTACCATTAAATAATATAAAAAAAGGAAGTTCTAAAGTTCGTGATCATATATTAGCAATGACAAGATCAAAATCAACAAATATGCGTACAGAAAAAAATGGAAACATAATTATAAATGGTAATAATTATAATATTGAAAATAAATTTAATGATAATTTAAAATTATCATCAACAACTTATGATTATGAAAAAAAATTAAATCAAAATATAGAAAATAATATATCTAAATCATTGAAAAATGTTACTTTAACAAATCGAAATAATAATTCAAATTTATATAATACTGCAATATTTAATTTCGGCATTAAAAATGATAGTAATGTTATAGATAAAACAAGAAAAAAAACAATTATACAAGCATCAACAACTGAATTATTAAAAGGTGTTTCATTATTAATTACAACTAAATGTTCCCATAAAGTACCAGATTTTTTTGCTAATCAATTAACAATGTGGTTAAGAAGTGTTGATAGATCTTTAATTGTTCAGGGTTGGCAAGATATCGCATTTTTAAATCCAGCTAACATGGTATTTTTTTTTATGCTTTTAAGATCAATGCTTAATGATGAAGAAACATTTCCTGTTAATAATTTGGAAGACCTTCAAATGATTGTTTTTACTTGTTTATTTATTAGTTACAGTTACATGGGTAATGAAATTAGTTATCCGTTAAAACCATTTATTTCACAACAGGATCGCTCAAAATTTTGGGATATGTGTGTTCAAATAGTTAATAAATATTCTGGGGATATGTTACAATTAAATACATCAGCTTCATTTTTTACACAAGTTTTTAGTGATCTCAAAAATTTTACAACTATTGCTAACAATTAA"; | |
# WRITE OUT THE OUTPUT FILE OF TRANSLATED SEQUENCES: | |
($output,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
@temp = split(/\//,$output); | |
$output = $temp[$#temp]; | |
($errorcode,$errormsg) = &write_out_translations(\%SEQ,$output,$outputdir); | |
if ($errorcode != 0) { print STDERR "ERROR: test_write_out_translation: failed test1\n"; exit;} | |
($expected_output,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(EXPECTED,">$expected_output") || die "ERROR: test_write_out_translations: cannot open $expected_output\n"; | |
print EXPECTED ">seq1\n"; | |
print EXPECTED "MQ*\n"; | |
print EXPECTED ">seq2\n"; | |
print EXPECTED "MGATLSSTSSSPSSIDGRENLIRRNNTSTYNHLFVTPGNTKNKNNNFLFDSKNRHSSLSS\n"; | |
print EXPECTED "SSSMLGIAWNFAKKTTTGIPNKEKRNINRLSSSSSVSSALSKSHDSATSLTFNSSFSINS\n"; | |
print EXPECTED "SSDQLSAYSSSASNSSSGDSGIGISRKITISNQCYHEIDGKNNNNNIVNNVPTLNTNEKF\n"; | |
print EXPECTED "ICIRKKESILSSKIMSVPLNNIKKGSSKVRDHILAMTRSKSTNMRTEKNGNIIINGNNYN\n"; | |
print EXPECTED "IENKFNDNLKLSSTTYDYEKKLNQNIENNISKSLKNVTLTNRNNNSNLYNTAIFNFGIKN\n"; | |
print EXPECTED "DSNVIDKTRKKTIIQASTTELLKGVSLLITTKCSHKVPDFFANQLTMWLRSVDRSLIVQG\n"; | |
print EXPECTED "WQDIAFLNPANMVFFFMLLRSMLNDEETFPVNNLEDLQMIVFTCLFISYSYMGNEISYPL\n"; | |
print EXPECTED "KPFISQQDRSKFWDMCVQIVNKYSGDMLQLNTSASFFTQVFSDLKNFTTIANN*\n"; | |
close(EXPECTED); | |
$differences = ""; | |
open(TEMP,"diff $output $expected_output |"); | |
while(<TEMP>) | |
{ | |
$line = $_; | |
$differences = $differences.$line; | |
} | |
close(TEMP); | |
$length_differences = length($differences); | |
if ($length_differences != 0) { print STDERR "ERROR: test_write_out_translation: failed test1 (output $output expected_output $expected_output)\n"; exit;} | |
system "rm -f $output"; | |
system "rm -f $expected_output"; | |
} | |
#------------------------------------------------------------------# | |
# WRITE OUT THE OUTPUT FILE OF TRANSLATED SEQUENCES: | |
sub write_out_translations | |
{ | |
my $SEQ = $_[0]; # HASH TABLE WITH SPLICED DNA SEQUENCES FOR TRANSCRIPTS | |
my $translated_dna = $_[1]; # OUTPUT FILE | |
my $outputdir = $_[2]; # DIRECTORY TO PUT OUTPUT FILES INTO. | |
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR. | |
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR. | |
my $seqname; # NAME OF A TRANSCRIPT | |
my $seq; # DNA SEQUENCE OF A TRANSCRIPT | |
# OPEN THE OUTPUT FILE: | |
$translated_dna = $outputdir."/".$translated_dna; | |
open(OUTPUT,">$translated_dna") || die "ERROR: write_out_translations: cannot open $translated_dna\n"; | |
close(OUTPUT); | |
# LOOP THROUGH ALL THE DNA SEQUENCES: | |
foreach $seqname (keys %{$SEQ}) | |
{ | |
$seq = $SEQ->{$seqname}; | |
# TRANSLATE THIS SEQUENCE: | |
($seq,$errorcode,$errormsg) = &translate($seq); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
# WRITE OUT THE SEQUENCE IN THE OUTPUT FASTA FILE: | |
($errorcode,$errormsg) = &print_to_output($translated_dna,$seq,$seqname); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
} | |
return($errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# TEST &translate | |
sub test_translate | |
{ | |
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE IS NO ERROR | |
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE IS NO ERROR | |
my $seq; # TRANSLATED SEQUENCE | |
($seq,$errorcode,$errormsg) = &translate("ATGCAGTAA"); | |
if ($errorcode != 0 || $seq ne "MQ*") { print STDERR "ERROR: test_translate: failed test1 (seq $seq errorcode $errorcode errormsg $errormsg)\n"; exit;} | |
($seq,$errorcode,$errormsg) = &translate("ATGGGTGCAACATTGTCTTCAACTTCTTCTTCACCATCTTCTATTGATGGAAGAGAAAATTTGATAAGAAGAAATAACACATCAACATACAATCATTTATTTGTCACACCTGGTAATACAAAAAATAAGAATAATAATTTTCTATTTGATTCAAAAAATCGGCACTCATCTCTTTCTTCATCTTCTTCTATGCTTGGTATTGCATGGAACTTTGCCAAAAAAACAACTACGGGAATTCCTAATAAAGAAAAAAGAAATATAAATAGATTATCATCATCTAGTTCAGTATCATCTGCTTTATCAAAATCTCATGATTCTGCAACTTCATTAACTTTTAATTCAAGTTTTTCTATTAATTCATCAAGTGACCAATTATCAGCATACTCTTCTAGTGCATCAAATTCTTCATCAGGTGATAGTGGTATTGGAATATCTAGAAAAATAACAATATCTAATCAATGTTATCATGAAATTGATGGCAAAAATAACAATAATAATATTGTAAATAATGTACCTACTTTAAATACTAATGAAAAATTTATTTGTATTAGAAAAAAAGAATCAATTTTATCAAGTAAAATAATGTCCGTACCATTAAATAATATAAAAAAAGGAAGTTCTAAAGTTCGTGATCATATATTAGCAATGACAAGATCAAAATCAACAAATATGCGTACAGAAAAAAATGGAAACATAATTATAAATGGTAATAATTATAATATTGAAAATAAATTTAATGATAATTTAAAATTATCATCAACAACTTATGATTATGAAAAAAAATTAAATCAAAATATAGAAAATAATATATCTAAATCATTGAAAAATGTTACTTTAACAAATCGAAATAATAATTCAAATTTATATAATACTGCAATATTTAATTTCGGCATTAAAAATGATAGTAATGTTATAGATAAAACAAGAAAAAAAACAATTATACAAGCATCAACAACTGAATTATTAAAAGGTGTTTCATTATTAATTACAACTAAATGTTCCCATAAAGTACCAGATTTTTTTGCTAATCAATTAACAATGTGGTTAAGAAGTGTTGATAGATCTTTAATTGTTCAGGGTTGGCAAGATATCGCATTTTTAAATCCAGCTAACATGGTATTTTTTTTTATGCTTTTAAGATCAATGCTTAATGATGAAGAAACATTTCCTGTTAATAATTTGGAAGACCTTCAAATGATTGTTTTTACTTGTTTATTTATTAGTTACAGTTACATGGGTAATGAAATTAGTTATCCGTTAAAACCATTTATTTCACAACAGGATCGCTCAAAATTTTGGGATATGTGTGTTCAAATAGTTAATAAATATTCTGGGGATATGTTACAATTAAATACATCAGCTTCATTTTTTACACAAGTTTTTAGTGATCTCAAAAATTTTACAACTATTGCTAACAATTAA"); | |
if ($errorcode != 0 || $seq ne "MGATLSSTSSSPSSIDGRENLIRRNNTSTYNHLFVTPGNTKNKNNNFLFDSKNRHSSLSSSSSMLGIAWNFAKKTTTGIPNKEKRNINRLSSSSSVSSALSKSHDSATSLTFNSSFSINSSSDQLSAYSSSASNSSSGDSGIGISRKITISNQCYHEIDGKNNNNNIVNNVPTLNTNEKFICIRKKESILSSKIMSVPLNNIKKGSSKVRDHILAMTRSKSTNMRTEKNGNIIINGNNYNIENKFNDNLKLSSTTYDYEKKLNQNIENNISKSLKNVTLTNRNNNSNLYNTAIFNFGIKNDSNVIDKTRKKTIIQASTTELLKGVSLLITTKCSHKVPDFFANQLTMWLRSVDRSLIVQGWQDIAFLNPANMVFFFMLLRSMLNDEETFPVNNLEDLQMIVFTCLFISYSYMGNEISYPLKPFISQQDRSKFWDMCVQIVNKYSGDMLQLNTSASFFTQVFSDLKNFTTIANN*") { print STDERR "ERROR: test_translate: failed test2\n"; exit;} | |
# TEST ERRORCODE=1 (NON-ACGTN BASES): | |
($seq,$errorcode,$errormsg) = &translate("ATGCAG-TAA"); | |
if ($errorcode !=1) { print STDERR "ERROR: test_translate: failed test3 (seq $seq errorcode $errorcode errormsg $errormsg)\n"; exit;} | |
} | |
#------------------------------------------------------------------# | |
# SUBROUTINE TO TRANSLATE A DNA SEQUENCE TO A PROTEIN SEQUENCE: | |
sub translate | |
{ | |
my $dna = $_[0]; # INPUT DNA SEQUENCE | |
my $protein = ""; # PROTEIN SEQUENCE | |
my $triplet; # A TRIPLET OF THE SEQUENCE | |
my $aa; # AN AMINO ACID | |
my $length; # LENGTH OF THE SEQUENCE | |
my $i; # | |
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR | |
my $first_base; # FIRST BASE OF TRIPLET | |
my $second_base; # SECOND BASE OF TRIPLET | |
my $third_base; # THIRD BASE OF TRIPLET | |
my $protein_length_bp; # LENGTH OF THE PROTEIN SEQUENCE IN BASE-PAIRS | |
$dna =~ tr/a-z/A-Z/; | |
$length = length($dna); | |
$protein = ""; | |
for ($i = 0; $i <= $length-3; $i = $i+3) | |
{ | |
$triplet = substr($dna,$i,3); | |
$first_base = substr($triplet,0,1); | |
$second_base = substr($triplet,1,1); | |
$third_base = substr($triplet,2,1); | |
if (($first_base ne 'A' && $first_base ne 'C' && $first_base ne 'G' && $first_base ne 'T' && $first_base ne 'N') || | |
($second_base ne 'A' && $second_base ne 'C' && $second_base ne 'G' && $second_base ne 'T' && $second_base ne 'N') || | |
($third_base ne 'A' && $third_base ne 'C' && $third_base ne 'G' && $third_base ne 'T' && $third_base ne 'N')) | |
{ | |
$errormsg = "ERROR: translate: triplet $triplet contains non-ACGTN bases (first_base $first_base second_base $second_base third_base $third_base)\n"; | |
$errorcode = 1; # ERRORCODE=1 (TESTED FOR) | |
return($protein,$errorcode,$errormsg); | |
} | |
$aa = "none"; | |
if ($triplet eq 'TTT') { $aa = 'F';} | |
if ($triplet eq 'TTC') { $aa = 'F';} | |
if ($triplet eq 'TTA') { $aa = 'L';} | |
if ($triplet eq 'TTG') { $aa = 'L';} # 4 | |
if ($triplet eq 'TCT') { $aa = 'S';} | |
if ($triplet eq 'TCC') { $aa = 'S';} | |
if ($triplet eq 'TCA') { $aa = 'S';} | |
if ($triplet eq 'TCG') { $aa = 'S';} # 8 | |
if ($triplet eq 'TAT') { $aa = 'Y';} | |
if ($triplet eq 'TAC') { $aa = 'Y';} | |
if ($triplet eq 'TAA') { $aa = '*';} | |
if ($triplet eq 'TAG') { $aa = '*';} # 12 | |
if ($triplet eq 'TGT') { $aa = 'C';} | |
if ($triplet eq 'TGC') { $aa = 'C';} | |
if ($triplet eq 'TGA') { $aa = '*';} | |
if ($triplet eq 'TGG') { $aa = 'W';} # 16 | |
if ($triplet eq 'CTT') { $aa = 'L';} | |
if ($triplet eq 'CTC') { $aa = 'L';} | |
if ($triplet eq 'CTA') { $aa = 'L';} | |
if ($triplet eq 'CTG') { $aa = 'L';} # 20 | |
if ($triplet eq 'CCT') { $aa = 'P';} | |
if ($triplet eq 'CCC') { $aa = 'P';} | |
if ($triplet eq 'CCA') { $aa = 'P';} | |
if ($triplet eq 'CCG') { $aa = 'P';} # 24 | |
if ($triplet eq 'CAT') { $aa = 'H';} | |
if ($triplet eq 'CAC') { $aa = 'H';} | |
if ($triplet eq 'CAA') { $aa = 'Q';} | |
if ($triplet eq 'CAG') { $aa = 'Q';} # 28 | |
if ($triplet eq 'CGT') { $aa = 'R';} | |
if ($triplet eq 'CGC') { $aa = 'R';} | |
if ($triplet eq 'CGA') { $aa = 'R';} | |
if ($triplet eq 'CGG') { $aa = 'R';} # 32 | |
if ($triplet eq 'ATT') { $aa = 'I';} | |
if ($triplet eq 'ATC') { $aa = 'I';} | |
if ($triplet eq 'ATA') { $aa = 'I';} | |
if ($triplet eq 'ATG') { $aa = 'M';} # 36 | |
if ($triplet eq 'ACT') { $aa = 'T';} | |
if ($triplet eq 'ACC') { $aa = 'T';} | |
if ($triplet eq 'ACA') { $aa = 'T';} | |
if ($triplet eq 'ACG') { $aa = 'T';} # 40 | |
if ($triplet eq 'AAT') { $aa = 'N';} | |
if ($triplet eq 'AAC') { $aa = 'N';} | |
if ($triplet eq 'AAA') { $aa = 'K';} | |
if ($triplet eq 'AAG') { $aa = 'K';} # 44 | |
if ($triplet eq 'AGT') { $aa = 'S';} | |
if ($triplet eq 'AGC') { $aa = 'S';} | |
if ($triplet eq 'AGA') { $aa = 'R';} | |
if ($triplet eq 'AGG') { $aa = 'R';} # 48 | |
if ($triplet eq 'GTT') { $aa = 'V';} | |
if ($triplet eq 'GTC') { $aa = 'V';} | |
if ($triplet eq 'GTA') { $aa = 'V';} | |
if ($triplet eq 'GTG') { $aa = 'V';} # 52 | |
if ($triplet eq 'GCT') { $aa = 'A';} | |
if ($triplet eq 'GCC') { $aa = 'A';} | |
if ($triplet eq 'GCA') { $aa = 'A';} | |
if ($triplet eq 'GCG') { $aa = 'A';} # 56 | |
if ($triplet eq 'GAT') { $aa = 'D';} | |
if ($triplet eq 'GAC') { $aa = 'D';} | |
if ($triplet eq 'GAA') { $aa = 'E';} | |
if ($triplet eq 'GAG') { $aa = 'E';} # 60 | |
if ($triplet eq 'GGT') { $aa = 'G';} | |
if ($triplet eq 'GGC') { $aa = 'G';} | |
if ($triplet eq 'GGA') { $aa = 'G';} | |
if ($triplet eq 'GGG') { $aa = 'G';} # 64 | |
if ($aa eq "none") { $aa = 'X';} # FOR AN UNKNOWN AMINO ACIDS JUST PUT X. | |
$protein = $protein.$aa; | |
} | |
$protein_length_bp = length($protein) * 3; | |
# IF THERE WERE 1 OR 2 OVERLHANGING BASES AT THE END OF THE SPLICED GENE DNA: | |
if ($protein_length_bp < $length) | |
{ | |
$protein = $protein."X"; | |
} | |
return($protein,$errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# WRITE OUT THE SEQUENCE IN THE OUTPUT FASTA FILE (THIS APPENDS TO AN EXISTING OUTPUT FILE): | |
sub print_to_output | |
{ | |
my $output_fasta = $_[0]; # OUTPUT FASTA FILE | |
my $seq = $_[1]; # SEQUENCE | |
my $name = $_[2]; # NAME OF THE SEQUENCE | |
my $length; # LENGTH OF THE SEQUENCE | |
my $offset; # | |
my $a_line; # A LINE OF THE SEQUENCE | |
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR | |
# MAKE AN OUTPUT FASTA FILE: | |
open(OUTPUT,">>$output_fasta") || die "ERROR: print_to_output: cannot open output_fasta $output_fasta\n"; | |
print OUTPUT ">$name\n"; | |
$length = length($seq); | |
$offset = 0; | |
while ($offset < $length) | |
{ | |
$a_line = substr($seq,$offset,60); | |
print OUTPUT "$a_line\n"; | |
$offset = $offset + 60; | |
} | |
# CLOSE THE OUTPUT FILE: | |
close(OUTPUT); | |
return($errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# TEST &print_to_output | |
sub test_print_to_output | |
{ | |
my $outputdir = $_[0]; # DIRECTORY TO WRITE OUTPUT FILES IN | |
my $output_fasta; # OUTPUT FILE | |
my $seq; # SEQUENCE | |
my $errorcode; # RETURNED AS 0 BY A FUNCTION IF THERE IS NO ERROR | |
my $errormsg; # RETURNED AS 'none' BY A FUNCTION IF THERE IS NO ERROR | |
my $expected_output_fasta; # FILE CONTAINING EXPECTED CONTENTS OF $output_fasta | |
my $differences; # DIFFERENCES BETWEEN $output_fasta AND $expected_output_fasta | |
my $line; # | |
my $length_differences; # LENGTH OF $differences | |
($output_fasta,$errorcode,$errormsg) = &make_filename($outputdir); | |
open(OUTPUT,">$output_fasta") || die "ERROR: test_print_to_output: cannot open $output_fasta\n"; | |
close(OUTPUT); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
$seq = "AAAAATTTTTCCCCCGGGGGAAAAATTTTTCCCCCGGGGGAAAAATTTTTCCCCCGGGGGAAAAATTTTTCCCCCGGGGGAAAAATTTTTCCCCCGGGGGAAAAATTTTTCCCCCGGGGGAAAAATTTTTCCCCCGGGGG"; | |
($errorcode,$errormsg) = &print_to_output($output_fasta,$seq,"seq1"); | |
if ($errorcode != 0) { print STDERR "ERROR: test_print_to_output: failed test1\n"; exit;} | |
($expected_output_fasta,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(EXPECTED_OUTPUT,">$expected_output_fasta") || die "ERROR: test_print_to_output: cannot open $expected_output_fasta\n"; | |
print EXPECTED_OUTPUT ">seq1\n"; | |
print EXPECTED_OUTPUT "AAAAATTTTTCCCCCGGGGGAAAAATTTTTCCCCCGGGGGAAAAATTTTTCCCCCGGGGG\n"; | |
print EXPECTED_OUTPUT "AAAAATTTTTCCCCCGGGGGAAAAATTTTTCCCCCGGGGGAAAAATTTTTCCCCCGGGGG\n"; | |
print EXPECTED_OUTPUT "AAAAATTTTTCCCCCGGGGG\n"; | |
close(EXPECTED_OUTPUT); | |
$differences = ""; | |
open(TEMP,"diff $output_fasta $expected_output_fasta |"); | |
while(<TEMP>) | |
{ | |
$line = $_; | |
$differences = $differences.$line; | |
} | |
close(TEMP); | |
$length_differences = length($differences); | |
if ($length_differences != 0) { print STDERR "ERROR: test_print_to_output: failed test1 (output_fasta $output_fasta expected_output_fasta $expected_output_fasta)\n"; exit;} | |
system "rm -f $output_fasta"; | |
system "rm -f $expected_output_fasta"; | |
} | |
#------------------------------------------------------------------# | |
# TEST &read_assembly | |
sub test_read_assembly | |
{ | |
my $outputdir = $_[0]; # DIRECTORY WHERE WE CAN WRITE OUTPUT FILES | |
my $random_number; # RANDOM NUMBER TO USE IN TEMPORARY FILE NAMES | |
my $assembly; # TEMPORARY ASSEMBLY FILE NAME | |
my $SEQ; # HASH TABLE WITH SEQUENCES OF SCAFFOLDS | |
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE IS NO ERROR | |
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE IS NO ERROR | |
$random_number = rand(); | |
$assembly = $outputdir."/tmp".$random_number; | |
open(ASSEMBLY,">$assembly") || die "ERROR: test_read_assembly: cannot open $assembly\n"; | |
print ASSEMBLY ">seq1\n"; | |
print ASSEMBLY "AAAAA\n"; | |
print ASSEMBLY ">seq2\n"; | |
print ASSEMBLY "AAAAATTTTT\n"; | |
print ASSEMBLY "\n"; | |
print ASSEMBLY ">seq3\n"; | |
print ASSEMBLY "AAAAA\n"; | |
print ASSEMBLY "TTTTT\n"; | |
print ASSEMBLY ">seq4\n"; | |
print ASSEMBLY ">seq5\n"; | |
print ASSEMBLY " AAA AA \n"; | |
close(ASSEMBLY); | |
($SEQ,$errorcode,$errormsg) = &read_assembly($assembly); | |
if ($SEQ->{'seq1'} ne 'AAAAA' || $SEQ->{'seq2'} ne 'AAAAATTTTT' || $SEQ->{'seq3'} ne 'AAAAATTTTT' || defined($SEQ->{'seq4'}) || | |
$SEQ->{'seq5'} ne 'AAAAA' || $errorcode != 0) | |
{ | |
print STDERR "ERROR: test_read_assembly: failed test1\n"; | |
exit; | |
} | |
system "rm -f $assembly"; | |
$random_number = rand(); | |
$assembly = $outputdir."/tmp".$random_number; | |
open(ASSEMBLY,">$assembly") || die "ERROR: test_assembly: cannot open $assembly\n"; | |
print ASSEMBLY ">seq1\n"; | |
print ASSEMBLY "AAAAA\n"; | |
print ASSEMBLY ">seq2\n"; | |
print ASSEMBLY "AAAAATTTTT\n"; | |
print ASSEMBLY ">seq1\n"; | |
print ASSEMBLY "AAAAA\n"; | |
close(ASSEMBLY); | |
($SEQ,$errorcode,$errormsg) = &read_assembly($assembly); | |
if ($errorcode != 4) { print STDERR "ERROR: test_assembly: failed test2\n"; exit;} | |
system "rm -f $assembly"; | |
$random_number = rand(); | |
$assembly = $outputdir."/tmp".$random_number; | |
open(ASSEMBLY,">$assembly") || die "ERROR: test_assembly: cannot open $assembly\n"; | |
print ASSEMBLY "AAAAA\n"; | |
print ASSEMBLY ">seq2\n"; | |
print ASSEMBLY "AAAAATTTTT\n"; | |
print ASSEMBLY ">seq1\n"; | |
print ASSEMBLY "AAAAA\n"; | |
close(ASSEMBLY); | |
($SEQ,$errorcode,$errormsg) = &read_assembly($assembly); | |
if ($errorcode != 5) { print STDERR "ERROR: test_assembly: failed test2\n"; exit;} | |
system "rm -f $assembly"; | |
} | |
#------------------------------------------------------------------# | |
# READ IN THE ASSEMBLY FILE: | |
# SUBROUTINE SYNOPSIS: read_assembly(): read sequences in a fasta file into a hash | |
sub read_assembly | |
{ | |
my $input_assembly = $_[0]; # THE INPUT ASSEMBLY FILE | |
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR | |
my $line; # | |
my $scaffold = "none";# NAME OF SCAFFOLD | |
my @temp; # | |
my $seq; # SEQUENCE OF SCAFFOLD | |
my %SEQ = (); # HASH TABLE FOR STORING SCAFFOLD SEQUENCE | |
$seq = ""; | |
open(ASSEMBLY,"$input_assembly") || die "ERROR: read_assembly: cannot open $input_assembly\n"; | |
while(<ASSEMBLY>) | |
{ | |
$line = $_; | |
chomp $line; | |
if (substr($line,0,1) eq '>') | |
{ | |
if ($seq ne "") | |
{ | |
if ($scaffold eq 'none') | |
{ | |
$errormsg = "ERROR: read_assembly: do not know name of scaffold\n"; | |
$errorcode = 5; # ERRORCODE=5 (TESTED FOR) | |
return(\%SEQ,$errorcode,$errormsg); | |
} | |
if ($SEQ{$scaffold}) | |
{ | |
$errormsg = "ERROR: read_assembly: already know sequence for scaffold $scaffold\n"; | |
$errorcode = 4; # ERRORCODE=4 (TESTED FOR) | |
return(\%SEQ,$errorcode,$errormsg); | |
} | |
$SEQ{$scaffold}= $seq; | |
$seq = ""; | |
} | |
@temp = split(/\s+/,$line); | |
$scaffold = $temp[0]; | |
$scaffold = substr($scaffold,1,length($scaffold)-1); | |
} | |
else | |
{ | |
$line =~ s/\s+//g; # REMOVE SPACES | |
$seq = $seq.$line; | |
} | |
} | |
close(ASSEMBLY); | |
if ($seq ne "") | |
{ | |
if ($scaffold eq 'none') | |
{ | |
$errormsg = "ERROR: read_assembly: do not know name of scaffold\n"; | |
$errorcode = 5; # ERRORCODE=5 (TESTED FOR) | |
return(\%SEQ,$errorcode,$errormsg); | |
} | |
if ($SEQ{$scaffold}) | |
{ | |
$errormsg = "ERROR: read_assembly: already know sequence for scaffold $scaffold\n"; | |
$errorcode = 4; # ERRORCODE=4 (TESTED FOR) | |
return(\%SEQ,$errorcode,$errormsg); | |
} | |
$SEQ{$scaffold} = $seq; | |
$seq = ""; | |
} | |
return(\%SEQ,$errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# SUBROUTINE TO MAKE A FILE NAME FOR A TEMPORARY FILE: | |
sub make_filename | |
{ | |
my $outputdir = $_[0]; # OUTPUT DIRECTORY TO WRITE TEMPORARY FILE NAME TO | |
my $found_name = 0; # SAYS WHETHER WE HAVE FOUND A FILE NAME YET | |
my $filename = "none";# NEW FILE NAME TO USE | |
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR | |
my $poss_filename; # POSSIBLE FILE NAME TO USE | |
my $random_number; # RANDOM NUMBER TO USE IN TEMPORARY FILE NAME | |
while($found_name == 0) | |
{ | |
$random_number = rand(); | |
$poss_filename = $outputdir."/tmp".$random_number; | |
if (!(-e $poss_filename)) | |
{ | |
$filename = $poss_filename; | |
$found_name = 1; | |
} | |
} | |
if ($found_name == 0 || $filename eq 'none') | |
{ | |
$errormsg = "ERROR: make_filename: found_name $found_name filename $filename\n"; | |
$errorcode = 6; # ERRORCODE=6 | |
return($filename,$errorcode,$errormsg); | |
} | |
return($filename,$errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# TEST &print_error | |
sub test_print_error | |
{ | |
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE WAS NO ERROR | |
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE WAS NO ERROR | |
($errormsg,$errorcode) = &print_error(45,45,1); | |
if ($errorcode != 12) { print STDERR "ERROR: test_print_error: failed test1\n"; exit;} | |
($errormsg,$errorcode) = &print_error('My error message','My error message',1); | |
if ($errorcode != 11) { print STDERR "ERROR: test_print_error: failed test2\n"; exit;} | |
($errormsg,$errorcode) = &print_error('none',45,1); | |
if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test3\n"; exit;} | |
($errormsg,$errorcode) = &print_error('My error message', 0, 1); | |
if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test4\n"; exit;} | |
} | |
#------------------------------------------------------------------# | |
# PRINT OUT AN ERROR MESSAGE AND EXIT. | |
sub print_error | |
{ | |
my $errormsg = $_[0]; # THIS SHOULD BE NOT 'none' IF AN ERROR OCCURRED. | |
my $errorcode = $_[1]; # THIS SHOULD NOT BE 0 IF AN ERROR OCCURRED. | |
my $called_from_test = $_[2]; # SAYS WHETHER THIS WAS CALLED FROM test_print_error OR NOT | |
if ($errorcode =~ /[A-Z]/ || $errorcode =~ /[a-z]/) | |
{ | |
if ($called_from_test == 1) | |
{ | |
$errorcode = 11; $errormsg = "ERROR: print_error: the errorcode is $errorcode, should be a number.\n"; # ERRORCODE=11 | |
return($errormsg,$errorcode); | |
} | |
else | |
{ | |
print STDERR "ERROR: print_error: the errorcode is $errorcode, should be a number.\n"; | |
exit; | |
} | |
} | |
if (!($errormsg =~ /[A-Z]/ || $errormsg =~ /[a-z]/)) | |
{ | |
if ($called_from_test == 1) | |
{ | |
$errorcode = 12; $errormsg = "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n"; # ERRORCODE=12 | |
return($errormsg,$errorcode); | |
} | |
else | |
{ | |
print STDERR "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n"; | |
exit; | |
} | |
} | |
if ($errormsg eq 'none' || $errorcode == 0) | |
{ | |
if ($called_from_test == 1) | |
{ | |
$errorcode = 13; $errormsg = "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n"; # ERRORCODE=13 | |
return($errormsg,$errorcode); | |
} | |
else | |
{ | |
print STDERR "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n"; | |
exit; | |
} | |
} | |
else | |
{ | |
print STDERR "$errormsg"; | |
exit; | |
} | |
return($errormsg,$errorcode); | |
} | |
#------------------------------------------------------------------# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment