Last active
August 29, 2015 14:22
-
-
Save scottx611x/784c7a64f3729cff7032 to your computer and use it in GitHub Desktop.
Finds "Fuzzy" representations of genes from .fna files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use strict; | |
use warnings; | |
use List::Util 'shuffle'; | |
use Bio::Tools::CodonTable; | |
use Text::Wrap; | |
print "Fuzzy Finder\n"; | |
my $filename = "humanXM_000000.fna"; | |
open(my $fh, '<:encoding(UTF-8)', $filename) | |
or die "Could not open file '$filename' $!"; | |
my $firstline = <$fh>; | |
print "HEADER LINE:\n"; | |
print "$firstline\n"; | |
print "********************************************************************************\n"; | |
print "Sucessfully opened the file $filename\n"; | |
my $NUC_WIDTH = 70; # 70 nucleotides per line | |
my $AA_WIDTH = 40; # 40 amino acids per line | |
my @DNA_Seq; | |
my @Rand_DNA_Seq; | |
my @newAA_Seq; | |
my @newAA_Seq_Rand; | |
my $checker = 0; | |
$Text::Wrap::columns = $NUC_WIDTH; | |
@DNA_Seq = &SequenceFromFile; | |
my @newDNA_Seq = grep(s/\s*//g, @DNA_Seq); | |
@newDNA_Seq = grep(s/[0-9]//g, @DNA_Seq); | |
my $big_Sequence_String = $newDNA_Seq[0]; | |
my $length = (length $big_Sequence_String); | |
#print "$length\n"; | |
#print "big_Sequence_String: $big_Sequence_String\n"; | |
print "\n\n********************************************************************************\n"; | |
print "*********************************DNA Sequence***********************************\n"; | |
print "********************************************************************************\n\n"; | |
print wrap('', '', @DNA_Seq); | |
&to_Amino_Acid(@newDNA_Seq); | |
&get_Hydrophobicity(@newAA_Seq); | |
print "\n\n********************************************************************************\n"; | |
print "***************************Randomized DNA Sequence:*****************************\n"; | |
print "********************************************************************************\n"; | |
$Text::Wrap::columns = $NUC_WIDTH; | |
@Rand_DNA_Seq = &useRandomSequence($big_Sequence_String); | |
my @new_Rand_DNA_Seq = grep(s/\s*//g, @Rand_DNA_Seq); | |
print wrap('', '', @Rand_DNA_Seq); | |
&to_Amino_Acid(@new_Rand_DNA_Seq); | |
&get_Hydrophobicity(@newAA_Seq_Rand); | |
sub to_Amino_Acid | |
{ | |
# Amino acid hash table | |
my $CodonTable = Bio::Tools::CodonTable->new(); | |
if ($checker == 0) | |
{ | |
$Text::Wrap::columns = $AA_WIDTH; | |
my @AA_Seq; | |
my $DNA_Seq_String; | |
foreach my $nucleotide (@_){ $DNA_Seq_String = $DNA_Seq_String . $nucleotide;} | |
my $index = 0; | |
my $length = 3; | |
my $codon; | |
my $result; | |
while ($index != length $DNA_Seq_String) | |
{ | |
$codon = substr $DNA_Seq_String, $index, $length; | |
#print "\n$codon\n"; working | |
$result = $CodonTable->translate($codon); | |
$codon = ''; | |
#print $result; | |
push @AA_Seq, $result; | |
$index = $index + 3; | |
} | |
@newAA_Seq = grep(s/\s*//g, @AA_Seq); | |
print "\n\nAmino Acid Sequence:\n"; | |
print wrap('', '', @newAA_Seq); | |
$checker++; | |
} | |
else{ | |
$Text::Wrap::columns = $AA_WIDTH; | |
my @AA_Seq; | |
my $DNA_Seq_String; | |
foreach my $nucleotide (@_){ $DNA_Seq_String = $DNA_Seq_String . $nucleotide;} | |
my $index = 0; | |
my $length = 3; | |
my $codon; | |
my $result; | |
print "\n"; | |
while ($index != length $DNA_Seq_String) | |
{ | |
$codon = substr $DNA_Seq_String, $index, $length; | |
#print "$codon "; | |
$result = $CodonTable->translate($codon); | |
$codon = ''; | |
#print $result; | |
push @AA_Seq, $result; | |
$index = $index + 3; | |
} | |
@newAA_Seq_Rand = grep(s/\s*//g, @AA_Seq); | |
print "\n\nAmino Acid Sequence:\n"; | |
print wrap('', '', @newAA_Seq_Rand); | |
$checker++; | |
} | |
} | |
sub get_Hydrophobicity | |
{ | |
my $AA_Seq_String; | |
foreach my $amino_acid (@_){ $AA_Seq_String = $AA_Seq_String . $amino_acid;} | |
my @matches; | |
my @match_start; | |
my @match_end; | |
my $match_length; | |
my $match; | |
push (@matches,$&) while($AA_Seq_String =~ m/[WAVLIMFY]{5,}/g); | |
push (@match_start,@-) while($AA_Seq_String =~ m/[WAVLIMFY]{5,}/g); | |
push (@match_end,@+) while($AA_Seq_String =~ m/[WAVLIMFY]{5,}/g); | |
print "\n\nRegex used to find hydrophobic matches: m/[WAVLIMFY]{5,}/g\n"; | |
print "This regex finds regions where 5 or more hydrophobic amino acids are found adjacent to each other.\n\n"; | |
print "Hydrophobic Matches:\n"; | |
foreach (@matches) { | |
print "$_\n"; | |
} | |
print "\n"; | |
my $i = 0; | |
my $arrSize = @matches; | |
if ($arrSize == 0) | |
{ | |
print "No hydrophobic regions found!\n\n" | |
} | |
else | |
{ | |
foreach $match (@matches) | |
{ | |
$match_length = length $match; | |
print "I found hydrophobic region: $match of size : $match_length from location: $match_start[$i] to $match_end[$i].\n"; | |
$i = $i + 1; | |
} | |
} | |
} | |
sub useRandomSequence | |
{ | |
my $NUC_STR; | |
foreach my $nuc (@_){ $NUC_STR = $NUC_STR . $nuc;} | |
#print "$NUC_STR\n"; | |
my $countA = ($NUC_STR =~ tr/A/A/); | |
my $countT = ($NUC_STR =~ tr/T/T/); | |
my $countC = ($NUC_STR =~ tr/C/C/); | |
my $countG = ($NUC_STR =~ tr/G/G/); | |
#print "$length\n"; | |
#print "$countA $countC $countT $countG\n"; | |
#print $countA + $countC + $countT + $countG; | |
#print "\n"; | |
#Calling subroutine with Sequence length and nucleotide percentages | |
&generateRandomSequence($length,$countA,$countC,$countT,$countG); | |
} | |
sub generateRandomSequence | |
{ | |
#assigning arguments to variables | |
my ($sequenceLength) = @_; | |
#getting the percentage of each nucleotide | |
my $numAs = $_[1]; | |
my $numCs = $_[2]; | |
my $numTs = $_[3]; | |
my $numGs = $_[4]; | |
print "\nNucleotide Percentages: \n"; | |
my $perA = $numAs / $length * 100; | |
my $perC = $numCs / $length * 100; | |
my $perT = $numTs / $length * 100; | |
my $perG = $numGs / $length * 100; | |
$perA = sprintf "%.2f" ,$perA; | |
$perC = sprintf "%.2f" ,$perC; | |
$perT = sprintf "%.2f" ,$perT; | |
$perG = sprintf "%.2f" ,$perG; | |
print "%As -> $perA\n"; | |
print "%Cs -> $perC\n"; | |
print "%Ts -> $perT\n"; | |
print "%Gs -> $perG\n"; | |
print "$numAs A's, $numCs C's, $numTs T's, and $numGs G's \n\n"; | |
#Filling 4 arrays with their respective amount of nucleotides | |
my @Nucleotides = (); | |
@Nucleotides = (@Nucleotides, ('A') x $numAs); | |
@Nucleotides = (@Nucleotides, ('C') x $numCs); | |
@Nucleotides = (@Nucleotides, ('T') x $numTs); | |
@Nucleotides = (@Nucleotides, ('G') x $numGs); | |
#Shuffle array order | |
my $randomSequence = ""; | |
#While array isnt empty, pop a letter and concatenate that to the new DNA sequence | |
while (@Nucleotides) | |
{ | |
@Nucleotides = shuffle(@Nucleotides); | |
my $nucleotide = pop @Nucleotides; | |
$randomSequence = $randomSequence . $nucleotide; | |
} | |
#print "Randomized DNA string is: \n"; | |
#print "$randomSequence\n"; | |
$randomSequence =~ s/TAA|TAG|TGA/TGG/g; | |
$randomSequence = $randomSequence . "TAA"; | |
#print "$randomSequence\n"; | |
my @DNA_Seq; | |
@DNA_Seq = split /\s+/, $randomSequence; | |
return @DNA_Seq; | |
} | |
sub SequenceFromFile | |
{ | |
open(my $fh, '<:encoding(UTF-8)', $filename) | |
or die "Could not open file '$filename' $!"; | |
my $firstline = <$fh>; | |
#print "$firstline\n"; | |
my @DNA_Seq; | |
while (my $row = <$fh>) { | |
chomp $row; | |
push @DNA_Seq, uc $row; | |
} | |
return @DNA_Seq ; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment