Created
October 11, 2011 04:50
-
-
Save danaj/1277319 to your computer and use it in GitHub Desktop.
A faster fasta
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# The Computer Language Benchmarks game | |
# http://shootout.alioth.debian.org/ | |
# | |
# contributed by David Pyke | |
# tweaked by Danny Sauer | |
# optimized by Steffen Mueller | |
# tweaked by Kuang-che Wu | |
# optimized by Rodrigo de Oliveira | |
# modified by Dana Jacobsen | |
use strict; | |
use warnings; | |
use constant IM => 139968; | |
use constant IA => 3877; | |
use constant IC => 29573; | |
use constant LINELENGTH => 60; | |
use constant TSIZE => 512; | |
my $iub = [ | |
[ 'a', 0.27 ], | |
[ 'c', 0.12 ], | |
[ 'g', 0.12 ], | |
[ 't', 0.27 ], | |
[ 'B', 0.02 ], | |
[ 'D', 0.02 ], | |
[ 'H', 0.02 ], | |
[ 'K', 0.02 ], | |
[ 'M', 0.02 ], | |
[ 'N', 0.02 ], | |
[ 'R', 0.02 ], | |
[ 'S', 0.02 ], | |
[ 'V', 0.02 ], | |
[ 'W', 0.02 ], | |
[ 'Y', 0.02 ] | |
]; | |
my $homosapiens = [ | |
[ 'a', 0.3029549426680 ], | |
[ 'c', 0.1979883004921 ], | |
[ 'g', 0.1975473066391 ], | |
[ 't', 0.3015094502008 ] | |
]; | |
my $alu = | |
'GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGG' . | |
'GAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGA' . | |
'CCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAAT' . | |
'ACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCA' . | |
'GCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGG' . | |
'AGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCC' . | |
'AGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA'; | |
my $LAST = 42; | |
sub gen_random { | |
return map { ($LAST = ($LAST * IA + IC) % IM) / IM} (1 .. $_[0]); | |
} | |
sub makeCumulative { | |
my $genelist = shift; | |
my $cp = 0.0; | |
$_->[1] = $cp += $_->[1] foreach @$genelist; | |
} | |
sub makeRandomFasta { | |
my ($id, $desc, $n, $genelist) = @_; | |
print ">", $id, " ", $desc, "\n"; | |
my @chars = map { $_->[0] } @$genelist; | |
my @probs = map { $_->[1] } @$genelist; | |
# $tab[$x] is defined if all FP in the interval [$x/TSIZE, ($x+1)/TSIZE] | |
# map to a single gene. | |
my @tab; | |
foreach my $i (0 .. TSIZE-1) { | |
my $ldiv = 0.0 + $i / TSIZE; | |
my $udiv = 0.0 + ($i+1) / TSIZE; | |
my $lidx = scalar grep { $_ < $ldiv } @probs; | |
my $uidx = scalar grep { $_ < $udiv } @probs; | |
$tab[$i] = $chars[$lidx] if $lidx == $uidx; | |
} | |
# print whole lines | |
foreach (1 .. int($n / LINELENGTH) ) { | |
my @r = gen_random(LINELENGTH); | |
my $s = ''; | |
foreach my $r (@r) { | |
$s .= $tab[int($r*TSIZE)] || $chars[ scalar grep { $_ < $r } @probs ]; | |
} | |
print $s, "\n"; | |
} | |
# print remaining line (if required) | |
if ($n % LINELENGTH) { | |
my @r = gen_random($n % LINELENGTH); | |
my $s = ''; | |
foreach my $r (@r) { | |
$s .= $tab[int($r*TSIZE)] || $chars[ scalar grep { $_ < $r } @probs ]; | |
} | |
print $s, "\n"; | |
} | |
} | |
sub makeRepeatFasta { | |
my ($id, $desc, $s, $n) = @_; | |
print ">", $id, " ", $desc, "\n"; | |
my $r = length $s; | |
my $ss = $s . $s . substr($s, 0, $n % $r); | |
for my $j(0..int($n / LINELENGTH)-1) { | |
my $i = $j*LINELENGTH % $r; | |
print substr($ss, $i, LINELENGTH), "\n"; | |
} | |
if ($n % LINELENGTH) { | |
print substr($ss, -($n % LINELENGTH)), "\n"; | |
} | |
} | |
###################################################################### | |
#main | |
my $n = $ARGV[0] || 1000; | |
makeCumulative($iub); | |
makeCumulative($homosapiens); | |
makeRepeatFasta ('ONE', 'Homo sapiens alu', $alu, $n*2); | |
makeRandomFasta ('TWO', 'IUB ambiguity codes', $n*3, $iub); | |
makeRandomFasta ('THREE', 'Homo sapiens frequency', $n*6, $homosapiens); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Note that Rodrigo's version is much faster, but relies on the particular random number generator having a very short cycle. This version does not cache any random numbers and does not rely on the particular generator. This code is in fact, much more like perl #4 than Rodrigo's, but because his code gave me ideas, I chose to view his as the direct ancestor.