Created
July 18, 2011 20:02
-
-
Save grassa/1090505 to your computer and use it in GitHub Desktop.
equations(1) from Hohenlohe, et. al. 2010 http://www.plosgenetics.org/article/info:doi/10.1371/journal.pgen.1000862
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use warnings; | |
use strict; | |
my $readsA = '5'; | |
my $readsT = '4'; | |
my $readsG = '0'; | |
my $readsC = '0'; | |
my $error_rate = '0.001'; | |
my @counts = ($readsA,$readsT,$readsG,$readsC,$error_rate); | |
my $call = genotype(@counts); | |
print "$call\n"; | |
# prints: | |
# at | |
sub genotype { | |
# this subroutine is passed an array of read counts for each base, | |
# plus the mean error rate at the site as the last element | |
my @counts = @_; | |
my %n; # | |
$n{a} = sprintf("%09d", $counts[0]); # | |
$n{t} = sprintf("%09d", $counts[1]); ### hash of 'BASE' => 'N_READS' | |
$n{g} = sprintf("%09d", $counts[2]); # | |
$n{c} = sprintf("%09d", $counts[3]); # | |
my $E = $counts[4]; # the mean error rate | |
my $nS = $n{a} + $n{t} + $n{g} + $n{c}; # total reads | |
# sort the hash decreasing by values, return the keys | |
my @sort_bases = reverse sort { $n{$a} cmp $n{$b} } keys %n; | |
my $n1base = $sort_bases[0]; # | |
my $n2base = $sort_bases[1]; ### the bases, so that we can call the genotype later | |
my $n3base = $sort_bases[2]; # | |
my $n4base = $sort_bases[3]; # | |
my $n1 = int($n{$n1base}); ### n1 == number of reads for most frequent base | |
my $n2 = int($n{$n2base}); ### n2 == number of reads for the second most frequent base | |
my $n3 = int($n{$n3base}); # | |
my $n4 = int($n{$n4base}); # | |
# the factorials are expensive, so we consolidate them as one variable | |
my $sampling = fact($nS) / ( fact($n1) * fact($n2) * fact($n3) * fact($n4) ); | |
# likelihood of a homozygote: eq (1a) | |
my $L11 = $sampling * ( ( 1 - ((3*$E)/4) )**$n1 ) * ( ($E/4)**($n2 + $n3 + $n4) ); | |
# likelihood of a heterozygote: eq (1b) | |
my $L12 = $sampling * ( ( 0.5 - ($E/4) )**($n1 + $n2) ) * ( ($E/4)**($n3 + $n4) ); | |
# the test statistic is the likelihood ratio of our two most likely hypotheses | |
my $D = abs(2 * log($L12/$L11) ); | |
# for a = 0.05, this number comes from a chi-squared distribution with 1 degree of freedom | |
my $sig = '3.841'; | |
# no call is the default | |
my $call = 'NN'; | |
# if the test statistic is significant | |
if($D>$sig){ | |
# if a heterozygote was more likely, then that was our alternative hypothesis | |
if($L12>$L11){ | |
$call = $n1base.$n2base; | |
} | |
# else, our hypothesis was homozygosity | |
else{ | |
$call = $n1base.$n1base; | |
} | |
} | |
return $call; | |
} | |
# factorial subroutine | |
sub fact { | |
no warnings 'recursion'; # THIS COULD BE DANGEROUS!!! | |
my $n = shift; | |
$n == 0 ? 1 : $n*fact($n-1); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment