Skip to content

Instantly share code, notes, and snippets.

@grassa
Created July 18, 2011 20:02
Show Gist options
  • Save grassa/1090505 to your computer and use it in GitHub Desktop.
Save grassa/1090505 to your computer and use it in GitHub Desktop.
#!/usr/bin/perl
use warnings;
use strict;
my $readsA = '5';
my $readsT = '4';
my $readsG = '0';
my $readsC = '0';
my $error_rate = '0.001';
my @counts = ($readsA,$readsT,$readsG,$readsC,$error_rate);
my $call = genotype(@counts);
print "$call\n";
# prints:
# at
sub genotype {
# this subroutine is passed an array of read counts for each base,
# plus the mean error rate at the site as the last element
my @counts = @_;
my %n; #
$n{a} = sprintf("%09d", $counts[0]); #
$n{t} = sprintf("%09d", $counts[1]); ### hash of 'BASE' => 'N_READS'
$n{g} = sprintf("%09d", $counts[2]); #
$n{c} = sprintf("%09d", $counts[3]); #
my $E = $counts[4]; # the mean error rate
my $nS = $n{a} + $n{t} + $n{g} + $n{c}; # total reads
# sort the hash decreasing by values, return the keys
my @sort_bases = reverse sort { $n{$a} cmp $n{$b} } keys %n;
my $n1base = $sort_bases[0]; #
my $n2base = $sort_bases[1]; ### the bases, so that we can call the genotype later
my $n3base = $sort_bases[2]; #
my $n4base = $sort_bases[3]; #
my $n1 = int($n{$n1base}); ### n1 == number of reads for most frequent base
my $n2 = int($n{$n2base}); ### n2 == number of reads for the second most frequent base
my $n3 = int($n{$n3base}); #
my $n4 = int($n{$n4base}); #
# the factorials are expensive, so we consolidate them as one variable
my $sampling = fact($nS) / ( fact($n1) * fact($n2) * fact($n3) * fact($n4) );
# likelihood of a homozygote: eq (1a)
my $L11 = $sampling * ( ( 1 - ((3*$E)/4) )**$n1 ) * ( ($E/4)**($n2 + $n3 + $n4) );
# likelihood of a heterozygote: eq (1b)
my $L12 = $sampling * ( ( 0.5 - ($E/4) )**($n1 + $n2) ) * ( ($E/4)**($n3 + $n4) );
# the test statistic is the likelihood ratio of our two most likely hypotheses
my $D = abs(2 * log($L12/$L11) );
# for a = 0.05, this number comes from a chi-squared distribution with 1 degree of freedom
my $sig = '3.841';
# no call is the default
my $call = 'NN';
# if the test statistic is significant
if($D>$sig){
# if a heterozygote was more likely, then that was our alternative hypothesis
if($L12>$L11){
$call = $n1base.$n2base;
}
# else, our hypothesis was homozygosity
else{
$call = $n1base.$n1base;
}
}
return $call;
}
# factorial subroutine
sub fact {
no warnings 'recursion'; # THIS COULD BE DANGEROUS!!!
my $n = shift;
$n == 0 ? 1 : $n*fact($n-1);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment