grassa/genotypeML.pl

## genotypeML.pl
#!/usr/bin/perl
use warnings;
use strict;

my $readsA = '5';
my $readsT = '4';
my $readsG = '0';
my $readsC = '0';
my $error_rate = '0.001';

my @counts = ($readsA,$readsT,$readsG,$readsC,$error_rate);

my $call = genotype(@counts);


print "$call\n";
# prints:
# at

sub genotype {
	# this subroutine is passed an array of read counts for each base,
	# plus the mean error rate at the site as the last element
	my @counts = @_;
	my %n;									#
	$n{a} = sprintf("%09d", $counts[0]);   	#
	$n{t} = sprintf("%09d", $counts[1]);   	### hash of 'BASE' => 'N_READS'
	$n{g} = sprintf("%09d", $counts[2]);   	#
	$n{c} = sprintf("%09d", $counts[3]);   	#

	my $E = $counts[4];					   	# the mean error rate
	my $nS = $n{a} + $n{t} + $n{g} + $n{c};	# total reads
	# sort the hash decreasing by values, return the keys
	my @sort_bases = reverse sort { $n{$a} cmp $n{$b} } keys %n;
	my $n1base = $sort_bases[0];			#
	my $n2base = $sort_bases[1];			### the bases, so that we can call the genotype later
	my $n3base = $sort_bases[2];			#
	my $n4base = $sort_bases[3];			#

	my $n1 = int($n{$n1base});				### n1 == number of reads for most frequent base
	my $n2 = int($n{$n2base});				### n2 == number of reads for the second most frequent base
	my $n3 = int($n{$n3base});				#
	my $n4 = int($n{$n4base});				#

	# the factorials are expensive, so we consolidate them as one variable
	my $sampling = fact($nS) / ( fact($n1) * fact($n2) * fact($n3) * fact($n4) );
	# likelihood of a homozygote: eq (1a)
	my $L11 = $sampling * ( ( 1 - ((3*$E)/4) )**$n1 ) * ( ($E/4)**($n2 + $n3 + $n4)  );
	# likelihood of a heterozygote: eq (1b)
	my $L12 = $sampling * ( ( 0.5 - ($E/4) )**($n1 + $n2) ) * ( ($E/4)**($n3 + $n4)  );
	# the test statistic is the likelihood ratio of our two most likely hypotheses
	my $D = abs(2 * log($L12/$L11) );
	# for a = 0.05, this number comes from a chi-squared distribution with 1 degree of freedom
	my $sig = '3.841';
	# no call is the default
	my $call = 'NN';
	# if the test statistic is significant
	if($D>$sig){
		# if a heterozygote was more likely, then that was our alternative hypothesis
		if($L12>$L11){
			$call = $n1base.$n2base;
		}
		# else, our hypothesis was homozygosity
		else{
			$call = $n1base.$n1base;
		}
	}
	return $call;
}
# factorial subroutine
sub fact {
	no warnings 'recursion'; # THIS COULD BE DANGEROUS!!!
	my $n = shift;
	$n == 0 ? 1 : $n*fact($n-1);
}
	#!/usr/bin/perl
	use warnings;
	use strict;

	my $readsA = '5';
	my $readsT = '4';
	my $readsG = '0';
	my $readsC = '0';
	my $error_rate = '0.001';

	my @counts = ($readsA,$readsT,$readsG,$readsC,$error_rate);

	my $call = genotype(@counts);


	print "$call\n";
	# prints:
	# at

	sub genotype {
	# this subroutine is passed an array of read counts for each base,
	# plus the mean error rate at the site as the last element
	my @counts = @_;
	my %n; #
	$n{a} = sprintf("%09d", $counts[0]); #
	$n{t} = sprintf("%09d", $counts[1]); ### hash of 'BASE' => 'N_READS'
	$n{g} = sprintf("%09d", $counts[2]); #
	$n{c} = sprintf("%09d", $counts[3]); #

	my $E = $counts[4]; # the mean error rate
	my $nS = $n{a} + $n{t} + $n{g} + $n{c}; # total reads
	# sort the hash decreasing by values, return the keys
	my @sort_bases = reverse sort { $n{$a} cmp $n{$b} } keys %n;
	my $n1base = $sort_bases[0]; #
	my $n2base = $sort_bases[1]; ### the bases, so that we can call the genotype later
	my $n3base = $sort_bases[2]; #
	my $n4base = $sort_bases[3]; #

	my $n1 = int($n{$n1base}); ### n1 == number of reads for most frequent base
	my $n2 = int($n{$n2base}); ### n2 == number of reads for the second most frequent base
	my $n3 = int($n{$n3base}); #
	my $n4 = int($n{$n4base}); #

	# the factorials are expensive, so we consolidate them as one variable
	my $sampling = fact($nS) / ( fact($n1) * fact($n2) * fact($n3) * fact($n4) );
	# likelihood of a homozygote: eq (1a)
	my $L11 = $sampling * ( ( 1 - ((3$E)/4) )$n1 ) ( ($E/4)**($n2 + $n3 + $n4) );
	# likelihood of a heterozygote: eq (1b)
	my $L12 = $sampling * ( ( 0.5 - ($E/4) )*($n1 + $n2) ) ( ($E/4)**($n3 + $n4) );
	# the test statistic is the likelihood ratio of our two most likely hypotheses
	my $D = abs(2 * log($L12/$L11) );
	# for a = 0.05, this number comes from a chi-squared distribution with 1 degree of freedom
	my $sig = '3.841';
	# no call is the default
	my $call = 'NN';
	# if the test statistic is significant
	if($D>$sig){
	# if a heterozygote was more likely, then that was our alternative hypothesis
	if($L12>$L11){
	$call = $n1base.$n2base;
	}
	# else, our hypothesis was homozygosity
	else{
	$call = $n1base.$n1base;
	}
	}
	return $call;
	}
	# factorial subroutine
	sub fact {
	no warnings 'recursion'; # THIS COULD BE DANGEROUS!!!
	my $n = shift;
	$n == 0 ? 1 : $n*fact($n-1);
	}