Skip to content

Instantly share code, notes, and snippets.

@xeoncross
Created October 16, 2012 16:38
Show Gist options
  • Save xeoncross/3900424 to your computer and use it in GitHub Desktop.
Save xeoncross/3900424 to your computer and use it in GitHub Desktop.
Al-Kashi - statistics classes

This class can compute statistics on a set of values.

It takes an array of values and can computes the mean, median, mode, variance, standard deviation, coefficient of variation, skewness, and Kurtosis, covariance, correlation, and regression analysis, paired and unpaired t-Test including calculate the probability of null hypothesis, normal distribution, student t-distribution one and two tailed, F distribution, chi-square test or contingency tables (A/B testing), analysis of variance (ANOVA), Shannon and Simpson diversity index.

http://www.phpclasses.org/package/7401-PHP-Compute-statistics-on-a-set-of-values.html

GPL License

model mpg cyl! disp hp drat wt qsec vs am! gear! carb!
Mazda RX4 21 6 160 110 3.9 2.62 16.46 0 1 4 4
Mazda RX4 Wag 21 6 160 110 3.9 2.875 17.02 0 1 4 4
Datsun 710 22.8 4 108 93 3.85 2.32 18.61 1 1 4 1
Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
Hornet Sportabout 18.7 8 360 175 3.15 3.44 17.02 0 0 3 2
Valiant 18.1 6 225 105 2.76 3.46 20.22 1 0 3 1
Duster 360 14.3 8 360 245 3.21 3.57 15.84 0 0 3 4
Merc 240D 24.4 4 146.7 62 3.69 3.19 20 1 0 4 2
Merc 230 22.8 4 140.8 95 3.92 3.15 22.9 1 0 4 2
Merc 280 19.2 6 167.6 123 3.92 3.44 18.3 1 0 4 4
Merc 280C 17.8 6 167.6 123 3.92 3.44 18.9 1 0 4 4
Merc 450SE 16.4 8 275.8 180 3.07 4.07 17.4 0 0 3 3
Merc 450SL 17.3 8 275.8 180 3.07 3.73 17.6 0 0 3 3
Merc 450SLC 15.2 8 275.8 180 3.07 3.78 18 0 0 3 3
Cadillac Fleetwood 10.4 8 472 205 2.93 5.25 17.98 0 0 3 4
Lincoln Continental 10.4 8 460 215 3 5.424 17.82 0 0 3 4
Chrysler Imperial 14.7 8 440 230 3.23 5.345 17.42 0 0 3 4
Fiat 128 32.4 4 78.7 66 4.08 2.2 19.47 1 1 4 1
Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.9 1 1 4 1
Toyota Corona 21.5 4 120.1 97 3.7 2.465 20.01 1 0 3 1
Dodge Challenger 15.5 8 318 150 2.76 3.52 16.87 0 0 3 2
AMC Javelin 15.2 8 304 150 3.15 3.435 17.3 0 0 3 2
Camaro Z28 13.3 8 350 245 3.73 3.84 15.41 0 0 3 4
Pontiac Firebird 19.2 8 400 175 3.08 3.845 17.05 0 0 3 2
Fiat X1-9 27.3 4 79 66 4.08 1.935 18.9 1 1 4 1
Porsche 914-2 26 4 120.3 91 4.43 2.14 16.7 0 1 5 2
Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.9 1 1 5 2
Ford Pantera L 15.8 8 351 264 4.22 3.17 14.5 0 1 5 4
Ferrari Dino 19.7 6 145 175 3.62 2.77 15.5 0 1 5 6
Maserati Bora 15 8 301 335 3.54 3.57 14.6 0 1 5 8
Volvo 142E 21.4 4 121 109 4.11 2.78 18.6 1 1 4 2
model mpg cyl disp hp drat wt qsec vs am gear carb
Mazda RX4 21 6 160 110 3.9 2.62 16.46 0 1 4 4
Mazda RX4 Wag 21 6 160 110 3.9 2.875 17.02 0 1 4 4
Datsun 710 22.8 4 108 93 3.85 2.32 18.61 1 1 4 1
Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
Hornet Sportabout 18.7 8 360 175 3.15 3.44 17.02 0 0 3 2
Valiant 18.1 6 225 105 2.76 3.46 20.22 1 0 3 1
Duster 360 14.3 8 360 245 3.21 3.57 15.84 0 0 3 4
Merc 240D 24.4 4 146.7 62 3.69 3.19 20 1 0 4 2
Merc 230 22.8 4 140.8 95 3.92 3.15 22.9 1 0 4 2
Merc 280 19.2 6 167.6 123 3.92 3.44 18.3 1 0 4 4
Merc 280C 17.8 6 167.6 123 3.92 3.44 18.9 1 0 4 4
Merc 450SE 16.4 8 275.8 180 3.07 4.07 17.4 0 0 3 3
Merc 450SL 17.3 8 275.8 180 3.07 3.73 17.6 0 0 3 3
Merc 450SLC 15.2 8 275.8 180 3.07 3.78 18 0 0 3 3
Cadillac Fleetwood 10.4 8 472 205 2.93 5.25 17.98 0 0 3 4
Lincoln Continental 10.4 8 460 215 3 5.424 17.82 0 0 3 4
Chrysler Imperial 14.7 8 440 230 3.23 5.345 17.42 0 0 3 4
Fiat 128 32.4 4 78.7 66 4.08 2.2 19.47 1 1 4 1
Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.9 1 1 4 1
Toyota Corona 21.5 4 120.1 97 3.7 2.465 20.01 1 0 3 1
Dodge Challenger 15.5 8 318 150 2.76 3.52 16.87 0 0 3 2
AMC Javelin 15.2 8 304 150 3.15 3.435 17.3 0 0 3 2
Camaro Z28 13.3 8 350 245 3.73 3.84 15.41 0 0 3 4
Pontiac Firebird 19.2 8 400 175 3.08 3.845 17.05 0 0 3 2
Fiat X1-9 27.3 4 79 66 4.08 1.935 18.9 1 1 4 1
Porsche 914-2 26 4 120.3 91 4.43 2.14 16.7 0 1 5 2
Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.9 1 1 5 2
Ford Pantera L 15.8 8 351 264 4.22 3.17 14.5 0 1 5 4
Ferrari Dino 19.7 6 145 175 3.62 2.77 15.5 0 1 5 6
Maserati Bora 15 8 301 335 3.54 3.57 14.6 0 1 5 8
Volvo 142E 21.4 4 121 109 4.11 2.78 18.6 1 1 4 2
<?php
$sep = "\t"; $nl = "\n";
$content = file_get_contents('data.txt');
$records = explode($nl, $content);
$header = explode($sep, trim(array_shift($records)));
$data = array_fill_keys($header, array());
foreach ($records as $id=>$record) {
$record = trim($record);
$fields = explode($sep, $record);
$titles = $header;
foreach ($fields as $field) {
$title = array_shift($titles);
$data[$title][] = $field;
}
}
$x = $data['wt'];
$y = $data['mpg'];
require('kashi.php');
// $kashi = new Kashi($dbname, $dbuser, $dbpass, $dbhost);
$kashi = new Kashi('test', 'root', '', 'localhost');
/**
* Summary Statistics:
*/
// $x is an array of values
echo 'Mean: ' . $kashi->mean($x) . '<br />';
// It will use previous data set if you don't provide one as an argument to the method
echo 'Mode: ' . print_r($kashi->mode()) . '<br />';
echo 'Median: ' . $kashi->median() . '<br />';
echo 'Variance: ' . $kashi->variance() . '<br />';
echo 'SD: ' . $kashi->sd() . '<br />';
echo '%CV: ' . $kashi->cv() . '<br />';
echo 'Skewness: ' . $kashi->skew() . '<br />';
echo 'Kurtosis: ' . $kashi->kurt() . '<br />';
/**
* Correlation, Regression, and t-Test:
*/
echo 'Covariance: ' . $kashi->cov($x, $y) . '<br />';
echo 'Correlation: ' . $kashi->cor($x, $y) . '<br />';
$r = $kashi->cor($x, $y);
$n = count($x);
echo 'Significant of Correlation: ' . $kashi->corTest($r, $n) . '<br />';
echo 'Regression: ' . print_r($kashi->lm($x, $y), true) . '<br />';
echo 't-Test unpaired: ' . $kashi->tTest($x, $y, false) . '<br />';
echo 'Test: ' . $kashi->tDist($kashi->tTest($x, $y, false), (count($x)-1)*(count($y)-1)) . '<br />';
echo 't-Test paired: ' . $kashi->tTest($x, $y, true) . '<br />';
echo 'Test: ' . $kashi->tDist($kashi->tTest($x, $y, true), count($x)-1) . '<br />';
/**
* Distributions:
*/
echo 'Normal distribution (x=0.5, mean=0, sd=1): ' . $kashi->norm(0.5, 0, 1) . '<br />';
echo 'Probability for the Student t-distribution (t=3, n=10) one-tailed: ';
echo $kashi->tDist(3, 10, 1) . '<br />';
echo 'Probability for the Student t-distribution (t=3, n=10) two-tailed: ';
echo $kashi->tDist(3, 10, 2) . '<br />';
echo 'F probability distribution (f=2, df1=12, df2=15): ' . $kashi->fDist(2, 12, 15) . '<br />';
echo 'Inverse of the standard normal cumulative distribution (p=0.95): ';
echo $kashi->inverseNormCDF(0.95) . '<br />';
echo 't-value of the Student\'s t-distribution (p=0.05, n=29): ';
echo $kashi->inverseTCDF(0.05, 29) . '<br />';
/**
* Chi-square test or Contingency tables (A/B testing):
*/
$table['Automatic'] = array('4 Cylinders' => 3, '6 Cylinders' => 4, '8 Cylinders' => 12);
$table['Manual'] = array('4 Cylinders' => 8, '6 Cylinders' => 3, '8 Cylinders' => 2);
$result = $kashi->chiTest($table);
$probability = $kashi->chiDist($result['chi'], $result['df']);
echo 'Chi-square test probability: ' . $probability . '<br />';
/**
* Diversity index:
*/
$gear = array('3' => 15, '4' => 12, '5' => 5);
$cyl = array('4' => 11, '6' => 7, '8' => 14);
echo 'Shannon index for gear: ' . $kashi->diversity($gear) . '<br />';
echo 'Shannon index for cyl: ' . $kashi->diversity($cyl) . '<br />';
/**
* ANOVA:
*/
require('kashi_anova.php');
// $obj = new KashiANOVA($dbname, $dbuser, $dbpass, $dbhost);
$obj = new KashiANOVA('test', 'root', '', 'localhost');
$str = file_get_contents('anova_data.txt');
$obj->loadString($str);
// mpg ~ cyl
$result = $obj->anova('cyl', 'mpg');
echo '<hr />Analysis of variance (ANOVA): mpg ~ cyl<pre>';
print_r($result);
echo '</pre>';
<?php
/**
* ----------------------------------------------------------------------
*
* Copyright (c) 2012 Khaled Al-Shamaa.
*
* http://www.ar-php.org
*
* PHP Version 5
*
* ----------------------------------------------------------------------
* @desc We aim in Al-Kashi project to provide a rich package full of statistical
* functions useful for online business intelligent and data mining, possible
* applications may include an online log file analysis, Ad's and Campaign
* statistics, or survey/voting results on-fly analysis.
*
* @category Math
* @package Kashi
* @author Khaled Al-Shamaa <khaled.alshamaa@gmail.com>
* @copyright 2012 Khaled Al-Shamaa
*
* @license GPL <http://www.gnu.org/licenses/gpl.txt>
* @version 1.0.0 released in March 5, 2012
* @link http://www.ar-php.org/stats/al-kashi
*/
class Kashi
{
private $values = array();
private $dataset = array();
public function __construct()
{
}
public function __destruct()
{
}
/**
* Compute the arithmetic mean, and is calculated by adding a group of numbers
* and then dividing by the count of those numbers.
* For example, the mean of 2, 3, 3, 5, 7, and 10 is 30 divided by 6, which is 5.
*
* @param array List of float values for which you want to calculate the mean.
*
* @return float Arithmetic mean
* @author Khaled Al-Sham'aa <khaled.alshamaa@gmail.com>
*/
public function mean()
{
if (func_num_args() > 0) {
$this->values = func_get_arg(0);
}
$total = 0;
foreach ($this->values as $value) {
$total += $value;
}
return $total/count($this->values);
}
/**
* Compute the sample median which is the middle number of a group of numbers; that is,
* half the numbers have values that are greater than the median, and half the numbers
* have values that are less than the median.
* For example, the median of 2, 3, 3, 5, 7, and 10 is 4.
*
* @param array List of float values for which you want to calculate the median.
*
* @return float Sample median
* @author Khaled Al-Sham'aa <khaled.alshamaa@gmail.com>
*/
public function median()
{
if (func_num_args() > 0) {
$this->values = func_get_arg(0);
}
$list = $this->values;
sort($list);
$count = count($list);
if ($count % 2 == 0) {
$median = ($list[($count / 2) - 1] + $list[$count / 2]) / 2;
} else {
$median = $list[floor($count / 2)];
}
return $median;
}
/**
* Compute the mode which is the most frequently occurring number in a group of numbers.
* For example, the mode of 2, 3, 3, 5, 7, and 10 is 3.
*
* @param array List of float values for which you want to calculate the mode.
*
* @return float Returns the most frequently occurring or repetitive value
* @author Khaled Al-Sham'aa <khaled.alshamaa@gmail.com>
*/
public function mode()
{
if (func_num_args() > 0) {
$this->values = func_get_arg(0);
}
$counter = array();
foreach ($this->values as $value) {
if (isset($counter[$value])) {
$counter[$value]++;
} else {
$counter[$value] = 1;
}
}
return array_keys($counter, max($counter));
}
/**
* Estimates variance based on a sample
*
* @param array List of float values corresponding to a sample of a population.
*
* @return float Returns the sample variance
* @author Khaled Al-Sham'aa <khaled.alshamaa@gmail.com>
*/
public function variance()
{
if (func_num_args() > 0) {
$this->values = func_get_arg(0);
}
$mean = $this->mean();
$var = 0;
foreach ($this->values as $value) {
$var += ($value - $mean) * ($value - $mean);
}
$var = $var / (count($this->values) - 1);
return $var;
}
/**
* Compute the standard deviation based on a sample. The standard deviation is a measure of
* how widely values are dispersed from the average value (the mean).
*
* @param array List of float values for which you want to calculate the standard deviation.
*
* @return float Returns the standard deviation
* @author Khaled Al-Sham'aa <khaled.alshamaa@gmail.com>
*/
public function sd()
{
if (func_num_args() > 0) {
$this->values = func_get_arg(0);
}
return sqrt($this->variance());
}
/**
* Compute the skewness of a distribution. Skewness characterizes the degree of asymmetry of
* a distribution around its mean. Positive skewness indicates a distribution with an asymmetric
* tail extending toward more positive values. Negative skewness indicates a distribution with
* an asymmetric tail extending toward more negative values.
*
* @param array List of float values for which you want to calculate the skewness.
*
* @return float Returns the skewness of a distribution
* @author Khaled Al-Sham'aa <khaled.alshamaa@gmail.com>
*/
public function skew()
{
if (func_num_args() > 0) {
$this->values = func_get_arg(0);
}
$mean = $this->mean();
$sd = $this->sd();
$n = count($this->values);
$skew = 0;
foreach ($this->values as $value) {
$skew += pow(($value - $mean) / $sd, 3);
}
$skew = ($skew * $n) / (($n - 1) * ($n - 2));
// skew standard error = sqrt(6 / $n)
// skew is significant if its value exceed 2 * sqrt(6 / $n)
return $skew;
}
/**
* Compute the kurtosis of a distribution. Kurtosis characterizes the relative peakedness or
* flatness of a distribution compared with the normal distribution. Positive kurtosis indicates
* a relatively peaked distribution. Negative kurtosis indicates a relatively flat distribution.
*
* @param array List of float values for which you want to calculate the kurtosis.
*
* @return float Returns the kurtosis of a distribution
* @author Khaled Al-Sham'aa <khaled.alshamaa@gmail.com>
*/
public function kurt()
{
if (func_num_args() > 0) {
$this->values = func_get_arg(0);
}
$mean = $this->mean();
$sd = $this->sd();
$n = count($this->values);
$kurt = 0;
foreach ($this->values as $value) {
$kurt += pow(($value - $mean) / $sd, 4);
}
$kurt = ($kurt * $n * ($n + 1)) / (($n - 1) * ($n - 2) * ($n - 3));
$kurt = $kurt - ((3 * ($n - 1) * ($n - 1)) / (($n - 2) * ($n - 3)));
// kurt standard error = sqrt(24 / $n)
// kurt is significant if its value exceed 2 * sqrt(24 / $n)
return $kurt;
}
/**
* Compute the coefficients of variation
*
* @param array List of float values for which you want to calculate the coefficients of variation.
*
* @return float Returns the coefficients of variation
* @author Khaled Al-Sham'aa <khaled.alshamaa@gmail.com>
*/
public function cv()
{
if (func_num_args() > 0) {
$this->values = func_get_arg(0);
}
$mean = $this->mean();
$sd = $this->sd();
$cv = ($sd / $mean) * 100;
return $cv;
}
/**
* Compute the covariance, the average of the products of deviations for each data point pair.
* Use covariance to determine the relationship between two data sets. For example, you can
* examine whether greater income accompanies greater levels of education.
*
* @param array $x First list of float values
* @param array $y Second list of float values
*
* @return float Returns the covariance
* @author Khaled Al-Sham'aa <khaled.alshamaa@gmail.com>
*/
public function cov($x, $y)
{
$meanX = $this->mean($x);
$meanY = $this->mean($y);
$count = count($x);
$total = 0;
for ($i=0; $i<$count; $i++) {
$total += ($x[$i] - $meanX) * ($y[$i] - $meanY);
}
return (1 / ($count - 1)) * $total;
}
/**
* Compute the correlation coefficient. Use the correlation coefficient to determine the
* relationship between two properties. It uses different measures of association, all
* in the range [-1, 1] with 0 indicating no association.
*
* @param array $x First list of float values
* @param array $y Second list of float values
*
* @return float Returns the correlation coefficient
* @author Khaled Al-Sham'aa <khaled.alshamaa@gmail.com>
*/
public function cor($x, $y)
{
$cov = $this->cov($x, $y);
$sdX = $this->sd($x);
$sdY = $this->sd($y);
return $cov / ($sdX * $sdY);
}
/**
* Test of the null hypothesis that true correlation is equal to 0
*
* @param array $r Correlation value
* @param array $n Number of observations
*
* @return float Returns null hypothesis probability: true correlation is equal to 0
* @author Khaled Al-Sham'aa <khaled.alshamaa@gmail.com>
*/
public function corTest($r, $n)
{
$t = $r / sqrt((1 - ($r * $r)) / ($n - 2));
$result = $this->tDist($t, $n - 2);
return $result;
}
/**
* Compute the simple linear regression fits a linear model to represent
* the relationship between a response (or y-) variate, and an explanatory
* (or x-) variate.
*
* @param array $x Specifies the name of the explanatory (or x-) variate.
* @param array $y Specifies the name of the response (or y-) variate.
*
* @return array Returns [intercept], [sploe], and [r-square] as float values
* @author Khaled Al-Sham'aa <khaled.alshamaa@gmail.com>
*/
public function lm($x, $y)
{
$meanX = $this->mean($x);
$meanY = $this->mean($y);
$n = count($x);
$nominator = 0;
$denominator = 0;
for ($i=0; $i<$n; $i++) {
$nominator += ($x[$i] - $meanX) * ($y[$i] - $meanY);
$denominator += ($x[$i] - $meanX) * ($x[$i] - $meanX);
}
$b = $nominator / $denominator;
$a = $meanY - $b * $meanX;
// Total sum of squares (ss) and Residual sum of squares (rss)
$ss = 0;
$rss = 0;
for ($i=0; $i<$n; $i++) {
$ss += pow($y[$i] - $meanY, 2);
$est = $a + ($b * $x[$i]);
$rss += pow($y[$i] - $est, 2);
}
// R-square value
$r2 = 1 - ($rss / $ss);
return array('intercept'=>$a, 'slope'=>$b, 'r-square'=>$r2);
}
/**
* Compute the Student's t-Test value to determine whether two samples are likely
* to have come from the same two underlying populations that have the same mean
*
* @param array $a First list of float values
* @param array $b Second list of float values
* @param boolean $paired Logical indicating whether you want a paired t-test
*
* @return float Returns the associated with a Student's t-Test
* @author Khaled Al-Sham'aa <khaled.alshamaa@gmail.com>
*/
public function tTest ($a, $b, $paired=false)
{
if ($paired == true) {
$count = count($a);
$diff = array();
for ($i=0; $i<$count; $i++) {
$diff[$i] = $a[$i] - $b[$i];
}
$mean = $this->mean($diff);
$var = $this->variance($diff);
$t = $mean / sqrt($var / $count);
} else {
$meanA = $this->mean($a);
$meanB = $this->mean($b);
$varA = $this->variance($a);
$varB = $this->variance($b);
$countA = count($a);
$countB = count($b);
$t = ($meanA - $meanB) / sqrt(($varA / $countA) + ($varB / $countB));
}
return $t;
}
/**
* Returns the standard normal cumulative distribution function. The distribution
* has a mean of 0 (zero) and a standard deviation of one. Use this function in
* place of a table of standard normal curve areas.
*
* @param float $x Is the value for which you want the distribution.
* @param float $mean The distribution mean (default is zero).
* @param float $sd The distribution standard deviation (default is one).
*
* @return float the standard normal cumulative distribution function.
* @author Khaled Al-Sham'aa <khaled.alshamaa@gmail.com>
*/
public function norm ($x, $mean=0, $sd=1)
{
$y = (1 / sqrt(2 * pi())) * exp(-0.5 * pow($x, 2));
return $y;
}
private function _zip ($q, $i, $j, $b)
{
$zz = 1;
$z = $zz;
$k = $i;
while ($k <= $j) {
$zz *= $q * $k / ($k - $b);
$z += $zz;
$k += 2;
}
return $z;
}
/**
* Returns the Percentage Points (probability) for the Student t-distribution
* where a numeric value (t) is a calculated value of t for which the Percentage
* Points are to be computed.
*
* @param float $t Is the numeric value at which to evaluate the distribution.
* @param integer $n Is an integer indicating the number of degrees of freedom.
* @param integer $tail Specifies the number of distribution tails to return.
* If tail = 1, TDIST returns the one-tailed distribution.
* If tail = 2, TDIST returns the two-tailed distribution.
*
* @return float the Percentage Points (probability) for the Student t-distribution
* @author Khaled Al-Sham'aa <khaled.alshamaa@gmail.com>
*/
public function tDist ($t, $n, $tail=1)
{
$pj2 = pi() / 2;
$t = abs($t);
$rt = $t / sqrt($n);
$fk = atan($rt);
if ($n == 1) {
$result = 1 - $fk / $pj2;
} else {
$ek = sin($fk);
$dk = cos($fk);
if (($n % 2) == 1) {
$result = 1 - ($fk + $ek * $dk * $this->_zip($dk * $dk, 2, $n-3, -1)) / $pj2;
} else {
$result = 1 - $ek * $this->_zip($dk * $dk, 1, $n-3, -1);
}
}
return $result / $tail;
}
/**
* Returns the F probability distribution. You can use this function to determine
* whether two data sets have different degrees of diversity.
*
* @param float $f Is the value at which to evaluate the function.
* @param integer $df1 Is the numerator degrees of freedom.
* @param integer $df2 Is the denominator degrees of freedom.
*
* @return float the F probability distribution
* @author Khaled Al-Sham'aa <khaled.alshamaa@gmail.com>
*/
public function fDist ($f, $df1, $df2)
{
$pj2 = pi() / 2;
$x = $df2 / ($df1 * $f + $df2);
if (($df1 % 2) == 0) {
return $this->_zip(1 - $x, $df2, $df1 + $df2 - 4, $df2 - 2) * pow($x, $df2 / 2);
}
if (($df2 % 2) == 0) {
return 1 - $this->_zip($x, $df1, $df1 + $df2 - 4, $df1 - 2) * pow(1 - $x, $df1 / 2);
}
$tan = atan(sqrt($df1 * $f / $df2));
$a = $tan / $pj2;
$sat = sin($tan);
$cot = cos($tan);
if ($df2 > 1) {
$a = $a + $sat * $cot * $this->_zip($cot * $cot, 2, $df2 - 3, -1) / $pj2;
}
if ($df1 == 1) {
return 1 - $a;
}
$c = 4 * $this->_zip($sat * $sat, $df2 + 1, $df1 + $df2 - 4, $df2 - 2) * $sat * pow($cot, $df2) / pi();
if ($df2 == 1) {
return 1 - $a + $c / 2;
}
$k = 2;
while ($k <= ($df2 - 1) / 2) {
$c *= $k / ($k - 0.5);
$k++;
}
return 1 - $a + $c;
}
/**
* Return the probability of normal z value
* Adapted from a polynomial approximation in:
* Ibbetson D, Algorithm 209
* Collected Algorithms of the CACM 1963 p. 616
*
* @param float $z Is the value at which you want to evaluate the probability.
*
* @return float the probability of normal z value
* @author Khaled Al-Sham'aa <khaled.alshamaa@gmail.com>
*/
public function normCDF ($z)
{
$max = 6;
if ($z == 0) {
$x = 0;
} else {
$y = abs($z) / 2;
if ($y >= ($max / 2)) {
$x = 1;
} elseif ($y < 1) {
$w = $y * $y;
$x = ((((((((0.000124818987 * $w
- 0.001075204047) * $w + 0.005198775019) * $w
- 0.019198292004) * $w + 0.059054035642) * $w
- 0.151968751364) * $w + 0.319152932694) * $w
- 0.531923007300) * $w + 0.797884560593) * $y * 2;
} else {
$y -= 2;
$x = (((((((((((((-0.000045255659 * $y
+ 0.000152529290) * $y - 0.000019538132) * $y
- 0.000676904986) * $y + 0.001390604284) * $y
- 0.000794620820) * $y - 0.002034254874) * $y
+ 0.006549791214) * $y - 0.010557625006) * $y
+ 0.011630447319) * $y - 0.009279453341) * $y
+ 0.005353579108) * $y - 0.002141268741) * $y
+ 0.000535310849) * $y + 0.999936657524;
}
}
if ($z > 0) {
$result = ($x + 1) / 2;
} else {
$result = (1 - $x) / 2;
}
return $result;
}
/**
* Returns the one-tailed probability of the chi-squared distribution. The chi-squared
* distribution is associated with a chi-squared test. Use the chi-squared test to
* compare observed and expected values.
* Adapted from:
* Hill, I. D. and Pike, M. C. Algorithm 299
* Collected Algorithms for the CACM 1967 p. 243
* Updated for rounding errors based on remark in
* ACM TOMS June 1985, page 185
*
* @param float $x Is the value at which you want to evaluate the distribution.
* @param integer $df Is the number of degrees of freedom.
*
* @return float the probability of the chi-squared distribution
* @author Khaled Al-Sham'aa <khaled.alshamaa@gmail.com>
*/
public function chiDist ($x, $df)
{
define(LOG_SQRT_PI, log(sqrt(pi())));
define(I_SQRT_PI, 1 / sqrt(pi()));
if ($x <= 0 || $df < 1) {
$result = 1;
}
$a = $x / 2;
if ($df % 2 == 0) {
$even = true;
} else {
$even = false;
}
if ($df > 1) {
$y = exp(-1 * $a);
}
if ($even) {
$s = $y;
} else {
$s = 2 * $this->normCDF(-1 * sqrt($x));
}
if ($df > 2) {
$x = ($df - 1) / 2;
if ($even) {
$z = 1;
} else {
$z = 0.5;
}
if ($a > 20) {
if ($even) {
$e = 0;
} else {
$e = LOG_SQRT_PI;
}
$c = log($a);
while ($z <= $x) {
$e += log($z);
$s += exp($c * $z - $a - $e);
$z += 1;
}
$result = $s;
} else {
if ($even) {
$e = 1;
} else {
$e = I_SQRT_PI / sqrt($a);
}
$c = 0;
while ($z <= $x) {
$e *= ($a / $z);
$c += $e;
$z += 1;
}
$result = $c * $y + $s;
}
} else {
$result = $s;
}
return $result;
}
/**
* Performs chi-squared contingency table tests and goodness-of-fit tests.
*
* Example:
* <code>
* $table['Automatic'] = array('4 Cylinders' => 3, '6 Cylinders' => 4, '8 Cylinders' => 12);
* $table['Manual'] = array('4 Cylinders' => 8, '6 Cylinders' => 3, '8 Cylinders' => 2);
*
* $results = $stats->chiTest($table);
* </code>
*
* @param array $table Specifies the two-way, n x m table containing the counts
*
* @return array [chi] => the value the chi-squared test statistic
* [df] => the degrees of freedom of the approximate chi-squared distribution of the test statistic.
* [expected] => the expected counts under the null hypothesis.
* @author Khaled Al-Sham'aa <khaled.alshamaa@gmail.com>
*/
public function chiTest ($table)
{
$total = 0;
$chi = 0;
foreach ($table as $category => $row) {
foreach ($row as $sample => $cell) {
if (isset($rows[$category])) {
$rows[$category] += $cell;
} else {
$rows[$category] = $cell;
}
if (isset($cols[$sample])) {
$cols[$sample] += $cell;
} else {
$cols[$sample] = $cell;
}
$total += $cell;
}
}
$r = count($rows);
$c = count($cols);
$df = ($r - 1) * ($c - 1);
$expected = array();
foreach ($table as $category => $row) {
foreach ($row as $sample => $cell) {
// fo frequency of the observed value
// fe frequency of the expected value
$fo = $cell;
$fe = ($rows[$category] * $cols[$sample]) / $total;
$chi += pow($fo - $fe, 2) / $fe;
$expected[$category][$sample] = $fe;
}
}
return array('chi'=>$chi, 'df'=>$df, 'expected'=>$expected);
}
/**
* Returns the inverse of the standard normal cumulative distribution.
* The distribution has a mean of zero and a standard deviation of one.
* This is an implementation of the algorithm published at:
* http://home.online.no/~pjacklam/notes/invnorm/
*
* @param float $p Is a probability corresponding to the normal distribution.
*
* @return float Inverse of the standard normal cumulative distribution, with a probability of $p
* @author Khaled Al-Sham'aa <khaled.alshamaa@gmail.com>
*/
function inverseNormCDF($p) {
/* coefficients for the rational approximants for the normal probit: */
$a1 = -3.969683028665376e+01;
$a2 = 2.209460984245205e+02;
$a3 = -2.759285104469687e+02;
$a4 = 1.383577518672690e+02;
$a5 = -3.066479806614716e+01;
$a6 = 2.506628277459239e+00;
$b1 = -5.447609879822406e+01;
$b2 = 1.615858368580409e+02;
$b3 = -1.556989798598866e+02;
$b4 = 6.680131188771972e+01;
$b5 = -1.328068155288572e+01;
$c1 = -7.784894002430293e-03;
$c2 = -3.223964580411365e-01;
$c3 = -2.400758277161838e+00;
$c4 = -2.549732539343734e+00;
$c5 = 4.374664141464968e+00;
$c6 = 2.938163982698783e+00;
$d1 = 7.784695709041462e-03;
$d2 = 3.224671290700398e-01;
$d3 = 2.445134137142996e+00;
$d4 = 3.754408661907416e+00;
$p_low = 0.02425;
$p_high = 1.0 - $p_low;
if(0 < $p && $p < $p_low) {
/* rational approximation for the lower region */
$q = sqrt(-2 * log($p));
$x = ((((($c1 * $q + $c2) * $q + $c3) * $q + $c4) * $q + $c5) * $q + $c6) / (((($d1 * $q + $d2) * $q + $d3) * $q + $d4) * $q + 1);
} elseif ($p_low <= $p && $p <= $p_high) {
/* rational approximation for the central region */
$q = $p - 0.5;
$r = $q * $q;
$x = ((((($a1 * $r + $a2) * $r + $a3) * $r + $a4) * $r + $a5) * $r + $a6) * $q / ((((($b1 * $r + $b2) * $r + $b3) * $r + $b4) * $r + $b5) * $r + 1);
} else {
/* rational approximation for the upper region */
$q = sqrt(-2 * log(1 - $p));
$x = -((((($c1 * $q + $c2) * $q + $c3) * $q + $c4) * $q + $c5) * $q + $c6) / (((($d1 * $q + $d2) * $q + $d3) * $q + $d4) * $q + 1);
}
/*
The relative error of the approximation has
absolute value less than 1.15 × 10-9. One iteration of
Halley's rational method (third order) gives full machine precision.
if(0 < $p && $p < 1) {
$e = 0.5 * erfc(-$x / sqrt(2)) - $p;
$u = $e * sqrt(2 * pi()) * exp($x * $x / 2);
$x = $x - $u / (1 + $x * $u / 2);
}
*/
return $x;
}
/**
* Returns the t-value of the Student's t-distribution as a function of
* the probability and the degrees of freedom. This is an implementation
* of the algorithm in:
* G. W. Hill. "Algorithm 396: Student's t-Quantiles." Communications
* of the ACM 13(10):619--620. ACM Press, October, 1970.
*
* @param float $p Is the probability associated with the two-tailed Student's t-distribution.
* @param integer $n Is the number of degrees of freedom with which to characterize the distribution.
*
* @return float t-value of the Student's t-distribution for the terms above (i.e. $p and $n).
* @author Khaled Al-Sham'aa <khaled.alshamaa@gmail.com>
*/
function inverseTCDF($p, $n) {
if($n == 1) {
$p *= M_PI_2;
$result = cos($p) / sin($p);
} else {
$a = 1 / ($n - 0.5);
$b = 48 / ($a * $a);
$c = ((20700 * $a / $b - 98) * $a - 16) * $a + 96.36;
$d = ((94.5 / ($b + $c) - 3) / $b + 1) * sqrt($a * M_PI_2) * $n;
$y = pow(2 * $d * $p, 2 / $n);
if ($y > (0.05 + $a)) {
/* asymptotic inverse expansion about the normal */
$x = $this->inverseNormCDF($p * 0.5);
$y = $x * $x;
if ($n < 5) {
$c += 0.3 * ($n - 4.5) * ($x + 0.6);
}
$c = (((0.05 * $d * $x - 5) * $x - 7) * $x - 2) * $x + $b + $c;
$y = (((((0.4 * $y + 6.3) * $y + 36) * $y + 94.5) / $c - $y - 3) / $b + 1) * $x;
$y *= $a * $y;
if ($y > 0.002) {
$y = exp($y) - 1;
} else {
$y += 0.5 * $y * $y;
}
} else {
$y = ((1 / ((($n + 6) / ($n * $y) - 0.089 * $d - 0.822) * ($n + 2) * 3) + 0.5 / ($n + 4)) * $y - 1) * ($n + 1) / ($n + 2) + 1 / $y;
}
$result = sqrt($n * $y);
}
return $result;
}
/**
* Returns the Shannon or Simpson index value
* http://en.wikipedia.org/wiki/Diversity_index
*
* @param array $abundances Associated array where keys refer to the categories and values refer to the observation counts in each category.
* @param string $index Which index you would like to calculate ["shannon"|"simpson"] (default is "shannon")
* @param float $base Base of the logarithmic transformation used in the Shannon index (default is M_E)
*
* @return float The index (either Shannon or Simpson as selected in the $index parameter)
* @author Khaled Al-Sham'aa <khaled.alshamaa@gmail.com>
*/
public function diversity ($abundances, $index='shannon', $base=M_E)
{
$index = strtolower($index);
$N = array_sum($abundances);
// Calculate the proportion of characters belonging to each category
foreach ($abundances as $key=>$value) {
$P["$key"] = $value / $N;
}
$result = 0;
if ($index == 'shannon') {
foreach ($P as $key=>$value) {
$result += $value * log($value, $base);
}
$result = -1 * $result;
} elseif ($index == 'simpson') {
if ($N < 20) {
foreach ($abundances as $key=>$value) {
$result += $value * ($value - 1);
}
$result = $result / ($N * ($N - 1));
} else {
foreach ($P as $key=>$value) {
$result += $value * $value;
}
}
}
return $result;
}
}
<?php
require_once('kashi.php');
class KashiANOVA {
protected $dbh;
protected $preparedStmt;
protected $kashi;
public $persistent = false;
public function __construct($dbname='test', $dbuser='root', $dbpass='', $dbhost='localhost') {
$this->kashi = new Kashi();
// We save data in SQL to have a scalable data-analysis solution (ENGINE=MEMORY ?)
$this->dbh = new PDO("mysql:host=$dbhost;dbname=$dbname", $dbuser, $dbpass);
$tableA = 'CREATE TABLE IF NOT EXISTS kashi_factors (id INT NOT NULL, kashi_factor VARCHAR(50) NOT NULL, kashi_level VARCHAR(50) NOT NULL, PRIMARY KEY (id, kashi_factor))';
$tableB = 'CREATE TABLE IF NOT EXISTS kashi_variates (id INT NOT NULL, kashi_trait VARCHAR(50) NOT NULL, kashi_value DECIMAL(9,3) NOT NULL, PRIMARY KEY (id, kashi_trait))';
$this->dbh->exec($tableA);
$this->dbh->exec($tableB);
$this->preparedStmt['insertFactor'] = $this->dbh->prepare("INSERT INTO kashi_factors VALUES (:myRecord, :myName, :myValue)");
$this->preparedStmt['insertVariate'] = $this->dbh->prepare("INSERT INTO kashi_variates VALUES (:myRecord, :myName, :myValue)");
$this->preparedStmt['queryFactor'] = $this->dbh->prepare("SELECT id FROM kashi_factors WHERE kashi_factor=:myFactor AND kashi_level=:myLevel");
$this->preparedStmt['ssBetween'] = $this->dbh->prepare("SELECT kashi_level, AVG(kashi_value) AS kashi_mean, SUM(kashi_value) AS kashi_total, SUM(kashi_value*kashi_value) AS kashi_ss, COUNT(kashi_value) AS kashi_count FROM kashi_factors INNER JOIN kashi_variates USING(id) WHERE kashi_factor=:myTreat AND kashi_trait=:myTrait GROUP BY kashi_level");
}
public function __destruct() {
$tableA = 'DROP TABLE kashi_factors';
$tableB = 'DROP TABLE kashi_variates';
if (!$this->persistent) {
$this->dbh->exec($tableA);
$this->dbh->exec($tableB);
}
$this->dbh = null;
}
public function loadArray($data) {
$this->dbh->exec('TRUNCATE TABLE kashi_factors');
$this->dbh->exec('TRUNCATE TABLE kashi_variates');
foreach ($data as $name=>$values) {
if (substr($name, -1) == '!') {
$name = substr($name, 0, -1);
$table = 'kashi_factors';
$stmt = 'insertFactor';
} else {
$table = 'kashi_variates';
$stmt = 'insertVariate';
}
$this->preparedStmt[$stmt]->bindParam(':myRecord', $record, PDO::PARAM_INT);
$this->preparedStmt[$stmt]->bindParam(':myValue', $value, PDO::PARAM_STR);
$this->preparedStmt[$stmt]->bindParam(':myName', $name, PDO::PARAM_STR);
$record = 1;
foreach ($values as $value) {
$this->preparedStmt[$stmt]->execute();
$record++;
}
}
}
protected function readString($str) {
$data = array();
$str = str_replace("\r", '', $str);
$lines = explode("\n", $str);
$header = explode("\t", $lines[0]);
$maxCol = count($header);
$maxRow = count($lines);
for ($i=1; $i<$maxRow; $i++) {
if (trim($lines[$i]) == '') continue;
$fields = explode("\t", $lines[$i]);
for ($j=0; $j<$maxCol; $j++) {
$data[$header[$j]][] = $fields[$j];
}
}
return $data;
}
public function loadString($str) {
$this->loadArray($this->readString($str));
}
protected function query($factors, $trait, $return='summary') {
$this->preparedStmt['queryFactor']->bindParam(':myFactor', $factor, PDO::PARAM_STR);
$this->preparedStmt['queryFactor']->bindParam(':myLevel', $level, PDO::PARAM_STR);
foreach ($factors as $factor=>$level) {
$this->preparedStmt['queryFactor']->execute();
$result = $this->preparedStmt['queryFactor']->fetchAll();
$keys = array();
foreach ($result as $row) {
$keys[] = $row['id'];
}
if (!isset($activeKeys)) {
$activeKeys = $keys;
}
$activeKeys = array_intersect($activeKeys, $keys);
}
$data = array();
$idList = implode(",", $activeKeys);
if ($return == 'values') {
$sql = "SELECT id, kashi_value FROM kashi_variates WHERE kashi_trait='$trait' AND id in ($idList)";
foreach ($this->dbh->query($sql) as $row) {
$data[$row['id']] = $row['kashi_value'];
}
} else if ($return == 'summary') {
$sql = "SELECT SUM(kashi_value) AS a, AVG(kashi_value) AS b, COUNT(kashi_value) AS c FROM kashi_variates WHERE kashi_trait='$trait' AND id in ($idList)";
$result = $this->dbh->query($sql)->fetch(PDO::FETCH_ASSOC);
$data['sum'] = $result['a'];
$data['avg'] = $result['b'];
$data['count'] = $result['c'];
}
return $data;
}
public function anova ($treat, $trait) {
$sql = "SELECT COUNT(DISTINCT kashi_level) AS levels FROM kashi_factors WHERE kashi_factor='$treat'";
$result = $this->dbh->query($sql)->fetch(PDO::FETCH_ASSOC);
$r = $result['levels'];
$treatDF = $r - 1;
$sql = "SELECT COUNT(DISTINCT id) AS levels FROM kashi_variates WHERE kashi_trait='$trait'";
$result = $this->dbh->query($sql)->fetch(PDO::FETCH_ASSOC);
$t = $result['levels'];
$totalDF = $t - 1;
$errorDF = $totalDF - $treatDF;
$sql = "SELECT SUM(kashi_value*kashi_value) AS ss, SUM(kashi_value) AS grandTotal, COUNT(kashi_value) AS observations FROM kashi_variates WHERE kashi_trait='$trait'";
$result = $this->dbh->query($sql)->fetch(PDO::FETCH_ASSOC);
$ss = $result['ss'];
$grandTotal = $result['grandTotal'];
$t = $result['observations'];
$C = pow($grandTotal, 2) / $t;
$SSTot = $ss - $C;
$grandMean = $grandTotal / $t;
$sql = "SELECT DISTINCT kashi_level FROM kashi_factors WHERE kashi_factor='$treat'";
$levels = array();
$means = array();
$reps = array();
$variation = 0;
foreach ($this->dbh->query($sql) as $row) {
$levels[] = $row['kashi_level'];
}
foreach ($levels as $level) {
$result = $this->query(array($treat=>$level), $trait, 'summary');
$variation += pow($result['sum'], 2) / $result['count'];
$means[$level] = $result['avg'];
$reps[$level] = $result['count'];
}
ksort($means);
ksort($reps);
$SST = $variation - $C;
$SSE = $SSTot - $SST;
$MST = $SST / $treatDF;
$MSE = $SSE / $errorDF;
$VRT = $MST / $MSE;
$F = $this->kashi->fDist($VRT, $treatDF, $errorDF);
if (max($reps) == min($reps)) {
$SE = sqrt($MSE / max($reps));
$SED = sqrt(2) * $SE;
// qt(0.975, Residual df)
$LSD = $this->kashi->inverseTCDF(0.05, $errorDF) * $SED;
} else {
$SE = array();
$SED = array();
$LSD = array();
$SE['min'] = sqrt($MSE / min($reps));
$SED['min'] = sqrt(2) * $SE['min'];
$LSD['min'] = $this->kashi->inverseTCDF(0.05, $errorDF) * $SED['min'];
$SE['max'] = sqrt($MSE / max($reps));
$SED['max'] = sqrt(2) * $SE['max'];
$LSD['max'] = $this->kashi->inverseTCDF(0.05, $errorDF) * $SED['max'];
}
$CV = 100 * sqrt($MSE) / $grandMean;
$anova['TDF'] = $treatDF;
$anova['EDF'] = $errorDF;
$anova['TotDF'] = $totalDF;
$anova['SST'] = $SST;
$anova['SSE'] = $SSE;
$anova['SSTot'] = $SSTot;
$anova['MST'] = $MST;
$anova['MSE'] = $MSE;
$anova['VRT'] = $VRT;
$anova['F'] = $F;
$anova['Mean'] = $grandMean;
$anova['Means'] = $means;
$anova['Reps'] = $reps;
$anova['SE'] = $SE;
$anova['SED'] = $SED;
$anova['LSD'] = $LSD;
$anova['CV'] = $CV;
return $anova;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment