Skip to content

Instantly share code, notes, and snippets.

@Jiu9Shen
Last active October 21, 2019 20:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Jiu9Shen/1709484e7bf9564a27de6f2c221314b5 to your computer and use it in GitHub Desktop.
Save Jiu9Shen/1709484e7bf9564a27de6f2c221314b5 to your computer and use it in GitHub Desktop.
<?php
/*
**********************************************************************************************
*********vcf_summary.php - give statistical summary of a vcf file
**********************************************************************************************
***Usage: on terminal command:php vcf_summary.php <file_name> <screen/table>
***Functions:
*** build_table: output a html format table
*** cal_median: calculate median number
*** stat_summary: return an array of statistic summary
*** read-depth_analysis
*** minor_allele_frequency_analysis
*** site_missing_analysis
*** main: main function to carry out all works
*/
/**FUNCTION------------------------------------------------------------------
***build_table---------------------------------------------------------------
*** -input: $array_table
*** -must creat by keys with corralted valuse
*** -in this case: Five keys: "Min","Max","Average","Median","SD"
*/
function build_table($array){
// start table
$html = '<table>';
// header row
$html .= '<tr>';
foreach($array[0] as $key=>$value){
$html .= '<th>' . htmlspecialchars($key) . '</th>';
}
$html .= '</tr>';
// data rows
foreach( $array as $key=>$value){
$html .= '<tr>';
foreach($value as $key2=>$value2){
$html .= '<td>' . htmlspecialchars($value2) . '</td>';
}
$html .= '</tr>';
}
// finish table and return it
$html .= '</table>';
return $html;
}
/**FUNCTION------------------------------------------------------------------
***cal_median
*** INPUT PARAMETERS: $num, $num_count
*** -$arr: an array of sorted number
*** -$arr_count: times of each number appear
*/
function cal_median($arr,$arr_count){
$total_num = array_sum($arr_count); //total numbers in array
$middleval = floor($total_num/2); // find middle value of number count
//echo "middleval calculated is $middleval\n";
$less_half = 0;
$i = 0;
while ($less_half < $middleval){
$less_half += $arr_count[$i];
$i++;
//echo "number value: ", $arr[($i-1)],", less_half: $less_half, i: $i.\n";
}
if ($total_num %2 ){ //for odd number
if ($less_half == $middleval){return $arr[$i];}
else{return $arr[$i-1];}
}else{ //for even number
if ($less_half == $middleval){return ($arr[$i]+$arr[$i-1])/2;}
else {return $arr[$i-1];}
}
}
/**FUNCTION---------------------------------------------------------------------------
***stat_summary-----------------------------------------------------------------------
*** INPUT:$num, $num_count,$num_sum
*** OUTPUT: a summary array with keys: Min, Max, Median, Average, SD
*** SPECIFICS:
*** -$num is an array of values
*** -$num_count is an array of appear times of correlated values
*** -$num_sum is the sum of all valuse
*/
function stat_summary($num,$num_count,$num_sum){
$var_all = 0;
$mean_overall = $num_sum/array_sum($num_count);
//calculate the numerator of variance first
for ($i=0;$i<count($num);$i++){
$var_all += ($num[$i]-$mean_overall)*($num[$i]-$mean_overall)*$num_count[$i];
}
return array(
"Min"=> min($num),
"Max"=> max($num),
"Median" => cal_median($num, $num_count),
"Average"=> number_format($mean_overall,4),
"SD"=> number_format(sqrt($var_all/array_sum($num_count)),4),
);
}
/**FUNCTION-------------------------------------------------------------------
***read-depth_analysis--------------------------------------------------------
*** INPUT: file of read depth produced with vcftool(.gdepth)
*** OUTPUT: array: $depth_m
*** with keys(actual values) / element (times this value appear)
*** SPECIFICS:
*** -Header: CHROM POS (Germplasm name)...
*** -split each line, make sure this line isn't empty
*** -save read depth in an array (number value as key, times this # appear as element)
*/
function read_depth_analysis($filename){
$depth_m = array();
//read file by lines, and split each line by tabs
foreach (new SplFileObject($filename) as $line){
$out_line = explode("\t", $line);
//first two clumns are chrom and postion, we can know total number of samples
$number_sample = count($out_line) - 2;
//check if varialbes exist and second element is a number, making sure we are not dealing with empty array
if (empty($out_line[1])==false and is_numeric($out_line[1]) == true){
for ($i=2;$i<($number_sample+2); $i++){
//only keep non-negative numbers since read depth cant be negative
if ($out_line[$i] > 0){
$r_d = intval ($out_line[$i]);
//check if this key exist, if not, set this key and give element value 0
if (array_key_exists($r_d,$depth_m)==false){$depth_m[$r_d]=0;
}
$depth_m[$r_d]++; // save read depth as integers
}
}
} //end for if
} //end for foreach loop
//delete unnecessary file, sort array by value of keys and return array
unlink($filename);// delete the .gdepth file
ksort($depth_m);
return $depth_m;
}
/**FUNCTION----------------------------------------------------------------------
***minor_allele_frequency_analysis-----------------------------------------------
*** INPUT: file of allele frequency produced by vcftool (.frq)
*** OUTPUT: array: $freq_count_m
*** with keys(actual values*1000) / element (times this value appear)
*** array:$frq_Nucleotide
*** overall frequency of nucleotide; A T C G
*** SPECIFICS:
*** -Header: CHROM POS N_ALLELES N_CHR {ALLELE:FREQ}
*** -split each line, make sure this line isn't empty
*** -find frequency for all alleles (2,3 or 4)
*** -save minimun frquency in an array (number value as key, times this # appear as element)
*/
function minor_allele_frequency_analysis($filename){
$A_count=0.0; $T_count=0.0; $C_count=0.0; $G_count=0.0;
$freq_count_m = array();
foreach (new SplFileObject($filename) as $line){
$out_line = explode("\t", $line);
if (empty($out_line[2]) == false && is_numeric($out_line[1]) == true){
for ($i=0; $i< $out_line[2];$i++){
$allele = $out_line[$i+4];
//allele frequency is stored in pattern A(TCG):0.***, need separate by :
$eachAF=explode(":",$allele);
// check the frequency of A, T, C, G
if ($eachAF[0] == 'A') {$A_count += $eachAF[1];}
elseif ($eachAF[0] == 'T') {$T_count += $eachAF[1];}
elseif ($eachAF[0] == 'C') {$C_count += $eachAF[1];}
elseif ($eachAF[0] == 'G') {$G_count += $eachAF[1];}
else {echo "Something isn't right, we find a nt other than ATCG.\n";}
// save frequency four digits after decimal point
$freq[$i] = floatval($eachAF[1]);
}
}
if (empty($freq) == false){
//times 1000, so float numbers[0.001 1] are stored in interval [1 1000] now, don't forget to convert back
$freq_min = intval(number_format(min($freq),3)*1000);
if (array_key_exists($freq_min, $freq_count_m) == false){
$freq_count_m[$freq_min] = 0;
}
$freq_count_m[$freq_min]++;
}
}//end for foreach loop
$ATCG_sum = $A_count + $T_count + $C_count+ $G_count ;
$frq_Nucleotide = array(
"A"=> number_format($A_count/$ATCG_sum, 6),
"T"=> number_format($T_count/$ATCG_sum, 6),
"C"=> number_format($C_count/$ATCG_sum, 6),
"G"=> number_format($G_count/$ATCG_sum, 6),
);
//delete unnecessary file, sort array by value of keys and return array
unlink($filename);
ksort($freq_count_m);
return array($frq_Nucleotide, $freq_count_m);
}
/**FUNCTION-------------------------------------------------------------------
***site_missing_analysis------------------------------------------------------
*** INPUT: site missing file produced by vcftool(.lmiss)
*** OUTPUT: array: $missing_frq_m
*** with keys(actual values*1000) / element (times this value appear)
*** SPECIFICS:
*** -Header: CHR POS N_DATA N_GENOTYPE_FILTERED N_MISS F_MISS
*** -split each line, make sure this line isn't empty
*** -save missing frequency in an array (number value as key, times this # appear as element)
*/
function site_missing_analysis($filename){
$missing_freq_m = array();
foreach (new SplFileObject($filename) as $line){
$out_line = explode("\t", $line);
if (isset($out_line[1]) and is_numeric($out_line[1])){
$miss_val = intval(number_format(floatval($out_line[5]), 3)*1000);
//check if the key exist, if not, set value to 0
if (array_key_exists($miss_val, $missing_freq_m) == false){
$missing_freq_m[$miss_val] = 0;
}
$missing_freq_m[$miss_val]++;
} //end for if
} //end for foreach loop
//delete unnecessary file, sort array by value of keys and return array
unlink($filename);
ksort($missing_freq_m);
return $missing_freq_m;
}
/**FUNCTION----------------------------------------------------------------------
***main--------------------------------------------------------------------------
*** INPUT: $argv[1],$argv[2]
*** -$argv[1]: name of input file
*** -$argv[2]: string of 'table' or 'screen'
*** SPECIFICS:
*** 1. system call vcftool to obtain files: .gdepth;.frq; .lmiss
*** 2. read depth analysis
*** 3. minor allele frequency analysis
*** 4. missing frequency analysis
*** 5. display results
*/
function main($file_need_stat_summary, $user_table_screen){
//-1--------------------------------------VCFtool run on system command
//read ARGV[0], if it doesn't exit or its wrong, ask user to re-input, three times limit
$inputname = trim($file_need_stat_summary);
$inputname_repeat = 0;
while (file_exists($inputname) == false ){
echo "\nThe input file: $inputname does not exist, please input again. \n";
$handle = fopen ("php://stdin","r");
$inputname = trim(fgets($handle));
if (file_exists($inputname)){break 1;}
if ($inputname_repeat >=3 ){break 1;}
}
// outputname is named after inputname by remove .txt or other suffix with 3 or 4 letters
$outputname = preg_replace('/\\.[^.\\s]{3,4}$/', '', $inputname);
echo "\nName choose for vcftool output files: $outputname", "\n";
// check if files exist, use VCFtool to obtain these files if not
if (file_exists("$outputname.gdepth") && file_exists("$outputname.frq") && file_exists("$outputname.lmiss")){
echo "\nFiles of .gdepth .frq .lmiss already exist. \n";
}
else{
echo "\nVCF summary files don't exist, use VCFtool for getting these files: \n";
system ("vcftools --vcf $inputname --geno-depth --remove-indels --out $outputname"); //.gdepth
system ("vcftools --vcf $inputname --freq --remove-indels --out $outputname"); //.frq
system ("vcftools --vcf $inputname --missing-site --remove-indels --out $outputname"); //.lmiss
}
unlink("$outputname.log");
//-------counts for minor allele frequency is its needed: file save in .frq.count
//system ("vcftools --vcf contrast.vcf --counts --remove-indels --chr 1 --out $outputname");
//-2-------------read depth analysis------------------------results save in $gdepth
$depth_m = read_depth_analysis("$outputname.gdepth");
//convert our array(keys and values) into two arrays
$num_g=array();$num_count_g=array();$num_sum_g = 0;
foreach ($depth_m as $key => $value) {
array_push($num_g, intval($key));
array_push($num_count_g, intval($value));
$num_sum_g += intval($key)*intval($value);
}
$gdepth = stat_summary($num_g, $num_count_g,$num_sum_g);
//-3-----------minor allele frequency analysis------results save in $frq/$frq_Nucleotide
list( $frq_Nucleotide, $freq_count_m) = minor_allele_frequency_analysis("$outputname.frq");
$num_f=array();$num_count_f=array();$num_sum_f = 0;
foreach ($freq_count_m as $key => $value) {
array_push($num_f, floatval(intval($key)/1000));
array_push($num_count_f, intval($value));
$num_sum_f += floatval(intval($key)/1000)*intval($value);
}
$frq = stat_summary($num_f,$num_count_f,$num_sum_f);
//-4-------------------missing analysis-------------------results save in $lmiss
$missing_freq_m = site_missing_analysis("$outputname.lmiss");
$num_m=array();$num_count_m=array();$num_sum_m = 0;
foreach ($missing_freq_m as $key => $value) {
array_push($num_m, floatval(intval($key)/1000));
array_push($num_count_m, intval($value));
$num_sum_m += floatval(intval($key)/1000)*intval($value);
}
$lmiss = stat_summary($num_m,$num_count_m,$num_sum_m);
//-5------------------------------------------------------------------------------------
// ask user to choose a way of representing result if there's wrong/no para from command
// user can try 3 times--------------------------then display results
$summary_or_table = trim($user_table_screen);
$repeat_input = 0;
while ($summary_or_table != 'table' and $summary_or_table != 'screen'){
if ($repeat_input >=3){break 1;}
echo "Please make a choose only between 'screen' and 'table'! \n";
$handle = fopen ("php://stdin","r");
$summary_or_table = trim(fgets($handle));
$repeat_input++;
}
if (trim($summary_or_table) == 'screen'){
echo "The user choose print resutls on screen. \n";
//all results are stored in three arrays: $gdepth,$frq,$lmiss
//print all result on screen by tabs if user choose "screen"
$fea = array ("Min","Max","Average","Median","SD");
echo "\n";
echo "Summary Statistics ","\t",$fea[0],"\t",$fea[1],"\t",$fea[2],"\t",$fea[3],"\t",$fea[4],"\n";
echo "Depth ","\t",$gdepth["$fea[0]"],"\t",$gdepth["$fea[1]"],"\t",
$gdepth["$fea[2]"],"\t",$gdepth["$fea[3]"],"\t",$gdepth["$fea[4]"],"\n";
echo "Minor Allele Frequency","\t",$frq["$fea[0]"],"\t",$frq["$fea[1]"],"\t",
$frq["$fea[2]"],"\t",$frq["$fea[3]"],"\t",$frq["$fea[4]"],"\n";
echo "Missing Frequency ","\t",$lmiss["$fea[0]"],"\t",$lmiss["$fea[1]"],"\t",
$lmiss["$fea[2]"],"\t",$lmiss["$fea[3]"],"\t",$lmiss["$fea[4]"],"\n\n";
echo "Nucleotide","\t","Frequency","\n";
echo "A ","\t",$frq_Nucleotide["A"],"\n";
echo "T ","\t",$frq_Nucleotide["T"],"\n";
echo "C ","\t",$frq_Nucleotide["C"],"\n";
echo "G ","\t",$frq_Nucleotide["G"],"\n";
}
//print html format table if user choose "table":
elseif (trim($summary_or_table) == 'table'){
echo "You choose print result as HTML table. \n\n";
$array_table = array(
array("Statistical Summary"=>"Read Depth ") + $gdepth,
array("Statistical Summary"=>"Minor Allele Frequency") + $frq,
array("Statistical Summary"=>"Missing Frequency ") + $lmiss,
);
echo build_table($array_table),"\n\n";
}
else {
echo "still, you are not making a choice between table or screen. \n";
}
}
main($argv[1],$argv[2]);
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment