Last active
October 21, 2019 20:54
-
-
Save Jiu9Shen/1709484e7bf9564a27de6f2c221314b5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/* | |
********************************************************************************************** | |
*********vcf_summary.php - give statistical summary of a vcf file | |
********************************************************************************************** | |
***Usage: on terminal command:php vcf_summary.php <file_name> <screen/table> | |
***Functions: | |
*** build_table: output a html format table | |
*** cal_median: calculate median number | |
*** stat_summary: return an array of statistic summary | |
*** read-depth_analysis | |
*** minor_allele_frequency_analysis | |
*** site_missing_analysis | |
*** main: main function to carry out all works | |
*/ | |
/**FUNCTION------------------------------------------------------------------ | |
***build_table--------------------------------------------------------------- | |
*** -input: $array_table | |
*** -must creat by keys with corralted valuse | |
*** -in this case: Five keys: "Min","Max","Average","Median","SD" | |
*/ | |
function build_table($array){ | |
// start table | |
$html = '<table>'; | |
// header row | |
$html .= '<tr>'; | |
foreach($array[0] as $key=>$value){ | |
$html .= '<th>' . htmlspecialchars($key) . '</th>'; | |
} | |
$html .= '</tr>'; | |
// data rows | |
foreach( $array as $key=>$value){ | |
$html .= '<tr>'; | |
foreach($value as $key2=>$value2){ | |
$html .= '<td>' . htmlspecialchars($value2) . '</td>'; | |
} | |
$html .= '</tr>'; | |
} | |
// finish table and return it | |
$html .= '</table>'; | |
return $html; | |
} | |
/**FUNCTION------------------------------------------------------------------ | |
***cal_median | |
*** INPUT PARAMETERS: $num, $num_count | |
*** -$arr: an array of sorted number | |
*** -$arr_count: times of each number appear | |
*/ | |
function cal_median($arr,$arr_count){ | |
$total_num = array_sum($arr_count); //total numbers in array | |
$middleval = floor($total_num/2); // find middle value of number count | |
//echo "middleval calculated is $middleval\n"; | |
$less_half = 0; | |
$i = 0; | |
while ($less_half < $middleval){ | |
$less_half += $arr_count[$i]; | |
$i++; | |
//echo "number value: ", $arr[($i-1)],", less_half: $less_half, i: $i.\n"; | |
} | |
if ($total_num %2 ){ //for odd number | |
if ($less_half == $middleval){return $arr[$i];} | |
else{return $arr[$i-1];} | |
}else{ //for even number | |
if ($less_half == $middleval){return ($arr[$i]+$arr[$i-1])/2;} | |
else {return $arr[$i-1];} | |
} | |
} | |
/**FUNCTION--------------------------------------------------------------------------- | |
***stat_summary----------------------------------------------------------------------- | |
*** INPUT:$num, $num_count,$num_sum | |
*** OUTPUT: a summary array with keys: Min, Max, Median, Average, SD | |
*** SPECIFICS: | |
*** -$num is an array of values | |
*** -$num_count is an array of appear times of correlated values | |
*** -$num_sum is the sum of all valuse | |
*/ | |
function stat_summary($num,$num_count,$num_sum){ | |
$var_all = 0; | |
$mean_overall = $num_sum/array_sum($num_count); | |
//calculate the numerator of variance first | |
for ($i=0;$i<count($num);$i++){ | |
$var_all += ($num[$i]-$mean_overall)*($num[$i]-$mean_overall)*$num_count[$i]; | |
} | |
return array( | |
"Min"=> min($num), | |
"Max"=> max($num), | |
"Median" => cal_median($num, $num_count), | |
"Average"=> number_format($mean_overall,4), | |
"SD"=> number_format(sqrt($var_all/array_sum($num_count)),4), | |
); | |
} | |
/**FUNCTION------------------------------------------------------------------- | |
***read-depth_analysis-------------------------------------------------------- | |
*** INPUT: file of read depth produced with vcftool(.gdepth) | |
*** OUTPUT: array: $depth_m | |
*** with keys(actual values) / element (times this value appear) | |
*** SPECIFICS: | |
*** -Header: CHROM POS (Germplasm name)... | |
*** -split each line, make sure this line isn't empty | |
*** -save read depth in an array (number value as key, times this # appear as element) | |
*/ | |
function read_depth_analysis($filename){ | |
$depth_m = array(); | |
//read file by lines, and split each line by tabs | |
foreach (new SplFileObject($filename) as $line){ | |
$out_line = explode("\t", $line); | |
//first two clumns are chrom and postion, we can know total number of samples | |
$number_sample = count($out_line) - 2; | |
//check if varialbes exist and second element is a number, making sure we are not dealing with empty array | |
if (empty($out_line[1])==false and is_numeric($out_line[1]) == true){ | |
for ($i=2;$i<($number_sample+2); $i++){ | |
//only keep non-negative numbers since read depth cant be negative | |
if ($out_line[$i] > 0){ | |
$r_d = intval ($out_line[$i]); | |
//check if this key exist, if not, set this key and give element value 0 | |
if (array_key_exists($r_d,$depth_m)==false){$depth_m[$r_d]=0; | |
} | |
$depth_m[$r_d]++; // save read depth as integers | |
} | |
} | |
} //end for if | |
} //end for foreach loop | |
//delete unnecessary file, sort array by value of keys and return array | |
unlink($filename);// delete the .gdepth file | |
ksort($depth_m); | |
return $depth_m; | |
} | |
/**FUNCTION---------------------------------------------------------------------- | |
***minor_allele_frequency_analysis----------------------------------------------- | |
*** INPUT: file of allele frequency produced by vcftool (.frq) | |
*** OUTPUT: array: $freq_count_m | |
*** with keys(actual values*1000) / element (times this value appear) | |
*** array:$frq_Nucleotide | |
*** overall frequency of nucleotide; A T C G | |
*** SPECIFICS: | |
*** -Header: CHROM POS N_ALLELES N_CHR {ALLELE:FREQ} | |
*** -split each line, make sure this line isn't empty | |
*** -find frequency for all alleles (2,3 or 4) | |
*** -save minimun frquency in an array (number value as key, times this # appear as element) | |
*/ | |
function minor_allele_frequency_analysis($filename){ | |
$A_count=0.0; $T_count=0.0; $C_count=0.0; $G_count=0.0; | |
$freq_count_m = array(); | |
foreach (new SplFileObject($filename) as $line){ | |
$out_line = explode("\t", $line); | |
if (empty($out_line[2]) == false && is_numeric($out_line[1]) == true){ | |
for ($i=0; $i< $out_line[2];$i++){ | |
$allele = $out_line[$i+4]; | |
//allele frequency is stored in pattern A(TCG):0.***, need separate by : | |
$eachAF=explode(":",$allele); | |
// check the frequency of A, T, C, G | |
if ($eachAF[0] == 'A') {$A_count += $eachAF[1];} | |
elseif ($eachAF[0] == 'T') {$T_count += $eachAF[1];} | |
elseif ($eachAF[0] == 'C') {$C_count += $eachAF[1];} | |
elseif ($eachAF[0] == 'G') {$G_count += $eachAF[1];} | |
else {echo "Something isn't right, we find a nt other than ATCG.\n";} | |
// save frequency four digits after decimal point | |
$freq[$i] = floatval($eachAF[1]); | |
} | |
} | |
if (empty($freq) == false){ | |
//times 1000, so float numbers[0.001 1] are stored in interval [1 1000] now, don't forget to convert back | |
$freq_min = intval(number_format(min($freq),3)*1000); | |
if (array_key_exists($freq_min, $freq_count_m) == false){ | |
$freq_count_m[$freq_min] = 0; | |
} | |
$freq_count_m[$freq_min]++; | |
} | |
}//end for foreach loop | |
$ATCG_sum = $A_count + $T_count + $C_count+ $G_count ; | |
$frq_Nucleotide = array( | |
"A"=> number_format($A_count/$ATCG_sum, 6), | |
"T"=> number_format($T_count/$ATCG_sum, 6), | |
"C"=> number_format($C_count/$ATCG_sum, 6), | |
"G"=> number_format($G_count/$ATCG_sum, 6), | |
); | |
//delete unnecessary file, sort array by value of keys and return array | |
unlink($filename); | |
ksort($freq_count_m); | |
return array($frq_Nucleotide, $freq_count_m); | |
} | |
/**FUNCTION------------------------------------------------------------------- | |
***site_missing_analysis------------------------------------------------------ | |
*** INPUT: site missing file produced by vcftool(.lmiss) | |
*** OUTPUT: array: $missing_frq_m | |
*** with keys(actual values*1000) / element (times this value appear) | |
*** SPECIFICS: | |
*** -Header: CHR POS N_DATA N_GENOTYPE_FILTERED N_MISS F_MISS | |
*** -split each line, make sure this line isn't empty | |
*** -save missing frequency in an array (number value as key, times this # appear as element) | |
*/ | |
function site_missing_analysis($filename){ | |
$missing_freq_m = array(); | |
foreach (new SplFileObject($filename) as $line){ | |
$out_line = explode("\t", $line); | |
if (isset($out_line[1]) and is_numeric($out_line[1])){ | |
$miss_val = intval(number_format(floatval($out_line[5]), 3)*1000); | |
//check if the key exist, if not, set value to 0 | |
if (array_key_exists($miss_val, $missing_freq_m) == false){ | |
$missing_freq_m[$miss_val] = 0; | |
} | |
$missing_freq_m[$miss_val]++; | |
} //end for if | |
} //end for foreach loop | |
//delete unnecessary file, sort array by value of keys and return array | |
unlink($filename); | |
ksort($missing_freq_m); | |
return $missing_freq_m; | |
} | |
/**FUNCTION---------------------------------------------------------------------- | |
***main-------------------------------------------------------------------------- | |
*** INPUT: $argv[1],$argv[2] | |
*** -$argv[1]: name of input file | |
*** -$argv[2]: string of 'table' or 'screen' | |
*** SPECIFICS: | |
*** 1. system call vcftool to obtain files: .gdepth;.frq; .lmiss | |
*** 2. read depth analysis | |
*** 3. minor allele frequency analysis | |
*** 4. missing frequency analysis | |
*** 5. display results | |
*/ | |
function main($file_need_stat_summary, $user_table_screen){ | |
//-1--------------------------------------VCFtool run on system command | |
//read ARGV[0], if it doesn't exit or its wrong, ask user to re-input, three times limit | |
$inputname = trim($file_need_stat_summary); | |
$inputname_repeat = 0; | |
while (file_exists($inputname) == false ){ | |
echo "\nThe input file: $inputname does not exist, please input again. \n"; | |
$handle = fopen ("php://stdin","r"); | |
$inputname = trim(fgets($handle)); | |
if (file_exists($inputname)){break 1;} | |
if ($inputname_repeat >=3 ){break 1;} | |
} | |
// outputname is named after inputname by remove .txt or other suffix with 3 or 4 letters | |
$outputname = preg_replace('/\\.[^.\\s]{3,4}$/', '', $inputname); | |
echo "\nName choose for vcftool output files: $outputname", "\n"; | |
// check if files exist, use VCFtool to obtain these files if not | |
if (file_exists("$outputname.gdepth") && file_exists("$outputname.frq") && file_exists("$outputname.lmiss")){ | |
echo "\nFiles of .gdepth .frq .lmiss already exist. \n"; | |
} | |
else{ | |
echo "\nVCF summary files don't exist, use VCFtool for getting these files: \n"; | |
system ("vcftools --vcf $inputname --geno-depth --remove-indels --out $outputname"); //.gdepth | |
system ("vcftools --vcf $inputname --freq --remove-indels --out $outputname"); //.frq | |
system ("vcftools --vcf $inputname --missing-site --remove-indels --out $outputname"); //.lmiss | |
} | |
unlink("$outputname.log"); | |
//-------counts for minor allele frequency is its needed: file save in .frq.count | |
//system ("vcftools --vcf contrast.vcf --counts --remove-indels --chr 1 --out $outputname"); | |
//-2-------------read depth analysis------------------------results save in $gdepth | |
$depth_m = read_depth_analysis("$outputname.gdepth"); | |
//convert our array(keys and values) into two arrays | |
$num_g=array();$num_count_g=array();$num_sum_g = 0; | |
foreach ($depth_m as $key => $value) { | |
array_push($num_g, intval($key)); | |
array_push($num_count_g, intval($value)); | |
$num_sum_g += intval($key)*intval($value); | |
} | |
$gdepth = stat_summary($num_g, $num_count_g,$num_sum_g); | |
//-3-----------minor allele frequency analysis------results save in $frq/$frq_Nucleotide | |
list( $frq_Nucleotide, $freq_count_m) = minor_allele_frequency_analysis("$outputname.frq"); | |
$num_f=array();$num_count_f=array();$num_sum_f = 0; | |
foreach ($freq_count_m as $key => $value) { | |
array_push($num_f, floatval(intval($key)/1000)); | |
array_push($num_count_f, intval($value)); | |
$num_sum_f += floatval(intval($key)/1000)*intval($value); | |
} | |
$frq = stat_summary($num_f,$num_count_f,$num_sum_f); | |
//-4-------------------missing analysis-------------------results save in $lmiss | |
$missing_freq_m = site_missing_analysis("$outputname.lmiss"); | |
$num_m=array();$num_count_m=array();$num_sum_m = 0; | |
foreach ($missing_freq_m as $key => $value) { | |
array_push($num_m, floatval(intval($key)/1000)); | |
array_push($num_count_m, intval($value)); | |
$num_sum_m += floatval(intval($key)/1000)*intval($value); | |
} | |
$lmiss = stat_summary($num_m,$num_count_m,$num_sum_m); | |
//-5------------------------------------------------------------------------------------ | |
// ask user to choose a way of representing result if there's wrong/no para from command | |
// user can try 3 times--------------------------then display results | |
$summary_or_table = trim($user_table_screen); | |
$repeat_input = 0; | |
while ($summary_or_table != 'table' and $summary_or_table != 'screen'){ | |
if ($repeat_input >=3){break 1;} | |
echo "Please make a choose only between 'screen' and 'table'! \n"; | |
$handle = fopen ("php://stdin","r"); | |
$summary_or_table = trim(fgets($handle)); | |
$repeat_input++; | |
} | |
if (trim($summary_or_table) == 'screen'){ | |
echo "The user choose print resutls on screen. \n"; | |
//all results are stored in three arrays: $gdepth,$frq,$lmiss | |
//print all result on screen by tabs if user choose "screen" | |
$fea = array ("Min","Max","Average","Median","SD"); | |
echo "\n"; | |
echo "Summary Statistics ","\t",$fea[0],"\t",$fea[1],"\t",$fea[2],"\t",$fea[3],"\t",$fea[4],"\n"; | |
echo "Depth ","\t",$gdepth["$fea[0]"],"\t",$gdepth["$fea[1]"],"\t", | |
$gdepth["$fea[2]"],"\t",$gdepth["$fea[3]"],"\t",$gdepth["$fea[4]"],"\n"; | |
echo "Minor Allele Frequency","\t",$frq["$fea[0]"],"\t",$frq["$fea[1]"],"\t", | |
$frq["$fea[2]"],"\t",$frq["$fea[3]"],"\t",$frq["$fea[4]"],"\n"; | |
echo "Missing Frequency ","\t",$lmiss["$fea[0]"],"\t",$lmiss["$fea[1]"],"\t", | |
$lmiss["$fea[2]"],"\t",$lmiss["$fea[3]"],"\t",$lmiss["$fea[4]"],"\n\n"; | |
echo "Nucleotide","\t","Frequency","\n"; | |
echo "A ","\t",$frq_Nucleotide["A"],"\n"; | |
echo "T ","\t",$frq_Nucleotide["T"],"\n"; | |
echo "C ","\t",$frq_Nucleotide["C"],"\n"; | |
echo "G ","\t",$frq_Nucleotide["G"],"\n"; | |
} | |
//print html format table if user choose "table": | |
elseif (trim($summary_or_table) == 'table'){ | |
echo "You choose print result as HTML table. \n\n"; | |
$array_table = array( | |
array("Statistical Summary"=>"Read Depth ") + $gdepth, | |
array("Statistical Summary"=>"Minor Allele Frequency") + $frq, | |
array("Statistical Summary"=>"Missing Frequency ") + $lmiss, | |
); | |
echo build_table($array_table),"\n\n"; | |
} | |
else { | |
echo "still, you are not making a choice between table or screen. \n"; | |
} | |
} | |
main($argv[1],$argv[2]); | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment