Last active
August 29, 2015 14:03
-
-
Save livingstonef/fc7587f277f68c5b7db0 to your computer and use it in GitHub Desktop.
CpG Island calculator
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* @author: Livingstone Fultang | |
* @license: GPL | |
**/ | |
//create a file name sequence in the same directory and paste your sequence | |
//in a variable like so $sequence = "AGCTGAGTCGATCGACTGACGTACGTACG"; return $sequence; | |
require 'sequence.php'; | |
$windowSize = 100; | |
$minLength = 200; | |
$minObservedExpected = 0.6; | |
$minPercentage = 50; | |
//We need a sequence to deal with; | |
if(empty($sequence)): | |
die('No sequence Provided'); | |
endif; | |
$seqLength = strlen(trim($sequence) ); | |
if($seqLength <= $windowSize || $seqLength < $minLength): | |
die('Sequence length is either shorter than the window size or min length'); | |
endif; | |
//split sequence into an array of window sizes; moving +1 | |
$windows = array(); | |
$maximum = $seqLength-$windowSize; | |
for($i=0; $i<$maximum; $i++): | |
$window = array( | |
"start"=>$i+1, | |
"stop"=>$i+$windowSize, | |
"observed" => null, | |
"expected" => null, | |
"percentage"=> null, | |
"sequence"=> substr($sequence, $i, $windowSize) | |
); | |
$G_count = intval(substr_count($window['sequence'], "G")); | |
$C_count = intval(substr_count($window['sequence'], "C")); | |
//$A_count = intval(substr_count($window['sequence'], "A")); | |
//$T_count = intval(substr_count($window['sequence'], "T")); | |
//Calculate the expected as (number of C's * number of G's)/windowSize | |
//Calculate the percentage GC as (numbers of C's + number of G's)/windowSize; | |
//substr_count is case sensitive so change haystack to uppercase; | |
$window['sequence'] = strtoupper($window['sequence']); | |
$window['expected'] = ( $C_count * $G_count) / $windowSize; | |
$window['percentage'] = ( ($C_count + $G_count) / $windowSize ) * 100; | |
//Calculate the observed as the number of C's directly followed by a G; | |
$window['observed'] = intval(substr_count($window['sequence'], "CG")); | |
$window['observed/expected'] = $window['observed']/$window['expected']; | |
$windows[] = $window; | |
endfor; | |
//var_dump($windows); | |
//Now walk through all windows and find 10 consecutive windows or a span of 200 nucleotides | |
//with min GC content of 50%, and minObservedExpected of 0.6 | |
print("start\tstop\tobserved\texptected\tpercentage\tobserved/expected\n"); | |
foreach($windows as $window): | |
$size = strlen($window['sequence']); | |
print("{$window['start']}\t{$window['stop']}\t{$window['observed']}\t{$window['expected']}\t{$window['percentage']}\t{$window['observed/expected']}\t{$size}\n"); | |
endforeach; | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment