Skip to content

Instantly share code, notes, and snippets.

@livingstonef
Last active August 29, 2015 14:03
Show Gist options
  • Save livingstonef/fc7587f277f68c5b7db0 to your computer and use it in GitHub Desktop.
Save livingstonef/fc7587f277f68c5b7db0 to your computer and use it in GitHub Desktop.
CpG Island calculator
<?php
/**
* @author: Livingstone Fultang
* @license: GPL
**/
//create a file name sequence in the same directory and paste your sequence
//in a variable like so $sequence = "AGCTGAGTCGATCGACTGACGTACGTACG"; return $sequence;
require 'sequence.php';
$windowSize = 100;
$minLength = 200;
$minObservedExpected = 0.6;
$minPercentage = 50;
//We need a sequence to deal with;
if(empty($sequence)):
die('No sequence Provided');
endif;
$seqLength = strlen(trim($sequence) );
if($seqLength <= $windowSize || $seqLength < $minLength):
die('Sequence length is either shorter than the window size or min length');
endif;
//split sequence into an array of window sizes; moving +1
$windows = array();
$maximum = $seqLength-$windowSize;
for($i=0; $i<$maximum; $i++):
$window = array(
"start"=>$i+1,
"stop"=>$i+$windowSize,
"observed" => null,
"expected" => null,
"percentage"=> null,
"sequence"=> substr($sequence, $i, $windowSize)
);
$G_count = intval(substr_count($window['sequence'], "G"));
$C_count = intval(substr_count($window['sequence'], "C"));
//$A_count = intval(substr_count($window['sequence'], "A"));
//$T_count = intval(substr_count($window['sequence'], "T"));
//Calculate the expected as (number of C's * number of G's)/windowSize
//Calculate the percentage GC as (numbers of C's + number of G's)/windowSize;
//substr_count is case sensitive so change haystack to uppercase;
$window['sequence'] = strtoupper($window['sequence']);
$window['expected'] = ( $C_count * $G_count) / $windowSize;
$window['percentage'] = ( ($C_count + $G_count) / $windowSize ) * 100;
//Calculate the observed as the number of C's directly followed by a G;
$window['observed'] = intval(substr_count($window['sequence'], "CG"));
$window['observed/expected'] = $window['observed']/$window['expected'];
$windows[] = $window;
endfor;
//var_dump($windows);
//Now walk through all windows and find 10 consecutive windows or a span of 200 nucleotides
//with min GC content of 50%, and minObservedExpected of 0.6
print("start\tstop\tobserved\texptected\tpercentage\tobserved/expected\n");
foreach($windows as $window):
$size = strlen($window['sequence']);
print("{$window['start']}\t{$window['stop']}\t{$window['observed']}\t{$window['expected']}\t{$window['percentage']}\t{$window['observed/expected']}\t{$size}\n");
endforeach;
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment