Created
November 27, 2012 14:18
-
-
Save clarle/4154435 to your computer and use it in GitHub Desktop.
Algorithm to find a target consensus motif in FASTA sequences
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import operator | |
from math import sqrt | |
from itertools import product | |
ALPHABET = ["A", "C", "G", "T", "[AC]", "[AG]", | |
"[AT]", "[CG]", "[CT]", "[GT]", "[ACGT]"] | |
NUC_PROBS = { | |
"A": 0.25, | |
"T": 0.25, | |
"C": 0.25, | |
"G": 0.25 | |
} | |
def z_score(x, mean, std): | |
""" | |
Shortcut for calculating the z-score. | |
Parameters: | |
x -- the determined raw score of the parameter | |
mean -- the mean of the parameter for the sample | |
std -- the standard deviation for the sample | |
""" | |
return (x - mean) / float(std) | |
def sequence_mean(sequence, length, n): | |
""" | |
Determine the expected value of obtaining a particular nucleotide motif, | |
over n sequences of equal length. | |
Parameters: | |
sequence -- A tuple of nucleotide characters (from generate_regexes) | |
length -- The length of each of the sequences | |
n -- The number of total promoter sequences | |
""" | |
prob = 1 | |
for char in sequence: | |
next_prob = 0 | |
if len(char) != 1: | |
# Strip brackets from regex | |
char = char[1:-1] | |
# Calculate probability sum | |
next_prob = sum(map(lambda x: NUC_PROBS[x], char)) | |
else: | |
# We only have one nucleotide | |
next_prob = NUC_PROBS[char] | |
prob *= next_prob | |
return (length - len(sequence) + 1) * n * prob | |
def sequence_approx_std(mean): | |
""" | |
Approximation of the standard deviation of the target motif. | |
Parameters: | |
mean -- Expected value of the target motif in randomized sequences | |
""" | |
return sqrt(mean) | |
def generate_regexes(length): | |
""" | |
Generates a dictionary of sequence pattern regexes of a desired length. | |
Parameters: | |
length -- the target length of the sequence | |
""" | |
# Dictionary of patterns to compiled regexes | |
regexes = {} | |
# Iterate through the entire product set of the alphabet to create regexes | |
for key in product(ALPHABET, repeat=length): | |
pattern = "".join(key) | |
# Allow for overlapping sequences | |
regexes[key] = re.compile("(?=(" + pattern + "))") | |
return regexes | |
def filter_sequences(data): | |
""" | |
Processes FASTA file data to strip out unnecessary gene information. | |
Parameters: | |
data -- Raw FASTA data as an array of strings | |
""" | |
data = filter(lambda s: not s.startswith(">"), data) | |
return map(lambda s: s.strip("\n"), data) | |
def sequence_matches(sequences, regex): | |
""" | |
Returns the number of sequence matches with the given regex. | |
Parameters: | |
sequences -- array of sequences from the FASTA data | |
regex -- regular expression for the target sequence | |
""" | |
matches = 0 | |
for sequence in sequences: | |
matches += len(regex.findall(sequence)) | |
return matches | |
def main(): | |
f_pos = open("promoterPositive.fa") | |
f_neg = open("promoterNegative.fa") | |
data_pos = f_pos.readlines() | |
data_neg = f_neg.readlines() | |
# Calculate positive promoter data | |
sequences_p = filter_sequences(data_pos) | |
sequence_number_p = len(sequences_p) | |
sequence_length_p = len(sequences_p[0]) | |
# Calculate negative promoter data | |
sequences_n = filter_sequences(data_neg) | |
sequence_number_n = len(sequences_n) | |
sequence_length_n = len(sequences_n[0]) | |
motif_patterns = generate_regexes(6) | |
results = {} | |
for pattern in motif_patterns: | |
# Calculate values for the positive promoters | |
mean_p = sequence_mean(pattern, sequence_length_p, sequence_number_p) | |
std_p = sequence_approx_std(mean_p) | |
matches_p = sequence_matches(sequences_p, motif_patterns[pattern]) | |
# Calculate values for the negative promoters | |
mean_n = sequence_mean(pattern, sequence_length_n, sequence_number_n) | |
std_n = sequence_approx_std(mean_n) | |
matches_n = sequence_matches(sequences_n, motif_patterns[pattern]) | |
# Calculate the z-score for both | |
z_p = z_score(matches_p, mean_p, std_p) | |
z_n = z_score(matches_n, mean_n, std_n) | |
# Take the difference of the z-scores to calculate final ranking | |
z = z_p - z_n | |
# Special filter to prevent large nucleotide repeats | |
if z > 4 and z_p > 0 and z_n < 0: | |
results[pattern] = z | |
print sorted(results.iteritems(), key=operator.itemgetter(1), reverse=True) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment