Skip to content

Instantly share code, notes, and snippets.

@clarle
Created November 27, 2012 14:18
Show Gist options
  • Save clarle/4154435 to your computer and use it in GitHub Desktop.
Save clarle/4154435 to your computer and use it in GitHub Desktop.
Algorithm to find a target consensus motif in FASTA sequences
import re
import operator
from math import sqrt
from itertools import product
ALPHABET = ["A", "C", "G", "T", "[AC]", "[AG]",
"[AT]", "[CG]", "[CT]", "[GT]", "[ACGT]"]
NUC_PROBS = {
"A": 0.25,
"T": 0.25,
"C": 0.25,
"G": 0.25
}
def z_score(x, mean, std):
"""
Shortcut for calculating the z-score.
Parameters:
x -- the determined raw score of the parameter
mean -- the mean of the parameter for the sample
std -- the standard deviation for the sample
"""
return (x - mean) / float(std)
def sequence_mean(sequence, length, n):
"""
Determine the expected value of obtaining a particular nucleotide motif,
over n sequences of equal length.
Parameters:
sequence -- A tuple of nucleotide characters (from generate_regexes)
length -- The length of each of the sequences
n -- The number of total promoter sequences
"""
prob = 1
for char in sequence:
next_prob = 0
if len(char) != 1:
# Strip brackets from regex
char = char[1:-1]
# Calculate probability sum
next_prob = sum(map(lambda x: NUC_PROBS[x], char))
else:
# We only have one nucleotide
next_prob = NUC_PROBS[char]
prob *= next_prob
return (length - len(sequence) + 1) * n * prob
def sequence_approx_std(mean):
"""
Approximation of the standard deviation of the target motif.
Parameters:
mean -- Expected value of the target motif in randomized sequences
"""
return sqrt(mean)
def generate_regexes(length):
"""
Generates a dictionary of sequence pattern regexes of a desired length.
Parameters:
length -- the target length of the sequence
"""
# Dictionary of patterns to compiled regexes
regexes = {}
# Iterate through the entire product set of the alphabet to create regexes
for key in product(ALPHABET, repeat=length):
pattern = "".join(key)
# Allow for overlapping sequences
regexes[key] = re.compile("(?=(" + pattern + "))")
return regexes
def filter_sequences(data):
"""
Processes FASTA file data to strip out unnecessary gene information.
Parameters:
data -- Raw FASTA data as an array of strings
"""
data = filter(lambda s: not s.startswith(">"), data)
return map(lambda s: s.strip("\n"), data)
def sequence_matches(sequences, regex):
"""
Returns the number of sequence matches with the given regex.
Parameters:
sequences -- array of sequences from the FASTA data
regex -- regular expression for the target sequence
"""
matches = 0
for sequence in sequences:
matches += len(regex.findall(sequence))
return matches
def main():
f_pos = open("promoterPositive.fa")
f_neg = open("promoterNegative.fa")
data_pos = f_pos.readlines()
data_neg = f_neg.readlines()
# Calculate positive promoter data
sequences_p = filter_sequences(data_pos)
sequence_number_p = len(sequences_p)
sequence_length_p = len(sequences_p[0])
# Calculate negative promoter data
sequences_n = filter_sequences(data_neg)
sequence_number_n = len(sequences_n)
sequence_length_n = len(sequences_n[0])
motif_patterns = generate_regexes(6)
results = {}
for pattern in motif_patterns:
# Calculate values for the positive promoters
mean_p = sequence_mean(pattern, sequence_length_p, sequence_number_p)
std_p = sequence_approx_std(mean_p)
matches_p = sequence_matches(sequences_p, motif_patterns[pattern])
# Calculate values for the negative promoters
mean_n = sequence_mean(pattern, sequence_length_n, sequence_number_n)
std_n = sequence_approx_std(mean_n)
matches_n = sequence_matches(sequences_n, motif_patterns[pattern])
# Calculate the z-score for both
z_p = z_score(matches_p, mean_p, std_p)
z_n = z_score(matches_n, mean_n, std_n)
# Take the difference of the z-scores to calculate final ranking
z = z_p - z_n
# Special filter to prevent large nucleotide repeats
if z > 4 and z_p > 0 and z_n < 0:
results[pattern] = z
print sorted(results.iteritems(), key=operator.itemgetter(1), reverse=True)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment