clarle/consensus.py

## consensus.py
import re
import operator
from math import sqrt
from itertools import product

ALPHABET = ["A", "C", "G", "T", "[AC]", "[AG]",
            "[AT]", "[CG]", "[CT]", "[GT]", "[ACGT]"]

NUC_PROBS = {
    "A": 0.25,
    "T": 0.25,
    "C": 0.25,
    "G": 0.25
}

def z_score(x, mean, std):
    """
    Shortcut for calculating the z-score.

    Parameters:
        x    -- the determined raw score of the parameter
        mean -- the mean of the parameter for the sample
        std  -- the standard deviation for the sample
    """

    return (x - mean) / float(std)

def sequence_mean(sequence, length, n):
    """
    Determine the expected value of obtaining a particular nucleotide motif,
    over n sequences of equal length.

    Parameters:
        sequence -- A tuple of nucleotide characters (from generate_regexes)
        length   -- The length of each of the sequences
        n        -- The number of total promoter sequences
    """

    prob = 1
    for char in sequence:
        next_prob = 0
        if len(char) != 1:
            # Strip brackets from regex
            char = char[1:-1]

            # Calculate probability sum
            next_prob = sum(map(lambda x: NUC_PROBS[x], char))
        else:
            # We only have one nucleotide
            next_prob = NUC_PROBS[char]

        prob *= next_prob

    return (length - len(sequence) + 1) * n * prob

def sequence_approx_std(mean):
    """
    Approximation of the standard deviation of the target motif.

    Parameters:
        mean -- Expected value of the target motif in randomized sequences
    """

    return sqrt(mean)

def generate_regexes(length):
    """
    Generates a dictionary of sequence pattern regexes of a desired length.

    Parameters:
        length -- the target length of the sequence
    """

    # Dictionary of patterns to compiled regexes
    regexes = {}

    # Iterate through the entire product set of the alphabet to create regexes
    for key in product(ALPHABET, repeat=length):
        pattern = "".join(key)

        # Allow for overlapping sequences
        regexes[key] = re.compile("(?=(" + pattern + "))")

    return regexes

def filter_sequences(data):
    """
    Processes FASTA file data to strip out unnecessary gene information.

    Parameters:
        data -- Raw FASTA data as an array of strings
    """

    data = filter(lambda s: not s.startswith(">"), data)
    return map(lambda s: s.strip("\n"), data)

def sequence_matches(sequences, regex):
    """
    Returns the number of sequence matches with the given regex.

    Parameters:
        sequences -- array of sequences from the FASTA data
        regex     -- regular expression for the target sequence
    """

    matches = 0
    for sequence in sequences:
        matches += len(regex.findall(sequence))

    return matches

def main():
    f_pos = open("promoterPositive.fa")
    f_neg = open("promoterNegative.fa")

    data_pos = f_pos.readlines()
    data_neg = f_neg.readlines()

    # Calculate positive promoter data
    sequences_p = filter_sequences(data_pos)
    sequence_number_p = len(sequences_p)
    sequence_length_p = len(sequences_p[0])

    # Calculate negative promoter data
    sequences_n = filter_sequences(data_neg)
    sequence_number_n = len(sequences_n)
    sequence_length_n = len(sequences_n[0])

    motif_patterns = generate_regexes(6)
    results = {}

    for pattern in motif_patterns:
	# Calculate values for the positive promoters
        mean_p = sequence_mean(pattern, sequence_length_p, sequence_number_p)
        std_p = sequence_approx_std(mean_p)
        matches_p = sequence_matches(sequences_p, motif_patterns[pattern])

	# Calculate values for the negative promoters
        mean_n = sequence_mean(pattern, sequence_length_n, sequence_number_n)
        std_n = sequence_approx_std(mean_n)
        matches_n = sequence_matches(sequences_n, motif_patterns[pattern])

	# Calculate the z-score for both
        z_p = z_score(matches_p, mean_p, std_p)
        z_n = z_score(matches_n, mean_n, std_n)

	# Take the difference of the z-scores to calculate final ranking
        z = z_p - z_n

	# Special filter to prevent large nucleotide repeats
        if z > 4 and z_p > 0 and z_n < 0:
            results[pattern] = z

    print sorted(results.iteritems(), key=operator.itemgetter(1), reverse=True)

if __name__ == '__main__':
    main()
	import re
	import operator
	from math import sqrt
	from itertools import product

	ALPHABET = ["A", "C", "G", "T", "[AC]", "[AG]",
	"[AT]", "[CG]", "[CT]", "[GT]", "[ACGT]"]

	NUC_PROBS = {
	"A": 0.25,
	"T": 0.25,
	"C": 0.25,
	"G": 0.25
	}

	def z_score(x, mean, std):
	"""
	Shortcut for calculating the z-score.

	Parameters:
	x -- the determined raw score of the parameter
	mean -- the mean of the parameter for the sample
	std -- the standard deviation for the sample
	"""

	return (x - mean) / float(std)

	def sequence_mean(sequence, length, n):
	"""
	Determine the expected value of obtaining a particular nucleotide motif,
	over n sequences of equal length.

	Parameters:
	sequence -- A tuple of nucleotide characters (from generate_regexes)
	length -- The length of each of the sequences
	n -- The number of total promoter sequences
	"""

	prob = 1
	for char in sequence:
	next_prob = 0
	if len(char) != 1:
	# Strip brackets from regex
	char = char[1:-1]

	# Calculate probability sum
	next_prob = sum(map(lambda x: NUC_PROBS[x], char))
	else:
	# We only have one nucleotide
	next_prob = NUC_PROBS[char]

	prob *= next_prob

	return (length - len(sequence) + 1) * n * prob

	def sequence_approx_std(mean):
	"""
	Approximation of the standard deviation of the target motif.

	Parameters:
	mean -- Expected value of the target motif in randomized sequences
	"""

	return sqrt(mean)

	def generate_regexes(length):
	"""
	Generates a dictionary of sequence pattern regexes of a desired length.

	Parameters:
	length -- the target length of the sequence
	"""

	# Dictionary of patterns to compiled regexes
	regexes = {}

	# Iterate through the entire product set of the alphabet to create regexes
	for key in product(ALPHABET, repeat=length):
	pattern = "".join(key)

	# Allow for overlapping sequences
	regexes[key] = re.compile("(?=(" + pattern + "))")

	return regexes

	def filter_sequences(data):
	"""
	Processes FASTA file data to strip out unnecessary gene information.

	Parameters:
	data -- Raw FASTA data as an array of strings
	"""

	data = filter(lambda s: not s.startswith(">"), data)
	return map(lambda s: s.strip("\n"), data)

	def sequence_matches(sequences, regex):
	"""
	Returns the number of sequence matches with the given regex.

	Parameters:
	sequences -- array of sequences from the FASTA data
	regex -- regular expression for the target sequence
	"""

	matches = 0
	for sequence in sequences:
	matches += len(regex.findall(sequence))

	return matches

	def main():
	f_pos = open("promoterPositive.fa")
	f_neg = open("promoterNegative.fa")

	data_pos = f_pos.readlines()
	data_neg = f_neg.readlines()

	# Calculate positive promoter data
	sequences_p = filter_sequences(data_pos)
	sequence_number_p = len(sequences_p)
	sequence_length_p = len(sequences_p[0])

	# Calculate negative promoter data
	sequences_n = filter_sequences(data_neg)
	sequence_number_n = len(sequences_n)
	sequence_length_n = len(sequences_n[0])

	motif_patterns = generate_regexes(6)
	results = {}

	for pattern in motif_patterns:
	# Calculate values for the positive promoters
	mean_p = sequence_mean(pattern, sequence_length_p, sequence_number_p)
	std_p = sequence_approx_std(mean_p)
	matches_p = sequence_matches(sequences_p, motif_patterns[pattern])

	# Calculate values for the negative promoters
	mean_n = sequence_mean(pattern, sequence_length_n, sequence_number_n)
	std_n = sequence_approx_std(mean_n)
	matches_n = sequence_matches(sequences_n, motif_patterns[pattern])

	# Calculate the z-score for both
	z_p = z_score(matches_p, mean_p, std_p)
	z_n = z_score(matches_n, mean_n, std_n)

	# Take the difference of the z-scores to calculate final ranking
	z = z_p - z_n

	# Special filter to prevent large nucleotide repeats
	if z > 4 and z_p > 0 and z_n < 0:
	results[pattern] = z

	print sorted(results.iteritems(), key=operator.itemgetter(1), reverse=True)

	if __name__ == '__main__':
	main()