Created
September 3, 2019 16:52
-
-
Save vestalisvirginis/44362d695c1d1d65ab91eeb733607a6a to your computer and use it in GitHub Desktop.
Randomized Motif Search
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from functional import seq | |
from collections import deque, defaultdict | |
import time | |
def RandomNumberGenerator(dna, k): | |
#return an array (0,t) of random integers, than can be used as starting point for selecting random k-mer along dna strings | |
return np.random.randint(0, DnaToMatrix(dna).shape[1]+1-k, size = (DnaToMatrix(dna).shape[0], 1)) | |
def DnaToMatrix(dna): | |
#return a matrix of the list of dna strand | |
dna_matrix = deque() | |
for i in dna: | |
dna_matrix.append(list(i)) | |
return np.array(dna_matrix) | |
def RandomMotif(dna, k): | |
#return a matrix of randomely selected k-mer per dna strings | |
a = deque() | |
for pos, l in zip( range(RandomNumberGenerator(dna, k).shape[0]), RandomNumberGenerator(dna, k)): | |
a.append(list(DnaToMatrix(dna)[pos,int(l):int(l)+k])) | |
return np.array(a) | |
def A_count(l, matrix_init): | |
matrix_init[0,l] +=1 | |
return matrix_init | |
def C_count(l, matrix_init): | |
matrix_init[1,l] +=1 | |
return matrix_init | |
def G_count(l, matrix_init): | |
matrix_init[2,l] +=1 | |
return matrix_init | |
def T_count(l, matrix_init): | |
matrix_init[3,l] +=1 | |
return matrix_init | |
def NTP_count(n, l, matrix_init): | |
switcher = { | |
'A': A_count, | |
'C': C_count, | |
'G': G_count, | |
'T': T_count, | |
} | |
# Get the function from switcher dictionary | |
func = switcher.get(n, lambda: "Invalid nucleotide") | |
# Execute the function | |
return func(l, matrix_init) | |
def count_matrix(dna, k): | |
# Matrix default setting --> row 0 : 'A', row '1' : 'C', row 2 : 'G', row '3' : 'T' | |
NTP = DnaToMatrix('ACGT') | |
matrix_count = np.zeros((4,k)) | |
dna_matrix = DnaToMatrix(dna) | |
for l in range(k): | |
for t in range(len(dna)): | |
matrix_count = NTP_count(dna_matrix[t,l], l, matrix_count) | |
return matrix_count | |
def profile_matrix(dna, k): | |
# Matrix default setting --> row 0 : 'A', row '1' : 'C', row 2 : 'G', row '3' : 'T' | |
return count_matrix(dna, k)/(len(dna)) | |
def KmerPerDnaStrings(dna, k): | |
#return the k-mers along a given dna string, for all strings in dna | |
kmers_dict = defaultdict(list) | |
for t in range(DnaToMatrix(dna).shape[0]): | |
c = deque() | |
for l in range(DnaToMatrix(dna).shape[1]+1-k): | |
c.append(DnaToMatrix(dna)[t,l:l+k]) | |
kmers_dict[t] = c | |
return kmers_dict | |
def StringToProbability(substring, k, profile): | |
dict_letter = defaultdict(np.ndarray) | |
dict_letter['A'] = np.array([1, 0, 0, 0]) | |
dict_letter['C'] = np.array([0, 1, 0, 0]) | |
dict_letter['G'] = np.array([0, 0, 1, 0]) | |
dict_letter['T'] = np.array([0, 0, 0, 1]) | |
sub_string_matrix = deque() | |
for i in substring: | |
sub_string_matrix.append(dict_letter[i]) | |
mask = np.logical_and(np.array([*sub_string_matrix]).transpose(), np.array([*sub_string_matrix]).transpose()) | |
prob = profile[mask] | |
return np.prod(prob) | |
def MostProbableKmerMotif(dna,k, profile): | |
#return dictionary containing {motif:probability} for the motif scoring the highest probability for each dna string | |
max_probability = defaultdict(float) | |
for i in KmerPerDnaStrings(dna, k).values(): | |
list_kmer = np.array(list(i)).tolist() | |
p_motif = defaultdict(float) | |
p_motif.default_factory = lambda : 1.0 | |
for j in list_kmer: | |
p_motif[''.join(j)] = StringToProbability(j, k, profile) | |
max_probability[max(p_motif,key=p_motif.get)] = max(p_motif.values()) | |
return max_probability | |
def score_motifs(dna, k, letter_count): | |
score = 0 | |
for i in range(k): | |
max_id_letter = max(letter_count[0][i], letter_count[1][i], letter_count[2][i], letter_count[3][i]) | |
score += (len(dna)-max_id_letter) | |
return score | |
def score_motifs(dna, k, letter_count): | |
score = 0 | |
for i in range(k): | |
max_id_letter = max(letter_count[0][i], letter_count[1][i], letter_count[2][i], letter_count[3][i]) | |
score += (len(dna)-max_id_letter) | |
return score | |
def output(result): | |
_j = lambda x: ''.join(list(x)) | |
return print(*seq(result.tolist()).map(_j)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment