Skip to content

Instantly share code, notes, and snippets.

@vestalisvirginis
Created September 3, 2019 16:52
Show Gist options
  • Save vestalisvirginis/44362d695c1d1d65ab91eeb733607a6a to your computer and use it in GitHub Desktop.
Save vestalisvirginis/44362d695c1d1d65ab91eeb733607a6a to your computer and use it in GitHub Desktop.
Randomized Motif Search
import numpy as np
from functional import seq
from collections import deque, defaultdict
import time
def RandomNumberGenerator(dna, k):
#return an array (0,t) of random integers, than can be used as starting point for selecting random k-mer along dna strings
return np.random.randint(0, DnaToMatrix(dna).shape[1]+1-k, size = (DnaToMatrix(dna).shape[0], 1))
def DnaToMatrix(dna):
#return a matrix of the list of dna strand
dna_matrix = deque()
for i in dna:
dna_matrix.append(list(i))
return np.array(dna_matrix)
def RandomMotif(dna, k):
#return a matrix of randomely selected k-mer per dna strings
a = deque()
for pos, l in zip( range(RandomNumberGenerator(dna, k).shape[0]), RandomNumberGenerator(dna, k)):
a.append(list(DnaToMatrix(dna)[pos,int(l):int(l)+k]))
return np.array(a)
def A_count(l, matrix_init):
matrix_init[0,l] +=1
return matrix_init
def C_count(l, matrix_init):
matrix_init[1,l] +=1
return matrix_init
def G_count(l, matrix_init):
matrix_init[2,l] +=1
return matrix_init
def T_count(l, matrix_init):
matrix_init[3,l] +=1
return matrix_init
def NTP_count(n, l, matrix_init):
switcher = {
'A': A_count,
'C': C_count,
'G': G_count,
'T': T_count,
}
# Get the function from switcher dictionary
func = switcher.get(n, lambda: "Invalid nucleotide")
# Execute the function
return func(l, matrix_init)
def count_matrix(dna, k):
# Matrix default setting --> row 0 : 'A', row '1' : 'C', row 2 : 'G', row '3' : 'T'
NTP = DnaToMatrix('ACGT')
matrix_count = np.zeros((4,k))
dna_matrix = DnaToMatrix(dna)
for l in range(k):
for t in range(len(dna)):
matrix_count = NTP_count(dna_matrix[t,l], l, matrix_count)
return matrix_count
def profile_matrix(dna, k):
# Matrix default setting --> row 0 : 'A', row '1' : 'C', row 2 : 'G', row '3' : 'T'
return count_matrix(dna, k)/(len(dna))
def KmerPerDnaStrings(dna, k):
#return the k-mers along a given dna string, for all strings in dna
kmers_dict = defaultdict(list)
for t in range(DnaToMatrix(dna).shape[0]):
c = deque()
for l in range(DnaToMatrix(dna).shape[1]+1-k):
c.append(DnaToMatrix(dna)[t,l:l+k])
kmers_dict[t] = c
return kmers_dict
def StringToProbability(substring, k, profile):
dict_letter = defaultdict(np.ndarray)
dict_letter['A'] = np.array([1, 0, 0, 0])
dict_letter['C'] = np.array([0, 1, 0, 0])
dict_letter['G'] = np.array([0, 0, 1, 0])
dict_letter['T'] = np.array([0, 0, 0, 1])
sub_string_matrix = deque()
for i in substring:
sub_string_matrix.append(dict_letter[i])
mask = np.logical_and(np.array([*sub_string_matrix]).transpose(), np.array([*sub_string_matrix]).transpose())
prob = profile[mask]
return np.prod(prob)
def MostProbableKmerMotif(dna,k, profile):
#return dictionary containing {motif:probability} for the motif scoring the highest probability for each dna string
max_probability = defaultdict(float)
for i in KmerPerDnaStrings(dna, k).values():
list_kmer = np.array(list(i)).tolist()
p_motif = defaultdict(float)
p_motif.default_factory = lambda : 1.0
for j in list_kmer:
p_motif[''.join(j)] = StringToProbability(j, k, profile)
max_probability[max(p_motif,key=p_motif.get)] = max(p_motif.values())
return max_probability
def score_motifs(dna, k, letter_count):
score = 0
for i in range(k):
max_id_letter = max(letter_count[0][i], letter_count[1][i], letter_count[2][i], letter_count[3][i])
score += (len(dna)-max_id_letter)
return score
def score_motifs(dna, k, letter_count):
score = 0
for i in range(k):
max_id_letter = max(letter_count[0][i], letter_count[1][i], letter_count[2][i], letter_count[3][i])
score += (len(dna)-max_id_letter)
return score
def output(result):
_j = lambda x: ''.join(list(x))
return print(*seq(result.tolist()).map(_j))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment