Last active
August 20, 2022 21:06
-
-
Save ACEnglish/1e7421c46ee10c71bee4c03982e5df6c to your computer and use it in GitHub Desktop.
Tandem Repeat Motif Roller
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
We have a tandem repeat motif M of length N. | |
This motif is repeated C times which creates a sequence S of length L = C * N | |
We can choose any subsequence B of S at position p:p+N between 0:L-N. | |
This B is a 'rolled' representation of M. | |
We can 'unroll' B such that uB == M with the operation | |
B = S[p:p+N] | |
f = p % N | |
uB = B[-f:] + B[:-f] | |
uB == M | |
In practice, motifs my have mismatches or indels. Therefore, it isn't | |
always the case that uB == M. However, we can look at the sequence similarity of uB vs M and | |
measure two motifs' similarity. | |
Note that we use -f for slicing since the position of B is 'after' the start of S. | |
If we were to discover a tandem repeat which starts before S, we would need to use f or switch B | |
and M to compare | |
""" | |
import edlib | |
import random | |
import truvari | |
def create_tandem_repeat(N=5, C=4): | |
""" | |
Makes a random motif and repeats it | |
""" | |
nucs = ["A", "T", "C", "G"] | |
motif = "".join(random.choices(nucs, k=N)) | |
sequence = motif * C | |
return motif, sequence | |
def makeB(S, M): | |
""" | |
Make a random subsequence B from S | |
returns B and the chosen position in S | |
""" | |
start = 0 | |
# p at the beginning of the motif M is boring | |
while start % len(M) == 0: | |
start = random.randint(1, len(S) - len(M)) | |
def unroll(B, M, p): | |
""" | |
Unroll B and compare to M | |
""" | |
f = p % len(M) | |
uB = B[-f:] + B[:-f] | |
return uB | |
def test(): | |
""" | |
Best case simulation | |
""" | |
print('random test') | |
M, S = create_tandem_repeat() | |
print('S:', S) | |
print('M:', M) | |
B, p = makeB(S, M) | |
print('B:', B) | |
print('p:', p) | |
uB = unroll(B, M, p) | |
print('uB:', uB) | |
print('uB == M:', uB == M) | |
def similarity_report(seqa, seqb): | |
""" | |
pretty printing | |
""" | |
print(f'lenA: {len(seqa)} - lenB: {len(seqb)}') | |
seqsim = truvari.seqsim(seqa, seqb) | |
print(f'sequence similarity: {seqsim:.3f}') | |
print('alignment:') | |
aln = edlib.align(seqa, seqb, task='path') | |
d = edlib.getNiceAlignment(aln, seqa, seqb) | |
for key in d: | |
print(key + '\t' + d[key]) | |
def real_sequence(): | |
""" | |
A real(ish) example showing this pretty well for unidentical motifs | |
""" | |
print('real example') | |
M = "TATATGTATGTATACAATACACACACATATAACA" | |
B = "TGTATGTATACAATACAACACATATAACTATA" | |
p = 4 # derived from genomic coordinates 10577401 (S start) and 10577405 (S' start) | |
uB = unroll(B, M, p) | |
print('-=- before unrolling -=-') | |
similarity_report(M, B) | |
print('-=- after unrolling -=-') | |
similarity_report(M, uB) | |
if __name__ == '__main__': | |
test() | |
print('-=-' * 10) | |
real_sequence() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Example Output: