ACEnglish/tr_roller.py

## tr_roller.py
"""
We have a tandem repeat motif M of length N.
This motif is repeated C times which creates a sequence S of length L = C * N

We can choose any subsequence B of S at position p:p+N between 0:L-N.
This B is a 'rolled' representation of M.

We can 'unroll' B such that uB == M with the operation

B = S[p:p+N]
f = p % N
uB = B[-f:] + B[:-f]
uB == M

In practice, motifs my have mismatches or indels. Therefore, it isn't
always the case that uB == M. However, we can look at the sequence similarity of uB vs M and
measure two motifs' similarity.

Note that we use -f for slicing since the position of B is 'after' the start of S.
If we were to discover a tandem repeat which starts before S, we would need to use f or switch B
and M to compare
"""
import edlib
import random
import truvari

def create_tandem_repeat(N=5, C=4):
    """
    Makes a random motif and repeats it
    """
    nucs = ["A", "T", "C", "G"]
    motif = "".join(random.choices(nucs, k=N))
    sequence = motif * C
    return motif, sequence

def makeB(S, M):
    """
    Make a random subsequence B from S
    returns B and the chosen position in S
    """
    start = 0
    # p at the beginning of the motif M is boring
    while start % len(M) == 0:
        start = random.randint(1, len(S) - len(M))

def unroll(B, M, p):
    """
    Unroll B and compare to M
    """
    f = p % len(M)
    uB = B[-f:] + B[:-f]
    return uB

def test():
    """
    Best case simulation
    """
    print('random test')
    M, S = create_tandem_repeat()
    print('S:', S)
    print('M:', M)
    B, p = makeB(S, M)
    print('B:', B)
    print('p:', p)
    uB = unroll(B, M, p)
    print('uB:', uB)
    print('uB == M:', uB == M)

def similarity_report(seqa, seqb):
    """
    pretty printing
    """
    print(f'lenA: {len(seqa)}  -  lenB: {len(seqb)}')
    seqsim = truvari.seqsim(seqa, seqb)
    print(f'sequence similarity: {seqsim:.3f}')
    print('alignment:')
    aln = edlib.align(seqa, seqb, task='path')
    d = edlib.getNiceAlignment(aln, seqa, seqb)
    for key in d:
        print(key + '\t' + d[key])

def real_sequence():
    """
    A real(ish) example showing this pretty well for unidentical motifs
    """
    print('real example')
    M = "TATATGTATGTATACAATACACACACATATAACA"
    B = "TGTATGTATACAATACAACACATATAACTATA"
    p = 4 # derived from genomic coordinates 10577401 (S start) and 10577405 (S' start)
    uB = unroll(B, M, p)
    print('-=- before unrolling -=-')
    similarity_report(M, B)

    print('-=- after unrolling -=-')
    similarity_report(M, uB)

if __name__ == '__main__':
    test()
    print('-=-' * 10)
    real_sequence()
	"""
	We have a tandem repeat motif M of length N.
	This motif is repeated C times which creates a sequence S of length L = C * N

	We can choose any subsequence B of S at position p:p+N between 0:L-N.
	This B is a 'rolled' representation of M.

	We can 'unroll' B such that uB == M with the operation

	B = S[p:p+N]
	f = p % N
	uB = B[-f:] + B[:-f]
	uB == M

	In practice, motifs my have mismatches or indels. Therefore, it isn't
	always the case that uB == M. However, we can look at the sequence similarity of uB vs M and
	measure two motifs' similarity.

	Note that we use -f for slicing since the position of B is 'after' the start of S.
	If we were to discover a tandem repeat which starts before S, we would need to use f or switch B
	and M to compare
	"""
	import edlib
	import random
	import truvari

	def create_tandem_repeat(N=5, C=4):
	"""
	Makes a random motif and repeats it
	"""
	nucs = ["A", "T", "C", "G"]
	motif = "".join(random.choices(nucs, k=N))
	sequence = motif * C
	return motif, sequence

	def makeB(S, M):
	"""
	Make a random subsequence B from S
	returns B and the chosen position in S
	"""
	start = 0
	# p at the beginning of the motif M is boring
	while start % len(M) == 0:
	start = random.randint(1, len(S) - len(M))

	def unroll(B, M, p):
	"""
	Unroll B and compare to M
	"""
	f = p % len(M)
	uB = B[-f:] + B[:-f]
	return uB

	def test():
	"""
	Best case simulation
	"""
	print('random test')
	M, S = create_tandem_repeat()
	print('S:', S)
	print('M:', M)
	B, p = makeB(S, M)
	print('B:', B)
	print('p:', p)
	uB = unroll(B, M, p)
	print('uB:', uB)
	print('uB == M:', uB == M)

	def similarity_report(seqa, seqb):
	"""
	pretty printing
	"""
	print(f'lenA: {len(seqa)} - lenB: {len(seqb)}')
	seqsim = truvari.seqsim(seqa, seqb)
	print(f'sequence similarity: {seqsim:.3f}')
	print('alignment:')
	aln = edlib.align(seqa, seqb, task='path')
	d = edlib.getNiceAlignment(aln, seqa, seqb)
	for key in d:
	print(key + '\t' + d[key])

	def real_sequence():
	"""
	A real(ish) example showing this pretty well for unidentical motifs
	"""
	print('real example')
	M = "TATATGTATGTATACAATACACACACATATAACA"
	B = "TGTATGTATACAATACAACACATATAACTATA"
	p = 4 # derived from genomic coordinates 10577401 (S start) and 10577405 (S' start)
	uB = unroll(B, M, p)
	print('-=- before unrolling -=-')
	similarity_report(M, B)

	print('-=- after unrolling -=-')
	similarity_report(M, uB)

	if __name__ == '__main__':
	test()
	print('-=-' * 10)
	real_sequence()