Skip to content

Instantly share code, notes, and snippets.

@ggirelli
Last active October 14, 2021 14:52
Show Gist options
  • Save ggirelli/5bf8a9a1bf51d90eb07aa6596b6c6ecc to your computer and use it in GitHub Desktop.
Save ggirelli/5bf8a9a1bf51d90eb07aa6596b6c6ecc to your computer and use it in GitHub Desktop.
Align strings (brute force)
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from best_alignment import *\n",
"from typing import Tuple"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"A: GCGCGACCTAGCCCGTACTGT\n",
"B: GAGTTGACC\n",
"\n",
"Matches: 0\n",
"1: GCGCGACCTAGCCCGTACTGT\n",
"A: T slice(20, 21, 1)\n",
"M: \n",
"B: G slice(0, 1, 1)\n",
"2: GAGTTGACC\n",
"\n",
"Matches: 1\n",
"1: GCGCGACCTAGCCCGTACTGT\n",
"A: GT slice(19, 21, 1)\n",
"M: | \n",
"B: GA slice(0, 2, 1)\n",
"2: GAGTTGACC\n",
"\n",
"Matches: 0\n",
"1: GCGCGACCTAGCCCGTACTGT\n",
"A: TGT slice(18, 21, 1)\n",
"M: \n",
"B: GAG slice(0, 3, 1)\n",
"2: GAGTTGACC\n",
"\n",
"Matches: 2\n",
"1: GCGCGACCTAGCCCGTACTGT\n",
"A: CTGT slice(17, 21, 1)\n",
"M: ||\n",
"B: GAGT slice(0, 4, 1)\n",
"2: GAGTTGACC\n",
"\n",
"Matches: 1\n",
"1: GCGCGACCTAGCCCGTACTGT\n",
"A: ACTGT slice(16, 21, 1)\n",
"M: |\n",
"B: GAGTT slice(0, 5, 1)\n",
"2: GAGTTGACC\n",
"\n",
"Matches: 2\n",
"1: GCGCGACCTAGCCCGTACTGT\n",
"A: TACTGT slice(15, 21, 1)\n",
"M: | | \n",
"B: GAGTTG slice(0, 6, 1)\n",
"2: GAGTTGACC\n",
"\n",
"Matches: 3\n",
"1: GCGCGACCTAGCCCGTACTGT\n",
"A: GTACTGT slice(14, 21, 1)\n",
"M: | || \n",
"B: GAGTTGA slice(0, 7, 1)\n",
"2: GAGTTGACC\n",
"\n",
"Matches: 0\n",
"1: GCGCGACCTAGCCCGTACTGT\n",
"A: CGTACTGT slice(13, 21, 1)\n",
"M: \n",
"B: GAGTTGAC slice(0, 8, 1)\n",
"2: GAGTTGACC\n",
"\n",
"Matches: 2\n",
"1: GCGCGACCTAGCCCGTACTGT\n",
"A: CCGTACTGT slice(12, 21, 1)\n",
"M: || \n",
"B: GAGTTGACC slice(0, 9, 1)\n",
"2: GAGTTGACC\n",
"\n",
"Matches: 1\n",
"1: GCGCGACCTAGCCCGTACTGT\n",
"A: CCCGTACTG slice(11, 20, 1)\n",
"M: | \n",
"B: GAGTTGACC slice(0, 9, 1)\n",
"2: GAGTTGACC\n",
"\n",
"Matches: 3\n",
"1: GCGCGACCTAGCCCGTACTGT\n",
"A: GCCCGTACT slice(10, 19, 1)\n",
"M: | || \n",
"B: GAGTTGACC slice(0, 9, 1)\n",
"2: GAGTTGACC\n",
"\n",
"Matches: 2\n",
"1: GCGCGACCTAGCCCGTACTGT\n",
"A: AGCCCGTAC slice(9, 18, 1)\n",
"M: | |\n",
"B: GAGTTGACC slice(0, 9, 1)\n",
"2: GAGTTGACC\n",
"\n",
"Matches: 2\n",
"1: GCGCGACCTAGCCCGTACTGT\n",
"A: TAGCCCGTA slice(8, 17, 1)\n",
"M: || \n",
"B: GAGTTGACC slice(0, 9, 1)\n",
"2: GAGTTGACC\n",
"\n",
"Matches: 0\n",
"1: GCGCGACCTAGCCCGTACTGT\n",
"A: CTAGCCCGT slice(7, 16, 1)\n",
"M: \n",
"B: GAGTTGACC slice(0, 9, 1)\n",
"2: GAGTTGACC\n",
"\n",
"Matches: 1\n",
"1: GCGCGACCTAGCCCGTACTGT\n",
"A: CCTAGCCCG slice(6, 15, 1)\n",
"M: | \n",
"B: GAGTTGACC slice(0, 9, 1)\n",
"2: GAGTTGACC\n",
"\n",
"Matches: 4\n",
"1: GCGCGACCTAGCCCGTACTGT\n",
"A: ACCTAGCCC slice(5, 14, 1)\n",
"M: | | ||\n",
"B: GAGTTGACC slice(0, 9, 1)\n",
"2: GAGTTGACC\n",
"\n",
"Matches: 5\n",
"1: GCGCGACCTAGCCCGTACTGT\n",
"A: GACCTAGCC slice(4, 13, 1)\n",
"M: || | ||\n",
"B: GAGTTGACC slice(0, 9, 1)\n",
"2: GAGTTGACC\n",
"\n",
"Matches: 2\n",
"1: GCGCGACCTAGCCCGTACTGT\n",
"A: CGACCTAGC slice(3, 12, 1)\n",
"M: | |\n",
"B: GAGTTGACC slice(0, 9, 1)\n",
"2: GAGTTGACC\n",
"\n",
"Matches: 2\n",
"1: GCGCGACCTAGCCCGTACTGT\n",
"A: GCGACCTAG slice(2, 11, 1)\n",
"M: | | \n",
"B: GAGTTGACC slice(0, 9, 1)\n",
"2: GAGTTGACC\n",
"\n",
"Matches: 0\n",
"1: GCGCGACCTAGCCCGTACTGT\n",
"A: CGCGACCTA slice(1, 10, 1)\n",
"M: \n",
"B: GAGTTGACC slice(0, 9, 1)\n",
"2: GAGTTGACC\n",
"\n",
"Matches: 3\n",
"1: GCGCGACCTAGCCCGTACTGT\n",
"A: GCGCGACCT slice(0, 9, 1)\n",
"M: | | | \n",
"B: GAGTTGACC slice(0, 9, 1)\n",
"2: GAGTTGACC\n",
"\n",
"Matches: 4\n",
"1: GCGCGACCTAGCCCGTACTGT\n",
"A: GCGCGACC slice(0, 8, 1)\n",
"M: ||||\n",
"B: AGTTGACC slice(1, 9, 1)\n",
"2: GAGTTGACC\n",
"\n",
"Matches: 2\n",
"1: GCGCGACCTAGCCCGTACTGT\n",
"A: GCGCGAC slice(0, 7, 1)\n",
"M: | |\n",
"B: GTTGACC slice(2, 9, 1)\n",
"2: GAGTTGACC\n",
"\n",
"Matches: 1\n",
"1: GCGCGACCTAGCCCGTACTGT\n",
"A: GCGCGA slice(0, 6, 1)\n",
"M: | \n",
"B: TTGACC slice(3, 9, 1)\n",
"2: GAGTTGACC\n",
"\n",
"Matches: 1\n",
"1: GCGCGACCTAGCCCGTACTGT\n",
"A: GCGCG slice(0, 5, 1)\n",
"M: | \n",
"B: TGACC slice(4, 9, 1)\n",
"2: GAGTTGACC\n",
"\n",
"Matches: 2\n",
"1: GCGCGACCTAGCCCGTACTGT\n",
"A: GCGC slice(0, 4, 1)\n",
"M: | |\n",
"B: GACC slice(5, 9, 1)\n",
"2: GAGTTGACC\n",
"\n",
"Matches: 1\n",
"1: GCGCGACCTAGCCCGTACTGT\n",
"A: GCG slice(0, 3, 1)\n",
"M: | \n",
"B: ACC slice(6, 9, 1)\n",
"2: GAGTTGACC\n",
"\n",
"Matches: 1\n",
"1: GCGCGACCTAGCCCGTACTGT\n",
"A: GC slice(0, 2, 1)\n",
"M: |\n",
"B: CC slice(7, 9, 1)\n",
"2: GAGTTGACC\n",
"\n",
"Matches: 0\n",
"1: GCGCGACCTAGCCCGTACTGT\n",
"A: G slice(0, 1, 1)\n",
"M: \n",
"B: C slice(8, 9, 1)\n",
"2: GAGTTGACC\n",
"\n"
]
},
{
"data": {
"text/plain": [
"(5, slice(4, 13, 1), slice(0, 9, 1))"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"best_alignment_visualized('GCGCGACCTAGCCCGTACTGT', 'GAGTTGACC')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"A: GAGTTGACC\n",
"B: GCGCGACCTAGCCCGTACTGT\n",
"\n",
"Matches: 0\n",
"1: GAGTTGACC\n",
"A: C slice(8, 9, 1)\n",
"M: \n",
"B: G slice(0, 1, 1)\n",
"2: GCGCGACCTAGCCCGTACTGT\n",
"\n",
"Matches: 1\n",
"1: GAGTTGACC\n",
"A: CC slice(7, 9, 1)\n",
"M: |\n",
"B: GC slice(0, 2, 1)\n",
"2: GCGCGACCTAGCCCGTACTGT\n",
"\n",
"Matches: 1\n",
"1: GAGTTGACC\n",
"A: ACC slice(6, 9, 1)\n",
"M: | \n",
"B: GCG slice(0, 3, 1)\n",
"2: GCGCGACCTAGCCCGTACTGT\n",
"\n",
"Matches: 2\n",
"1: GAGTTGACC\n",
"A: GACC slice(5, 9, 1)\n",
"M: | |\n",
"B: GCGC slice(0, 4, 1)\n",
"2: GCGCGACCTAGCCCGTACTGT\n",
"\n",
"Matches: 1\n",
"1: GAGTTGACC\n",
"A: TGACC slice(4, 9, 1)\n",
"M: | \n",
"B: GCGCG slice(0, 5, 1)\n",
"2: GCGCGACCTAGCCCGTACTGT\n",
"\n",
"Matches: 1\n",
"1: GAGTTGACC\n",
"A: TTGACC slice(3, 9, 1)\n",
"M: | \n",
"B: GCGCGA slice(0, 6, 1)\n",
"2: GCGCGACCTAGCCCGTACTGT\n",
"\n",
"Matches: 2\n",
"1: GAGTTGACC\n",
"A: GTTGACC slice(2, 9, 1)\n",
"M: | |\n",
"B: GCGCGAC slice(0, 7, 1)\n",
"2: GCGCGACCTAGCCCGTACTGT\n",
"\n",
"Matches: 4\n",
"1: GAGTTGACC\n",
"A: AGTTGACC slice(1, 9, 1)\n",
"M: ||||\n",
"B: GCGCGACC slice(0, 8, 1)\n",
"2: GCGCGACCTAGCCCGTACTGT\n",
"\n",
"Matches: 3\n",
"1: GAGTTGACC\n",
"A: GAGTTGACC slice(0, 9, 1)\n",
"M: | | | \n",
"B: GCGCGACCT slice(0, 9, 1)\n",
"2: GCGCGACCTAGCCCGTACTGT\n",
"\n",
"Matches: 0\n",
"1: GAGTTGACC\n",
"A: GAGTTGACC slice(0, 9, 1)\n",
"M: \n",
"B: CGCGACCTA slice(1, 10, 1)\n",
"2: GCGCGACCTAGCCCGTACTGT\n",
"\n",
"Matches: 2\n",
"1: GAGTTGACC\n",
"A: GAGTTGACC slice(0, 9, 1)\n",
"M: | | \n",
"B: GCGACCTAG slice(2, 11, 1)\n",
"2: GCGCGACCTAGCCCGTACTGT\n",
"\n",
"Matches: 2\n",
"1: GAGTTGACC\n",
"A: GAGTTGACC slice(0, 9, 1)\n",
"M: | |\n",
"B: CGACCTAGC slice(3, 12, 1)\n",
"2: GCGCGACCTAGCCCGTACTGT\n",
"\n",
"Matches: 5\n",
"1: GAGTTGACC\n",
"A: GAGTTGACC slice(0, 9, 1)\n",
"M: || | ||\n",
"B: GACCTAGCC slice(4, 13, 1)\n",
"2: GCGCGACCTAGCCCGTACTGT\n",
"\n",
"Matches: 4\n",
"1: GAGTTGACC\n",
"A: GAGTTGACC slice(0, 9, 1)\n",
"M: | | ||\n",
"B: ACCTAGCCC slice(5, 14, 1)\n",
"2: GCGCGACCTAGCCCGTACTGT\n",
"\n",
"Matches: 1\n",
"1: GAGTTGACC\n",
"A: GAGTTGACC slice(0, 9, 1)\n",
"M: | \n",
"B: CCTAGCCCG slice(6, 15, 1)\n",
"2: GCGCGACCTAGCCCGTACTGT\n",
"\n",
"Matches: 0\n",
"1: GAGTTGACC\n",
"A: GAGTTGACC slice(0, 9, 1)\n",
"M: \n",
"B: CTAGCCCGT slice(7, 16, 1)\n",
"2: GCGCGACCTAGCCCGTACTGT\n",
"\n",
"Matches: 2\n",
"1: GAGTTGACC\n",
"A: GAGTTGACC slice(0, 9, 1)\n",
"M: || \n",
"B: TAGCCCGTA slice(8, 17, 1)\n",
"2: GCGCGACCTAGCCCGTACTGT\n",
"\n",
"Matches: 2\n",
"1: GAGTTGACC\n",
"A: GAGTTGACC slice(0, 9, 1)\n",
"M: | |\n",
"B: AGCCCGTAC slice(9, 18, 1)\n",
"2: GCGCGACCTAGCCCGTACTGT\n",
"\n",
"Matches: 3\n",
"1: GAGTTGACC\n",
"A: GAGTTGACC slice(0, 9, 1)\n",
"M: | || \n",
"B: GCCCGTACT slice(10, 19, 1)\n",
"2: GCGCGACCTAGCCCGTACTGT\n",
"\n",
"Matches: 1\n",
"1: GAGTTGACC\n",
"A: GAGTTGACC slice(0, 9, 1)\n",
"M: | \n",
"B: CCCGTACTG slice(11, 20, 1)\n",
"2: GCGCGACCTAGCCCGTACTGT\n",
"\n",
"Matches: 2\n",
"1: GAGTTGACC\n",
"A: GAGTTGACC slice(0, 9, 1)\n",
"M: || \n",
"B: CCGTACTGT slice(12, 21, 1)\n",
"2: GCGCGACCTAGCCCGTACTGT\n",
"\n",
"Matches: 0\n",
"1: GAGTTGACC\n",
"A: GAGTTGAC slice(0, 8, 1)\n",
"M: \n",
"B: CGTACTGT slice(13, 21, 1)\n",
"2: GCGCGACCTAGCCCGTACTGT\n",
"\n",
"Matches: 3\n",
"1: GAGTTGACC\n",
"A: GAGTTGA slice(0, 7, 1)\n",
"M: | || \n",
"B: GTACTGT slice(14, 21, 1)\n",
"2: GCGCGACCTAGCCCGTACTGT\n",
"\n",
"Matches: 2\n",
"1: GAGTTGACC\n",
"A: GAGTTG slice(0, 6, 1)\n",
"M: | | \n",
"B: TACTGT slice(15, 21, 1)\n",
"2: GCGCGACCTAGCCCGTACTGT\n",
"\n",
"Matches: 1\n",
"1: GAGTTGACC\n",
"A: GAGTT slice(0, 5, 1)\n",
"M: |\n",
"B: ACTGT slice(16, 21, 1)\n",
"2: GCGCGACCTAGCCCGTACTGT\n",
"\n",
"Matches: 2\n",
"1: GAGTTGACC\n",
"A: GAGT slice(0, 4, 1)\n",
"M: ||\n",
"B: CTGT slice(17, 21, 1)\n",
"2: GCGCGACCTAGCCCGTACTGT\n",
"\n",
"Matches: 0\n",
"1: GAGTTGACC\n",
"A: GAG slice(0, 3, 1)\n",
"M: \n",
"B: TGT slice(18, 21, 1)\n",
"2: GCGCGACCTAGCCCGTACTGT\n",
"\n",
"Matches: 1\n",
"1: GAGTTGACC\n",
"A: GA slice(0, 2, 1)\n",
"M: | \n",
"B: GT slice(19, 21, 1)\n",
"2: GCGCGACCTAGCCCGTACTGT\n",
"\n",
"Matches: 0\n",
"1: GAGTTGACC\n",
"A: G slice(0, 1, 1)\n",
"M: \n",
"B: T slice(20, 21, 1)\n",
"2: GCGCGACCTAGCCCGTACTGT\n",
"\n"
]
},
{
"data": {
"text/plain": [
"(5, slice(0, 9, 1), slice(4, 13, 1))"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"best_alignment_visualized('GAGTTGACC', 'GCGCGACCTAGCCCGTACTGT')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(5, slice(0, 9, 1), slice(4, 13, 1))"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"best_alignment('GAGTTGACC', 'GCGCGACCTAGCCCGTACTGT')"
]
}
],
"metadata": {
"interpreter": {
"hash": "767d51c1340bd893661ea55ea3124f6de3c7a262a8b4abca0554b478b1e2ff90"
},
"kernelspec": {
"display_name": "Python 3.8.10 64-bit",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
from typing import Tuple
BEST_ALIGNMENT = Tuple[int, slice, slice]
def best_alignment_visualized(seq1: str, seq2: str, verbose: bool = True) -> BEST_ALIGNMENT:
best_match: BEST_ALIGNMENT = (0, slice(0), slice(0))
if verbose:
print(f"A: {seq1}\nB: {seq2}\n")
for i in range(1, len(seq1)+1):
A = slice(len(seq1)-i, min(len(seq1), len(seq1)-i+len(seq2)), 1)
B = slice(0, min(len(seq2), i), 1)
match_string = ""
match_count = 0
for j in range(len(seq1[A])):
if seq1[A][j] == seq2[B][j]:
match_count += 1
match_string += "|"
else:
match_string += " "
if verbose:
spacer = ' '*(len(seq1)-i+len(seq2)-1)
print(f"Matches: {match_count}")
print(f"1: {' '*(len(seq2)-1)}{seq1}")
print(f"A: {spacer}{seq1[A]} {A}")
print(f"M: {spacer}{match_string}")
print(f"B: {spacer}{seq2[B]} {B}")
print(f"2: {spacer}{seq2}\n")
if match_count > best_match[0]:
best_match = (match_count, A, B)
for i in range(1, len(seq2))[::-1]:
A = slice(0, min(len(seq1), i), 1)
B = slice(len(seq2)-i, min(len(seq2), len(seq2)-i+len(seq1)), 1)
match_string = ""
match_count = 0
for j in range(len(seq1[A])):
if seq1[A][j] == seq2[B][j]:
match_count += 1
match_string += "|"
else:
match_string += " "
if verbose:
spacer = ' '*(len(seq2)-1)
print(f"Matches: {match_count}")
print(f"1: {' '*(len(seq2)-1)}{seq1}")
print(f"A: {spacer}{seq1[A]} {A}")
print(f"M: {spacer}{match_string}")
print(f"B: {spacer}{seq2[B]} {B}")
print(f"2: {' '*(i-1)}{seq2}\n")
if match_count > best_match[0]:
best_match = (match_count, A, B)
return best_match
def best_alignment(seq1: str, seq2: str) -> BEST_ALIGNMENT:
best_match: BEST_ALIGNMENT = (0, slice(0), slice(0))
for i in range(1, len(seq1)+1):
A = slice(len(seq1)-i, min(len(seq1), len(seq1)-i+len(seq2)), 1)
B = slice(0, min(len(seq2), i), 1)
match_count = 0
for j in range(len(seq1[A])):
if seq1[A][j] == seq2[B][j]:
match_count += 1
if match_count > best_match[0]:
best_match = (match_count, A, B)
for i in range(1, len(seq2))[::-1]:
A = slice(0, min(len(seq1), i), 1)
B = slice(len(seq2)-i, min(len(seq2), len(seq2)-i+len(seq1)), 1)
match_count = 0
for j in range(len(seq1[A])):
if seq1[A][j] == seq2[B][j]:
match_count += 1
if match_count > best_match[0]:
best_match = (match_count, A, B)
return best_match
from collections import defaultdict
from typing import DefaultDict, Tuple
def count_alignment_visualized(seq1: str, seq2: str, verbose: bool = True) -> DefaultDict[int, int]:
alignment_counts: DefaultDict[int, int] = defaultdict(lambda: 0)
if verbose:
print(f"A: {seq1}\nB: {seq2}\n")
for i in range(1, len(seq1)+1):
A = slice(len(seq1)-i, min(len(seq1), len(seq1)-i+len(seq2)), 1)
B = slice(0, min(len(seq2), i), 1)
match_string = ""
match_count = 0
for j in range(len(seq1[A])):
if seq1[A][j] == seq2[B][j]:
match_count += 1
match_string += "|"
else:
match_string += " "
if verbose:
spacer = ' '*(len(seq1)-i+len(seq2)-1)
print(f"Matches: {match_count}")
print(f"1: {' '*(len(seq2)-1)}{seq1}")
print(f"A: {spacer}{seq1[A]} {A}")
print(f"M: {spacer}{match_string}")
print(f"B: {spacer}{seq2[B]} {B}")
print(f"2: {spacer}{seq2}\n")
alignment_counts[match_count] += 1
for i in range(1, len(seq2))[::-1]:
A = slice(0, min(len(seq1), i), 1)
B = slice(len(seq2)-i, min(len(seq2), len(seq2)-i+len(seq1)), 1)
match_string = ""
match_count = 0
for j in range(len(seq1[A])):
if seq1[A][j] == seq2[B][j]:
match_count += 1
match_string += "|"
else:
match_string += " "
if verbose:
spacer = ' '*(len(seq2)-1)
print(f"Matches: {match_count}")
print(f"1: {' '*(len(seq2)-1)}{seq1}")
print(f"A: {spacer}{seq1[A]} {A}")
print(f"M: {spacer}{match_string}")
print(f"B: {spacer}{seq2[B]} {B}")
print(f"2: {' '*(i-1)}{seq2}\n")
alignment_counts[match_count] += 1
return alignment_counts
def count_alignment(seq1: str, seq2: str) -> DefaultDict[int, int]:
alignment_counts: DefaultDict[int, int] = defaultdict(lambda: 0)
for i in range(1, len(seq1)+1):
A = slice(len(seq1)-i, min(len(seq1), len(seq1)-i+len(seq2)), 1)
B = slice(0, min(len(seq2), i), 1)
match_count = 0
for j in range(len(seq1[A])):
if seq1[A][j] == seq2[B][j]:
match_count += 1
alignment_counts[match_count] += 1
for i in range(1, len(seq2))[::-1]:
A = slice(0, min(len(seq1), i), 1)
B = slice(len(seq2)-i, min(len(seq2), len(seq2)-i+len(seq1)), 1)
match_count = 0
for j in range(len(seq1[A])):
if seq1[A][j] == seq2[B][j]:
match_count += 1
alignment_counts[match_count] += 1
return alignment_counts
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment