Skip to content

Instantly share code, notes, and snippets.

@linnil1
Last active October 18, 2020 16:11
Show Gist options
  • Save linnil1/f6ce6001257b41c5196cf7c65e6f237b to your computer and use it in GitHub Desktop.
Save linnil1/f6ce6001257b41c5196cf7c65e6f237b to your computer and use it in GitHub Desktop.
Demo of bwa
from collections import defaultdict
from copy import deepcopy
from pprint import pprint
import pandas as pd
ref = "mississippi$"
print("Reference", ref)
# Build suffix array
sa = list(range(len(ref)))
print("Suffix Array");
for i in sa: print(i, ref[i:] + ref[:i])
sa.sort(key=lambda i: ref[i:] + ref[:i]) # O(n^2 log(n))
print("Sorted Suffix Array")
for ind, i in enumerate(sa): print(ind, i, ref[i:] + ref[:i])
# Build BWA index
index, new_s = zip(*[(i, ref[i - 1]) for i in sa])
# Find interval of each character
interval = {}
for i in range(1, len(new_s)):
if ref[index[i]] not in interval:
interval[ref[index[i]]] = [i, i + 1]
else:
interval[ref[index[i]]][1] = i + 1
# Calculate the occurance of each four characters
occurences = [{k: 0 for k in interval}]
for i in range(len(new_s)):
d = deepcopy(occurences[-1])
if new_s[i] != "$":
d[new_s[i]] += 1
occurences.append(d)
print("First letter index interval")
print(interval)
print("Occurence Array")
print(pd.DataFrame(occurences))
# Query a sequence
search_s = "mis"
print(f"Query", search_s)
a, b = 0, len(ref) # interval of occurence in single character
for i in range(len(search_s) - 1, -1, -1):
# Calculate the index interval
pa = interval[search_s[i]][0] + a
pb = interval[search_s[i]][0] + b
pb = min(interval[search_s[i]][1], pb)
print(pa, pb)
# Find it or cannot find
if pa >= pb:
break
if i == 0:
for pa in range(pa, pb):
print(index[pa], ref[index[pa]:])
break
# interval of occurence in single character inside interval
a = occurences[pa][search_s[i - 1]]
b = occurences[pb][search_s[i - 1]]
@linnil1
Copy link
Author

linnil1 commented Oct 18, 2020

Output

Suffix Array
0 mississippi$
1 ississippi$m
2 ssissippi$mi
3 sissippi$mis
4 issippi$miss
5 ssippi$missi
6 sippi$missis
7 ippi$mississ
8 ppi$mississi
9 pi$mississip
10 i$mississipp
11 $mississippi
Sorted Suffix Array
11 $mississippi
10 i$mississipp
7 ippi$mississ
4 issippi$miss
1 ississippi$m
0 mississippi$
9 pi$mississip
8 ppi$mississi
6 sippi$missis
3 sissippi$mis
5 ssippi$missi
2 ssissippi$mi
{'i': [1, 5], 'm': [5, 6], 'p': [6, 8], 's': [8, 12]}
    i  m  p  s
0   0  0  0  0
1   1  0  0  0
2   1  0  1  0
3   1  0  1  1
4   1  0  1  2
5   1  1  1  2
6   1  1  1  2
7   1  1  2  2
8   2  1  2  2
9   2  1  2  3
10  2  1  2  4
11  3  1  2  4
12  4  1  2  4
Query sis
8 12
3 5
9 10
3 sissippi$

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment