Last active
October 18, 2020 16:11
-
-
Save linnil1/f6ce6001257b41c5196cf7c65e6f237b to your computer and use it in GitHub Desktop.
Demo of bwa
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import defaultdict | |
from copy import deepcopy | |
from pprint import pprint | |
import pandas as pd | |
ref = "mississippi$" | |
print("Reference", ref) | |
# Build suffix array | |
sa = list(range(len(ref))) | |
print("Suffix Array"); | |
for i in sa: print(i, ref[i:] + ref[:i]) | |
sa.sort(key=lambda i: ref[i:] + ref[:i]) # O(n^2 log(n)) | |
print("Sorted Suffix Array") | |
for ind, i in enumerate(sa): print(ind, i, ref[i:] + ref[:i]) | |
# Build BWA index | |
index, new_s = zip(*[(i, ref[i - 1]) for i in sa]) | |
# Find interval of each character | |
interval = {} | |
for i in range(1, len(new_s)): | |
if ref[index[i]] not in interval: | |
interval[ref[index[i]]] = [i, i + 1] | |
else: | |
interval[ref[index[i]]][1] = i + 1 | |
# Calculate the occurance of each four characters | |
occurences = [{k: 0 for k in interval}] | |
for i in range(len(new_s)): | |
d = deepcopy(occurences[-1]) | |
if new_s[i] != "$": | |
d[new_s[i]] += 1 | |
occurences.append(d) | |
print("First letter index interval") | |
print(interval) | |
print("Occurence Array") | |
print(pd.DataFrame(occurences)) | |
# Query a sequence | |
search_s = "mis" | |
print(f"Query", search_s) | |
a, b = 0, len(ref) # interval of occurence in single character | |
for i in range(len(search_s) - 1, -1, -1): | |
# Calculate the index interval | |
pa = interval[search_s[i]][0] + a | |
pb = interval[search_s[i]][0] + b | |
pb = min(interval[search_s[i]][1], pb) | |
print(pa, pb) | |
# Find it or cannot find | |
if pa >= pb: | |
break | |
if i == 0: | |
for pa in range(pa, pb): | |
print(index[pa], ref[index[pa]:]) | |
break | |
# interval of occurence in single character inside interval | |
a = occurences[pa][search_s[i - 1]] | |
b = occurences[pb][search_s[i - 1]] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Output