Created
August 23, 2013 21:29
-
-
Save Puriney/6324227 to your computer and use it in GitHub Desktop.
Python: Lazy FM-index.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
EOS = "$" | |
''' | |
All sort algorithm here use the Built-in `sort` function inside Python | |
''' | |
refSeq = raw_input("Your ref: ") | |
if not refSeq: | |
refSeq = "ACAACG" | |
def print_table(table): | |
''' | |
print BWT table | |
''' | |
for i in table : | |
print i | |
def print_dictionary(dictionary): | |
for i in sorted(dictionary.keys()): | |
print str(i)+'\t'+str(dictionary[i]) | |
def bwt_table_construction(x): | |
''' | |
bwt index force construction | |
''' | |
x = x + "$" | |
bwtArray =[] | |
bwtArray.append(x) | |
for i in range(1,len(x),1): | |
bwtArray.append(x[i::]+x[:i:]) | |
''' | |
Built-in stupid merge_sort lexicographically | |
''' | |
bwtArray.sort() | |
return bwtArray | |
def bwt_string(array): | |
''' | |
report BWT string | |
''' | |
output_string ="" | |
for i in array: | |
output_string += i[-1] | |
return output_string | |
def sa_array(x): | |
''' | |
Suffix Array construction and report suffix array | |
''' | |
saDic = {} | |
saArray = [] | |
for i in range(len(x)+1): | |
saDic[x[i::]] = i | |
# Stupid merge sort | |
for seq in sorted(saDic.keys()): | |
saArray.append(saDic[seq]) | |
return saArray | |
def bwt_table_construction_equation(x,suffixArray): | |
''' | |
report BTW string without building up SA, but based on the equation of SA and BWT | |
BWT[i] = "$" if S[i] ==0 | |
BWT[i] = X[S[i]-1] else | |
''' | |
bwtString = [] | |
for i in range(len(x)+1): | |
if suffixArray[i]==0: | |
bwtString.append("$") | |
else: | |
bwtString.append(x[suffixArray[i]-1]) | |
# bwtStringbwtString[i] | |
return ''.join(bwtString) | |
saArray = sa_array(refSeq) | |
print '**Suffix Array:' + str(saArray) | |
bwtArray = bwt_table_construction(refSeq) | |
print '**raw bwt is' | |
print_table(bwtArray) | |
print '**equation bwt string is' | |
print bwt_table_construction_equation(refSeq,saArray) | |
bwtString = bwt_string(bwtArray) | |
print "**BWT string is " + bwtString | |
def count_table(bwtString): | |
''' | |
counts of letters less than given letter | |
''' | |
countTable = {} | |
alphbet = sorted(set(bwtString)) | |
alphbetRev = alphbet | |
alphbetRev.reverse() | |
print alphbetRev | |
for i in alphbet: | |
countTable[i]=0 | |
# counts = 0 | |
for letter in bwtString: | |
countTable[letter] += 1 | |
# print countTable | |
bwtStringLetterCounts = len(bwtString) | |
for letter in alphbetRev: | |
# print "before: " + str(countTable[letter]) | |
countTable[letter] = bwtStringLetterCounts - countTable[letter] | |
# print "after: " + str(countTable[letter]) | |
bwtStringLetterCounts = countTable[letter] | |
return countTable | |
countTable = count_table(bwtString) | |
print_dictionary(countTable) | |
def occ_table(bwtString): | |
occTable = {} | |
alphbet = sorted(set(bwtString)) | |
# print alphbet | |
for nt in alphbet: | |
occTable[nt]= [0]*len(bwtString) | |
# print occTable | |
for step in range(len(bwtString)): | |
nt = bwtString[step] | |
occTable[nt][step]=1 | |
# print occTable | |
for nt in alphbet: | |
for step in range(1,len(bwtString),1): | |
occTable[nt][step] += occTable[nt][step-1] | |
# print occTable | |
return occTable | |
occTable = occ_table(bwtString) | |
print_dictionary(occTable) | |
def first_col (saArray,refSeq): | |
x = "" | |
refSeq = refSeq+"$" | |
for i in saArray: | |
x = x + str(refSeq[i]) | |
return x | |
firstCol = first_col(saArray,refSeq) | |
def letter_range_in_string(letter,string): | |
# string = string[begin:end+1] | |
# string = list(string) | |
letterPositions =[] | |
letterPositions= [i for i, x in enumerate(list(string)) if x == letter] | |
# return [min(letterPositions),max(letterPositions)] | |
return letterPositions | |
def letter_LF(letter,oldposition,bwtString,saArray,countTable,occTable): | |
# print firstCol[oldposition] | |
# print bwtString[oldposition] | |
# print countTable[bwtString[oldposition]] | |
# print occTable[bwtString[oldposition]][oldposition] | |
#newposition= countTable[bwtString[oldposition]]+occTable[bwtString[oldposition]][oldposition]-1 | |
newposition= countTable[letter]+occTable[letter][oldposition]-1 | |
return newposition | |
query = raw_input('Your query:') | |
def string_search (query, firstCol, bwtString,saArray,countTable,occTable): | |
hitSApositions = [] | |
queryLength = len(query) | |
query = list(query) | |
initLetter = query.pop() | |
startOld = min(letter_range_in_string(initLetter,firstCol)) | |
endOld = max(letter_range_in_string(initLetter,firstCol)) | |
if queryLength ==1: | |
hitSApositions = letter_range_in_string(initLetter,firstCol) | |
else: | |
i = 2 | |
while i <= queryLength: | |
letter = query.pop() | |
# print letter | |
# print [startOld,endOld] | |
startOld = letter_LF(letter,startOld,bwtString,saArray,countTable,occTable) | |
endOld = letter_LF(letter,endOld,bwtString,saArray,countTable,occTable) | |
i+=1 | |
# print [startOld,endOld] | |
hitSApositions= [startOld,endOld] | |
# return hitSApositions | |
for i in range(len(hitSApositions)): | |
hitSApositions[i]=saArray[hitSApositions[i]] | |
return list(set(hitSApositions)) | |
result = string_search(query, firstCol, bwtString,saArray,countTable,occTable) | |
print "Your exact alignment is located at: " | |
print result |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment