Skip to content

Instantly share code, notes, and snippets.

@Puriney
Created August 23, 2013 21:29
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Puriney/6324227 to your computer and use it in GitHub Desktop.
Save Puriney/6324227 to your computer and use it in GitHub Desktop.
Python: Lazy FM-index.py
EOS = "$"
'''
All sort algorithm here use the Built-in `sort` function inside Python
'''
refSeq = raw_input("Your ref: ")
if not refSeq:
refSeq = "ACAACG"
def print_table(table):
'''
print BWT table
'''
for i in table :
print i
def print_dictionary(dictionary):
for i in sorted(dictionary.keys()):
print str(i)+'\t'+str(dictionary[i])
def bwt_table_construction(x):
'''
bwt index force construction
'''
x = x + "$"
bwtArray =[]
bwtArray.append(x)
for i in range(1,len(x),1):
bwtArray.append(x[i::]+x[:i:])
'''
Built-in stupid merge_sort lexicographically
'''
bwtArray.sort()
return bwtArray
def bwt_string(array):
'''
report BWT string
'''
output_string =""
for i in array:
output_string += i[-1]
return output_string
def sa_array(x):
'''
Suffix Array construction and report suffix array
'''
saDic = {}
saArray = []
for i in range(len(x)+1):
saDic[x[i::]] = i
# Stupid merge sort
for seq in sorted(saDic.keys()):
saArray.append(saDic[seq])
return saArray
def bwt_table_construction_equation(x,suffixArray):
'''
report BTW string without building up SA, but based on the equation of SA and BWT
BWT[i] = "$" if S[i] ==0
BWT[i] = X[S[i]-1] else
'''
bwtString = []
for i in range(len(x)+1):
if suffixArray[i]==0:
bwtString.append("$")
else:
bwtString.append(x[suffixArray[i]-1])
# bwtStringbwtString[i]
return ''.join(bwtString)
saArray = sa_array(refSeq)
print '**Suffix Array:' + str(saArray)
bwtArray = bwt_table_construction(refSeq)
print '**raw bwt is'
print_table(bwtArray)
print '**equation bwt string is'
print bwt_table_construction_equation(refSeq,saArray)
bwtString = bwt_string(bwtArray)
print "**BWT string is " + bwtString
def count_table(bwtString):
'''
counts of letters less than given letter
'''
countTable = {}
alphbet = sorted(set(bwtString))
alphbetRev = alphbet
alphbetRev.reverse()
print alphbetRev
for i in alphbet:
countTable[i]=0
# counts = 0
for letter in bwtString:
countTable[letter] += 1
# print countTable
bwtStringLetterCounts = len(bwtString)
for letter in alphbetRev:
# print "before: " + str(countTable[letter])
countTable[letter] = bwtStringLetterCounts - countTable[letter]
# print "after: " + str(countTable[letter])
bwtStringLetterCounts = countTable[letter]
return countTable
countTable = count_table(bwtString)
print_dictionary(countTable)
def occ_table(bwtString):
occTable = {}
alphbet = sorted(set(bwtString))
# print alphbet
for nt in alphbet:
occTable[nt]= [0]*len(bwtString)
# print occTable
for step in range(len(bwtString)):
nt = bwtString[step]
occTable[nt][step]=1
# print occTable
for nt in alphbet:
for step in range(1,len(bwtString),1):
occTable[nt][step] += occTable[nt][step-1]
# print occTable
return occTable
occTable = occ_table(bwtString)
print_dictionary(occTable)
def first_col (saArray,refSeq):
x = ""
refSeq = refSeq+"$"
for i in saArray:
x = x + str(refSeq[i])
return x
firstCol = first_col(saArray,refSeq)
def letter_range_in_string(letter,string):
# string = string[begin:end+1]
# string = list(string)
letterPositions =[]
letterPositions= [i for i, x in enumerate(list(string)) if x == letter]
# return [min(letterPositions),max(letterPositions)]
return letterPositions
def letter_LF(letter,oldposition,bwtString,saArray,countTable,occTable):
# print firstCol[oldposition]
# print bwtString[oldposition]
# print countTable[bwtString[oldposition]]
# print occTable[bwtString[oldposition]][oldposition]
#newposition= countTable[bwtString[oldposition]]+occTable[bwtString[oldposition]][oldposition]-1
newposition= countTable[letter]+occTable[letter][oldposition]-1
return newposition
query = raw_input('Your query:')
def string_search (query, firstCol, bwtString,saArray,countTable,occTable):
hitSApositions = []
queryLength = len(query)
query = list(query)
initLetter = query.pop()
startOld = min(letter_range_in_string(initLetter,firstCol))
endOld = max(letter_range_in_string(initLetter,firstCol))
if queryLength ==1:
hitSApositions = letter_range_in_string(initLetter,firstCol)
else:
i = 2
while i <= queryLength:
letter = query.pop()
# print letter
# print [startOld,endOld]
startOld = letter_LF(letter,startOld,bwtString,saArray,countTable,occTable)
endOld = letter_LF(letter,endOld,bwtString,saArray,countTable,occTable)
i+=1
# print [startOld,endOld]
hitSApositions= [startOld,endOld]
# return hitSApositions
for i in range(len(hitSApositions)):
hitSApositions[i]=saArray[hitSApositions[i]]
return list(set(hitSApositions))
result = string_search(query, firstCol, bwtString,saArray,countTable,occTable)
print "Your exact alignment is located at: "
print result
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment