Skip to content

Instantly share code, notes, and snippets.

# Puriney/FM-index.py Created Aug 23, 2013

Python: Lazy FM-index.py
 EOS = "\$" ''' All sort algorithm here use the Built-in `sort` function inside Python ''' refSeq = raw_input("Your ref: ") if not refSeq: refSeq = "ACAACG" def print_table(table): ''' print BWT table ''' for i in table : print i def print_dictionary(dictionary): for i in sorted(dictionary.keys()): print str(i)+'\t'+str(dictionary[i]) def bwt_table_construction(x): ''' bwt index force construction ''' x = x + "\$" bwtArray =[] bwtArray.append(x) for i in range(1,len(x),1): bwtArray.append(x[i::]+x[:i:]) ''' Built-in stupid merge_sort lexicographically ''' bwtArray.sort() return bwtArray def bwt_string(array): ''' report BWT string ''' output_string ="" for i in array: output_string += i[-1] return output_string def sa_array(x): ''' Suffix Array construction and report suffix array ''' saDic = {} saArray = [] for i in range(len(x)+1): saDic[x[i::]] = i # Stupid merge sort for seq in sorted(saDic.keys()): saArray.append(saDic[seq]) return saArray def bwt_table_construction_equation(x,suffixArray): ''' report BTW string without building up SA, but based on the equation of SA and BWT BWT[i] = "\$" if S[i] ==0 BWT[i] = X[S[i]-1] else ''' bwtString = [] for i in range(len(x)+1): if suffixArray[i]==0: bwtString.append("\$") else: bwtString.append(x[suffixArray[i]-1]) # bwtStringbwtString[i] return ''.join(bwtString) saArray = sa_array(refSeq) print '**Suffix Array:' + str(saArray) bwtArray = bwt_table_construction(refSeq) print '**raw bwt is' print_table(bwtArray) print '**equation bwt string is' print bwt_table_construction_equation(refSeq,saArray) bwtString = bwt_string(bwtArray) print "**BWT string is " + bwtString def count_table(bwtString): ''' counts of letters less than given letter ''' countTable = {} alphbet = sorted(set(bwtString)) alphbetRev = alphbet alphbetRev.reverse() print alphbetRev for i in alphbet: countTable[i]=0 # counts = 0 for letter in bwtString: countTable[letter] += 1 # print countTable bwtStringLetterCounts = len(bwtString) for letter in alphbetRev: # print "before: " + str(countTable[letter]) countTable[letter] = bwtStringLetterCounts - countTable[letter] # print "after: " + str(countTable[letter]) bwtStringLetterCounts = countTable[letter] return countTable countTable = count_table(bwtString) print_dictionary(countTable) def occ_table(bwtString): occTable = {} alphbet = sorted(set(bwtString)) # print alphbet for nt in alphbet: occTable[nt]= [0]*len(bwtString) # print occTable for step in range(len(bwtString)): nt = bwtString[step] occTable[nt][step]=1 # print occTable for nt in alphbet: for step in range(1,len(bwtString),1): occTable[nt][step] += occTable[nt][step-1] # print occTable return occTable occTable = occ_table(bwtString) print_dictionary(occTable) def first_col (saArray,refSeq): x = "" refSeq = refSeq+"\$" for i in saArray: x = x + str(refSeq[i]) return x firstCol = first_col(saArray,refSeq) def letter_range_in_string(letter,string): # string = string[begin:end+1] # string = list(string) letterPositions =[] letterPositions= [i for i, x in enumerate(list(string)) if x == letter] # return [min(letterPositions),max(letterPositions)] return letterPositions def letter_LF(letter,oldposition,bwtString,saArray,countTable,occTable): # print firstCol[oldposition] # print bwtString[oldposition] # print countTable[bwtString[oldposition]] # print occTable[bwtString[oldposition]][oldposition] #newposition= countTable[bwtString[oldposition]]+occTable[bwtString[oldposition]][oldposition]-1 newposition= countTable[letter]+occTable[letter][oldposition]-1 return newposition query = raw_input('Your query:') def string_search (query, firstCol, bwtString,saArray,countTable,occTable): hitSApositions = [] queryLength = len(query) query = list(query) initLetter = query.pop() startOld = min(letter_range_in_string(initLetter,firstCol)) endOld = max(letter_range_in_string(initLetter,firstCol)) if queryLength ==1: hitSApositions = letter_range_in_string(initLetter,firstCol) else: i = 2 while i <= queryLength: letter = query.pop() # print letter # print [startOld,endOld] startOld = letter_LF(letter,startOld,bwtString,saArray,countTable,occTable) endOld = letter_LF(letter,endOld,bwtString,saArray,countTable,occTable) i+=1 # print [startOld,endOld] hitSApositions= [startOld,endOld] # return hitSApositions for i in range(len(hitSApositions)): hitSApositions[i]=saArray[hitSApositions[i]] return list(set(hitSApositions)) result = string_search(query, firstCol, bwtString,saArray,countTable,occTable) print "Your exact alignment is located at: " print result
to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.