Skip to content

Instantly share code, notes, and snippets.

@1328
Created April 19, 2015 02:39
Show Gist options
  • Save 1328/8dd6f49d4260725bcdd4 to your computer and use it in GitHub Desktop.
Save 1328/8dd6f49d4260725bcdd4 to your computer and use it in GitHub Desktop.
tester
import time
import os.path
from pprint import pprint
from fsearch.index import FastaIndex, MemIndex
TASK = '''
Here is a link to some sample data. I want to find where each datapoint from
sequence column of inputfile can be found in the RefSeq file, and extract other
info from the RefSeq file based on that. I did try sqlite3 but found it to be
very slow.
I think right now it is acceptable to just loop through a list loaded with the
RefSeq file for each line in the inputfile as it is actually the fastest method
I've tried so far. It takes 8-10 minutes with one core, which isn't too bad.
Though if there is a method out there that makes it reasonably faster and is
easy to implement then I would be interested in checking it out.
'''
REFERENCE = 'data/human_RefSeq_20130704.tab'
CROSS = 'data/inputfile.txt'
INDEX = os.path.normpath('e:/tmp/')
def read_reference(fn):
result = {}
with open(fn, mode ='r') as fh:
next(fh) # skips header
for idx, line in enumerate(fh):
try:
# had some trouble with your data, so I just compress all the
# non-fasta information into one field. I figure you already
# know what to do there
*info, fasta = [i.strip() for i in line.split()]
result[fasta] = info
except ValueError:
print('cannot parse line {}:'.format(idx))
#pprint(line.split())
#break
return result
def read_cross(fn):
'''
just grab the third item for cross referencing
which looks to be between 7 and 55 characters
'''
result = []
with open(fn, mode ='r') as fh:
next(fh) # skips header
for idx, line in enumerate(fh):
try:
result.append(line.split()[3])
except ValueError:
print('cannot parse line {}:'.format(idx))
pprint(line.split())
break
return result
def match(index, genetics, find_me):
hit, miss = 0,0
for target in find_me:
found = index[target]
if found:
hit += len(found)
#pprint(target)
#pprint(found)
#break
else:
#print('miss {}'.format(target))
miss += 1
print('got {} hits and {} misses'.format(hit, miss))
def main():
start = time.time()
genetics = read_reference(REFERENCE)
find_me = read_cross(CROSS)
print('took {} to load files'.format(time.time() - start))
print('getting index')
#index = FastaIndex.create(genetics, INDEX, 5)
# NOTE: the indexer runs a variable length window across the longer protein
# strings to allow for better matching. The window needs to be smaller than
# your shortest search (here 7). The bigger the window the faster you can
# search, but the larger your index will be (and the longer it will take to
# index). After some trial and error, an index of 5 allows for an almost
# immediate (half-a-second) cross.
# an index length of 6 adds four seconds to indexing, but only drops
# crossing by .3 seconds.
index = MemIndex(genetics, 6)
print('matching')
match_start = time.time()
match(index, genetics, find_me)
print('took {} to match'.format(time.time() - match_start))
print('took {} overall'.format(time.time() - start))
index.close()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment