Created
April 19, 2015 02:39
-
-
Save 1328/8dd6f49d4260725bcdd4 to your computer and use it in GitHub Desktop.
tester
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
import os.path | |
from pprint import pprint | |
from fsearch.index import FastaIndex, MemIndex | |
TASK = ''' | |
Here is a link to some sample data. I want to find where each datapoint from | |
sequence column of inputfile can be found in the RefSeq file, and extract other | |
info from the RefSeq file based on that. I did try sqlite3 but found it to be | |
very slow. | |
I think right now it is acceptable to just loop through a list loaded with the | |
RefSeq file for each line in the inputfile as it is actually the fastest method | |
I've tried so far. It takes 8-10 minutes with one core, which isn't too bad. | |
Though if there is a method out there that makes it reasonably faster and is | |
easy to implement then I would be interested in checking it out. | |
''' | |
REFERENCE = 'data/human_RefSeq_20130704.tab' | |
CROSS = 'data/inputfile.txt' | |
INDEX = os.path.normpath('e:/tmp/') | |
def read_reference(fn): | |
result = {} | |
with open(fn, mode ='r') as fh: | |
next(fh) # skips header | |
for idx, line in enumerate(fh): | |
try: | |
# had some trouble with your data, so I just compress all the | |
# non-fasta information into one field. I figure you already | |
# know what to do there | |
*info, fasta = [i.strip() for i in line.split()] | |
result[fasta] = info | |
except ValueError: | |
print('cannot parse line {}:'.format(idx)) | |
#pprint(line.split()) | |
#break | |
return result | |
def read_cross(fn): | |
''' | |
just grab the third item for cross referencing | |
which looks to be between 7 and 55 characters | |
''' | |
result = [] | |
with open(fn, mode ='r') as fh: | |
next(fh) # skips header | |
for idx, line in enumerate(fh): | |
try: | |
result.append(line.split()[3]) | |
except ValueError: | |
print('cannot parse line {}:'.format(idx)) | |
pprint(line.split()) | |
break | |
return result | |
def match(index, genetics, find_me): | |
hit, miss = 0,0 | |
for target in find_me: | |
found = index[target] | |
if found: | |
hit += len(found) | |
#pprint(target) | |
#pprint(found) | |
#break | |
else: | |
#print('miss {}'.format(target)) | |
miss += 1 | |
print('got {} hits and {} misses'.format(hit, miss)) | |
def main(): | |
start = time.time() | |
genetics = read_reference(REFERENCE) | |
find_me = read_cross(CROSS) | |
print('took {} to load files'.format(time.time() - start)) | |
print('getting index') | |
#index = FastaIndex.create(genetics, INDEX, 5) | |
# NOTE: the indexer runs a variable length window across the longer protein | |
# strings to allow for better matching. The window needs to be smaller than | |
# your shortest search (here 7). The bigger the window the faster you can | |
# search, but the larger your index will be (and the longer it will take to | |
# index). After some trial and error, an index of 5 allows for an almost | |
# immediate (half-a-second) cross. | |
# an index length of 6 adds four seconds to indexing, but only drops | |
# crossing by .3 seconds. | |
index = MemIndex(genetics, 6) | |
print('matching') | |
match_start = time.time() | |
match(index, genetics, find_me) | |
print('took {} to match'.format(time.time() - match_start)) | |
print('took {} overall'.format(time.time() - start)) | |
index.close() | |
if __name__ == '__main__': | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment