1328/test.py

## test.py

import time
import os.path
from pprint import pprint

from fsearch.index import FastaIndex, MemIndex

TASK = '''
Here is a link to some sample data. I want to find where each datapoint from
sequence column of inputfile can be found in the RefSeq file, and extract other
info from the RefSeq file based on that. I did try sqlite3 but found it to be
very slow.

I think right now it is acceptable to just loop through a list loaded with the
RefSeq file for each line in the inputfile as it is actually the fastest method
I've tried so far. It takes 8-10 minutes with one core, which isn't too bad.
Though if there is a method out there that makes it reasonably faster and is
easy to implement then I would be interested in checking it out.
'''

REFERENCE = 'data/human_RefSeq_20130704.tab'
CROSS = 'data/inputfile.txt'
INDEX = os.path.normpath('e:/tmp/')


def read_reference(fn):
    result = {}
    with open(fn, mode ='r') as fh:
        next(fh) # skips header
        for idx, line in enumerate(fh):
            try:
                # had some trouble with your data, so I just compress all the
                # non-fasta information into one field.  I figure you already
                # know what to do there
                *info, fasta = [i.strip() for i in line.split()]
                result[fasta] = info
            except ValueError:
                print('cannot parse line {}:'.format(idx))
                #pprint(line.split())
                #break
    return result

def read_cross(fn):
    '''
    just grab the third item for cross referencing
    which looks to be between 7 and 55 characters
    '''
    result = []
    with open(fn, mode ='r') as fh:
        next(fh) # skips header
        for idx, line in enumerate(fh):
            try:
                result.append(line.split()[3])
            except ValueError:
                print('cannot parse line {}:'.format(idx))
                pprint(line.split())
                break

    return result

def match(index, genetics, find_me):
    hit, miss = 0,0
    for target in find_me:
        found = index[target]
        if found:
            hit += len(found)
            #pprint(target)
            #pprint(found)
            #break
        else:
            #print('miss {}'.format(target))
            miss += 1

    print('got {} hits and {} misses'.format(hit, miss))

def main():

    start = time.time()
    genetics = read_reference(REFERENCE)
    find_me = read_cross(CROSS)

    print('took {} to load files'.format(time.time() - start))

    print('getting index')
    #index = FastaIndex.create(genetics, INDEX, 5)

    # NOTE:  the indexer runs a variable length window across the longer protein
    # strings to allow for better matching.  The window needs to be smaller than
    # your shortest search (here 7).  The bigger the window the faster you can
    # search, but the larger your index will be (and the longer it will take to
    # index).  After some trial and error, an index of 5 allows for an almost
    # immediate (half-a-second) cross.
    # an index length of 6 adds four seconds to indexing, but only drops
    # crossing by .3 seconds.

    index = MemIndex(genetics, 6)

    print('matching')
    match_start = time.time()
    match(index, genetics, find_me)
    print('took {} to match'.format(time.time() - match_start))

    print('took {} overall'.format(time.time() - start))
    index.close()


if __name__ == '__main__':
    main()

	import time
	import os.path
	from pprint import pprint

	from fsearch.index import FastaIndex, MemIndex

	TASK = '''
	Here is a link to some sample data. I want to find where each datapoint from
	sequence column of inputfile can be found in the RefSeq file, and extract other
	info from the RefSeq file based on that. I did try sqlite3 but found it to be
	very slow.

	I think right now it is acceptable to just loop through a list loaded with the
	RefSeq file for each line in the inputfile as it is actually the fastest method
	I've tried so far. It takes 8-10 minutes with one core, which isn't too bad.
	Though if there is a method out there that makes it reasonably faster and is
	easy to implement then I would be interested in checking it out.
	'''

	REFERENCE = 'data/human_RefSeq_20130704.tab'
	CROSS = 'data/inputfile.txt'
	INDEX = os.path.normpath('e:/tmp/')


	def read_reference(fn):
	result = {}
	with open(fn, mode ='r') as fh:
	next(fh) # skips header
	for idx, line in enumerate(fh):
	try:
	# had some trouble with your data, so I just compress all the
	# non-fasta information into one field. I figure you already
	# know what to do there
	*info, fasta = [i.strip() for i in line.split()]
	result[fasta] = info
	except ValueError:
	print('cannot parse line {}:'.format(idx))
	#pprint(line.split())
	#break
	return result

	def read_cross(fn):
	'''
	just grab the third item for cross referencing
	which looks to be between 7 and 55 characters
	'''
	result = []
	with open(fn, mode ='r') as fh:
	next(fh) # skips header
	for idx, line in enumerate(fh):
	try:
	result.append(line.split()[3])
	except ValueError:
	print('cannot parse line {}:'.format(idx))
	pprint(line.split())
	break

	return result

	def match(index, genetics, find_me):
	hit, miss = 0,0
	for target in find_me:
	found = index[target]
	if found:
	hit += len(found)
	#pprint(target)
	#pprint(found)
	#break
	else:
	#print('miss {}'.format(target))
	miss += 1

	print('got {} hits and {} misses'.format(hit, miss))

	def main():

	start = time.time()
	genetics = read_reference(REFERENCE)
	find_me = read_cross(CROSS)

	print('took {} to load files'.format(time.time() - start))

	print('getting index')
	#index = FastaIndex.create(genetics, INDEX, 5)

	# NOTE: the indexer runs a variable length window across the longer protein
	# strings to allow for better matching. The window needs to be smaller than
	# your shortest search (here 7). The bigger the window the faster you can
	# search, but the larger your index will be (and the longer it will take to
	# index). After some trial and error, an index of 5 allows for an almost
	# immediate (half-a-second) cross.
	# an index length of 6 adds four seconds to indexing, but only drops
	# crossing by .3 seconds.

	index = MemIndex(genetics, 6)

	print('matching')
	match_start = time.time()
	match(index, genetics, find_me)
	print('took {} to match'.format(time.time() - match_start))

	print('took {} overall'.format(time.time() - start))
	index.close()






	if __name__ == '__main__':
	main()