Skip to content

Instantly share code, notes, and snippets.

@eparker05
Last active August 29, 2015 14:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save eparker05/8825aa93c987e6730a9c to your computer and use it in GitHub Desktop.
Save eparker05/8825aa93c987e6730a9c to your computer and use it in GitHub Desktop.
A short script to compare Biopython SeqIO performance with and without lazy loading
""" SeqIO_lazy_performance_test.py
This file runs a quick performance test to compare parsing with SeqIO using
the lazy loading indexing functions and using the traditional SeqIO parsing
functions.
Usage:
lazy_performance.py <sequence file> <format> [-r #]
sequence file: The file you will use for performance testing
format: The format of that file
-r: The number of repeats to average (defaults to 3)
"""
from timeit import timeit
from time import clock
import sys
import tempfile
import os
import argparse
from Bio import SeqIO
try:
from Bio.SeqIO import _lazy
except ImportError:
raise ImportError("This performance test requires Bio.SeqIO._lazy")
#get arguments
parser = argparse.ArgumentParser(description=\
'Run a performance test on the lazy loading parser.')
parser.add_argument('filename', type=str,
help='name and path of a sequence file')
parser.add_argument('fileformat', type=str,
help='format of the sequence file')
parser.add_argument('-r', dest='repeats', metavar="N", type=int, nargs=1,
default=[3], help="the number of repeats averaged for the performance test")
args = parser.parse_args()
FILENAME = args.filename
FILEFMT = args.fileformat
if args.repeats[0] > 10:
print("Please use 10 or fewer repeats")
exit()
REPEATS = args.repeats[0]
def parseold():
#Parses all record components into memory
timei = clock()
rec = next(SeqIO.parse(FILENAME, FILEFMT))
return clock() - timei
def parse_lazy_make_index(index):
timei = clock()
recordict = SeqIO.index_db(index, FILENAME, FILEFMT, lazy=True)
return clock() - timei
def parse_indexdb_make_index(index):
timei = clock()
record_dict = SeqIO.index_db(index, FILENAME, FILEFMT, lazy=False)
return clock() - timei
def parse_indexdb_fetch(index):
timei = clock()
record_dict = SeqIO.index_db(index, FILENAME, FILEFMT, lazy=False)
key = list(record_dict.keys())[0]
record = record_dict[key]
return clock() - timei
def lazyfetch_from_index(index):
timei = clock()
a = next(SeqIO.parse(FILENAME, FILEFMT, lazy=index))
return clock() - timei
def lazyfetch_no_index():
timei = clock()
a = next(SeqIO.parse(FILENAME, FILEFMT, lazy=True))
return clock() - timei
def parselazyseq(index):
rec = next(SeqIO.parse(FILENAME, FILEFMT, lazy=index))
timei = clock()
seq = rec.seq
return clock() - timei
def parselazyfeatures(index):
rec = next(SeqIO.parse(FILENAME, FILEFMT, lazy=index))
timei = clock()
features = rec.features
return clock() - timei
def parselazyfeatures_5ppc(index):
rec = next(SeqIO.parse(FILENAME, FILEFMT, lazy=index))
recordend = int(len(rec)/20)
timei = clock()
features = rec[3*recordend:4*recordend].features
return clock() - timei
def parselazyseq_5ppc(index):
rec = next(SeqIO.parse(FILENAME, FILEFMT, lazy=index))
recordend = int(len(rec)/20)
timei = clock()
seq = rec[3*recordend:4*recordend].seq
return clock() - timei
def make_index_file():
oshandlelazy, db_name = tempfile.mkstemp()
os.close(oshandlelazy)
return db_name
def delete_index_file(fname):
os.remove(fname)
if __name__ == "__main__":
rep = REPEATS
counter = 0
#prefill os file buffer
for line in open(FILENAME, "rb"):
a = line
#initialize vars for tracking time
old = 0
indexdbfetch = 0
newindexing = 0
oldindexing = 0
recfetch = 0
seqtime = 0
fttime = 0
ft5ppc = 0
sq5ppc = 0
while counter < rep:
counter += 1
#make index handles
lazy_index = make_index_file()
old_index = make_index_file()
delete_index_file(old_index)
#timer functions
newindexing += parse_lazy_make_index(lazy_index)
oldindexing += parse_indexdb_make_index(old_index)
#using parseold() because parse_indexdb_fetch fails on files > 300 MB
# parseold approximates old index_db fetching but is a tad faster
indexdbfetch += parseold()
#indexdbfetch += parse_indexdb_fetch(old_index) #dsabled
recfetch += lazyfetch_from_index(lazy_index)
seqtime += parselazyseq(lazy_index)
fttime += parselazyfeatures(lazy_index)
ft5ppc += parselazyfeatures_5ppc(lazy_index)
sq5ppc += parselazyseq_5ppc(lazy_index)
#delete index handles
delete_index_file(lazy_index)
delete_index_file(old_index)
print("Showing average time for {0} repeats (in seconds):".format(rep))
print("Old index_db time = {0:.3f}".format(oldindexing/rep))
print("Old index_db fetch time = {0:.3f}".format(indexdbfetch/rep))
print("Old index_db index + full fetch = {0:.3f}"\
.format((oldindexing+indexdbfetch)/rep))
print("Lazy record indexing time = {0:.3f}".format(newindexing/rep))
print("Lazy record db-fetch time = {0:.3f}".format(recfetch/rep))
print("Sequence fetch time = {0:.3f}".format(seqtime/rep))
print("Features fetch time = {0:.3f}".format(fttime/rep))
print("lazy indexing + full fetch = {0:.3f}"\
.format((newindexing+recfetch+seqtime+fttime)/rep))
print("5% feature fetch time = {0:.3f}".format(ft5ppc/rep))
print("5% sequence fetch time = {0:.3f}".format(sq5ppc/rep))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment