Last active
August 29, 2015 14:04
-
-
Save eparker05/8825aa93c987e6730a9c to your computer and use it in GitHub Desktop.
A short script to compare Biopython SeqIO performance with and without lazy loading
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" SeqIO_lazy_performance_test.py | |
This file runs a quick performance test to compare parsing with SeqIO using | |
the lazy loading indexing functions and using the traditional SeqIO parsing | |
functions. | |
Usage: | |
lazy_performance.py <sequence file> <format> [-r #] | |
sequence file: The file you will use for performance testing | |
format: The format of that file | |
-r: The number of repeats to average (defaults to 3) | |
""" | |
from timeit import timeit | |
from time import clock | |
import sys | |
import tempfile | |
import os | |
import argparse | |
from Bio import SeqIO | |
try: | |
from Bio.SeqIO import _lazy | |
except ImportError: | |
raise ImportError("This performance test requires Bio.SeqIO._lazy") | |
#get arguments | |
parser = argparse.ArgumentParser(description=\ | |
'Run a performance test on the lazy loading parser.') | |
parser.add_argument('filename', type=str, | |
help='name and path of a sequence file') | |
parser.add_argument('fileformat', type=str, | |
help='format of the sequence file') | |
parser.add_argument('-r', dest='repeats', metavar="N", type=int, nargs=1, | |
default=[3], help="the number of repeats averaged for the performance test") | |
args = parser.parse_args() | |
FILENAME = args.filename | |
FILEFMT = args.fileformat | |
if args.repeats[0] > 10: | |
print("Please use 10 or fewer repeats") | |
exit() | |
REPEATS = args.repeats[0] | |
def parseold(): | |
#Parses all record components into memory | |
timei = clock() | |
rec = next(SeqIO.parse(FILENAME, FILEFMT)) | |
return clock() - timei | |
def parse_lazy_make_index(index): | |
timei = clock() | |
recordict = SeqIO.index_db(index, FILENAME, FILEFMT, lazy=True) | |
return clock() - timei | |
def parse_indexdb_make_index(index): | |
timei = clock() | |
record_dict = SeqIO.index_db(index, FILENAME, FILEFMT, lazy=False) | |
return clock() - timei | |
def parse_indexdb_fetch(index): | |
timei = clock() | |
record_dict = SeqIO.index_db(index, FILENAME, FILEFMT, lazy=False) | |
key = list(record_dict.keys())[0] | |
record = record_dict[key] | |
return clock() - timei | |
def lazyfetch_from_index(index): | |
timei = clock() | |
a = next(SeqIO.parse(FILENAME, FILEFMT, lazy=index)) | |
return clock() - timei | |
def lazyfetch_no_index(): | |
timei = clock() | |
a = next(SeqIO.parse(FILENAME, FILEFMT, lazy=True)) | |
return clock() - timei | |
def parselazyseq(index): | |
rec = next(SeqIO.parse(FILENAME, FILEFMT, lazy=index)) | |
timei = clock() | |
seq = rec.seq | |
return clock() - timei | |
def parselazyfeatures(index): | |
rec = next(SeqIO.parse(FILENAME, FILEFMT, lazy=index)) | |
timei = clock() | |
features = rec.features | |
return clock() - timei | |
def parselazyfeatures_5ppc(index): | |
rec = next(SeqIO.parse(FILENAME, FILEFMT, lazy=index)) | |
recordend = int(len(rec)/20) | |
timei = clock() | |
features = rec[3*recordend:4*recordend].features | |
return clock() - timei | |
def parselazyseq_5ppc(index): | |
rec = next(SeqIO.parse(FILENAME, FILEFMT, lazy=index)) | |
recordend = int(len(rec)/20) | |
timei = clock() | |
seq = rec[3*recordend:4*recordend].seq | |
return clock() - timei | |
def make_index_file(): | |
oshandlelazy, db_name = tempfile.mkstemp() | |
os.close(oshandlelazy) | |
return db_name | |
def delete_index_file(fname): | |
os.remove(fname) | |
if __name__ == "__main__": | |
rep = REPEATS | |
counter = 0 | |
#prefill os file buffer | |
for line in open(FILENAME, "rb"): | |
a = line | |
#initialize vars for tracking time | |
old = 0 | |
indexdbfetch = 0 | |
newindexing = 0 | |
oldindexing = 0 | |
recfetch = 0 | |
seqtime = 0 | |
fttime = 0 | |
ft5ppc = 0 | |
sq5ppc = 0 | |
while counter < rep: | |
counter += 1 | |
#make index handles | |
lazy_index = make_index_file() | |
old_index = make_index_file() | |
delete_index_file(old_index) | |
#timer functions | |
newindexing += parse_lazy_make_index(lazy_index) | |
oldindexing += parse_indexdb_make_index(old_index) | |
#using parseold() because parse_indexdb_fetch fails on files > 300 MB | |
# parseold approximates old index_db fetching but is a tad faster | |
indexdbfetch += parseold() | |
#indexdbfetch += parse_indexdb_fetch(old_index) #dsabled | |
recfetch += lazyfetch_from_index(lazy_index) | |
seqtime += parselazyseq(lazy_index) | |
fttime += parselazyfeatures(lazy_index) | |
ft5ppc += parselazyfeatures_5ppc(lazy_index) | |
sq5ppc += parselazyseq_5ppc(lazy_index) | |
#delete index handles | |
delete_index_file(lazy_index) | |
delete_index_file(old_index) | |
print("Showing average time for {0} repeats (in seconds):".format(rep)) | |
print("Old index_db time = {0:.3f}".format(oldindexing/rep)) | |
print("Old index_db fetch time = {0:.3f}".format(indexdbfetch/rep)) | |
print("Old index_db index + full fetch = {0:.3f}"\ | |
.format((oldindexing+indexdbfetch)/rep)) | |
print("Lazy record indexing time = {0:.3f}".format(newindexing/rep)) | |
print("Lazy record db-fetch time = {0:.3f}".format(recfetch/rep)) | |
print("Sequence fetch time = {0:.3f}".format(seqtime/rep)) | |
print("Features fetch time = {0:.3f}".format(fttime/rep)) | |
print("lazy indexing + full fetch = {0:.3f}"\ | |
.format((newindexing+recfetch+seqtime+fttime)/rep)) | |
print("5% feature fetch time = {0:.3f}".format(ft5ppc/rep)) | |
print("5% sequence fetch time = {0:.3f}".format(sq5ppc/rep)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment