Skip to content

Instantly share code, notes, and snippets.

@zachcp
Last active August 29, 2015 14:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zachcp/cfbbdeecb89bdd27a000 to your computer and use it in GitHub Desktop.
Save zachcp/cfbbdeecb89bdd27a000 to your computer and use it in GitHub Desktop.
simple benchmarks for BioPython I/O with python2, python3, pypy2, pypy3
#!/usr/bin/env python
import random
letters = ['A','C','T','G']
qualchars = """!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~"""
def makeseq(length=300,letters=letters):
seq = [random.choice(letters) for x in range(length)]
return ''.join(seq)
def make_qual(length=300,qualchars=qualchars):
seq = [random.choice(qualchars) for x in range(length)]
return ''.join(seq)
def make_fasta(sequences=1000000, seqlength=300):
with open('test.fasta', 'w') as f:
for i in range(sequences):
f.write(">sequence_{}\n{}\n".format(i,makeseq()))
def make_fastq(sequences=1000000, seqlength=300):
with open('test.fastq', 'w') as f:
for i in range(sequences):
f.write("@seq_{}\n{}\n+\n{}\n".format(i,makeseq(),make_qual()))
if __name__ == "__main__":
make_fasta()
make_fastq()
#install pypy
#brew install pypy pypy3
#/usr/local/share/pypy/pip install biopython
#/usr/local/share/pypy3/pip install biopython
#biopython v1.65
#set python 3
py3 = ~/anaconda/envs/py3k/bin/python3
#make a fasta/fastq dataset
test.fasta:
# make a million fasta and fastq sequences
# could take while
python makefasta.py
tests: test.fasta
#seqtk
time seqtk seq test.fasta > out.fasta
time seqtk seq test.fastq > out.fastq
#I/O for fasta: SeqIO.parse()
time python read_write_fasta.py
time pypy read_write_fasta.py
time ${py3} read_write_fasta.py
time pypy3 read_write_fasta.py
# I/O for fastq: SeqIO.parse()
time python read_write_fastq.py
time pypy read_write_fastq.py
time ${py3} read_write_fastq.py
time pypy3 read_write_fastq.py
# I/O for fastq: readfq.py
time python read_write_fastq2.py
time pypy read_write_fastq2.py
time ${py3} read_write_fastq2.py
time pypy3 read_write_fastq2.py
# I/O for fastq: fastq_general_iterator
time python read_write_fastq3.py
time pypy read_write_fastq3.py
time ${py3} read_write_fastq3.py
time pypy3 read_write_fastq3.py
#!/usr/bin/env python
from Bio import SeqIO
fasta_records = SeqIO.parse(open("test.fasta",'r'),'fasta')
SeqIO.write(fasta_records, "out.fasta","fasta")
#!/usr/bin/env python
from Bio import SeqIO
fastq_records = SeqIO.parse(open("test.fastq",'r'),'fastq')
SeqIO.write(fastq_records, "out.fastq","fastq")
#!/usr/bin/env python
#based on lh3's readfq script https://github.com/lh3/readfq/blob/master/readfq.py
def readfq(fp): # this is a generator function
last = None # this is a buffer keeping the last unprocessed line
while True: # mimic closure; is it a bad idea?
if not last: # the first record or a record following a fastq
for l in fp: # search for the start of the next record
if l[0] in '>@': # fasta/q header line
last = l[:-1] # save this line
break
if not last: break
name, seqs, last = last[1:].partition(" ")[0], [], None
for l in fp: # read the sequence
if l[0] in '@+>':
last = l[:-1]
break
seqs.append(l[:-1])
if not last or last[0] != '+': # this is a fasta record
yield name, ''.join(seqs), None # yield a fasta record
if not last: break
else: # this is a fastq record
seq, leng, seqs = ''.join(seqs), 0, []
for l in fp: # read the quality
seqs.append(l[:-1])
leng += len(l) - 1
if leng >= len(seq): # have read enough quality
last = None
yield name, seq, ''.join(seqs); # yield a fastq record
break
if last: # reach EOF before reading enough quality
yield name, seq, None # yield a fasta record instead
break
fastq_records = readfq(open("test.fastq",'r'))
with open("out.fastq",'w') as f:
for name, seq, qual in fastq_records:
f.write("@seq_{}\n{}\n+\n{}\n".format(name,seq,qual))
#!/usr/bin/env python
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqIO.QualityIO import FastqGeneralIterator
fastq_records = FastqGeneralIterator(open("test.fastq",'r'))
with open("out.fastq",'w') as f:
for name, seq, qual in fastq_records:
f.write("@seq_{}\n{}\n+\n{}\n".format(name,seq,qual))
make tests
#seqtk
time seqtk seq test.fasta > out.fasta
real 0m5.012s
user 0m0.617s
sys 0m0.509s
time seqtk seq test.fastq > out.fastq
real 0m7.737s
user 0m1.078s
sys 0m0.967s
#I/O for fasta: SeqIO.parse()
time python read_write_fasta.py
28.01 real 26.58 user 0.85 sys
time pypy read_write_fasta.py
6.52 real 5.88 user 0.56 sys
time ~/anaconda/envs/py3k/bin/python3 read_write_fasta.py
real 0m27.944s
user 0m26.993s
sys 0m0.590s
time pypy3 read_write_fasta.py
12.44 real 11.57 user 0.72 sys
# I/O for fastq: SeqIO.parse()
time python read_write_fastq.py
105.75 real 103.37 user 1.77 sys
time pypy read_write_fastq.py
30.10 real 28.78 user 1.12 sys
time ~/anaconda/envs/py3k/bin/python3 read_write_fastq.py
real 1m43.242s
user 1m41.732s
sys 0m1.182s
time pypy3 read_write_fastq.py
43.74 real 40.68 user 1.42 sys
# I/O for fastq: readfq.py
time python read_write_fastq2.py
11.32 real 5.22 user 1.22 sys
time pypy read_write_fastq2.py
9.89 real 2.98 user 0.90 sys
time ~/anaconda/envs/py3k/bin/python3 read_write_fastq2.py
real 0m11.068s
user 0m7.858s
sys 0m0.963s
time pypy3 read_write_fastq2.py
16.50 real 14.49 user 1.30 sys
# I/O for fastq: fastq_general_iterator
time python read_write_fastq3.py
11.93 real 10.11 user 1.34 sys
time pypy read_write_fastq3.py
8.59 real 3.32 user 0.91 sys
time ~/anaconda/envs/py3k/bin/python3 read_write_fastq3.py
real 0m8.000s
user 0m6.709s
sys 0m0.969s
time pypy3 read_write_fastq3.py
16.89 real 15.50 user 1.28 sys
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment