Last active
August 29, 2015 14:17
-
-
Save zachcp/cfbbdeecb89bdd27a000 to your computer and use it in GitHub Desktop.
simple benchmarks for BioPython I/O with python2, python3, pypy2, pypy3
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import random | |
letters = ['A','C','T','G'] | |
qualchars = """!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~""" | |
def makeseq(length=300,letters=letters): | |
seq = [random.choice(letters) for x in range(length)] | |
return ''.join(seq) | |
def make_qual(length=300,qualchars=qualchars): | |
seq = [random.choice(qualchars) for x in range(length)] | |
return ''.join(seq) | |
def make_fasta(sequences=1000000, seqlength=300): | |
with open('test.fasta', 'w') as f: | |
for i in range(sequences): | |
f.write(">sequence_{}\n{}\n".format(i,makeseq())) | |
def make_fastq(sequences=1000000, seqlength=300): | |
with open('test.fastq', 'w') as f: | |
for i in range(sequences): | |
f.write("@seq_{}\n{}\n+\n{}\n".format(i,makeseq(),make_qual())) | |
if __name__ == "__main__": | |
make_fasta() | |
make_fastq() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#install pypy | |
#brew install pypy pypy3 | |
#/usr/local/share/pypy/pip install biopython | |
#/usr/local/share/pypy3/pip install biopython | |
#biopython v1.65 | |
#set python 3 | |
py3 = ~/anaconda/envs/py3k/bin/python3 | |
#make a fasta/fastq dataset | |
test.fasta: | |
# make a million fasta and fastq sequences | |
# could take while | |
python makefasta.py | |
tests: test.fasta | |
#seqtk | |
time seqtk seq test.fasta > out.fasta | |
time seqtk seq test.fastq > out.fastq | |
#I/O for fasta: SeqIO.parse() | |
time python read_write_fasta.py | |
time pypy read_write_fasta.py | |
time ${py3} read_write_fasta.py | |
time pypy3 read_write_fasta.py | |
# I/O for fastq: SeqIO.parse() | |
time python read_write_fastq.py | |
time pypy read_write_fastq.py | |
time ${py3} read_write_fastq.py | |
time pypy3 read_write_fastq.py | |
# I/O for fastq: readfq.py | |
time python read_write_fastq2.py | |
time pypy read_write_fastq2.py | |
time ${py3} read_write_fastq2.py | |
time pypy3 read_write_fastq2.py | |
# I/O for fastq: fastq_general_iterator | |
time python read_write_fastq3.py | |
time pypy read_write_fastq3.py | |
time ${py3} read_write_fastq3.py | |
time pypy3 read_write_fastq3.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from Bio import SeqIO | |
fasta_records = SeqIO.parse(open("test.fasta",'r'),'fasta') | |
SeqIO.write(fasta_records, "out.fasta","fasta") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from Bio import SeqIO | |
fastq_records = SeqIO.parse(open("test.fastq",'r'),'fastq') | |
SeqIO.write(fastq_records, "out.fastq","fastq") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
#based on lh3's readfq script https://github.com/lh3/readfq/blob/master/readfq.py | |
def readfq(fp): # this is a generator function | |
last = None # this is a buffer keeping the last unprocessed line | |
while True: # mimic closure; is it a bad idea? | |
if not last: # the first record or a record following a fastq | |
for l in fp: # search for the start of the next record | |
if l[0] in '>@': # fasta/q header line | |
last = l[:-1] # save this line | |
break | |
if not last: break | |
name, seqs, last = last[1:].partition(" ")[0], [], None | |
for l in fp: # read the sequence | |
if l[0] in '@+>': | |
last = l[:-1] | |
break | |
seqs.append(l[:-1]) | |
if not last or last[0] != '+': # this is a fasta record | |
yield name, ''.join(seqs), None # yield a fasta record | |
if not last: break | |
else: # this is a fastq record | |
seq, leng, seqs = ''.join(seqs), 0, [] | |
for l in fp: # read the quality | |
seqs.append(l[:-1]) | |
leng += len(l) - 1 | |
if leng >= len(seq): # have read enough quality | |
last = None | |
yield name, seq, ''.join(seqs); # yield a fastq record | |
break | |
if last: # reach EOF before reading enough quality | |
yield name, seq, None # yield a fasta record instead | |
break | |
fastq_records = readfq(open("test.fastq",'r')) | |
with open("out.fastq",'w') as f: | |
for name, seq, qual in fastq_records: | |
f.write("@seq_{}\n{}\n+\n{}\n".format(name,seq,qual)) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from Bio import SeqIO | |
from Bio.Seq import Seq | |
from Bio.SeqIO.QualityIO import FastqGeneralIterator | |
fastq_records = FastqGeneralIterator(open("test.fastq",'r')) | |
with open("out.fastq",'w') as f: | |
for name, seq, qual in fastq_records: | |
f.write("@seq_{}\n{}\n+\n{}\n".format(name,seq,qual)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
make tests | |
#seqtk | |
time seqtk seq test.fasta > out.fasta | |
real 0m5.012s | |
user 0m0.617s | |
sys 0m0.509s | |
time seqtk seq test.fastq > out.fastq | |
real 0m7.737s | |
user 0m1.078s | |
sys 0m0.967s | |
#I/O for fasta: SeqIO.parse() | |
time python read_write_fasta.py | |
28.01 real 26.58 user 0.85 sys | |
time pypy read_write_fasta.py | |
6.52 real 5.88 user 0.56 sys | |
time ~/anaconda/envs/py3k/bin/python3 read_write_fasta.py | |
real 0m27.944s | |
user 0m26.993s | |
sys 0m0.590s | |
time pypy3 read_write_fasta.py | |
12.44 real 11.57 user 0.72 sys | |
# I/O for fastq: SeqIO.parse() | |
time python read_write_fastq.py | |
105.75 real 103.37 user 1.77 sys | |
time pypy read_write_fastq.py | |
30.10 real 28.78 user 1.12 sys | |
time ~/anaconda/envs/py3k/bin/python3 read_write_fastq.py | |
real 1m43.242s | |
user 1m41.732s | |
sys 0m1.182s | |
time pypy3 read_write_fastq.py | |
43.74 real 40.68 user 1.42 sys | |
# I/O for fastq: readfq.py | |
time python read_write_fastq2.py | |
11.32 real 5.22 user 1.22 sys | |
time pypy read_write_fastq2.py | |
9.89 real 2.98 user 0.90 sys | |
time ~/anaconda/envs/py3k/bin/python3 read_write_fastq2.py | |
real 0m11.068s | |
user 0m7.858s | |
sys 0m0.963s | |
time pypy3 read_write_fastq2.py | |
16.50 real 14.49 user 1.30 sys | |
# I/O for fastq: fastq_general_iterator | |
time python read_write_fastq3.py | |
11.93 real 10.11 user 1.34 sys | |
time pypy read_write_fastq3.py | |
8.59 real 3.32 user 0.91 sys | |
time ~/anaconda/envs/py3k/bin/python3 read_write_fastq3.py | |
real 0m8.000s | |
user 0m6.709s | |
sys 0m0.969s | |
time pypy3 read_write_fastq3.py | |
16.89 real 15.50 user 1.28 sys | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment