Skip to content

Instantly share code, notes, and snippets.

@shenwei356
Last active September 17, 2017 03:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shenwei356/70d016694e5ebffe833289ccaad0bdce to your computer and use it in GitHub Desktop.
Save shenwei356/70d016694e5ebffe833289ccaad0bdce to your computer and use it in GitHub Desktop.
#!/bin/sh
# Test data
#
# Retrieve 1M reads from any Illumina reads
#
# seqkit head -n 1000000 xxxx_1.fq.gz -o test.fq.gz
#
# Or
#
# wget http://app.shenwei.me/data/test.fq.gz
#
read=test.fq.gz
# Download seqkit
#
# wget http://app.shenwei.me/data/seqkit/seqkit_linux_amd64.tar.gz
# tar -zxvf seqkit_linux_amd64.tar.gz
#
seqkit=./seqkit
# seqtk_tab which ultilizes klib like seqtk
#
# wget 'https://github.com/jameslz/seqtk_utils/blob/master/seqtk_tab?raw=true' -O seqtk_tab
# chmod a+x seqtk_tab
#
seqtk_tab=./seqtk_tab
# statistics of testdata
echo $ time $seqkit stats $read
time $seqkit stats $read
echo
echo '$ time (gzip -c -d $read | paste - - - - | cut -f 1,2,4 > /dev/null)'
time (gzip -c -d $read | paste - - - - | cut -f 1,2,4 > /dev/null)
echo
echo '$ time (pigz -c -d $read | paste - - - - | cut -f 1,2,4 > /dev/null)'
time (pigz -c -d $read | paste - - - - | cut -f 1,2,4 > /dev/null)
echo
echo '$ time $seqkit fx2tab $read > /dev/null'
time $seqkit fx2tab $read > /dev/null
echo
echo '$ time $seqtk_tab $read > /dev/null'
time $seqtk_tab $read > /dev/null
# Ubuntu 14.4
$ time ./seqkit stats test.fq.gz
file format type num_seqs sum_len min_len avg_len max_len
test.fq.gz FASTQ DNA 1,000,000 125,959,012 114 126 126
real 0m3.651s
user 0m1.031s
sys 0m0.594s
$ time (gzip -c -d $read | paste - - - - | cut -f 1,2,4 > /dev/null)
real 0m3.173s
user 0m5.188s
sys 0m2.297s
$ time (pigz -c -d $read | paste - - - - | cut -f 1,2,4 > /dev/null)
real 0m2.717s
user 0m4.406s
sys 0m2.781s
$ time $seqkit fx2tab $read > /dev/null
real 0m4.029s
user 0m1.406s
sys 0m0.469s
$ time $seqtk_tab $read > /dev/null
real 0m2.348s
user 0m2.125s
sys 0m0.219s
$ time ./seqkit stats test.fq.gz
file format type num_seqs sum_len min_len avg_len max_len
test.fq.gz FASTQ DNA 1,000,000 177,535,684 100 177.5 230
real 0m2.008s
user 0m1.054s
sys 0m0.283s
$ time (gzip -c -d $read | paste - - - - | cut -f 1,2,4 > /dev/null)
real 0m3.504s
user 0m6.826s
sys 0m0.527s
$ time (pigz -c -d $read | paste - - - - | cut -f 1,2,4 > /dev/null)
real 0m2.463s
user 0m5.963s
sys 0m0.627s
$ time $seqkit fx2tab $read > /dev/null
real 0m2.333s
user 0m1.504s
sys 0m0.308s
$ time $seqtk_tab $read > /dev/null
real 0m2.610s
user 0m2.540s
sys 0m0.058s
# CentOS 6.8 HP DL580G7 Server
$ time ./seqkit stats test.fq.gz
file format type num_seqs sum_len min_len avg_len max_len
test.fq.gz FASTQ DNA 1,000,000 125,959,012 114 126 126
real 0m2.200s
user 0m1.689s
sys 0m0.322s
$ time (gzip -c -d $read | paste - - - - | cut -f 1,2,4 > /dev/null)
real 0m16.919s
user 0m22.515s
sys 0m0.976s
$ time (pigz -c -d $read | paste - - - - | cut -f 1,2,4 > /dev/null)
real 0m17.306s
user 0m26.074s
sys 0m1.368s
$ time $seqkit fx2tab $read > /dev/null
real 0m2.590s
user 0m2.112s
sys 0m0.287s
$ time $seqtk_tab $read > /dev/null
./seqtk_tab: /lib64/libc.so.6: version `GLIBC_2.14' not found (required by ./seqtk_tab)
real 0m0.001s
user 0m0.000s
sys 0m0.000s
$ time ./seqkit stats test.fq.gz
file format type num_seqs sum_len min_len avg_len max_len
test.fq.gz FASTQ DNA 1,000,000 177,535,684 100 177.5 230
real 0m1.902s
user 0m1.032s
sys 0m0.258s
$ time (gzip -c -d $read | paste - - - - | cut -f 1,2,4 > /dev/null)
real 0m3.283s
user 0m6.412s
sys 0m0.390s
$ time (pigz -c -d $read | paste - - - - | cut -f 1,2,4 > /dev/null)
real 0m2.048s
user 0m5.677s
sys 0m0.780s
$ time $seqkit fx2tab $read > /dev/null
real 0m1.949s
user 0m1.561s
sys 0m0.306s
$ time $seqtk_tab $read > /dev/null
real 0m2.874s
user 0m2.841s
sys 0m0.028s
$ time ./seqkit stats test.fq.gz
file format type num_seqs sum_len min_len avg_len max_len
test.fq.gz FASTQ DNA 1,000,000 150,000,000 150 150 150
real 0m4.556s
user 0m0.855s
sys 0m0.197s
$ time (gzip -c -d $read | paste - - - - | cut -f 1,2,4 > /dev/null)
real 0m5.049s
user 0m6.354s
sys 0m0.718s
$ time (pigz -c -d $read | paste - - - - | cut -f 1,2,4 > /dev/null)
real 0m3.475s
user 0m5.338s
sys 0m1.632s
$ time $seqkit fx2tab $read > /dev/null
real 0m4.917s
user 0m1.294s
sys 0m0.207s
$ time $seqtk_tab $read > /dev/null
real 0m2.708s
user 0m2.666s
sys 0m0.040s
# DELL R730 server with HDD.
# From https://github.com/shenwei356/seqkit/issues/25#issuecomment-329976934
$ seqkit stats EAOA2.fq.gz
file format type num_seqs sum_len min_len avg_len max_len
EAOA2.fq.gz FASTQ DNA 16,017,817 2,402,672,550 150 150 150
real 0m0.000s # pseudo data, not used. He did not provide data.
user 0m0.000s
sys 0m0.000s
$ time (gzip -c -d EAOA2.fq.gz | paste - - - - | cut -f 1,2,4 > /dev/null)
real 0m38.195s
user 1m7.577s
sys 0m5.141s
$ time (pigz -c -d EAOA2.fq.gz | paste - - - - | cut -f 1,2,4 > /dev/null)
real 0m20.407s
user 1m2.547s
sys 0m8.182s
# with pigz
$ time seqkit fx2tab EAOA2.fq.gz > /dev/null
real 0m21.934s
user 0m17.370s
sys 0m3.735s
$ time seqtk_tab EAOA2.fq.gz >/dev/null
real 0m29.091s
user 0m28.610s
sys 0m0.453s
$ time ./seqkit stats test.fq.gz
file format type num_seqs sum_len min_len avg_len max_len
test.fq.gz FASTQ DNA 1,000,000 177,535,684 100 177.5 230
real 0m4.740s
user 0m1.088s
sys 0m0.467s
$ time (gzip -c -d $read | paste - - - - | cut -f 1,2,4 > /dev/null)
real 0m7.929s
user 0m7.409s
sys 0m0.446s
$ time (pigz -c -d $read | paste - - - - | cut -f 1,2,4 > /dev/null)
real 0m6.299s
user 0m5.858s
sys 0m0.389s
$ time $seqkit fx2tab $read > /dev/null
real 0m4.840s
user 0m1.386s
sys 0m0.401s
$ time $seqtk_tab $read > /dev/null
real 0m3.776s
user 0m3.657s
sys 0m0.071s
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment