martijnvermaat/.gitignore

## .gitignore
*.vcf.gz

## README.md

      
    Raw
  

              README.md
            
          
    Benchmark: CyVCF versus PyVCF

A quick and dirty benchmark of parsing performance on a 1KG complete genotypes
file.
To repeat, have virtualenv installed and run the
run.sh script. Plot the results with %run plot.ipy from IPython.
The following libraries are benchmarked:

CyVCF
PyVCF, with/without Cython
PyVCF with lazy computation of call
stats (proof of concept),
with/without Cython

Three modes are benchmarked:

Only parsing
Parsing and getting some call stats (num_hom_ref, num_het, num_hom_alt)
Parsing and getting all call stats (+ num_called, call_rate, num_unknown)

See the results.csv file for results.

  
## parse.py
#!/usr/bin/env python
import sys


USAGE = """usage:
  %s <library> <stats> <file>
where
  <library> can be 'pyvcf', 'pyvcf_lazy' or 'cyvcf'
  <stats> can be 'no', 'some' or 'all'
  <file> is a VCF file""" % sys.argv[0]

def error(message):
    sys.stderr.write(message + '\n')
    sys.exit(1)


if len(sys.argv) != 4:
    error(USAGE)

if sys.argv[1] in ('pyvcf', 'pyvcf_lazy'):
    from vcf import Reader
elif sys.argv[1] == 'cyvcf':
    from cyvcf import Reader
else:
    error(USAGE)

if sys.argv[2] not in ('no', 'some', 'all'):
    error(USAGE)


for record in Reader(open(sys.argv[3])):
    if sys.argv[2] == 'some':
        _ = (record.num_hom_ref,
             record.num_het,
             record.num_hom_alt)
    elif sys.argv[2] == 'all':
        _ = (record.num_called,
             record.call_rate,
             record.num_hom_ref,
             record.num_het,
             record.num_hom_alt,
             record.num_unknown)

## plot.ipy
# Run in IPython with %run plot.ipy
%pylab
results = open('results.csv')
headers = next(results)[1:-1].split('\t')
data = [dict(zip(headers, line.rstrip().split('\t'))) for line in results]
barh(range(0, len(data), 3), [float(d['time'].split()[3]) for d in data if d['callstats'] == 'no'], align='center', label='no callstats', color='#eeeeee')
barh(range(1, len(data), 3), [float(d['time'].split()[3]) for d in data if d['callstats'] == 'some'], align='center', label='some callstats', color='#eeaaaa')
barh(range(2, len(data), 3), [float(d['time'].split()[3]) for d in data if d['callstats'] == 'all'], align='center', label='all callstats', color='#ee6666')
yticks(range(1, len(data), 3), ['{library} (cython {cython})'.format(**d) for d in data])
legend()
grid(True, axis='x')
tight_layout()

## results.csv
#library	cython	callstats	time
pyvcf	no	no	real 161.84 user 161.43 sys 0.25
pyvcf	yes	no	real 136.70 user 136.35 sys 0.22
pyvcf_lazy	no	no	real 165.07 user 164.43 sys 0.44
pyvcf_lazy	yes	no	real 140.08 user 139.57 sys 0.36
cyvcf	yes	no	real 251.10 user 250.38 sys 0.48
pyvcf	no	some	real 298.38 user 297.68 sys 0.45
pyvcf	yes	some	real 270.32 user 269.72 sys 0.34
pyvcf_lazy	no	some	real 221.27 user 220.82 sys 0.24
pyvcf_lazy	yes	some	real 192.47 user 191.91 sys 0.36
cyvcf	yes	some	real 250.08 user 249.57 sys 0.27
pyvcf	no	all	real 352.17 user 351.51 sys 0.40
pyvcf	yes	all	real 321.58 user 320.57 sys 0.66
pyvcf_lazy	no	all	real 219.99 user 219.62 sys 0.15
pyvcf_lazy	yes	all	real 190.60 user 190.34 sys 0.11
cyvcf	yes	all	real 251.68 user 251.25 sys 0.21

## results.png

      
    Raw
  

              results.png
            
          
## run.sh
#!/bin/bash

set -e

TEST_VCF="ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20110521/ALL.chr22.phase1_release_v3.20101123.snps_indels_svs.genotypes.vcf.gz"
TEST_SIZE=20000
RESULTS="results.csv"

if [ ! -f test.vcf.gz ]; then
    wget $TEST_VCF -O - | zcat | head -$TEST_SIZE | gzip > test.vcf.gz
fi

function time_parse_vcf {
    LIBRARY="$1"
    CYTHON="$2"
    STATS="$3"
    if [ $LIBRARY = "pyvcf" ]; then
        REPO="https://github.com/jamescasbon/PyVCF.git"
        BRANCH="master"
    elif [ $LIBRARY = "pyvcf_lazy" ]; then
        REPO="https://github.com/martijnvermaat/PyVCF.git"
        BRANCH="lazy-call-stats"
    elif [ $LIBRARY = "cyvcf" ]; then
        REPO="https://github.com/arq5x/cyvcf.git"
        BRANCH="master"
    fi
    CLONE=$(mktemp -d)
    virtualenv environment
    . environment/bin/activate
    if [ $CYTHON = "yes" ]; then
        pip install cython
    fi
    git clone $REPO $CLONE
    pushd $CLONE
    git checkout $BRANCH
    python setup.py install
    popd
    TIME=$(/usr/bin/time -f "real %e user %U sys %S" ./parse.py $LIBRARY $STATS test.vcf.gz 2>&1)
    deactivate
    rm -Rf environment
    echo -e "$LIBRARY\t$CYTHON\t$STATS\t$TIME" >> $RESULTS
}

echo -e "#library\tcython\tcallstats\ttime" > $RESULTS

# Without call stats
time_parse_vcf "pyvcf" "no" "no"
time_parse_vcf "pyvcf" "yes" "no"
time_parse_vcf "pyvcf_lazy" "no" "no"
time_parse_vcf "pyvcf_lazy" "yes" "no"
time_parse_vcf "cyvcf" "yes" "no"

# With some call stats
time_parse_vcf "pyvcf" "no" "some"
time_parse_vcf "pyvcf" "yes" "some"
time_parse_vcf "pyvcf_lazy" "no" "some"
time_parse_vcf "pyvcf_lazy" "yes" "some"
time_parse_vcf "cyvcf" "yes" "some"

# With all call stats
time_parse_vcf "pyvcf" "no" "all"
time_parse_vcf "pyvcf" "yes" "all"
time_parse_vcf "pyvcf_lazy" "no" "all"
time_parse_vcf "pyvcf_lazy" "yes" "all"
time_parse_vcf "cyvcf" "yes" "all"
	#!/usr/bin/env python
	import sys


	USAGE = """usage:
	%s <library> <stats> <file>
	where
	<library> can be 'pyvcf', 'pyvcf_lazy' or 'cyvcf'
	<stats> can be 'no', 'some' or 'all'
	<file> is a VCF file""" % sys.argv[0]

	def error(message):
	sys.stderr.write(message + '\n')
	sys.exit(1)



	if len(sys.argv) != 4:
	error(USAGE)

	if sys.argv[1] in ('pyvcf', 'pyvcf_lazy'):
	from vcf import Reader
	elif sys.argv[1] == 'cyvcf':
	from cyvcf import Reader
	else:
	error(USAGE)

	if sys.argv[2] not in ('no', 'some', 'all'):
	error(USAGE)


	for record in Reader(open(sys.argv[3])):
	if sys.argv[2] == 'some':
	_ = (record.num_hom_ref,
	record.num_het,
	record.num_hom_alt)
	elif sys.argv[2] == 'all':
	_ = (record.num_called,
	record.call_rate,
	record.num_hom_ref,
	record.num_het,
	record.num_hom_alt,
	record.num_unknown)
	# Run in IPython with %run plot.ipy
	%pylab
	results = open('results.csv')
	headers = next(results)[1:-1].split('\t')
	data = [dict(zip(headers, line.rstrip().split('\t'))) for line in results]
	barh(range(0, len(data), 3), [float(d['time'].split()[3]) for d in data if d['callstats'] == 'no'], align='center', label='no callstats', color='#eeeeee')
	barh(range(1, len(data), 3), [float(d['time'].split()[3]) for d in data if d['callstats'] == 'some'], align='center', label='some callstats', color='#eeaaaa')
	barh(range(2, len(data), 3), [float(d['time'].split()[3]) for d in data if d['callstats'] == 'all'], align='center', label='all callstats', color='#ee6666')
	yticks(range(1, len(data), 3), ['{library} (cython {cython})'.format(**d) for d in data])
	legend()
	grid(True, axis='x')
	tight_layout()
	#library cython callstats time
	pyvcf no no real 161.84 user 161.43 sys 0.25
	pyvcf yes no real 136.70 user 136.35 sys 0.22
	pyvcf_lazy no no real 165.07 user 164.43 sys 0.44
	pyvcf_lazy yes no real 140.08 user 139.57 sys 0.36
	cyvcf yes no real 251.10 user 250.38 sys 0.48
	pyvcf no some real 298.38 user 297.68 sys 0.45
	pyvcf yes some real 270.32 user 269.72 sys 0.34
	pyvcf_lazy no some real 221.27 user 220.82 sys 0.24
	pyvcf_lazy yes some real 192.47 user 191.91 sys 0.36
	cyvcf yes some real 250.08 user 249.57 sys 0.27
	pyvcf no all real 352.17 user 351.51 sys 0.40
	pyvcf yes all real 321.58 user 320.57 sys 0.66
	pyvcf_lazy no all real 219.99 user 219.62 sys 0.15
	pyvcf_lazy yes all real 190.60 user 190.34 sys 0.11
	cyvcf yes all real 251.68 user 251.25 sys 0.21
	#!/bin/bash

	set -e

	TEST_VCF="ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20110521/ALL.chr22.phase1_release_v3.20101123.snps_indels_svs.genotypes.vcf.gz"
	TEST_SIZE=20000
	RESULTS="results.csv"

	if [ ! -f test.vcf.gz ]; then
	wget $TEST_VCF -O - \| zcat \| head -$TEST_SIZE \| gzip > test.vcf.gz
	fi

	function time_parse_vcf {
	LIBRARY="$1"
	CYTHON="$2"
	STATS="$3"
	if [ $LIBRARY = "pyvcf" ]; then
	REPO="https://github.com/jamescasbon/PyVCF.git"
	BRANCH="master"
	elif [ $LIBRARY = "pyvcf_lazy" ]; then
	REPO="https://github.com/martijnvermaat/PyVCF.git"
	BRANCH="lazy-call-stats"
	elif [ $LIBRARY = "cyvcf" ]; then
	REPO="https://github.com/arq5x/cyvcf.git"
	BRANCH="master"
	fi
	CLONE=$(mktemp -d)
	virtualenv environment
	. environment/bin/activate
	if [ $CYTHON = "yes" ]; then
	pip install cython
	fi
	git clone $REPO $CLONE
	pushd $CLONE
	git checkout $BRANCH
	python setup.py install
	popd
	TIME=$(/usr/bin/time -f "real %e user %U sys %S" ./parse.py $LIBRARY $STATS test.vcf.gz 2>&1)
	deactivate
	rm -Rf environment
	echo -e "$LIBRARY\t$CYTHON\t$STATS\t$TIME" >> $RESULTS
	}

	echo -e "#library\tcython\tcallstats\ttime" > $RESULTS

	# Without call stats
	time_parse_vcf "pyvcf" "no" "no"
	time_parse_vcf "pyvcf" "yes" "no"
	time_parse_vcf "pyvcf_lazy" "no" "no"
	time_parse_vcf "pyvcf_lazy" "yes" "no"
	time_parse_vcf "cyvcf" "yes" "no"

	# With some call stats
	time_parse_vcf "pyvcf" "no" "some"
	time_parse_vcf "pyvcf" "yes" "some"
	time_parse_vcf "pyvcf_lazy" "no" "some"
	time_parse_vcf "pyvcf_lazy" "yes" "some"
	time_parse_vcf "cyvcf" "yes" "some"

	# With all call stats
	time_parse_vcf "pyvcf" "no" "all"
	time_parse_vcf "pyvcf" "yes" "all"
	time_parse_vcf "pyvcf_lazy" "no" "all"
	time_parse_vcf "pyvcf_lazy" "yes" "all"
	time_parse_vcf "cyvcf" "yes" "all"