Skip to content

Instantly share code, notes, and snippets.

@adamewing
Created September 3, 2013 01:41
Show Gist options
  • Save adamewing/6418833 to your computer and use it in GitHub Desktop.
Save adamewing/6418833 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import vcf
import sys
import traceback
import gzip
'''
validatevcf.py
Contact: Adam Ewing (ewingad@soe.ucsc.edu)
Attempts to validate a VCF that may contain SNVs, INDELs, and SVs
The main test is whether the VCF can be successfully parsed by PyVCF,
a few specific fields are check for as well (VCFs should define somatic,
imprecise breakends should have confidence intervals)
'''
class ValidationError(Exception):
def __init__(self, value):
self.value = value
def __str__(self):
return repr(self.value)
def context(vcf, recnum):
vcf_h = None
ctbuf = 5
if vcf.endswith('.gz'):
vcf_h = gzip.open(vcf, 'rb')
else:
vcf_h = open(vcf, 'r')
n = 0
for line in vcf_h:
if not line.startswith('#'):
n += 1
if n >= recnum-ctbuf and n <= recnum+ctbuf:
print n,':',line.strip()
found = True
if n > recnum + ctbuf:
vcf_h.close()
return
if len(sys.argv) == 2:
assert sys.argv[1].endswith('.vcf') or sys.argv[1].endswith('.vcf.gz')
vcfin = vcf.Reader(filename=sys.argv[1])
recnum = 0
try:
use_info_somatic = False
use_info_ss = False
use_info_loh = False
use_fmt_ss = False
indel_count = 0
snv_count = 0
sv_count = 0
for rec in vcfin:
''' try to detemine how germline vs. somatic is specified '''
recnum += 1
if rec.FILTER == 'GERMLINE' or rec.FILTER == 'SOMATIC':
raise ValidationError('GERMLINE or SOMATIC in FILTER field: invalid VCF')
if rec.INFO.get('SOMATIC'):
use_info_somatic=True
if str(rec.INFO.get('SS')).upper() == 'SOMATIC':
use_info_ss=True
if str(rec.INFO.get('SS')).upper() == 'LOH':
use_info_loh=True
if rec.is_snp:
snv_count += 1
if rec.is_indel:
indel_count += 1
if rec.is_sv:
sv_count += 1
if rec.INFO.get('IMPRECISE'):
if not (rec.INFO.get('CIPOS') and rec.INFO.get('CIEND')):
raise ValidationError('Imprecise SV record without CIPOS and CIEND: ' + str(rec))
for call in rec.samples:
data = call.data
if 'SS' in data._fields:
use_fmt_ss = True
if not (use_info_somatic or use_info_ss or use_fmt_ss):
raise ValidationError('No somatic records found using INFO or FORMAT fields')
print "Validation complete."
print "Total records:",recnum
print "-"*60
print "uses INFO/SOMATIC:", use_info_somatic
print "uses INFO/SS=Somatic/Germline:", use_info_ss
print "uses INFO/SS=LOH:",use_info_loh
print "uses FORMAT/SS=0,1,2,...:", use_fmt_ss
print "-"*60
print "SNV count:",snv_count
print "INDEL count:",indel_count
print "SV count:",sv_count
print "-"*60
except:
raise
recnum += 1
print "parse error in VCF on line",recnum
print '-'*60
traceback.print_exc(file=sys.stdout)
print '-'*60
print "context:"
context(sys.argv[1], recnum)
else:
print "usage:",sys.argv[0],"<vcf or vcf.gz>"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment