jamescasbon/melt_vcf.py

## melt_vcf.py
#!/usr/bin/env python
""" Melt a VCF file into a tab delimited set of calls, one per line

VCF files have all the calls from different samples on one line.  This
script reads vcf on stdin and writes all calls to stdout in tab delimited
format with one call in one sample per line.  This makes it easy to find
a given sample's genotype with, say, grep.
"""

import sys
import csv

out = csv.writer(sys.stdout, delimiter='\t')

fixed = None
samples = None
format = None

def parse_samples(line):
    toks = line[1:].split('\t')
    fixed, samples = toks[:9], toks[9:]
    return fixed, samples

def parse_format(line):
    return line.split('\t')[8].split(':')

def parse_variant(line):
    return line.split('\t')[:8]

def parse_calls(line):
    for call in line.split('\t')[9:]:
        yield call.split(':')

for line in sys.stdin:
    line = line.rstrip()
    if line.startswith('#'):
        if not line.startswith('##'):
            fixed, samples = parse_samples(line)

    else:
        if not format:
            format = parse_format(line)
            out.writerow(['SAMPLE'] + format + fixed)

        variant = parse_variant(line)
        for sample, call in zip(samples, parse_calls(line)):
            while len(call) < len(format):
                call.append('.')

            out.writerow([sample] + call + variant)
	#!/usr/bin/env python
	""" Melt a VCF file into a tab delimited set of calls, one per line

	VCF files have all the calls from different samples on one line. This
	script reads vcf on stdin and writes all calls to stdout in tab delimited
	format with one call in one sample per line. This makes it easy to find
	a given sample's genotype with, say, grep.
	"""

	import sys
	import csv

	out = csv.writer(sys.stdout, delimiter='\t')

	fixed = None
	samples = None
	format = None

	def parse_samples(line):
	toks = line[1:].split('\t')
	fixed, samples = toks[:9], toks[9:]
	return fixed, samples

	def parse_format(line):
	return line.split('\t')[8].split(':')

	def parse_variant(line):
	return line.split('\t')[:8]

	def parse_calls(line):
	for call in line.split('\t')[9:]:
	yield call.split(':')

	for line in sys.stdin:
	line = line.rstrip()
	if line.startswith('#'):
	if not line.startswith('##'):
	fixed, samples = parse_samples(line)

	else:
	if not format:
	format = parse_format(line)
	out.writerow(['SAMPLE'] + format + fixed)

	variant = parse_variant(line)
	for sample, call in zip(samples, parse_calls(line)):
	while len(call) < len(format):
	call.append('.')

	out.writerow([sample] + call + variant)