Skip to content

Instantly share code, notes, and snippets.

@arundurvasula
Created February 24, 2016 22:29
Show Gist options
  • Save arundurvasula/bb60b3c27462e2491da5 to your computer and use it in GitHub Desktop.
Save arundurvasula/bb60b3c27462e2491da5 to your computer and use it in GitHub Desktop.
A script to extract a chromosome from a multisample VCF and create a fasta file
import gzip
import csv
import argparse
import sys
parser = argparse.ArgumentParser(description="script to convert an all sites vcf to FASTA format. FASTA description will be the sample name in the VCF header.Only does one chromosome/region at a time.")
parser.add_argument("-v", "--vcf", action="store", required=True, help="Input VCF file. Should be a multisample vcf, though it should theoretically work with a single sample.")
parser.add_argument("-o", "--out", action="store", required=True, help="Output filename")
parser.add_argument("-c", "--chromosome", action="store", required=True, help="Chromosome to output. Should be something in the first column of the vcf.")
parser.add_argument("-g", "--gzip", action="store_true", required=False, help="Set if the VCF is gzipped.")
args = parser.parse_args()
vcf_in = args.vcf
out_name = args.out
out_chr = args.chromosome
sample_names = []
sample_seqs = []
if args.gzip:
opener = gzip.open
else:
opener = open
with opener(vcf_in, 'r') as tsvin:
tsvin = csv.reader(tsvin, delimiter='\t')
for row in tsvin:
if any('##' in strings for strings in row):
continue
if any('#CHROM' in strings for strings in row):
sample_names = row[9:]
for sample in sample_names:
sample_seqs.append([""])
continue
chrom,pos,id,ref,alt,qual,filter,info,format=row[0:9]
haplotypes = row[9:]
if chrom == out_chr:
alt_list = alt.split(",")
for index,haplotype in enumerate(haplotypes):
# if haplotype == '0|0':
if haplotype.split(":")[0] == '0':
base = ref
# elif haplotype == ".|.":
elif haplotype.split(":")[0] == ".":
base = "N"
else:
call = haplotype.split(":")[0]
base = alt_list[int(call)-1]
sample_seqs[index][0] = sample_seqs[index][0]+base
else:
continue
fasta_out = open(out_name, 'w')
for i in range(len(sample_seqs)):
fasta_out.write(">"+str(sample_names[i])+"\n"+sample_seqs[i][0]+"\n")
fasta_out.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment