Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save brantfaircloth/10688524 to your computer and use it in GitHub Desktop.
Save brantfaircloth/10688524 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
(c) 2014 Brant Faircloth || http://faircloth-lab.org/
All rights reserved.
This code is distributed under a 3-clause BSD license. Please see
LICENSE.txt for more information.
Created on 14 April 2014 12:57 PDT (-0700)
"""
import sys
import vcf
import argparse
import pandas as pd
#import pdb
def get_args():
parser = argparse.ArgumentParser(description='Parse fastq files and drop reads containing Ns.')
parser.add_argument(
'--input',
required=True,
help="The VCF file to process"
)
parser.add_argument(
'--output',
required=True,
help="The STRUCTURE file to write"
)
parser.add_argument(
"--filter-identical",
action="store_true",
default=False,
help="""Remove sites with only a single heterozygote genotype.""",
)
return parser.parse_args()
def identical(items):
return all(x == items[0] for x in items if x != "-9,-9")
def progress(cnt):
if cnt % 100 == 0:
sys.stdout.write(".")
sys.stdout.flush()
def main():
args = get_args()
alleles = {"A":1, "T":2, "G":3, "C":4}
sys.stdout.write("Working")
sys.stdout.flush()
with open(args.input, 'r') as infile:
vcf_reader = vcf.Reader(infile)
# create data frame with indiv as row id
data = pd.DataFrame()
for cnt, record in enumerate(vcf_reader):
# prog
progress(cnt)
# get ref/alt alleles and make those into a list - integer values
# in genotypes refer to 0-index positions of this list.
reference = [record.REF] + [str(i) for i in record.ALT]
# we'll keep a list of all genotypes
all_genotypes = []
for sample in record.samples:
# deal with missing data
if sample.data.GT == None:
gt = ",".join(["-9","-9"])
# bung those into list
all_genotypes.append(gt)
else:
# split the integer code
coded_gt = [int(i) for i in sample.data.GT.split("/")]
# get the ref/alt allele based on integer code and lookup
# new integer value in dict/hash table
gt = sorted([alleles[reference[i]] for i in coded_gt])
# join those to a string
gt = ",".join([str(i) for i in gt])
# bung those into list
all_genotypes.append(gt)
# filter monomorphic loci
if args.filter_identical and not identical(all_genotypes):
temp_df = pd.DataFrame({cnt:pd.Series(all_genotypes,index=vcf_reader.samples)})
data = pd.concat([data, temp_df], axis=1)
elif not args.filter_identical:
temp_df = pd.DataFrame({cnt:pd.Series(all_genotypes,index=vcf_reader.samples)})
data = pd.concat([data, temp_df], axis=1)
with open(args.output, 'w') as outf:
loci_header = ",".join(["{0}a,{0}b".format(i) for i in data.columns.values.tolist()])
outf.write("indiv,{}\n".format(loci_header))
for row in data.iterrows():
outf.write("{0},{1}\n".format(
row[0],
",".join(row[1])
))
# flush
print ""
print "Data contain {} individuals and {} loci output to {}.".format(
len(data),
len(data.columns.values.tolist()),
args.output
)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment