Created
September 10, 2015 14:28
-
-
Save indraniel/c75d6bfe373e8fd90b1e to your computer and use it in GitHub Desktop.
annotate a VCF with allelic frequency information (AF, AC, & AN) on total and subpopulation sets
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
/* | |
Building & Usage | |
---------------- | |
go build -o foo subpopulation-annotate.go | |
./foo --in /path/to/input/test.vcf --sample-population-map /path/to/20120131-omni-2141-sample-superpopulation.dat --out /path/to/out.vcf.gz | |
Where "20120131-omni-2141-sample-superpopulation.dat" is a tsv file like: | |
HG00096 GBR EUR | |
HG00097 GBR EUR | |
HG00098 GBR EUR | |
HG00099 GBR EUR | |
HG00100 GBR EUR | |
HG00101 GBR EUR | |
HG00102 GBR EUR | |
HG00103 GBR EUR | |
HG00104 GBR EUR | |
HG00105 GBR EUR | |
*/ | |
import ( | |
"bufio" | |
"fmt" | |
"log" | |
"os" | |
"sort" | |
"strconv" | |
"strings" | |
"github.com/brentp/vcfgo" | |
"github.com/brentp/xopen" | |
"gopkg.in/alecthomas/kingpin.v2" | |
) | |
type EmptyInterface interface{} | |
type SampleInfo struct { | |
Name string | |
Nationality string | |
SuperPopulation string | |
} | |
type Samples map[string]SampleInfo | |
type PopulationIndex struct { | |
Label string | |
Indexes []int | |
} | |
func NewPopulationIndex(label string) *PopulationIndex { | |
idx := PopulationIndex{Label: label, Indexes: make([]int, 0)} | |
return &idx | |
} | |
func (pi PopulationIndex) String() string { | |
ids := make([]string, 0) | |
for _, val := range pi.Indexes { | |
ids = append(ids, strconv.Itoa(val)) | |
} | |
full := strings.Join(ids, ",") | |
str := "Label: '%s' - Indexes: %s" | |
out := fmt.Sprintf(str, pi.Label, full) | |
return out | |
} | |
func (s SampleInfo) String() string { | |
str := "Name: '%s' - Nationality: '%s' - SuperPopulation: '%s'" | |
out := fmt.Sprintf(str, s.Name, s.Nationality, s.SuperPopulation) | |
return out | |
} | |
var ( | |
debug = kingpin.Flag("debug", "Enable debug mode.").Bool() | |
inputVCF = kingpin.Flag("in", "Input VCF").Required().String() | |
outVCF = kingpin.Flag("out", "Output VCF").Required().String() | |
inputSamplePop = kingpin.Flag("sample-population-map", "Sample to Population Map File"). | |
Required().String() | |
) | |
func main() { | |
kingpin.Version("0.0.1") | |
kingpin.Parse() | |
validateInputVCF(*inputVCF) | |
fmt.Println("The input file is: ", *inputVCF) | |
validateSamplePopulationMapFile(*inputSamplePop) | |
fmt.Println("The sample file is: ", *inputSamplePop) | |
validateOutputVCF(*outVCF) | |
fmt.Println("The output file is: ", *outVCF) | |
populationData := parseSamplePopMapFile(*inputSamplePop) | |
log.Println("Processing", *inputVCF) | |
processVCF(*inputVCF, *outVCF, populationData) | |
log.Println("All Done!") | |
} | |
func parseSamplePopMapFile(inputFile string) Samples { | |
file, err := os.Open(inputFile) | |
if err != nil { | |
log.Fatalf("Couldn't open file: %s", err) | |
} | |
defer file.Close() | |
samples := make(Samples) | |
scanner := bufio.NewScanner(file) | |
for scanner.Scan() { | |
line := scanner.Text() | |
data := strings.Split(line, "\t") | |
s := SampleInfo{ | |
Name: data[0], | |
Nationality: data[1], | |
SuperPopulation: data[2], | |
} | |
samples[data[0]] = s | |
} | |
return samples | |
} | |
func processVCF(inputVCF string, outVCF string, sampleInfo Samples) { | |
f, err := xopen.Ropen(inputVCF) | |
if err != nil { | |
log.Fatalf("Couldn't open file: %s", err) | |
} | |
defer f.Close() | |
outf, err := xopen.Wopen(outVCF) | |
if err != nil { | |
log.Fatalf("Couldn't open file: %s", err) | |
} | |
defer outf.Close() | |
rdr, err := vcfgo.NewReader(f, false) | |
if err != nil { | |
log.Fatalf("Couldn't open vcf reader : %s", err) | |
} | |
var groups []*PopulationIndex | |
parsedSamples := false | |
initVCFWriter := false | |
iter := 0 | |
for { | |
iter++ | |
variant := rdr.Read() | |
if variant == nil { | |
break | |
} | |
if !parsedSamples { | |
groups = parseVCFSamples(variant.Header.SampleNames, sampleInfo) | |
parsedSamples = true | |
} | |
// super population allelic frequencies | |
for _, group := range groups { | |
data := calculatePopulationAlleleFrequency(group, variant) | |
mk := make([]string, len(data)) | |
i := 0 | |
for k := range data { | |
mk[i] = k | |
i++ | |
} | |
sort.Strings(mk) | |
for _, key := range mk { | |
val := data[key] | |
variant.Info.Add(key, val) | |
} | |
} | |
// overall allelic frequencies | |
data := calculateAlleleFrequency(variant) | |
variant.Info.Add("AC", data["AC"]) | |
variant.Info.Add("AF", data["AF"]) | |
variant.Info.Add("AN", data["AN"]) | |
// fmt.Printf("%s\t%d\t%s\t%s\t%s\n", variant.Chromosome, variant.Pos, variant.Ref, variant.Alt[0], variant.Info.String()) | |
if !initVCFWriter { | |
_, err = vcfgo.NewWriter(outf, variant.Header) | |
if err != nil { | |
log.Fatalf("Couldn't open vcf writer : %s", err) | |
} | |
initVCFWriter = true | |
} | |
fmt.Fprintln(outf, variant) | |
} | |
} | |
func calculatePopulationAlleleFrequency( | |
popIdx *PopulationIndex, | |
variant *vcfgo.Variant) map[string]string { | |
totalAlleles := 0 | |
altAlleles := 0 | |
for _, idx := range popIdx.Indexes { | |
gt := variant.Samples[idx].Fields["GT"] | |
if (gt == "0/0") || (gt == "0|0") { | |
totalAlleles += 2 | |
} else if (gt == "0/1") || (gt == "0|1") || | |
(gt == "1/0") || (gt == "1|0") { | |
totalAlleles += 2 | |
altAlleles += 1 | |
} else if (gt == "1/1") || (gt == "1|1") { | |
totalAlleles += 2 | |
altAlleles += 2 | |
} | |
} | |
af := 0.0 | |
if totalAlleles > 0 { | |
af = (float64(altAlleles) / float64(totalAlleles)) | |
} | |
afStr := strconv.FormatFloat(af, 'f', 5, 64) | |
out := make(map[string]string) | |
label := strings.Join([]string{popIdx.Label, "AF"}, "_") | |
out[label] = afStr | |
label = strings.Join([]string{popIdx.Label, "AC"}, "_") | |
out[label] = strconv.Itoa(altAlleles) | |
label = strings.Join([]string{popIdx.Label, "AN"}, "_") | |
out[label] = strconv.Itoa(totalAlleles) | |
return out | |
} | |
func calculateAlleleFrequency(variant *vcfgo.Variant) map[string]string { | |
totalAlleles := 0 | |
altAlleles := 0 | |
for _, v := range variant.Samples { | |
gt := v.Fields["GT"] | |
if (gt == "0/0") || (gt == "0|0") { | |
totalAlleles += 2 | |
} else if (gt == "0/1") || (gt == "0|1") || | |
(gt == "1/0") || (gt == "1|0") { | |
totalAlleles += 2 | |
altAlleles += 1 | |
} else if (gt == "1/1") || (gt == "1|1") { | |
totalAlleles += 2 | |
altAlleles += 2 | |
} | |
} | |
af := 0.0 | |
if totalAlleles > 0 { | |
af = (float64(altAlleles) / float64(totalAlleles)) | |
} | |
afStr := strconv.FormatFloat(af, 'f', 5, 64) | |
out := make(map[string]string) | |
out["AF"] = afStr | |
out["AC"] = strconv.Itoa(altAlleles) | |
out["AN"] = strconv.Itoa(totalAlleles) | |
return out | |
} | |
func parseVCFSamples(sampleNames []string, sampleInfo Samples) []*PopulationIndex { | |
europeans := NewPopulationIndex("EUR") | |
africans := NewPopulationIndex("AFR") | |
restOfWorld := NewPopulationIndex("ROW") | |
for i, full_sample_name := range sampleNames { | |
names := strings.Split(full_sample_name, "_") | |
sample := names[0] | |
info, ok := sampleInfo[sample] | |
if !ok { | |
msg := "Yikes! Didn't find sample '%s' " + | |
"in the population set!" | |
log.Fatalf(msg, sample) | |
} | |
if info.SuperPopulation == "EUR" { | |
europeans.Indexes = append(europeans.Indexes, i) | |
} else if info.SuperPopulation == "AFR" { | |
africans.Indexes = append(africans.Indexes, i) | |
} else { | |
restOfWorld.Indexes = append(restOfWorld.Indexes, i) | |
} | |
} | |
groups := make([]*PopulationIndex, 0) | |
groups = append(groups, europeans, africans, restOfWorld) | |
if *debug == true { | |
sum := 0 | |
for _, superPop := range groups { | |
fmt.Println(superPop) | |
sum += len(superPop.Indexes) | |
} | |
fmt.Println("Total Samples:", sum) | |
} | |
return groups | |
} | |
func validateSamplePopulationMapFile(input string) { | |
checkFileType(input, ".dat") | |
checkExists(input) | |
} | |
func validateOutputVCF(outVCF string) { | |
checkFileType(outVCF, ".vcf", ".vcf.gz") | |
} | |
func validateInputVCF(inputVCF string) { | |
checkFileType(inputVCF, ".vcf", ".vcf.gz") | |
checkExists(inputVCF) | |
} | |
func checkExists(file string) { | |
if _, err := os.Stat(file); os.IsNotExist(err) { | |
log.Fatalf( | |
"Could not find '%s' on file system: %s", | |
file, err, | |
) | |
} | |
} | |
func checkFileType(path string, suffixTypes ...string) { | |
validType := false | |
for _, suffix := range suffixTypes { | |
if strings.HasSuffix(path, suffix) { | |
validType = true | |
break | |
} | |
} | |
all := strings.Join(suffixTypes, ", ") | |
if !validType { | |
log.Fatalf("'%s' isn't a valid %s file!", | |
path, | |
all, | |
) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment