Skip to content

Instantly share code, notes, and snippets.

@indraniel
Created September 10, 2015 14:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save indraniel/c75d6bfe373e8fd90b1e to your computer and use it in GitHub Desktop.
Save indraniel/c75d6bfe373e8fd90b1e to your computer and use it in GitHub Desktop.
annotate a VCF with allelic frequency information (AF, AC, & AN) on total and subpopulation sets
package main
/*
Building & Usage
----------------
go build -o foo subpopulation-annotate.go
./foo --in /path/to/input/test.vcf --sample-population-map /path/to/20120131-omni-2141-sample-superpopulation.dat --out /path/to/out.vcf.gz
Where "20120131-omni-2141-sample-superpopulation.dat" is a tsv file like:
HG00096 GBR EUR
HG00097 GBR EUR
HG00098 GBR EUR
HG00099 GBR EUR
HG00100 GBR EUR
HG00101 GBR EUR
HG00102 GBR EUR
HG00103 GBR EUR
HG00104 GBR EUR
HG00105 GBR EUR
*/
import (
"bufio"
"fmt"
"log"
"os"
"sort"
"strconv"
"strings"
"github.com/brentp/vcfgo"
"github.com/brentp/xopen"
"gopkg.in/alecthomas/kingpin.v2"
)
type EmptyInterface interface{}
type SampleInfo struct {
Name string
Nationality string
SuperPopulation string
}
type Samples map[string]SampleInfo
type PopulationIndex struct {
Label string
Indexes []int
}
func NewPopulationIndex(label string) *PopulationIndex {
idx := PopulationIndex{Label: label, Indexes: make([]int, 0)}
return &idx
}
func (pi PopulationIndex) String() string {
ids := make([]string, 0)
for _, val := range pi.Indexes {
ids = append(ids, strconv.Itoa(val))
}
full := strings.Join(ids, ",")
str := "Label: '%s' - Indexes: %s"
out := fmt.Sprintf(str, pi.Label, full)
return out
}
func (s SampleInfo) String() string {
str := "Name: '%s' - Nationality: '%s' - SuperPopulation: '%s'"
out := fmt.Sprintf(str, s.Name, s.Nationality, s.SuperPopulation)
return out
}
var (
debug = kingpin.Flag("debug", "Enable debug mode.").Bool()
inputVCF = kingpin.Flag("in", "Input VCF").Required().String()
outVCF = kingpin.Flag("out", "Output VCF").Required().String()
inputSamplePop = kingpin.Flag("sample-population-map", "Sample to Population Map File").
Required().String()
)
func main() {
kingpin.Version("0.0.1")
kingpin.Parse()
validateInputVCF(*inputVCF)
fmt.Println("The input file is: ", *inputVCF)
validateSamplePopulationMapFile(*inputSamplePop)
fmt.Println("The sample file is: ", *inputSamplePop)
validateOutputVCF(*outVCF)
fmt.Println("The output file is: ", *outVCF)
populationData := parseSamplePopMapFile(*inputSamplePop)
log.Println("Processing", *inputVCF)
processVCF(*inputVCF, *outVCF, populationData)
log.Println("All Done!")
}
func parseSamplePopMapFile(inputFile string) Samples {
file, err := os.Open(inputFile)
if err != nil {
log.Fatalf("Couldn't open file: %s", err)
}
defer file.Close()
samples := make(Samples)
scanner := bufio.NewScanner(file)
for scanner.Scan() {
line := scanner.Text()
data := strings.Split(line, "\t")
s := SampleInfo{
Name: data[0],
Nationality: data[1],
SuperPopulation: data[2],
}
samples[data[0]] = s
}
return samples
}
func processVCF(inputVCF string, outVCF string, sampleInfo Samples) {
f, err := xopen.Ropen(inputVCF)
if err != nil {
log.Fatalf("Couldn't open file: %s", err)
}
defer f.Close()
outf, err := xopen.Wopen(outVCF)
if err != nil {
log.Fatalf("Couldn't open file: %s", err)
}
defer outf.Close()
rdr, err := vcfgo.NewReader(f, false)
if err != nil {
log.Fatalf("Couldn't open vcf reader : %s", err)
}
var groups []*PopulationIndex
parsedSamples := false
initVCFWriter := false
iter := 0
for {
iter++
variant := rdr.Read()
if variant == nil {
break
}
if !parsedSamples {
groups = parseVCFSamples(variant.Header.SampleNames, sampleInfo)
parsedSamples = true
}
// super population allelic frequencies
for _, group := range groups {
data := calculatePopulationAlleleFrequency(group, variant)
mk := make([]string, len(data))
i := 0
for k := range data {
mk[i] = k
i++
}
sort.Strings(mk)
for _, key := range mk {
val := data[key]
variant.Info.Add(key, val)
}
}
// overall allelic frequencies
data := calculateAlleleFrequency(variant)
variant.Info.Add("AC", data["AC"])
variant.Info.Add("AF", data["AF"])
variant.Info.Add("AN", data["AN"])
// fmt.Printf("%s\t%d\t%s\t%s\t%s\n", variant.Chromosome, variant.Pos, variant.Ref, variant.Alt[0], variant.Info.String())
if !initVCFWriter {
_, err = vcfgo.NewWriter(outf, variant.Header)
if err != nil {
log.Fatalf("Couldn't open vcf writer : %s", err)
}
initVCFWriter = true
}
fmt.Fprintln(outf, variant)
}
}
func calculatePopulationAlleleFrequency(
popIdx *PopulationIndex,
variant *vcfgo.Variant) map[string]string {
totalAlleles := 0
altAlleles := 0
for _, idx := range popIdx.Indexes {
gt := variant.Samples[idx].Fields["GT"]
if (gt == "0/0") || (gt == "0|0") {
totalAlleles += 2
} else if (gt == "0/1") || (gt == "0|1") ||
(gt == "1/0") || (gt == "1|0") {
totalAlleles += 2
altAlleles += 1
} else if (gt == "1/1") || (gt == "1|1") {
totalAlleles += 2
altAlleles += 2
}
}
af := 0.0
if totalAlleles > 0 {
af = (float64(altAlleles) / float64(totalAlleles))
}
afStr := strconv.FormatFloat(af, 'f', 5, 64)
out := make(map[string]string)
label := strings.Join([]string{popIdx.Label, "AF"}, "_")
out[label] = afStr
label = strings.Join([]string{popIdx.Label, "AC"}, "_")
out[label] = strconv.Itoa(altAlleles)
label = strings.Join([]string{popIdx.Label, "AN"}, "_")
out[label] = strconv.Itoa(totalAlleles)
return out
}
func calculateAlleleFrequency(variant *vcfgo.Variant) map[string]string {
totalAlleles := 0
altAlleles := 0
for _, v := range variant.Samples {
gt := v.Fields["GT"]
if (gt == "0/0") || (gt == "0|0") {
totalAlleles += 2
} else if (gt == "0/1") || (gt == "0|1") ||
(gt == "1/0") || (gt == "1|0") {
totalAlleles += 2
altAlleles += 1
} else if (gt == "1/1") || (gt == "1|1") {
totalAlleles += 2
altAlleles += 2
}
}
af := 0.0
if totalAlleles > 0 {
af = (float64(altAlleles) / float64(totalAlleles))
}
afStr := strconv.FormatFloat(af, 'f', 5, 64)
out := make(map[string]string)
out["AF"] = afStr
out["AC"] = strconv.Itoa(altAlleles)
out["AN"] = strconv.Itoa(totalAlleles)
return out
}
func parseVCFSamples(sampleNames []string, sampleInfo Samples) []*PopulationIndex {
europeans := NewPopulationIndex("EUR")
africans := NewPopulationIndex("AFR")
restOfWorld := NewPopulationIndex("ROW")
for i, full_sample_name := range sampleNames {
names := strings.Split(full_sample_name, "_")
sample := names[0]
info, ok := sampleInfo[sample]
if !ok {
msg := "Yikes! Didn't find sample '%s' " +
"in the population set!"
log.Fatalf(msg, sample)
}
if info.SuperPopulation == "EUR" {
europeans.Indexes = append(europeans.Indexes, i)
} else if info.SuperPopulation == "AFR" {
africans.Indexes = append(africans.Indexes, i)
} else {
restOfWorld.Indexes = append(restOfWorld.Indexes, i)
}
}
groups := make([]*PopulationIndex, 0)
groups = append(groups, europeans, africans, restOfWorld)
if *debug == true {
sum := 0
for _, superPop := range groups {
fmt.Println(superPop)
sum += len(superPop.Indexes)
}
fmt.Println("Total Samples:", sum)
}
return groups
}
func validateSamplePopulationMapFile(input string) {
checkFileType(input, ".dat")
checkExists(input)
}
func validateOutputVCF(outVCF string) {
checkFileType(outVCF, ".vcf", ".vcf.gz")
}
func validateInputVCF(inputVCF string) {
checkFileType(inputVCF, ".vcf", ".vcf.gz")
checkExists(inputVCF)
}
func checkExists(file string) {
if _, err := os.Stat(file); os.IsNotExist(err) {
log.Fatalf(
"Could not find '%s' on file system: %s",
file, err,
)
}
}
func checkFileType(path string, suffixTypes ...string) {
validType := false
for _, suffix := range suffixTypes {
if strings.HasSuffix(path, suffix) {
validType = true
break
}
}
all := strings.Join(suffixTypes, ", ")
if !validType {
log.Fatalf("'%s' isn't a valid %s file!",
path,
all,
)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment