Skip to content

Instantly share code, notes, and snippets.

@cth
Created April 12, 2016 14:08
Show Gist options
  • Save cth/1b4533c9e8a694071934743772de4e50 to your computer and use it in GitHub Desktop.
Save cth/1b4533c9e8a694071934743772de4e50 to your computer and use it in GitHub Desktop.
#!/home/fng514/bin/julia
#$ -S /home/fng514/bin/julia
#$ -cwd
#
# Split multi allelic sites:
# 1. Split multi-allelic sites into multiple VCF lines, s.t., a) each line has one unique alternative allele, and b) if person has an other alternative allele than the one specified for the line in question, then that particular genotype is coded as missing in that line.
# 2. Run bi-allelic hardy weinberg test for each of these lines.
#
# Christian Theil Have, 2016.
function split_multiallelic(outvcf,line)
fields = split(chomp(line))
alt_alleles = split(fields[5],",")
split_lines = Array{ASCIIString}(length(alt_alleles),length(fields)-9)
gtidx = first(find(x -> x=="GT", split(fields[9],':')))
for field_idx in 10:length(fields)
field = fields[field_idx]
field_values = split(field,':')
gt = field_values[gtidx]
rmatch = match(r"(.*)/(.*)", gt)
for i in 1:length(alt_alleles)
if rmatch[2] == "0"
split_lines[i,field_idx-9] = join(field_values,":")
elseif rmatch[2] == alt_alleles[i]
field_values_copy = copy(field_values)
field_values_copy[gtidx] = rmatch[1]==rmatch[2] ? "1/1" : "0/1"
split_lines[i,field_idx-9] = join(field_values_copy,":")
else
field_values_copy = copy(field_values)
field_values_copy[gtidx] = "./."
split_lines[i,field_idx-9] = join(field_values_copy,":")
end
end
end
# Construct each line from the split_lines array
for i in 1:length(alt_alleles)
info_part = copy(fields[1:9])
info_part[5] = alt_alleles[i]
write(outvcf,string(join(info_part,'\t'),"\t", join(split_lines[i,:], "\t"), "\n"))
end
end
if (length(ARGS) != 2) || (ARGS[1] == ARGS[2])
throw("usage: split_multiallelic in.vcf out.vcf")
end
open(ARGS[1]) do vcf
open(ARGS[2],"w") do outvcf
lines_read = 0
line_refs = []
removed_by_variant = []
for line in eachline(vcf)
if ismatch(r"^chr", line)
if length(line) > 1200
fields = split(line[1:1200], "\t")
else
fields = split(line[1:end], "\t")
end
if ismatch(r",", fields[5])
split_multiallelic(outvcf,line)
else
write(outvcf,line)
end
else
write(outvcf,line)
end
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment