Last active
July 17, 2020 08:33
-
-
Save kantale/2812a46a02bd0e9b151cd35b07f5678b to your computer and use it in GitHub Desktop.
Create random VCXF file from 1000 Genomes Project
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import random | |
import re | |
import gzip | |
import glob | |
def get_chr(x): | |
return int(re.search(r'chr([\d]+)', x).group(1)) | |
def fil(lg): | |
ret = [x for x in lg if re.search(r'chr[\d]+', x)] | |
ret = sorted(ret, key = get_chr) | |
return ret | |
def do(r=0.01, index=1, output_fn = 'output.vcf', strand=0.01): | |
lg = glob.glob('1000GP_Phase3/1000GP_Phase3_chr*.legend.gz') | |
lg = fil(lg) | |
#print (lg) | |
h = glob.glob('1000GP_Phase3/1000GP_Phase3_chr*.hap.gz') | |
h = fil(h) | |
#print (h) | |
ind_2 = index-1 | |
output_f = open(output_fn, 'w') | |
for leg, hap in zip(lg, h): | |
print ('Legend:', leg) | |
print ('hap:', hap) | |
with gzip.open(leg, 'rt') as fl, gzip.open(hap, 'rt') as fh: | |
fl.readline() # Ignore firt line of legend (header) | |
# Write vcf header | |
output_f.write('##fileformat=VCFv4.2\n') | |
output_f.write('#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tMITSOS\n') | |
chromosome = get_chr(leg) | |
c = 0 | |
ins_c = 0 | |
s_err = 0 | |
while True: | |
c += 1 | |
l_line = fl.readline() | |
h_line = fh.readline() | |
if c % 100000 == 0: | |
print (f'{chromosome} {c} {ins_c} s_err:{s_err}') | |
if not l_line or not h_line: | |
break | |
if random.random() < r: | |
ins_c += 1 | |
_, pos, ref, alt, *rest = l_line.split() | |
if random.random() < strand: | |
# Add a strand error | |
ref, alt = alt, ref | |
s_err += 1 | |
c1, c2 = h_line.split()[(ind_2*2):(ind_2*2)+2] | |
#c1, c2 = h_line[:3].split() | |
GT = f'{c1}/{c2}' | |
if GT == '1/0': | |
GT = '0/1' | |
assert GT in ['0/0', '0/1', '1/1'] | |
#print (chromosome, pos, ref, alt, c1, c2) | |
output_f.write(f'chr{chromosome}\t{pos}\t.\t{ref}\t{alt}\t.\t.\t.\tGT\t{GT}\n') | |
output_f.close() | |
if __name__ == '__main__': | |
do() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment