Skip to content

Instantly share code, notes, and snippets.

@kantale
Last active July 17, 2020 08:33
Show Gist options
  • Save kantale/2812a46a02bd0e9b151cd35b07f5678b to your computer and use it in GitHub Desktop.
Save kantale/2812a46a02bd0e9b151cd35b07f5678b to your computer and use it in GitHub Desktop.
Create random VCXF file from 1000 Genomes Project
import random
import re
import gzip
import glob
def get_chr(x):
return int(re.search(r'chr([\d]+)', x).group(1))
def fil(lg):
ret = [x for x in lg if re.search(r'chr[\d]+', x)]
ret = sorted(ret, key = get_chr)
return ret
def do(r=0.01, index=1, output_fn = 'output.vcf', strand=0.01):
lg = glob.glob('1000GP_Phase3/1000GP_Phase3_chr*.legend.gz')
lg = fil(lg)
#print (lg)
h = glob.glob('1000GP_Phase3/1000GP_Phase3_chr*.hap.gz')
h = fil(h)
#print (h)
ind_2 = index-1
output_f = open(output_fn, 'w')
for leg, hap in zip(lg, h):
print ('Legend:', leg)
print ('hap:', hap)
with gzip.open(leg, 'rt') as fl, gzip.open(hap, 'rt') as fh:
fl.readline() # Ignore firt line of legend (header)
# Write vcf header
output_f.write('##fileformat=VCFv4.2\n')
output_f.write('#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tMITSOS\n')
chromosome = get_chr(leg)
c = 0
ins_c = 0
s_err = 0
while True:
c += 1
l_line = fl.readline()
h_line = fh.readline()
if c % 100000 == 0:
print (f'{chromosome} {c} {ins_c} s_err:{s_err}')
if not l_line or not h_line:
break
if random.random() < r:
ins_c += 1
_, pos, ref, alt, *rest = l_line.split()
if random.random() < strand:
# Add a strand error
ref, alt = alt, ref
s_err += 1
c1, c2 = h_line.split()[(ind_2*2):(ind_2*2)+2]
#c1, c2 = h_line[:3].split()
GT = f'{c1}/{c2}'
if GT == '1/0':
GT = '0/1'
assert GT in ['0/0', '0/1', '1/1']
#print (chromosome, pos, ref, alt, c1, c2)
output_f.write(f'chr{chromosome}\t{pos}\t.\t{ref}\t{alt}\t.\t.\t.\tGT\t{GT}\n')
output_f.close()
if __name__ == '__main__':
do()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment