Skip to content

Instantly share code, notes, and snippets.

@KlausTrainer
Created January 27, 2020 22:34
Show Gist options
  • Save KlausTrainer/26c2996cf32677e4c107bd8aaee67794 to your computer and use it in GitHub Desktop.
Save KlausTrainer/26c2996cf32677e4c107bd8aaee67794 to your computer and use it in GitHub Desktop.
Converts a VCF SNP file from Dante Labs to the 23andme file format.
import re
import vcf
"""
Converts a VCF SNP file from Dante Labs to the 23andme file format.
As the files from Dante Labs don't contain SNP IDs, they are mapped
via variant summary data extracted from dbSNP (see
https://genome.ucsc.edu/cgi-bin/hgTables).
The variant summary data file is expected to have three tab-separated
columns, i.e., chromosome, postion, and rsID.
"""
DB_SNP_FILE = '/home/klaus/Downloads/clinvar-snps.txt'
VCF_FILE = '/home/klaus/Downloads/60820188484212.filtered.snp.vcf'
snp_dict = {}
with open(DB_SNP_FILE, 'r') as snp_file:
for line in snp_file:
if line[0] == '#':
continue
(chromosome, position, rsid) = line.rstrip().split('\t')
match = re.match('^chr([0-9]+|[MXY])$', chromosome)
if not match:
continue
chromosome_number = match[1]
if chromosome_number == 'M':
chromosome_number = 'MT'
snp_dict[(chromosome_number, int(position))] = rsid
print('rsid\tchromosome\tposition\tallele1\tallele2')
for record in vcf.Reader(filename=VCF_FILE):
key = (record.CHROM, record.POS)
if key in snp_dict:
record.ID = snp_dict[key]
allel_count = record.INFO['AC'][0] if 'AC' in record.INFO else 1
if len(record.ALT) == 2:
print('{}\t{}\t{}\t{}\t{}'.format(record.ID, record.CHROM, record.POS, record.ALT[0], record.ALT[1]))
elif allel_count == 2:
print('{}\t{}\t{}\t{}\t{}'.format(record.ID, record.CHROM, record.POS, record.ALT[0], record.ALT[0]))
else:
print('{}\t{}\t{}\t{}\t{}'.format(record.ID, record.CHROM, record.POS, record.ALT[0], record.REF))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment