Skip to content

Instantly share code, notes, and snippets.

@informationsea
Created October 26, 2018 02:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save informationsea/1012f44ca09efa777c5315917c58daee to your computer and use it in GitHub Desktop.
Save informationsea/1012f44ca09efa777c5315917c58daee to your computer and use it in GitHub Desktop.
Simple Repeat Checker
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import argparse
import cyvcf2
import pyfaidx
import sys
def _main():
parser = argparse.ArgumentParser(description="Simple Repeat Variant Detector")
parser.add_argument('reference')
parser.add_argument('vcf')
parser.add_argument('--output', '-o', default=sys.stdout)
parser.add_argument('--minimum-repeat-times', default=4, type=int)
options = parser.parse_args()
reference = pyfaidx.Fasta(options.reference)
reader = cyvcf2.VCF(options.vcf)
reader.add_info_to_header({'ID': 'SR', 'Number': '0', 'Type': 'Flag', 'Description': 'Simple repeat variant'})
writer = cyvcf2.Writer(options.output, reader)
for record in reader:
for onealt in record.ALT:
if len(record.REF) > len(onealt) and record.REF.startswith(onealt): # deletion
if repeat_check(reference, record.CHROM, record.POS + len(onealt), record.REF[len(onealt):], options.minimum_repeat_times, 'ins'):
record.INFO['SR'] = True
elif len(record.REF) < len(onealt) and onealt.startswith(record.REF): # insertion
if repeat_check(reference, record.CHROM, record.POS + len(record.REF), onealt[len(record.REF):], options.minimum_repeat_times, 'del'):
record.INFO['SR'] = True
writer.write_record(record)
writer.close()
def repeat_check(reference, chrom, pos, seq, minimum_repeat_times, variant_type):
ref_seq = reference[chrom][pos-1:pos+3*minimum_repeat_times]
repeat_type = None
# single repeat
single = str(ref_seq[0])
doublet = str(ref_seq[0:2])
triplet = str(ref_seq[0:3])
if str(ref_seq).startswith(single*minimum_repeat_times) and seq == single*len(seq):
#print('single repeat', single)
repeat_type = 'single repeat ' + single
# doublet repeat
elif str(ref_seq).startswith(doublet*minimum_repeat_times) and len(seq) % 2 == 0 and seq == doublet*(int(len(seq)/2)):
repeat_type = 'doublet repeat ' + doublet
# triplet repeat
elif str(ref_seq).startswith(triplet*minimum_repeat_times) and len(seq) % 3 == 0 and seq == triplet*(int(len(seq)/3)):
repeat_type = 'triplet repeat ' + triplet
#print(chrom, pos, seq, reference[chrom][pos-1:pos+10], repeat_type, variant_type)
return repeat_type != None
if __name__ == '__main__':
_main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment