Skip to content

Instantly share code, notes, and snippets.

@jsun
Last active August 31, 2018 03:46
Show Gist options
  • Save jsun/9a58baeeb0018d07285bdb48443445df to your computer and use it in GitHub Desktop.
Save jsun/9a58baeeb0018d07285bdb48443445df to your computer and use it in GitHub Desktop.
Convert coordinates in GTF file with VCF file.
import sys
import os
## Description:
## This script is used for converting the coordinates in GTF file
## with the VCF file.
##
## Usage:
## python convert_gtf.py sampl.vcf sampl.gtf > modified_sampl.gtf
##
def convert_coordinates(vcf_path):
'''
Create a dictionary with a chromosome name as a key and a list as a value.
The index of a list indicates the coordiantes in original GTF file, and the
value of the associating index indicates the converted coordinates.
'''
pos_dict = {}
diff_from_ref = 0
with open(vcf_path, 'r') as vcffh:
for buf in vcffh:
buf = buf.replace('\n', '')
if buf[0:1] != '#':
records = buf.split('\t')
chrname = records[0]
pos = int(records[1])
ref = records[3]
alt = records[4]
if chrname not in pos_dict:
pos_dict[chrname] = []
while len(pos_dict[chrname]) <= pos:
pos_dict[chrname].append(len(pos_dict[chrname]) + diff_from_ref)
diff_from_ref = diff_from_ref + (len(alt) - len(ref))
pos_dict[chrname].append(len(pos_dict[chrname]) + diff_from_ref)
return pos_dict
def modify_gtf(gtf_path, pos_dict):
'''
Replace the coordinates in the original GTF file to new coordinates.
'''
with open(gtf_path, 'r') as gtffh:
for buf in gtffh:
buf = buf.replace('\n', '')
if buf[0:1] == '#':
print(buf)
else:
r = buf.split('\t')
chrname = r[0]
while len(pos_dict[chrname]) <= int(r[3]):
pos_dict[chrname].append(pos_dict[chrname][-1] + 1)
while len(pos_dict[chrname]) <= int(r[4]):
pos_dict[chrname].append(pos_dict[chrname][-1] + 1)
r[3] = str(pos_dict[chrname][int(r[3])])
r[4] = str(pos_dict[chrname][int(r[4])])
buf_new = '\t'.join(r)
print(buf_new)
if __name__ == '__main__':
pos_dict = convert_coordinates(sys.argv[1])
modify_gtf(sys.argv[2], pos_dict)
chr1 biopapyrus exon 11 20
chr1 biopapyrus exon 31 50
chr1 biopapyrus exon 61 70
chr1 biopapyrus exon 91 100
chr1 5 . A T
chr1 8 . C G
chr1 15 . A T
chr1 25 . G CCC
chr1 40 . A CT
chr1 55 . ACA T
chr1 80 . ACCT G
chr1 90 . A TCAA
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment