Skip to content

Instantly share code, notes, and snippets.

@chilampoon
Created December 8, 2022 04:51
Show Gist options
  • Save chilampoon/18dc42230f2ec5c7df936a3ccdf1fd33 to your computer and use it in GitHub Desktop.
Save chilampoon/18dc42230f2ec5c7df936a3ccdf1fd33 to your computer and use it in GitHub Desktop.
Fix the T2T gff3 file to prevent errors when getting a gffutils db
#!/usr/bin/env python
# fix t2t gff3 error
# now the problem is ids for some start&stop condons/exons are not unique
# 11/20/2022
import sys,gzip
from collections import defaultdict
def sep_info(info):
items = info.split(';')
items_tuple = [(i.split('=')[0], i.split('=')[1]) for i in items]
return items_tuple
def comb_info(items):
res = [f'{i[0]}={i[1]}' for i in items]
return ';'.join(res)
def update_id(r, info, count, replace=False):
(idx, sid) = [(i, j) for i,j in enumerate(info) if j[0] == 'ID'][0]
if replace:
tmp = sid[1].split(':')
new_id = f'{":".join(tmp[:-1])}:{count}'
else:
new_id = f'{sid[1]}:{count}'
info[idx] = (sid[0], new_id)
r[8] = comb_info(info)
return r
def fix_id(gff3, out):
start_codon_cnt = defaultdict(lambda:0)
stop_codon_cnt = defaultdict(lambda:0)
liftoff_id_cnt = defaultdict(lambda:0)
with open(gff3, 'r') as f, gzip.open(out, 'wb') as out:
for row in f:
if row.startswith('#'):
out.write(row.encode())
continue
r = row.rstrip().split('\t')
if r[2] == 'start_codon':
info = sep_info(r[8])
tx_id = [i[1] for i in info if i[0] == 'transcript_id'][0]
new_r = update_id(r, info, start_codon_cnt[tx_id])
row = '\t'.join(new_r) + '\n'
start_codon_cnt[tx_id] += 1
elif r[2] == 'stop_codon':
info = sep_info(r[8])
tx_id = [i[1] for i in info if i[0] == 'transcript_id'][0]
new_r = update_id(r, info, stop_codon_cnt[tx_id])
row = '\t'.join(new_r) + '\n'
stop_codon_cnt[tx_id] += 1
if r[1] == 'Liftoff' and r[2] not in ['gene', 'transcript']:
info = sep_info(r[8])
old_id = [i[1] for i in info if i[0] == 'ID'][0]
tmp = old_id.split(':')
if len(tmp) == 2:
id_num = 0
this_id = old_id
repl = False
elif len(tmp) == 3:
id_num = int(tmp[-1])
this_id = ':'.join(tmp[:-1])
repl = True
else:
print(f'check {old_id}')
if id_num != liftoff_id_cnt[this_id]:
new_r = update_id(r, info, liftoff_id_cnt[this_id], replace=repl)
row = '\t'.join(new_r) + '\n'
liftoff_id_cnt[this_id] += 1
out.write(row.encode())
gff3_file = sys.argv[1]
out_file = sys.argv[2]
fix_id(gff3_file, out_file)
# run: python fix_gff3.py chm13.draft_v2.0.gene_annotation.gff3 chm13.draft_v2.0.gene_annotation.fixed.gff3.gz
@chilampoon
Copy link
Author

For another gff3:

but I'd stick to the gencode one^
def fix_id(gff3, out):
    id_cnt = set()

    with open(gff3, 'r') as f, open(out, 'w') as out:
        for row in f:
            if row.startswith('#'): 
                out.write(row.encode())
                continue

            r = row.rstrip().split('\t')
            chrom = r[0]
            info = r[8]
            idx, sid = [(i, j) for i,j in enumerate(info) if j[0] == 'ID'][0]
            if sid[1] in id_cnt:
                new_id = f'{chrom}_{sid[1]}'
                info[idx] = (sid[0], new_id)
                r[8] = comb_info(info)
                row = '\t'.join(r) + '\n'
            else:
                id_cnt.add(sid[1])_

            out.write(row.encode())

gff3_file = sys.argv[1]
out_file = sys.argv[2]
fix_id(gff3_file, out_file)

# run: python fix_gff3.py chm13v2.0_RefSeq_Liftoff_v4.gff3 chm13v2.0_RefSeq_Liftoff_v4.fixed.gff3

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment