Skip to content

Instantly share code, notes, and snippets.

@gpertea
Last active December 9, 2023 06:13
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save gpertea/4207fa9cb30fe7fec0eb52bd29b9a976 to your computer and use it in GitHub Desktop.
Save gpertea/4207fa9cb30fe7fec0eb52bd29b9a976 to your computer and use it in GitHub Desktop.
appending ref_gene_id (or gene_name) info to MSTRG gene_ids in stringtie --merge output
#!/bin/env python3
#Usage: mstrg_prep.py merged.gtf > merged_prep.gtf
import re, fileinput
g={} #gene_id => {ref_gene_ids}
prep=[] #array of [line, mstrg_id]
for line in fileinput.input():
line=line.rstrip()
t=line.split('\t')
if len(t)<9:
print(line)
continue
mgid=re.search('gene_id "(MSTRG\.\d+)"', t[8])
if mgid:
gid=mgid.group(1)
prep.append([line, gid])
mrn = re.search('ref_gene_id "([^"]+)', t[8])
#or if you want gene_name:
#mrn = re.search('gene_name "([^"]+)', t[8])
if mrn:
rn = mrn.group(1)
h=g.get(gid)
if h: h.add(rn)
else: g[gid]={ rn }
else:
print(line)
prevgid, gadd='', ''
for [line, gid] in prep:
if prevgid!=gid:
gadd=''
h=g.get(gid)
if h:
gadd='|'+'|'.join(sorted(g[gid]))
if len(gadd)>0:
line=re.sub('gene_id "MSTRG\.\d+',
'gene_id "'+gid+gadd, line)
print(line)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment