Skip to content

Instantly share code, notes, and snippets.

@wflynny
Created July 16, 2020 16:51
Show Gist options
  • Save wflynny/a4c1bde3ac60bd5f3c88976a55035d27 to your computer and use it in GitHub Desktop.
Save wflynny/a4c1bde3ac60bd5f3c88976a55035d27 to your computer and use it in GitHub Desktop.
import re
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--infile", required=True)
parser.add_argument("-o", "--outfile", required=True)
args = parser.parse_args()
gene_matcher = re.compile('\tgene\t.*gene_id (".*?");.*Name (".*?");')
parent_matcher = re.compile('gene_id (".*?");.*Parent (".*?");')
new_line_fmt = '{} {} {};\n'
# Stores (gene_id -> Name)
mapper = {}
with open(args.infile, "r") as fin, open(args.outfile, "w") as fout:
for k, line in enumerate(fin):
if line.startswith("#"): continue
entry = line.rstrip('\n')
gmatch = gene_matcher.search(entry)
pmatch = parent_matcher.search(entry)
if gmatch:
print("gene match")
gid, gname = gmatch.groups()
mapper[gid] = gname
elif pmatch:
gid, parent = pmatch.groups()
gname = mapper.get(gid, "")
else:
print(f"found wrong thing on line: {k}")
continue
fout.write(new_line_fmt.format(entry, "gene_name", gname))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment