Skip to content

Instantly share code, notes, and snippets.

@TTTPOB
Created February 28, 2023 10:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save TTTPOB/214ca0b5abcc37fbc3bc77e9a6cef414 to your computer and use it in GitHub Desktop.
Save TTTPOB/214ca0b5abcc37fbc3bc77e9a6cef414 to your computer and use it in GitHub Desktop.
extract gencode gtf file gene id and gene name mapping
#!/usr/bin/env python3
import gzip
import sys
from pathlib import Path
def get_gene_lines(path: str):
with gzip.open(path, "rt") as f:
for l in f:
if l.startswith("##"):
continue
if l.startswith("#"):
header = l.strip().split("\t")
continue
if l.split("\t")[2] == "gene":
yield l
def get_attributes(l: str):
attrs = {}
for a in l.split("\t")[8].split(";"):
a = a.strip()
if a == "":
continue
k, v = a.split(" ")
attrs[k] = v.replace('"', "")
return attrs
def get_gene_id_name_mapping(attr: dict):
return (attr["gene_id"], attr["gene_name"])
def main():
infile = sys.argv[1]
# outfile should be placed at the same directory as the input file, but with suffix ".gene_id_name_mapping.txt"
outfile=Path(infile.replace(".gtf.gz", "")).with_suffix(".gene_id_name_mapping.txt")
outhandle = open(outfile, "w")
gene_lines = get_gene_lines(infile)
for gl in gene_lines:
attr = get_attributes(gl)
gene_id, gene_name = get_gene_id_name_mapping(attr)
outhandle.write(f"{gene_id}\t{gene_name}\n")
outhandle.close()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment