Skip to content

Instantly share code, notes, and snippets.

@alexpreynolds
Created November 18, 2024 05:34
Get per-gene longest isoform from GFF3 input
#!/usr/bin/env python
import sys
import pandas as pd
i_fn = sys.argv[1]
if not i_fn:
raise Exception("Error: Missing input file")
df = pd.read_csv(i_fn, delimiter='\t', header=None, comment='#')
# https://useast.ensembl.org/info/website/upload/gff3.html
df.columns = ['seqid',
'source',
'type',
'start',
'end',
'score',
'strand',
'phase',
'attributes']
transcript_ids = []
longest_isoform = {}
transcript_id_to_gene_id = {}
for index, row in df.iterrows():
attrs = {a_kv[0]:a_kv[1] for a_kv in [a_kvs.split('=') for a_kvs in row['attributes'].split(';')]}
transcript_id = attrs['transcript_id'] if 'transcript_id' in attrs else None
ensg_id = attrs['gene_id'] if 'gene_id' in attrs else None
if not transcript_id or not ensg_id: continue
if row['type'] == 'transcript':
txLength = int(row['end']) - int(row['start'])
if ensg_id not in longest_isoform:
longest_isoform[ensg_id] = {'row': row, 'txLength': txLength, 'transcript_id': transcript_id}
transcript_ids.append(transcript_id)
transcript_id_to_gene_id[transcript_id] = ensg_id
elif longest_isoform[ensg_id]['txLength'] < txLength:
transcript_id_to_filter = longest_isoform[ensg_id]['transcript_id']
idx_to_replace = transcript_ids.index(longest_isoform[ensg_id]['transcript_id'])
transcript_ids[idx_to_replace] = transcript_id
longest_isoform[ensg_id] = {'row': row, 'txLength': txLength, 'transcript_id': transcript_id}
transcript_id_to_gene_id[transcript_id] = ensg_id
for transcript_id in transcript_ids:
gene_id = transcript_id_to_gene_id[transcript_id]
for column in df.columns:
print(longest_isoform[gene_id]['row'][column], end="\t")
print()
@alexpreynolds
Copy link
Author

alexpreynolds commented Nov 18, 2024

$ wget -qO- "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_19/gencode.v19.annotation.gff3.gz" | gunzip -c > gencode.v19.annotation.gff3
$ ./longest_isoform_from_gff3.py gencode.v19.annotation.gff3 | sort -k1,1 -k4,4n -k5,5n > gencode.v19.annotation.longestIsoform.gff3

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment