Skip to content

Instantly share code, notes, and snippets.

@mdshw5
Created June 26, 2019 15:54
Show Gist options
  • Save mdshw5/444e91ecf6d3e79466180457983b1155 to your computer and use it in GitHub Desktop.
Save mdshw5/444e91ecf6d3e79466180457983b1155 to your computer and use it in GitHub Desktop.
import gffutils
import os.path
from pyfaidx import Fasta
gtf_path = "/path/to/gencode.gtf"
gtf_db = gtf_path + ".db"
fasta_path = "/path/to/hg38.fasta"
if os.path.exists(gtf_db):
db = gffutils.FeatureDB(gtf_db)
# If gene and transcript tags are present in the GTF file, use them. Otherwise infer them if missing
# with disable_infer_genes/transcripts=False.
db = gffutils.create_db(gtf_path, gtf_db, disable_infer_genes=True, disable_infer_transcripts=True)
with Fasta(fasta_path) as fasta:
# Each of the following gene, transcript, and exon are objects of gffutils.Feature type
# https://pythonhosted.org/gffutils/autodocs/gffutils.Feature.html#gffutils.Feature
for gene in db.features_of_type('gene'):
transcripts = db.children(gene, featuretype='transcript')
for transcript in transcripts:
exons = db.children(transcript, featuretype='exon', order_by='start')
exon_sequences = [exon.sequence(fasta, use_strand=True) for exon in exons]
exon_coords = [(exon.start, exon.end) for exon in exons] # 1-based coordinates, from GTF/GFF3 spec
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment