Skip to content

Instantly share code, notes, and snippets.

@daler
Created January 4, 2014 18:54
Show Gist options
  • Save daler/8259098 to your computer and use it in GitHub Desktop.
Save daler/8259098 to your computer and use it in GitHub Desktop.
creation of gffutils database from Gencode v19 annotations
import gffutils
import datetime
# Annotations from:
# ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_19/gencode.v19.annotation.gtf.gz
v19gff = 'gencode.v19.annotation.gtf'
v19db_filename = v19gff + '.db'
gene_transcript = set(('gene', 'transcript'))
def transform(f):
if f.featuretype in gene_transcript:
return f
else:
exon_location = '{}:{}:{}-{}:{}'.format(f.featuretype, f.seqid, f.start, f.stop, f.strand)
exon_id = exon_location
if f.featuretype == 'CDS':
exon_id += ':' + f.frame
f.attributes['fancy_id'] = [exon_id]
return f
print datetime.datetime.now()
v19db = gffutils.create_db(
v19gff,
v19db_filename,
merge_strategy='merge',
id_spec={'gene': 'gene_id', 'transcript':
'transcript_id','exon':'fancy_id',
'CDS':'fancy_id','start_codon':'fancy_id',
'stop_codon':'fancy_id','UTR':'fancy_id'},
transform=transform,
force=True,
verbose=True,
infer_gene_extent=False,
force_merge_fields=['source'])
print datetime.datetime.now()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment