Created
July 14, 2020 14:41
-
-
Save flashton2003/b246ce509300a8669d27de7a4eb5c4c9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import gffutils | |
class PseudoFeature(): | |
def __init__(self, chrom, start, stop, strand): | |
self.chrom = chrom | |
self.start = start | |
self.stop = stop | |
self.strand = strand | |
class FeatureGroup(): | |
def __init__(self, gene_id): | |
## see here https://github.com/samtools/bcftools/issues/530#issuecomment-614410603 | |
self.id = gene_id | |
## the below should be lists of instances of Feature class. | |
self.gene = None # one feature group should have a single gene | |
self.transcript = None | |
## assuming here that ncRNA is more like transcript than exon | |
## i.e. there is only one ncRNA feature per feature group. | |
self.ncRNA = None | |
self.exons = [] | |
self.CDSs = [] | |
def add_gene(self, feature): | |
self.gene = feature | |
def add_transcript(self, feature): | |
self.transcript = feature # one feature group should have a single gene | |
def add_exon(self, feature): | |
self.exons.append(feature) | |
def add_cds(self, feature): | |
self.CDSs.append(feature) | |
def add_ncRNA(self, feature): | |
self.ncRNA = feature | |
def get_args(): | |
if len(sys.argv) != 3: | |
print('Usage: python convert_gff3_to_ensembl_gff3.py <gff_inhandle> </path/where/you/want/to/save/gffutils_db>') | |
sys.exit() | |
else: | |
return sys.argv[1], sys.argv[2] | |
def group_features(db): | |
feature_groups = {} | |
for feature in db.all_features(): | |
if feature.featuretype not in ('repeat_region', 'regulatory', 'stem_loop', 'gene_component_region', 'repeat_region'): | |
if feature.featuretype == 'gene': | |
## for any particular feature group, the gene doesn't necassarily come | |
## before the CDS in the gff, so need to have the if/else logic | |
gene_id = feature.id.split('.')[0] | |
if gene_id not in feature_groups: | |
feature_groups[gene_id] = FeatureGroup(gene_id) | |
feature_groups[gene_id].add_gene(feature) | |
else: | |
feature_groups[gene_id].add_gene(feature) | |
## in this gff, the transcripts are referred to as mRNA | |
## not all genes have mRNA records | |
elif feature.featuretype == 'mRNA': | |
gene_id = feature.id.split('.')[0] | |
# print(feature.id) | |
# print(gene_id) | |
feature_groups[gene_id].add_transcript(feature) | |
elif feature.featuretype == 'exon': | |
gene_id = feature.attributes['locus_tag'][0] | |
if gene_id not in feature_groups: | |
feature_groups[gene_id] = FeatureGroup(gene_id) | |
feature_groups[gene_id].add_exon(feature) | |
else: | |
feature_groups[gene_id].add_exon(feature) | |
elif feature.featuretype == 'CDS': | |
gene_id = feature.attributes['locus_tag'][0] | |
if gene_id not in feature_groups: | |
feature_groups[gene_id] = FeatureGroup(gene_id) | |
feature_groups[gene_id].add_cds(feature) | |
else: | |
feature_groups[gene_id].add_cds(feature) | |
elif feature.featuretype == 'ncRNA': | |
gene_id = feature.attributes['locus_tag'][0] | |
# print(gene_id) | |
if gene_id not in feature_groups: | |
feature_groups[gene_id] = FeatureGroup(gene_id) | |
feature_groups[gene_id].add_ncRNA(feature) | |
else: | |
feature_groups[gene_id].add_ncRNA(feature) | |
return feature_groups | |
def make_transcript_pseudofeature(CDSs): | |
## takes list of CDSs, list len will often be 1, but will sometimes be 2 | |
start = min([x.start for x in CDSs]) | |
stop = max([x.stop for x in CDSs]) | |
chrom = CDSs[0].chrom | |
## check that both on the same strand | |
assert len(set([x.strand for x in CDSs])) == 1 | |
strand = CDSs[0].strand | |
pf = PseudoFeature(chrom, start, stop, strand) | |
return pf | |
def check_feature_groups(feature_groups): | |
## checking that feature group has a gene, at least one CDS, and a transcript | |
## if doesn't have a transcript (as many don't), then make a pseudo feature corresponding to a transcript | |
## only checking that have transcript for everything | |
for gene_id in feature_groups: | |
## don't want to analyse ncRNAs here, but need to include them up | |
## to this point so that we know that that gene is an ncRNA gene | |
if feature_groups[gene_id].ncRNA != None: | |
continue | |
assert feature_groups[gene_id].gene != None | |
assert len(feature_groups[gene_id].CDSs) > 0 | |
if feature_groups[gene_id].transcript == None: | |
## using pseudofeature because the gffutils feature says it's not really intended | |
## for direct instantiation by users | |
feature_groups[gene_id].transcript = make_transcript_pseudofeature(feature_groups[gene_id].CDSs) | |
def print_features(feature_groups): | |
for gene_id in feature_groups: | |
feature_group = feature_groups[gene_id] | |
if feature_group.ncRNA != None: | |
continue | |
print('###') | |
name = feature_group.gene.attributes['Name'][0] | |
gene_attributes = f'ID=gene:{gene_id};Name={name};biotype=protein_coding;gene_id:{gene_id}' | |
print(feature_group.gene.chrom, 'EMBL', 'gene', feature_group.gene.start, feature_group.gene.stop, '.', feature_group.gene.strand, '.', gene_attributes, sep = '\t') | |
transcript_attributes = f'ID=transcript:{gene_id};Parent=gene:{gene_id};Name={name};biotype=protein_coding;transcript_id={gene_id}' | |
print(feature_group.transcript.chrom, 'EMBL', 'transcript', feature_group.transcript.start, feature_group.transcript.stop, '.', feature_group.transcript.strand, '.', transcript_attributes, sep = '\t') | |
cds_attributes = f'Parent=transcript:{gene_id};Name={name}' | |
for c in feature_group.CDSs: | |
print(c.chrom, 'EMBL', 'CDS', c.start, c.stop, '.', c.strand, '0', cds_attributes, sep = '\t') | |
def main(): | |
''' | |
Ensembl gff should have one gene and one transcript per "feature group" | |
Then, can have multiple CDS/exons | |
read in from the input gff, one FeatueGroup instance has one gene, one transcript and (potentially) | |
multiple CDS/exon | |
Exons aren't printed, as not needed bt bcftools csq | |
''' | |
gff_handle, db_handle = get_args() | |
fn = gffutils.example_filename(gff_handle) | |
db = gffutils.create_db(fn, dbfn=db_handle, force=True, keep_order=True,merge_strategy='merge', sort_attribute_values=True) | |
feature_groups = group_features(db) | |
check_feature_groups(feature_groups) | |
print_features(feature_groups) | |
## gff input is generated from a genbank file by bp_genbank2gff3.pl | |
if __name__ == '__main__': | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment