Skip to content

Instantly share code, notes, and snippets.

@johnsolk
Last active March 13, 2017 22:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save johnsolk/5375428dba77362c2a7875e9cf27e870 to your computer and use it in GitHub Desktop.
Save johnsolk/5375428dba77362c2a7875e9cf27e870 to your computer and use it in GitHub Desktop.
dammit_annotations_parsing
##gff-version 3.2.1
Transcript_0 transdecoder CDS 1664 2062 . + . ID=cds.Transcript_0|m.1;Parent=Transcript_0|m.1
Transcript_0 transdecoder exon 1 2064 . + . ID=Transcript_0|m.1.exon1;Parent=Transcript_0|m.1
Transcript_0 transdecoder five_prime_UTR 1 1663 . + . ID=Transcript_0|m.1.utr5p1;Parent=Transcript_0|m.1
Transcript_0 transdecoder gene 1 2064 . + . ID=Transcript_0|g.1;Name=ORF%20Transcript_0%7Cg.1%20Transcript_0%7Cm.1%20type%3A3prime_partial%20len%3A134%20%28%2B%29
Transcript_0 transdecoder mRNA 1 2064 . + . ID=Transcript_0|m.1;Parent=Transcript_0|g.1;Name=ORF%20Transcript_0%7Cg.1%20Transcript_0%7Cm.1%20type%3A3prime_partial%20len%3A134%20%28%2B%29
Transcript_0 transdecoder three_prime_UTR 2063 2064 . + . ID=Transcript_0|m.1.utr3p1;Parent=Transcript_0|m.1
Transcript_100002 transdecoder CDS 1 1047 . + . ID=cds.Transcript_100002|m.114344;Parent=Transcript_100002|m.114344
Transcript_100002 transdecoder CDS 1411 1722 . + . ID=cds.Transcript_100002|m.114346;Parent=Transcript_100002|m.114346
Transcript_100002 transdecoder CDS 936 1316 . - . ID=cds.Transcript_100002|m.114345;Parent=Transcript_100002|m.114345
Transcript_100002 transdecoder exon 1 1767 . + . ID=Transcript_100002|m.114344.exon1;Parent=Transcript_100002|m.114344
Transcript_100002 transdecoder exon 1 1767 . - . ID=Transcript_100002|m.114345.exon1;Parent=Transcript_100002|m.114345
Transcript_100002 transdecoder exon 1 1767 . + . ID=Transcript_100002|m.114346.exon1;Parent=Transcript_100002|m.114346
Transcript_100002 transdecoder five_prime_UTR 1 1410 . + . ID=Transcript_100002|m.114346.utr5p1;Parent=Transcript_100002|m.114346
Transcript_100002 transdecoder five_prime_UTR 1317 1767 . - . ID=Transcript_100002|m.114345.utr5p1;Parent=Transcript_100002|m.114345
Transcript_100002 transdecoder gene 1 1767 . + . ID=Transcript_100002|g.114344;Name=ORF%20Transcript_100002%7Cg.114344%20Transcript_100002%7Cm.114344%20type%3A5prime_partial%20len%3A349%20%28%2B%29
Transcript_100002 transdecoder gene 1 1767 . - . ID=Transcript_100002|g.114345;Name=ORF%20Transcript_100002%7Cg.114345%20Transcript_100002%7Cm.114345%20type%3Acomplete%20len%3A127%20%28-%29
Transcript_100002 transdecoder gene 1 1767 . + . ID=Transcript_100002|g.114346;Name=ORF%20Transcript_100002%7Cg.114346%20Transcript_100002%7Cm.114346%20type%3Acomplete%20len%3A104%20%28%2B%29
Transcript_100002 transdecoder mRNA 1 1767 . + . ID=Transcript_100002|m.114344;Parent=Transcript_100002|g.114344;Name=ORF%20Transcript_100002%7Cg.114344%20Transcript_100002%7Cm.114344%20type%3A5prime_partial%20len%3A349%20%28%2B%29
Transcript_100002 transdecoder mRNA 1 1767 . - . ID=Transcript_100002|m.114345;Parent=Transcript_100002|g.114345;Name=ORF%20Transcript_100002%7Cg.114345%20Transcript_100002%7Cm.114345%20type%3Acomplete%20len%3A127%20%28-%29
Transcript_100002 transdecoder mRNA 1 1767 . + . ID=Transcript_100002|m.114346;Parent=Transcript_100002|g.114346;Name=ORF%20Transcript_100002%7Cg.114346%20Transcript_100002%7Cm.114346%20type%3Acomplete%20len%3A104%20%28%2B%29
Transcript_100002 transdecoder three_prime_UTR 1048 1767 . + . ID=Transcript_100002|m.114344.utr3p1;Parent=Transcript_100002|m.114344
Transcript_100002 transdecoder three_prime_UTR 1723 1767 . + . ID=Transcript_100002|m.114346.utr3p1;Parent=Transcript_100002|m.114346
Transcript_100002 transdecoder three_prime_UTR 1 935 . - . ID=Transcript_100002|m.114345.utr3p1;Parent=Transcript_100002|m.114345
Transcript_100003 shmlast.LAST conditional_reciprocal_best_LAST 48 538 8.600000e-184 + . ID=homology:230094;Name=gi|768937493|ref|XP_011608972.1| PREDICTED: uncharacterized protein LOC105417370 [Takifugu rubripes];Target=gi|768937493|ref|XP_011608972.1| PREDICTED: uncharacterized protein LOC105417370 [Takifugu rubripes] 5 498 +;database=protein.fa
# loops through directories with annotations from multiple species
# sorts each by ID
# drops all rows not annotated to cutsom reference protein database, startswith("gi")
# returns transcript ID and gene name
# writes to .csv file
import pandas as pd
import os
# requires dammit env:
# source activate py3.dammit
from dammit.fileio.gff3 import GFF3Parser
dammit_dir = '/home/ljcohen/osmotic_damit/'
dammit_dirs = os.listdir(dammit_dir)
print(dammit_dirs)
for dammit_dirname in dammit_dirs:
if dammit_dirname != "sbatch_files":
genus_species = dammit_dirname.split(".")[0]
dammit_gff = dammit_dir + dammit_dirname + "/" + genus_species + ".trinity_out.Trinity.fasta.dammit.gff3"
print(dammit_gff)
annotations = GFF3Parser(filename=dammit_gff).read()
all_names = annotations.sort_values(by=['seqid'],ascending=True)[['seqid','Name']]
annotations = annotations.dropna(subset=['Name'])
fund = annotations[annotations['Name'].str.startswith("gi")]
names = fund.sort_values(by=['seqid'], ascending=True)[['seqid', 'Name']]
names_out = '/home/ljcohen/osmotic_assemblies_farm/'+genus_species+'.trinity_out.Trinity.fasta.Fundulus.genenames.csv'
#names.to_csv(names_out)
#print("Written:",names_out)
all_names_out = '/home/ljcohen/osmotic_assemblies_farm/'+genus_species+'.trinity_out.Trinity.fasta.all_gene_names.csv'
all_names.to_csv(all_names_out)
print("Written:",all_names_out)
# takes Trinity .fasta file
# sorts by ID and e-value
# checks that score < 1e-05
# drops duplicates (picks lowest e-value)
# returns transcript ID and gene name
# writes to .csv file
import pandas as pd
from dammit.fileio.gff3 import GFF3Parser
annotations = GFF3Parser(filename='mahi.trinity_out.Trinity.fasta.dammit.gff3').read()
alignments = annotations.query('source != "transdecoder"')
names = alignments.sort_values(by=['seqid', 'score'], ascending=True).query('score < 1e-05').drop_duplicates(subset='seqid')[['seqid', 'Name']]
names.to_csv('genenames.csv')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment