Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save johnsolk/cf1b48e082f8bd42294a3fea37946f6f to your computer and use it in GitHub Desktop.
Save johnsolk/cf1b48e082f8bd42294a3fea37946f6f to your computer and use it in GitHub Desktop.
Transcriptome annotation is a mess.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import os\n",
"import pandas as pd\n",
"# requires dammit env\n",
"# source activate dammit\n",
"from dammit.fileio.gff3 import GFF3Parser"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"L_goodei_transfer_2\n",
"L_goodei_BW_3\n",
"L_goodei_BW_1\n",
"L_goodei_transfer_3\n",
"L_goodei_transfer_1\n",
"L_goodei_BW_2\n",
"L_goodei_FW_2\n",
"L_goodei_FW_1\n",
"L_goodei_FW_3\n",
"F_notti_FW_2\n",
"F_notti_FW_1\n",
"L_parva_FW_3\n",
"L_parva_FW_1\n",
"L_parva_transfer_3\n",
"L_parva_transfer_1\n",
"L_parva_FW_2\n",
"L_parva_transfer_2\n",
"L_parva_BW_2\n",
"L_parva_BW_1\n",
"L_parva_BW_3\n",
"F_similis_transfer_1\n",
"F_similis_BW_1\n",
"F_similis_transfer_3\n",
"F_similis_BW_3\n",
"F_similis_BW_2\n",
"F_similis_transfer_2\n",
"F_similis_FW_2\n",
"F_similis_FW_3\n",
"F_similis_FW_1\n",
"F_olivaceous_FW_2\n",
"F_olivaceous_transfer_2\n",
"F_olivaceous_FW_1\n",
"F_olivaceous_transfer_1\n",
"F_olivaceous_FW_3\n",
"F_olivaceous_BW_3\n",
"F_olivaceous_BW_1\n",
"F_olivaceous_BW_2\n",
"F_notatus_FW_2\n",
"F_notatus_FW_3\n",
"F_notatus_FW_1\n",
"F_notatus_transfer_1\n",
"F_notatus_transfer_3\n",
"F_notatus_BW_1\n",
"F_notatus_BW_3\n",
"F_notatus_transfer_2\n",
"F_notatus_BW_2\n",
"F_heteroclitusMDPP_FW_3\n",
"F_heteroclitusMDPP_FW_1\n",
"F_heteroclitusMDPP_FW_2\n",
"F_heteroclitusMDPP_BW_2\n",
"F_heteroclitusMDPP_transfer_2\n",
"F_heteroclitusMDPP_transfer_3\n",
"F_heteroclitusMDPP_BW_1\n",
"F_heteroclitusMDPP_transfer_1\n",
"F_heteroclitusMDPP_BW_3\n",
"F_heteroclitusMDPL_transfer_3\n",
"F_heteroclitusMDPL_FW_2\n",
"F_heteroclitusMDPL_transfer_1\n",
"F_heteroclitusMDPL_FW_3\n",
"F_heteroclitusMDPL_transfer_2\n",
"F_heteroclitusMDPL_FW_1\n",
"F_heteroclitusMDPL_BW_1\n",
"F_heteroclitusMDPL_BW_3\n",
"F_heteroclitusMDPL_BW_2\n",
"F_parvapinis_FW_1\n",
"F_parvapinis_transfer_2\n",
"F_parvapinis_FW_3\n",
"F_parvapinis_transfer_1\n",
"F_parvapinis_FW_2\n",
"F_parvapinis_BW_2\n",
"F_parvapinis_BW_3\n",
"F_parvapinis_BW_1\n",
"F_diaphanus_BW_1\n",
"F_diaphanus_BW_2\n",
"F_diaphanus_transfer_2\n",
"F_diaphanus_FW_2\n",
"F_diaphanus_transfer_1\n",
"F_diaphanus_FW_3\n",
"F_catanatus_BW_2\n",
"F_catanatus_transfer_1\n",
"F_catanatus_BW_3\n",
"F_catanatus_transfer_2\n",
"F_catanatus_BW_1\n",
"F_catanatus_FW_1\n",
"F_catanatus_FW_2\n",
"F_zebrinus_FW_2\n",
"F_zebrinus_FW_1\n",
"F_zebrinus_BW_1\n",
"F_zebrinus_BW_2\n",
"F_sciadicus_transfer_1\n",
"F_sciadicus_BW_1\n",
"F_sciadicus_FW_1\n",
"F_sciadicus_FW_2\n",
"F_grandis_transfer_3\n",
"F_grandis_FW_2\n",
"F_grandis_transfer_1\n",
"F_grandis_FW_3\n",
"F_grandis_transfer_2\n",
"F_grandis_FW_1\n",
"F_grandis_BW_1\n",
"F_grandis_BW_3\n",
"F_grandis_BW_2\n",
"F_rathbuni_BW_2\n",
"F_rathbuni_BW_3\n",
"F_rathbuni_BW_1\n",
"F_rathbuni_FW_1\n",
"F_rathbuni_transfer_2\n",
"F_rathbuni_FW_3\n",
"F_rathbuni_transfer_1\n",
"F_rathbuni_FW_2\n",
"F_rathbuni_transfer_3\n",
"F_chrysotus_BW_2\n",
"F_chrysotus_BW_1\n",
"F_chrysotus_BW_3\n",
"F_chrysotus_FW_3\n",
"F_chrysotus_transfer_1\n",
"F_chrysotus_FW_1\n",
"F_chrysotus_FW_2\n",
"F_chrysotus_transfer_2\n",
"A_xenica_FW_2\n",
"A_xenica_transfer_2\n",
"A_xenica_FW_1\n",
"A_xenica_transfer_3\n",
"A_xenica_transfer_1\n",
"A_xenica_FW_3\n",
"A_xenica_BW_3\n",
"A_xenica_BW_1\n",
"A_xenica_BW_2\n"
]
}
],
"source": [
"species_dirs = \"salmon_denovo_by_species\"\n",
"out_dir = \"/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/\"\n",
"dirs = os.listdir(species_dirs)\n",
"for species in dirs:\n",
" if species != \".DS_Store\":\n",
" species_gene_file = species + \"_gene_transcript_table.txt\"\n",
" files = os.listdir(species_dirs + \"/\" + species)\n",
" for quant_dir in files:\n",
" if quant_dir != \".DS_Store\":\n",
" replicate = quant_dir.split(\".\")[0]\n",
" print(replicate)\n",
" quant_files = os.listdir(species_dirs + \"/\" + species + \"/\" + quant_dir)\n",
" for file in quant_files:\n",
" if file.endswith(\".sf\"):\n",
" with open(species_dirs + \"/\" + species + \"/\" + quant_dir + \"/\" + file) as qf:\n",
" header = next(qf).split(\"\\t\")\n",
" expression_quant_data = qf.readlines()\n",
" with open(out_dir + species_gene_file,\"w\") as gt:\n",
" for line in expression_quant_data:\n",
" transcript = line.split(\"\\t\")[0]\n",
" gene = transcript[:-3]\n",
" gt.write(transcript + \"\\t\")\n",
" gt.write(gene + \"\\n\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['F_notatus.trinity_out.Trinity.fasta.dammit.gff3', '.DS_Store', 'F_rathbuni.trinity_out.Trinity.fasta.dammit.gff3', 'F_diaphanus.trinity_out.Trinity.fasta.dammit.gff3', 'F_chrysotus.trinity_out.Trinity.fasta.dammit.gff3', 'L_goodei.trinity_out.Trinity.fasta.dammit.gff3', 'F_heteroclitusMDPL.trinity_out.Trinity.fasta.dammit.gff3', 'F_heteroclitusMDPP.trinity_out.Trinity.fasta.dammit.gff3', 'F_similis.trinity_out.Trinity.fasta.dammit.gff3', 'L_parva.trinity_out.Trinity.fasta.dammit.gff3', 'A_xenica.trinity_out.Trinity.fasta.dammit.gff3', 'F_catanatus.trinity_out.Trinity.fasta.dammit.gff3', 'F_zebrinus.trinity_out.Trinity.fasta.dammit.gff3', 'F_parvapinis.trinity_out.Trinity.fasta.dammit.gff3', 'F_grandis.trinity_out.Trinity.fasta.dammit.gff3', 'F_sciadicus.trinity_out.Trinity.fasta.dammit.gff3', 'F_olivaceous.trinity_out.Trinity.fasta.dammit.gff3', 'F_notti.trinity_out.Trinity.fasta.dammit.gff3']\n",
"F_notatus.trinity_out.Trinity.fasta.dammit.gff3\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/johnsolk/anaconda3/lib/python3.6/site-packages/dammit/fileio/gff3.py:73: ParserWarning: Both a converter and dtype were specified for column attributes - only the converter will be used\n",
" dtype=dict(self.columns)):\n",
"/Users/johnsolk/anaconda3/lib/python3.6/site-packages/dammit/fileio/gff3.py:73: ParserWarning: Both a converter and dtype were specified for column attributes - only the converter will be used\n",
" dtype=dict(self.columns)):\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"F_rathbuni.trinity_out.Trinity.fasta.dammit.gff3\n",
"F_diaphanus.trinity_out.Trinity.fasta.dammit.gff3\n",
"F_chrysotus.trinity_out.Trinity.fasta.dammit.gff3\n",
"L_goodei.trinity_out.Trinity.fasta.dammit.gff3\n",
"F_heteroclitusMDPL.trinity_out.Trinity.fasta.dammit.gff3\n",
"F_heteroclitusMDPP.trinity_out.Trinity.fasta.dammit.gff3\n",
"F_similis.trinity_out.Trinity.fasta.dammit.gff3\n",
"L_parva.trinity_out.Trinity.fasta.dammit.gff3\n",
"A_xenica.trinity_out.Trinity.fasta.dammit.gff3\n",
"F_catanatus.trinity_out.Trinity.fasta.dammit.gff3\n",
"F_zebrinus.trinity_out.Trinity.fasta.dammit.gff3\n",
"F_parvapinis.trinity_out.Trinity.fasta.dammit.gff3\n",
"F_grandis.trinity_out.Trinity.fasta.dammit.gff3\n",
"F_sciadicus.trinity_out.Trinity.fasta.dammit.gff3\n",
"F_olivaceous.trinity_out.Trinity.fasta.dammit.gff3\n",
"F_notti.trinity_out.Trinity.fasta.dammit.gff3\n"
]
}
],
"source": [
"annotations_dir = \"/Users/johnsolk/Documents/UCDavis/Whitehead/gff_annotations/\"\n",
"annotations = os.listdir(annotations_dir)\n",
"print(annotations)\n",
"for annotation in annotations:\n",
" if annotation != \".DS_Store\":\n",
" species = annotation.split(\".\")[0]\n",
" print(annotation)\n",
" name = annotations_dir + annotation\n",
" annotations = GFF3Parser(filename=name).read()\n",
" all_names = annotations.sort_values(by=['seqid'],ascending=True)[['seqid','Name']]\n",
" annotations = annotations.dropna(subset=['Name'])\n",
" pickonename = annotations.sort_values(by=['seqid', 'score'], ascending=True).query('score < 1e-05').drop_duplicates(subset='seqid')[['seqid', 'Name']]\n",
" pickonename = pickonename.dropna(axis=0,how=\"all\")\n",
" fund = annotations[annotations['Name'].str.startswith(\"gi\")]\n",
" names = fund.sort_values(by=['seqid'], ascending=True)[['seqid', 'Name']]\n",
" names_out = \"/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/\" + species + \"_genenames.csv\"\n",
" fund_names = \"/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/\" + species + \"_Fhet_genenames.csv\"\n",
" pickone = \"/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/\" + species + \"_onegenenamepertranscript.csv\"\n",
" #all_names.to_csv(names_out)\n",
" names.to_csv(fund_names)\n",
" pickonename.to_csv(pickone)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_diaphanus_namemap2.csv\n",
"(384218, 4)\n",
"/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_diaphanus_Fhet_genenames.csv\n",
"(134129, 3)\n",
"(384218, 6)\n",
"(455093, 7)\n",
"/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_chrysotus_namemap2.csv\n",
"(396400, 4)\n",
"/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_chrysotus_Fhet_genenames.csv\n",
"(168861, 3)\n",
"(396400, 6)\n",
"(485204, 7)\n",
"/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/L_goodei_namemap2.csv\n",
"(385476, 4)\n",
"/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/L_goodei_Fhet_genenames.csv\n",
"(168019, 3)\n",
"(385476, 6)\n",
"(478428, 7)\n",
"/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_heteroclitusMDPL_namemap2.csv\n",
"(592419, 4)\n",
"/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_heteroclitusMDPL_Fhet_genenames.csv\n",
"(149975, 3)\n",
"(592419, 6)\n",
"(663574, 7)\n",
"/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_grandis_namemap2.csv\n",
"(809060, 4)\n",
"/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_grandis_Fhet_genenames.csv\n",
"(182607, 3)\n",
"(809060, 6)\n",
"(896995, 7)\n",
"/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_olivaceous_namemap2.csv\n",
"(350265, 4)\n",
"/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_olivaceous_Fhet_genenames.csv\n",
"(157612, 3)\n",
"(350265, 6)\n",
"(438933, 7)\n",
"/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_zebrinus_namemap2.csv\n",
"(266978, 4)\n",
"/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_zebrinus_Fhet_genenames.csv\n",
"(119383, 3)\n",
"(266978, 6)\n",
"(329326, 7)\n",
"/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_catanatus_namemap2.csv\n",
"(405866, 4)\n",
"/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_catanatus_Fhet_genenames.csv\n",
"(157130, 3)\n",
"(405866, 6)\n",
"(484458, 7)\n",
"/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_sciadicus_namemap2.csv\n",
"(241279, 4)\n",
"/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_sciadicus_Fhet_genenames.csv\n",
"(118097, 3)\n",
"(241279, 6)\n",
"(304539, 7)\n",
"/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_heteroclitusMDPP_namemap2.csv\n",
"(668487, 4)\n",
"/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_heteroclitusMDPP_Fhet_genenames.csv\n",
"(146014, 3)\n",
"(668487, 6)\n",
"(736813, 7)\n",
"/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_similis_namemap2.csv\n",
"(520319, 4)\n",
"/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_similis_Fhet_genenames.csv\n",
"(161532, 3)\n",
"(520319, 6)\n",
"(607296, 7)\n",
"/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/A_xenica_namemap2.csv\n",
"(362783, 4)\n",
"/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/A_xenica_Fhet_genenames.csv\n",
"(161802, 3)\n",
"(362783, 6)\n",
"(451127, 7)\n",
"/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_parvapinis_namemap2.csv\n",
"(352346, 4)\n",
"/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_parvapinis_Fhet_genenames.csv\n",
"(146111, 3)\n",
"(352346, 6)\n",
"(435605, 7)\n",
"/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_notatus_namemap2.csv\n",
"(416299, 4)\n",
"/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_notatus_Fhet_genenames.csv\n",
"(178608, 3)\n",
"(416299, 6)\n",
"(512589, 7)\n",
"/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/L_parva_namemap2.csv\n",
"(409543, 4)\n",
"/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/L_parva_Fhet_genenames.csv\n",
"(159764, 3)\n",
"(409543, 6)\n",
"(495881, 7)\n",
"/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_notti_namemap2.csv\n",
"(159771, 4)\n",
"/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_notti_Fhet_genenames.csv\n",
"(91102, 3)\n",
"(159771, 6)\n",
"(209036, 7)\n",
"/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_rathbuni_namemap2.csv\n",
"(501222, 4)\n",
"/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_rathbuni_Fhet_genenames.csv\n",
"(176180, 3)\n",
"(501222, 6)\n",
"(595482, 7)\n"
]
}
],
"source": [
"namemap_dir = \"/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/\"\n",
"maps = os.listdir(namemap_dir)\n",
"for namemap in maps:\n",
" if namemap.endswith(\"dammit.namemap.csv\"):\n",
" species = namemap.split(\".\")[0]\n",
" namemap2 = namemap_dir + species + \"_namemap2.csv\"\n",
" with open(namemap_dir + namemap) as f:\n",
" header = next(f)\n",
" new_header = header.strip() + \",\"+\"seqid\"+\",\"+\"Name\"\n",
" ids = f.readlines()\n",
" with open(namemap2,\"w\") as n:\n",
" n.write(new_header+\"\\n\")\n",
" for line in ids:\n",
" trinity_full = line.split(\",\")[0]\n",
" renamed = line.split(\",\")[-1].strip()\n",
" trinity_contig = trinity_full.split(\" \")[0].strip('\"')\n",
" gene = trinity_contig[:-3]\n",
" n.write(trinity_contig+\",\")\n",
" n.write(renamed+\",\")\n",
" n.write(gene+\",\")\n",
" n.write(gene+\"\\n\")\n",
" annotations = namemap_dir + species + \"_onegenenamepertranscript.csv\"\n",
" Fhet = namemap_dir + species + \"_Fhet_genenames.csv\"\n",
" ann = pd.read_csv(annotations)\n",
" dammit_Trinity = pd.read_csv(namemap2)\n",
" print(namemap2)\n",
" print(dammit_Trinity.shape)\n",
" species_Fhet = pd.read_csv(Fhet)\n",
" print(Fhet)\n",
" print(species_Fhet.shape)\n",
" species_Fhet = species_Fhet.drop(['Unnamed: 0'], axis=1)\n",
" ann = ann.drop(['Unnamed: 0'], axis=1)\n",
" combined = pd.merge(dammit_Trinity, ann, how='outer', left_on=\"renamed\", right_on=\"seqid\")\n",
" print(combined.shape)\n",
" combined = combined.drop(['renamed'],axis=1)\n",
" combined = pd.merge(combined,species_Fhet,how = \"outer\",left_on=\"seqid_y\",right_on=\"seqid\")\n",
" print(combined.shape)\n",
" combined['Name_y'] = combined['Name_y'].fillna(combined['seqid_x'])\n",
" combined = combined.drop(['Name_x'],axis=1)\n",
" new = combined.to_csv(\"/Users/johnsolk/Documents/UCDavis/Whitehead/annotation_gene_names/\" + species + \"_gene_names.csv\")\n",
" #print(\"/Users/johnsolk/Documents/UCDavis/Whitehead/annotation_gene_names/\" + species + \"_gene_names.csv\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment