Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Created April 15, 2018 10:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avrilcoghlan/83acefb2af564a11d22b4bbc9e0aac18 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/83acefb2af564a11d22b4bbc9e0aac18 to your computer and use it in GitHub Desktop.
Script written by Diogo Ribeiro to identify gene family expansions in an in-house Compara database
#!/usr/bin/env python3
#25-Feb-2015 dr7
#Analysis suite for exploring Compara families. The purpose is to gather information about Compara trees to that we can mine this large set of information and select the most interesting families to study.
import sys
import os
import re
import gzip
import random
import pickle
from numpy import mean, median, std #numpy.std gets the population standard deviation, not the sample standard deviation
from glob import glob
from pprint import pprint
from time import clock
from warnings import warn
#dependency
import findLastCommonAncestor #dr7 deployed and tested code
from Bio import Phylo
time0 = clock()
class Run(object):
def __init__(self, comparaFamilies,familiesPerTaxonomyLevel,geneDuplicationsFolder,geneLossesFolder,speciesNameTag,geneIDMappingFolder, \
interproscanGFFFolder,branchLengths,removeOutgroupsFile,speciesTreeFile,verbose,outputFile,dumpFamilies,flagFamilyList,speciesClade, \
geneFilterList,dumpsFolder,nodeInfo,proteinLengthsFolder):
"""Initialization of all class variables. Explains data structures."""
#inputs / arguments
self.comparaFamiliesFile = open(comparaFamilies,"r")
self.familiesPerTaxonomyLevelFile = familiesPerTaxonomyLevel
self.speciesTreeFile = speciesTreeFile
self.geneDuplicationsFolder = geneDuplicationsFolder
self.geneLossesFolder = geneLossesFolder
self.speciesNameTagFile = open(speciesNameTag,"r")
self.geneIDMappingFolder = geneIDMappingFolder
self.interproscanGFFFolder = interproscanGFFFolder
self.branchLengthsFile = open(branchLengths,"r")
self.cladeFile = open(speciesClade,"r")
self.nodeInfoFile = open(nodeInfo,"r")
#optional inputs / arguments
self.verbose = verbose
self.outputFile = open(outputFile,"w")
self.removeOutgroupsFile = removeOutgroupsFile
self.dumpFamiliesFile = dumpFamilies
self.flagFamilyListFile = flagFamilyList
self.geneFilterListFile = geneFilterList
self.dumpsFolder = dumpsFolder
self.proteinLengthsFolder = proteinLengthsFolder
#hard-coded 'arguments'
self.jobID = str(random.randint(1,100000))
self.targetSources = ["Pfam","TMHMM","SignalP_EUK"] #the types of GFF source entries that we are interested in storing in memory. #this is kept here as it may be modified, but not by user.
self.freeLivingSpecies = ["caenorhabditis_elegans","panagrellus_redivivus","rhabditophanes_kr3021","pristionchus_pacificus","schmidtea_mediterranea"] # 50HG Species that are completely free-living, so that I can distinguish parasitic-specific gene families. #This is just for helminth species, other non-helminth outgroups would be free-living too.
#data structures #main connector is familyID or species
self.familiesDict = {} #key -> familyID, val -> dict: key -> species, val -> list of genes
self.familiesTaxon = {} #key -> familyID, val -> taxonomy level
self.numTaxonBelow = {} #key -> taxon ID, val -> number species below
self.familiesGeneDuplication = {} #key -> familyID, val -> sum of duplications on throughout all species nodes
self.familiesGeneDuplicationMaxNode = {} #key -> familyID, val -> node ID with most duplication events
self.familiesGeneLosses = {} #key -> familyID, val -> sum of losses on throughout all species nodes
self.tagSpeciesName = {} #key -> species_tag, val -> species_name #this is to connect species to interproscan files
self.geneIDMapping = {} #key -> species, val -> dict: key -> geneID(interpro), val -> list of other geneIDs #this is to connect genes in interproscan to genes in families.txt
self.geneIDMappingAllEntries = {} #key -> mRNA ID, val -> list of other IDs
self.interproscanData = {} #key -> species, value -> dict; key -> gene, value -> dict; key-> source, val-> all lines attributed to that gene/source
self.branchLengths = {} #key -> familyID, val -> average branch length in family
self.cladePerSpecies = {} #key -> species, val -> clade
self.speciesPerClade = {} #key -> clade, value -> number of species in the clade (based on the species tree)
self.nodeInfo = {} #key -> node ID, list of clades potentially present under that node. e.g. 'bilateria40001012': {'I', 'IV', 'Flatworm', 'Flukes', 'Tapeworm', 'V', 'III'}
self.nodeInfoSpp = {} #key -> node ID, list of species present under that node.
self.proteinLengths = {} #key -> species, val -> dict ; key -> gene, val -> protein length in aa
self.familiesDictLengths = {} #key -> familyID, val -> dict: key -> species, val -> sum of protein length of proteins in that family and species.
#optional data structures
self.speciesTree = {} #key -> parent, value -> child. Done from a newick tree with external script "findLastCommonAncestor"
self.removeOutgroups = [] #list of node/leaf ids + species/node-name to remove from analysis
self.speciesToRemove = [] #list of species to remove from analysis #to apply when processing families.txt
self.familiesToFlag = [] #list of family IDs of families to have 'exclude' flag
self.geneFilterList = [] #list of gene IDs to filter out
self.listOfSpecies = [] #list of species we process here #based on the species tree!
##Notes:
#The species name should be lower case and with underscore e.g. panagrellus_redivivus
#The family IDs should be an int
#geneFilterListFile: added gene filtering into the readFamily step. Most measures will be (re)calculated in this script itself (e.g. gene tree root is recalculated). Note though that the following measures won't be correct/updated if gene filtering: Total_duplications, Total_losses, Max_duplications_node, Median_branch_length. These are external measures and calculated only before the gene filtering.
def readFamilies(self):
"""Parse compara families file. This file can contain all species and genes (i.e. can be Compara families raw file).
Also does the species filtering (i.e. only counts genes in species that are not listed to remove).
Also does gene filtering, if given a list of genes to discard.
Only retains families that have at least 2 genes after filtering.
Also uses protein lengths instead of number of genes. instead of counting number of genes it sums up protein length of gene. (e.g. a species, instead of having 10 genes in that family has 3000 aa).
E.g. input file line: family 3 : SSTP_0000649300.1 (strongyloides_stercoralis) DME_0000314201-mRNA-1 (dracunculus_medinensis) TELCIR_18205 (teladorsagia_circumcincta) HPBE_0001583601-mRNA-1 (heligmosomoides_bakeri) """
if self.verbose: print ("Reading compara families files..")
comparaFamiliesFile = self.comparaFamiliesFile
speciesToRemove = self.speciesToRemove
geneFilterList = self.geneFilterList
proteinLengths = self.proteinLengths
familiesDict = {}
familiesDictLengths = {}
for line in comparaFamiliesFile: #for each family
if "----------" not in line:
line = line.strip()
spl = line.split(":",1) #doing only max of one split, this is because some IDs also contain ":"
assert (len(spl) == 2)
familyID = int(spl[0].split(" ")[1])
speciesDict = {} #key -> sp, val -> list of genes
speciesDictLengths = {} #same as previous but stores cumulative lengths of genes of a species as an integer
genes = spl[1].split(" ")[1:]
for i in range(0,len(genes),2):
gene,sp = genes[i],genes[i+1]
if gene not in geneFilterList: #otherwise it is ignored, as if not existing. After this there should be a condition excluding potentially empty families.
sp = sp.replace("(","").replace(")","")
if sp not in speciesToRemove: #validated that this works well by looking at a couple of families
if sp not in speciesDict:
speciesDict[sp] = []
speciesDictLengths[sp] = 0
speciesDict[sp].append(gene)
#for protein lengths instead of gene numbers
proteinLength = int(proteinLengths[sp][gene] )
speciesDictLengths[sp]+= proteinLength
lens = [len(speciesDict[sp]) for sp in speciesDict] #numbers of genes
sumLens = sum(lens)
if len(speciesDict) > 0 and sumLens > 1: #and sum(list(speciesDict.values()) ) > 1: #there can be empty families, if all species of that tree are to be removed #also there could be single gene 'families' after filtering.
familiesDict[familyID] = speciesDict
familiesDictLengths[familyID] = speciesDictLengths
if self.verbose: print ("Total number compara families (from families.txt) to be processed, after filtering:\t%s" % len(familiesDict)) #this number does not have to match the total number of families, in case species are being removed
assert (len(familiesDict) == len(familiesDictLengths))
self.familiesDict = familiesDict
self.familiesDictLengths = familiesDictLengths
def readFamiliesPerTaxonomyLevel(self):
"""Reads families per taxonomy level. Note that, if the families.txt is filterd in this script, the input for this function may be a temporary file (corrected for new families) created in this script itself.
E.g. Hymenolepis microstoma 40001043 : 841230,912843,1018527,1058154,1059819,1081886,1082962,1090840,1117204,1117804,1118812"""
if self.verbose: print ("Reading %s file.." % self.familiesPerTaxonomyLevelFile)
familiesPerTaxonomyLevelFile = open(self.familiesPerTaxonomyLevelFile,"r")
familiesTaxon = {}
for line in familiesPerTaxonomyLevelFile:
line = line.strip()
taxon,families = line.split(":")
taxon = taxon.lower().strip().split(" ") #may need lower()
taxonNum = taxon[-1]
taxon = "_".join(taxon[:-1])+taxonNum #e.g. Nippostrongylus_brasiliensis40001108
families = families.strip().split(",")
for family in families:
family = int(family)
if family not in familiesTaxon:
familiesTaxon[family] = taxon
else:
print ("Two roots for family?",family)
if self.verbose: print ("Total families in families_per_taxonomy_level:\t%s" % len(familiesTaxon))
self.familiesTaxon = familiesTaxon
def _readTaxonFiles(self,inFolder):
"""E.g filename: echinostomatoidea40001020.txt. Inside file: 66838 2 (familyID\tNum_Events).
What I want is to combine all events for each family."""
removeOutgroups = self.removeOutgroups
outDict = {} #key -> family ID, val -> tot events
outDictNode = {} #key -> family ID, val -> dict; key -> nodeID, val -> tot events #to have node with most events
filesToProcess = glob(inFolder+"/*.txt")
for fi in filesToProcess:
fileName = fi.split("/")[-1].replace(".txt","")
if fileName in removeOutgroups: #exclude data from nodes/species that we want to exclude
continue
# #checking if taxon IDs match between files
# print (self.numTaxonBelow[fileName])
f = open(fi,"r")
for line in f:
if "#" in line:
continue
line = line.strip()
spl = line.split("\t")
familyID = int(spl[0])
events = int(spl[1])
if familyID not in outDict:
outDict[familyID] = 0
outDictNode[familyID] = {}
if fileName not in outDictNode[familyID]:
outDictNode[familyID][fileName] = 0
outDict[familyID]+=events
outDictNode[familyID][fileName]+=events
f.close()
return outDict,outDictNode
def readGeneDuplicationsFolder(self):
"""Process all files in Gene Duplications folder"""
if self.verbose: print ("Reading gene duplication files..")
geneDuplicationsFolder = self.geneDuplicationsFolder
run = self._readTaxonFiles(geneDuplicationsFolder)
familiesGeneDuplication = run[0]
nodeDict = run[1]
#getting most frequent node
familiesGeneDuplicationMaxNode = {}
for familyID in nodeDict:
maxVal = 0
maxNode = ""
if familyID not in familiesGeneDuplicationMaxNode:
familiesGeneDuplicationMaxNode[familyID] = ""
for node in nodeDict[familyID]:
val = nodeDict[familyID][node]
if val > maxVal:
maxVal = val
maxNode = node
familiesGeneDuplicationMaxNode[familyID] = str(maxVal)+"X:"+maxNode
##validation
# dr7@farm3-head2:/nfs/helminths02/analysis/50HGP/00ANALYSES/final_gene_duplications/cutoffm1$ g 1178810 *
# crassostrea_gigas40001175.txt:1178810 1
# schmidtea_mediterranea40001061.txt:1178810 2
# Family 126577: '37X:schmidtea_mediterranea40001061'
# dr7@farm3-head2:/nfs/helminths02/analysis/50HGP/00ANALYSES/final_gene_duplications/cutoffm1$ grep 126577 *
# ...
# ascaridoidea40001165.txt:126577 4
# bilateria40001006.txt:126577 12
# ...
# schmidtea_mediterranea40001061.txt:126577 37
# ...
if self.verbose: print ("Total families with gene duplication info:\t%s" % len(familiesGeneDuplication))
self.familiesGeneDuplication = familiesGeneDuplication
self.familiesGeneDuplicationMaxNode = familiesGeneDuplicationMaxNode
def readGeneLossesFolder(self):
"""Process all files in Gene Losses folder"""
if self.verbose: print ("Reading gene loss files..")
geneLossesFolder = self.geneLossesFolder
familiesGeneLosses = self._readTaxonFiles(geneLossesFolder)[0]
#Validation: 1135828: 6
# dr7@pcs5b:/nfs/helminths02/analysis/50HGP/00ANALYSES/final_gene_losses$ g 1135828 *
# chromadorea40001076.txt:1135828 1
# platyhelminthes40001014.txt:1135828 1
# romanomermis_culicivorax40001064.txt:1135828 1
# soboliphyme_baturini40001066.txt:1135828 1
# trichuris_muris40001075.txt:1135828 1
# trichuris_trichiura40001074.txt:1135828 1
if self.verbose: print ("Total families with gene losses info:\t%s" % len(familiesGeneLosses))
self.familiesGeneLosses = familiesGeneLosses
def readSpeciesNameTag(self):
"""Reads tag-species_name correspondence. E.g. PREFIX Genus species VERSION
ACAC Angiostrongylus cantonensis 1.5.4"""
if self.verbose: print ("Reading species name mapping files..")
speciesNameTagFile = self.speciesNameTagFile
tagSpeciesName = {}
for line in speciesNameTagFile:
if line.startswith('"This') or line.startswith("PREFIX") or "#" in line:
continue
line = line.strip()
spl = line.split("\t")
tag = spl[0]
sp = spl[1].lower().strip()+"_"+spl[2].strip() #strip because sometimes there is some empty spaces
tagSpeciesName[tag] = sp
if self.verbose: print ("Total species with locus tag info:\t%s" % len(tagSpeciesName))
self.tagSpeciesName = tagSpeciesName
def readGeneIDMappingFolder(self):
"""Reads Gene ID mapping files
E.g. acanthocheilonema_viteae nAv.1.0.1.g00113 nAv.1.0.1.t00113-RA nAv.1.0.1.t00113-RA nAv.1.0.1.t00113-RA nAvx1x0x1xt00113-RA"""
if self.verbose: print ("Reading id mapping files..")
geneIDMappingFolder = self.geneIDMappingFolder
geneIDMapping = {} #key -> species, val -> dict: key -> geneID(interpro), val -> list of other geneIDs #this is to connect genes in interproscan to genes in families.txt
geneIDMappingAllEntries = {} #key -> mRNA ID, val -> list of other IDs
#Note: not all genes are present in the interproscan files, so the total number of genes for a species here is not representative of the full dataset.
filesToProcess = glob(geneIDMappingFolder+"/*_id_mapping.txt")
for fi in filesToProcess:
f = open(fi,"r")
fileName = fi.split("/")[-1].replace("_id_mapping.txt","")
if fileName not in geneIDMapping:
geneIDMapping[fileName] = {}
for line in f:
if line.startswith("SPECIES_NAME"):
continue
line = line.strip()
spl = line.split("\t")
iprID = spl[5]
mRNA = spl[2]
gene = spl[1]
geneIDMappingAllEntries[mRNA] = spl[1:]
if iprID != "NA":
if iprID not in geneIDMapping[fileName]:
geneIDMapping[fileName][iprID] = []
geneIDMapping[fileName][iprID] = spl[1:-1] #first column is species name
f.close()
if self.verbose: print ("Total species with id mapping info:\t%s" % (len(geneIDMapping)) )
if self.verbose: print ("Total genes with id mapping info:\t%s" % (len(geneIDMappingAllEntries)) )
self.geneIDMapping = geneIDMapping
self.geneIDMappingAllEntries = geneIDMappingAllEntries
def readInterproGFFFolder(self):
"""Reads interproscan GFF files for all species. The files can be gzipped."""
if self.verbose: print ("Reading interproscan files..")
interproscanGFFFolder = self.interproscanGFFFolder
tagSpeciesName = self.tagSpeciesName
geneIDMapping = self.geneIDMapping
interproscanData = {} #key -> species, value -> dict; key -> gene, value -> dict; key-> source, val-> all lines attributed to that gene/source
filesToProcess = glob(interproscanGFFFolder+"/*ipr*")
IDsNotFound = {}
for fi in filesToProcess:
if fi.endswith(".gz"):
# f = gzip.open(fi, 'rb').read().decode('ascii').split("\n") #this might be memory hungry
f = gzip.open(fi, 'rb').read().decode('utf-8').split("\n") #this might be memory hungry # bh4 changed this to utf-8 because ascii fails for the new ipr file for T.regenti
else:
f = open(fi,"r")
##e.g. file name /nfs/helminths02/analysis/50HGP/01INTERPRO/ACOC.protein.fa.gz.fas.ipr.gz
tag = fi.split("/")[-1].split(".")[0]
species = tagSpeciesName[tag]
if species not in interproscanData:
interproscanData[species] = {}
if species not in geneIDMapping:
warn("Warning: %s not in geneIDMapping" % species)
for line in f:
if "##FASTA" in line:
break #I do not want to read the fasta part of GFF. #I assume that after this line appears there will be no more GFF/Interproscan entries
if "##" in line:
continue
line = line.strip()
spl = line.split("\t")
if len(spl) < 8: #means is not proper GFF line
continue
source = spl[1]
if source not in self.targetSources:
continue
iprID = spl[0]
# if iprID not in geneIDMapping[species]:
# if species not in IDsNotFound:
# IDsNotFound[species] = set()
# IDsNotFound[species].add(iprID)
if iprID in geneIDMapping[species]: #It can happen that a gene is not present in geneIDMapping, this is because some interproscan files are not updated, containing genes that have since been removed for compara.
geneID = geneIDMapping[species][iprID][0]
# else:
# warn("IPR ID not in gene mapping file.")
if geneID not in interproscanData[species]:
interproscanData[species][geneID] = {}
if source not in interproscanData[species][geneID]:
interproscanData[species][geneID][source] = []
interproscanData[species][geneID][source].append(line)
#pprint(interproscanData[species])
#Validation, number of Pfam domains in file
#/nfs/helminths02/analysis/50HGP/01INTERPRO/ACOC.protein.fa.gz.fas.ipr.gz 13886
# dr7@farm3-head2:/lustre/scratch108/parasites/dr7/50HG/50HG_100K_families_parsing/interproscanGFFs$ grep Pfam ACOC.protein.fa.gz.fas.ipr | wc -l
# 13886
# print ("IDs on IPR GFF but not on ID Mapping file!",len(IDsNotFound))
# for sp in IDsNotFound:
# print (sp,len(list(IDsNotFound[sp]))
# print (IDsNotFound[sp])
if self.verbose: print ("Total species with interproscan data:\t%s" % len(interproscanData) )
self.interproscanData = interproscanData
def readBranchLengths(self):
"""Reads branch lengths file. Grabs median.
E.g. 22454 0.125 0.081 0.204589,0.263049,0.078644,0.113182,0.282062,0.03627,0.105361,0.177002,0.089029,0.129357,0.076138,0.102818"""
if self.verbose: print ("Reading family branch lengths file..")
branchLengthsFile = self.branchLengthsFile
branchLengths = {}
for line in branchLengthsFile:
line = line.strip()
familyID,mean,median,lista = line.split("\t")
familyID = int(familyID)
if familyID not in branchLengths:
branchLengths[familyID] = 0
branchLengths[familyID] = float(median)
if self.verbose: print ("Total families with branch lengths:\t%s" % len(branchLengths) )
self.branchLengths = branchLengths
def readOutgroupList(self):
"""Reads file with outgroup / node ids to exclude from analysis. ID\tspecies/node-name\twhether_is_species_or_node
e.g. 40001001 amphimedon_queenslandica species"""
if self.verbose: print ("Reading file with outgroups to remove..")
removeOutgroupsFile = open(self.removeOutgroupsFile,"r")
removeOutgroups = []
speciesToRemove = []
for line in removeOutgroupsFile:
line = line.strip()
spl = line.split("\t")
removeOutgroups.append(spl[1]+spl[0])
if spl[2] == "species":
speciesToRemove.append(spl[1])
if self.verbose: print ("Total outgroup nodes/leaves to be removed:\t%s" % len(removeOutgroups) )
if self.verbose: print ("Total species (leaves) to be removed:\t%s" % len(speciesToRemove) )
self.speciesToRemove = speciesToRemove
self.removeOutgroups = removeOutgroups
def readSpeciesTree(self):
"""Reads newick species tree. Node/leaf names must have at least one alphabethic character, otherwise if only numeric Phylo will read names as being branch lengths. Branch lengths are not required.
This outputs a tree with parent-child relationships. Will only work for simple binary trees.
Also returns number taxons below each node based on the tree."""
if self.verbose: print ("Reading species tree..")
speciesTreeFile = self.speciesTreeFile
numTaxonBelow = {}
sppTaxonBelow = {}
tree = Phylo.read(speciesTreeFile, 'newick')
speciesTree = findLastCommonAncestor.convertTreeToDict(tree) #in python dictionary format
reverseDict = findLastCommonAncestor.reverseDict(speciesTree)
allNodes = list(speciesTree.keys())
allNodes.extend(list(speciesTree.values()) )
allNodes = set(allNodes)
for node in allNodes:
numTaxonBelow[node] = int(findLastCommonAncestor.countLeaves(reverseDict,node))
if self.verbose: print ("Total nodes/leaves in species tree:\t%s" % len(speciesTree) )
if self.verbose: print ("Total nodes/leaves with number of taxon below:\t%s" % len(numTaxonBelow) )
self.numTaxonBelow = numTaxonBelow
self.speciesTree = speciesTree
leaves = [leaf.name for leaf in tree.get_terminals()]
listOfSpecies = []
for leaf in leaves:
listOfSpecies.append(leaf.split("400")[0])
self.listOfSpecies = listOfSpecies
def writeFamiliesPerTaxonomyLevel(self):
"""If filtering out some species/nodes, I need to calculate new root for these altered compara families, and create an updated familiesPerTaxonomyLevel. The root (branchName) will then be taken from this file.
This has to be run after readFamilies. The external 'findLastCommonAncestor' functions have been tested previously.
This function has been properly validated: the created file is the same as the one from Compara, if not removing any outgroups."""
#e.g.
#Hymenolepis microstoma 40001043 : 841230,912843,1018527,1058154,1059819
#Strongyloides 40001088 : 269778,518849,529306,576845,647355,668201,705124,735608
if self.verbose: print ("Creating familiesPerTaxonomyLevel.txt with recalculated roots..")
familiesDict = self.familiesDict
speciesTree = self.speciesTree
tmpFile = open("tmp_familiesPerTaxonomyLevel_"+self.jobID+".txt","w")
speciesKeys = list(speciesTree.keys())
familiesTaxonLevel = {} #this is temporary structure to store what will be written in tmp file
#loop through each family, calculate new root and write in file.
for familyID in familiesDict:
species = list(familiesDict[familyID].keys()) #these are e.g. clonorchis_sinensis, but we need to match to speciesTree clonorchis_sinensis40001019
speciesKey = ""
renamedSpecies = []
for sp in species:
for key in speciesKeys:
if sp in key:
renamedSpecies.append(key)
assert (len(species) == len(renamedSpecies)) #otherwise species-tree species names do not match families.txt species names..
rootNode = findLastCommonAncestor.run(renamedSpecies,speciesTree)
#rename rootNode so that it resembles the same as in familiesPerTaxonomyLevel.
#This bit may be particular to 50HG Compara. But it should not matter if other database/ID type, simply the file with look differently.
if "4000" in rootNode:
spl = rootNode.partition("4000")
name = spl[0]
ID = "".join(spl[1:])
if "_" in name:
name = name.replace("_"," ")
name = name[0].upper()+name[1:]
rootNode = name+" "+ID
if rootNode not in familiesTaxonLevel:
familiesTaxonLevel[rootNode] = []
familiesTaxonLevel[rootNode].append(str(familyID) )
#write to new file
for taxon in familiesTaxonLevel:
tmpFile.write(taxon+" : "+",".join(familiesTaxonLevel[taxon])+"\n")
tmpFile.close()
if self.verbose: print ("Total number of node/species in new familiesPerTaxonomyLevel file:\t%s" % len(familiesTaxonLevel)) #this should be equal to: total nodes+species in species tree (180 for 50HG) minus the removed ones.
self.familiesPerTaxonomyLevelFile = tmpFile.name #change familiesPerTaxonomyLevelFile file pointer to newly created file
def readFlagFamilyList(self):
"""Read file with list of families to be flagged."""
if self.verbose: print ("Reading file with families to flag with 'exclude'..")
flagFamilyListFile = open(self.flagFamilyListFile,"r")
familiesToFlag = [int(familyID.strip()) for familyID in flagFamilyListFile]
self.familiesToFlag = familiesToFlag
if self.verbose: print ("Total number of families to be flagged: %s" % (len(familiesToFlag)) )
def readGeneFilterList(self):
"""Read file with list of genes to be excluded from measures."""
if self.verbose: print ("Reading file with genes to exclude..")
geneIDMappingAllEntries = self.geneIDMappingAllEntries #mRNA ID to other IDs
geneFilterListFile = open(self.geneFilterListFile,"r")
#convert mRNA ID to gene ID, to match families
geneFilterList = {geneIDMappingAllEntries[geneID.strip()][0] for geneID in geneFilterListFile}
self.geneFilterList = geneFilterList
if self.verbose: print ("Total number of genes to be excluded: %s" % (len(geneFilterList)) )
def readCladeFile(self):
"""Read clade file, there must be a clade per each species"""
if self.verbose: print ("Reading clade file..")
cladeFile = self.cladeFile
listOfSpecies = self.listOfSpecies
cladePerSpecies = {} #key -> species, val -> clade
speciesPerClade = {} #key -> clade, value -> species in the clade (based on the species tree)
for line in cladeFile:
line = line.strip()
spl = line.split("\t")
sp = spl[0]
clade = spl[1]
sp = sp.lower()
sp = sp.replace(" ","_")
if sp in listOfSpecies:
cladePerSpecies[sp] = clade
if clade not in speciesPerClade:
speciesPerClade[clade] = []
speciesPerClade[clade].append(sp)
self.cladePerSpecies = cladePerSpecies
self.speciesPerClade = speciesPerClade
if self.verbose: print ("Total number of species in clade file: %s" % (len(cladePerSpecies)) )
def readNodeInfoFile(self):
"""Reads file with information on leaves for each node. E.g.
node amphimedon_queenslandica40001001 has parent metazoa40001000
node euteleostomi40001177 has descendants danio_rerio40001178,homo_sapiens40001179
Retrieves the set of Clades present for each node. All species of that clade need to be present for a clade to be considered (e.g. paraphyletic)."""
if self.verbose: print ("Reading node info file..")
nodeInfoFile = self.nodeInfoFile
speciesPerClade = self.speciesPerClade
cladePerSpecies = self.cladePerSpecies
nodeInfo = {} #key -> node ID, list of clades potentially present under that node.
nodeInfoSpp = {} #key -> node ID, list of species present under that node.
for line in nodeInfoFile:
line = line.strip()
if "descendants" in line:
spl = line.split(" ")
nodeID = spl[1]
descendants = spl[4].split(",")
cladeList = []
for des in descendants:
des = des.split("400")[0]
cladeList.append(cladePerSpecies[des])
descendantClades = set()
for clade in speciesPerClade:
if cladeList.count(clade) == len(speciesPerClade[clade]): #to make sure the whole clade is complete. I should only count a descendant clade if all species are present
descendantClades.add(clade)
nodeInfo[nodeID] = sorted(list(descendantClades))
nodeInfoSpp[nodeID] = [des.split("400")[0] for des in descendants]
self.nodeInfo = nodeInfo
self.nodeInfoSpp = nodeInfoSpp
if self.verbose: print ("Total number of tree nodes with leaf info: %s" % (len(nodeInfo)) )
def readProteinLengthFiles(self):
"""Read list of all genes and their length"""
if self.verbose: print ("Reading protein lengths folder..")
proteinLengthsFolder = self.proteinLengthsFolder
proteinLengths = {} #key -> species, val -> dict. Key -> gene id, val -> length
filesToProcess = glob(proteinLengthsFolder+"/*_protein_lengths.txt")
count = 0
for f in filesToProcess:
inFile = open(f,"r")
spName = f.split("/")[-1].replace("_protein_lengths.txt","")
if spName not in proteinLengths:
proteinLengths[spName] = {}
for line in inFile:
gene,geneLength = line.strip().split("\t")
proteinLengths[spName][gene] = geneLength
count+=1
inFile.close()
self.proteinLengths = proteinLengths
if self.verbose: print ("Total number of species with protein lengths: %s and total genes: %s" % (len(proteinLengths),count) )
def statsPerFamily(self):
"""Main function to write stats for each family"""
if self.verbose: print ("Making stats..")
#1) all global data structures in use #just wrote this to easier localization and make sure I don't change the values
familiesDict = self.familiesDict
familiesTaxon = self.familiesTaxon
numTaxonBelow = self.numTaxonBelow
familiesGeneDuplication = self.familiesGeneDuplication
familiesGeneDuplicationMaxNode = self.familiesGeneDuplicationMaxNode
familiesGeneLosses = self.familiesGeneLosses
interproscanData = self.interproscanData
freeLivingSpecies = self.freeLivingSpecies
branchLengths = self.branchLengths
familiesToFlag = self.familiesToFlag
cladePerSpecies = self.cladePerSpecies
speciesPerClade = self.speciesPerClade
listOfSpecies = self.listOfSpecies
nodeInfo = self.nodeInfo
nodeInfoSpp = self.nodeInfoSpp
familiesDictLengths = self.familiesDictLengths
#2) output file headers
self.outputFile.write("familyID\tFlag\tn_species\tn_genes\tMean_genes_per_species\tMedian_genes_per_species\tVariation_coefficient_n_genes_per_species\tn_paralogs\tBranch_name\tCompleteness_score\tTotal_duplications\tMax_duplications_node\tMost_frequent_species\t")
for species in sorted(freeLivingSpecies):
self.outputFile.write(species+"\t")
self.outputFile.write("Total_losses\tMedian_branch_length\tPfam_perc_genes\tTMHMM_perc_genes\tSignalP_perc_genes\tPfam_perc_in_family\tPfam_all_in_family\n")
outputFileMeasures = open(self.outputFile.name+"_measures","w") #special file just for the measures
outputFileMeasures.write("familyID\tFlag\tn_species\tn_genes\tmean_prot_len\tsum_prot_len\tSpecies_var_coef\tSpecies_var_coef_zeroes\tVar_coef_lengths_zeroes\t")
for clade in sorted(speciesPerClade):
outputFileMeasures.write("zscore_%s\t" % clade)
outputFileMeasures.write("Max_zscore_clade\tMax_zscore\t")
for clade in sorted(speciesPerClade):
outputFileMeasures.write("enrich_%s\t" % (clade) )
outputFileMeasures.write("Max_enrich_clade\tMax_enrich\t")
outputFileMeasures.write("Pfam_perc_in_family\tPfam_all_in_family\n")
#3) Loop through each compara family, and produce all stats for this family
count = 0 #count processed families
for family in familiesDict:
count+=1
#3.1) General basic stats
nSpecies = len(familiesDict[family]) #note that this familiesDict only contain species entry if any gene in that family (does not count zeroes)
nGenesPerSpecies = [len(familiesDict[family][sp]) for sp in familiesDict[family]]
proteinLengthsSpecies = [familiesDictLengths[family][sp] for sp in familiesDictLengths[family]] #this is list of protein lenghts sums
nGenes = sum(nGenesPerSpecies)
sumProteinLengths = sum(proteinLengthsSpecies)
avgProteinLenPerGene = "%.1f" % (sumProteinLengths/nGenes)
nParalogs = nGenes - nSpecies
avgGenesPerSpecies = "%.1f" % (mean(nGenesPerSpecies))
medianGenesPerSpecies = "%.1f" % (median(nGenesPerSpecies))
stdevGenesPerSpecies = std(nGenesPerSpecies,ddof=1)
# 21-Aug-2015: Variation coefficient = Stdev of number of genes in species with at least one gene / mean of number of genes in species with at least one gene
# this excludes the zeroes (species without any gene in that family)
stdevDivMean = "%.1f" % (stdevGenesPerSpecies/float(avgGenesPerSpecies)) # Variations in numbers of genes per family across species -> stdev / mean
#3.2) tag families to exclude
if family in familiesToFlag:
familyFlag = "exclude"
else:
familyFlag = ""
#3.3) Calculate/retrieve Branch name, median branch length and completeness score
branchName = familiesTaxon[family]
if family in branchLengths:
medianBranchLength = branchLengths[family]
else: #this should not happen except for a couple of known exceptions e.g. family 1109776 when only 33 species, because pruning did not work. #as I do not know if there are/will be other exceptions I decided to put "NA" when it does not work.
medianBranchLength = "NA"
nSpeciesBelowTaxon = numTaxonBelow[branchName]
completeness = "%.2f" % (nSpecies/nSpeciesBelowTaxon)
#Validation completeness for family: 80624. On James plot shows 0.5 completeness score. VERIFIED 80624 41 319 7.8 2.0 278 Metazoa40001000 0.5
#Validation completeness for family: 63382. On James plot shows <0.25 completeness score. VERIFIED 63382 10 393 39.3 36.5 383 Chromadorea40001076 0.2
#3.4) recalculate species variation coefficient, having insight from phylogeny.
#filling the zeroes when a species should have a gene (based on tree root) but does not
zeroesMissing = nSpeciesBelowTaxon - nSpecies #species that should have a gene but don't, based on the family species root
assert(zeroesMissing >= 0)
nGenesPerSpeciesZeroes = nGenesPerSpecies[:] #nGenesPerSpeciesZeroes = nGenesPerSpecies[:]
for i in range(zeroesMissing):
nGenesPerSpeciesZeroes.append(0)
stdnGenesPerSpeciesZeroes = std(nGenesPerSpeciesZeroes,ddof=1)
meannGenesPerSpeciesZeroes = mean(nGenesPerSpeciesZeroes)
speciesVarCoefZeroes = "%.1f" % (stdnGenesPerSpeciesZeroes / meannGenesPerSpeciesZeroes )
#variation coefficient for protein lengths
proteinLengthsSpeciesZeroes = proteinLengthsSpecies[:] #nGenesPerSpeciesZeroes = nGenesPerSpecies[:]
for i in range(zeroesMissing):
proteinLengthsSpeciesZeroes.append(0)
stdProteinLengthsSpeciesZeroes = std(proteinLengthsSpeciesZeroes,ddof=1)
meanProteinLengthsSpeciesZeroes = mean(proteinLengthsSpeciesZeroes)
speciesVarCoefZeroesLengths = "%.1f" % (stdProteinLengthsSpeciesZeroes / meanProteinLengthsSpeciesZeroes )
#3.5) retrieve gene losses and gene duplication data
if family in familiesGeneDuplication:
duplications = familiesGeneDuplication[family]
else:
duplications = 0
if family in familiesGeneLosses:
losses = familiesGeneLosses[family]
else:
losses = 0
if family in familiesGeneDuplicationMaxNode:
maxDuplicationNode = familiesGeneDuplicationMaxNode[family]
else:
maxDuplicationNode = "None"
#initialize interproscan-related items and others
nGenesWithPfam = 0 #at least 1 Pfam entry
nGenesWithSignalP = 0 #at least 1 SignalP entry
nGenesWithTMHMM = 0 #at least 1 TMHMM entry
pfamTerms = [] # this is to store one pfam domain example per gene
pfamTermsAll = [] # this is to store all pfam domains in a gene (not just one per gene)
maxSp = "" #the species with most number of genes
maxSpProtLen = 0
protLenPerClade = {} #key -> clade, val -> list of protein lengths in species of that clade . note that I only initialize a clade if any species on that clade, and only contains data on species with at least one gene.
protLenPerSp = {} #key -> sp, val -> prot length
#3.6) Loop through each species in gene family. To retrieve interproscan info and make other calculations
for species in sorted(familiesDict[family]):
#ngenesSp = len(familiesDict[family][species]) #if calculating measures based on gene numbers
protLenSp = familiesDictLengths[family][species] #if calculating measures using protein length instead of gene numbers, without changing all variable names..
protLenPerSp[species] = familiesDictLengths[family][species] #redundant but needed #only initializes species if any gene
#3.7) Get genes per clade
clade = cladePerSpecies[species]
if clade not in protLenPerClade:
protLenPerClade[clade] = []
protLenPerClade[clade].append(protLenSp)
#3.8) get measure of species with most genes in family
if protLenSp > maxSpProtLen:
maxSpProtLen = protLenSp
maxSp = species
# #testing: block start. Uncomment this when running for real
#3.9) get interproscan data
for gene in familiesDict[family][species]:
if gene in interproscanData[species]: #it is normal that a gene may not have any interproscan entries.
#Validation. Gene present in IPR file and with correct description
#Python: syphacia_muris SMUV_0000960301 {'Pfam': ['SMUV_0000960301-mRNA-1\tPfam\tprotein_match\t71\t295\t4.6E-58\t+\t.\tName=PF00089;signature_desc=Trypsin;Target=SMUV_0000960301-mRNA-1 71 295;status=T;ID=match$342_71_295;Ontology_term="GO:0004252","GO:0006508";date=04-06-2014;Dbxref="InterPro:IPR001254"']}
#Farm: /lustre/scratch108/parasites/dr7/50HG/50HG_100K_families_parsing/interproscanGFFs_testing$ grep SMUV_0000960301 SMUV.protein.fa.gz.fas.ipr
# SMUV_0000960301-mRNA-1 Pfam protein_match 71 295 4.6E-58 + . Name=PF00089;signature_desc=Trypsin;Target=SMUV_0000960301-mRNA-1 71 295;status=T;ID=match$342_71_295;Ontology_term="GO:0004252","GO:0006508";date=04-06-2014;Dbxref="InterPro:IPR001254"
typesOfEntries = list(interproscanData[species][gene].keys()) #this is dictionary entries, therefore no duplications
assert (len(typesOfEntries) <= len(self.targetSources))
for entryType in typesOfEntries:
if "Pfam" in entryType:
nGenesWithPfam+=1
spl = "".join(interproscanData[species][gene][entryType]).split(";") #there may be several lines of pfam domains for same gene
#Note: the presence of pfam entries for the same gene in the IPR GFF is random, (i.e. the last entry is not always the most N-terminal domain etc, its just random),
#Note(cont): meaning that picking only 1 pfam example for each gene is randomized (but the same everytime we run this)
##originally I was picking only one pfam domain per gene
desc = ""
pfamTermsSet = set()
for item in spl:
if item.startswith("signature_desc="):
desc = item.replace("signature_desc=","")
pfamTermsSet.add(desc)
for term in pfamTermsSet:
pfamTerms.append(term)
##then I also added column where we have all pfam domains in gene
for item in spl:
desc = ""
if item.startswith("signature_desc="):
desc = item.replace("signature_desc=","")
pfamTermsAll.append(desc)
if "SignalP" in entryType: #this can in principle match any SignalP_* entry
nGenesWithSignalP+=1
if "TMHMM" in entryType:
nGenesWithTMHMM+=1
# else: #dr7@farm3-head2:/lustre/scratch108/parasites/dr7/50HG/50HG_100K_families_parsing/interproscanGFFs_testing$ grep HNAJ_0000929301 HNAJ.protein.fa.gz.fas.fas.sl.ipr
# print (species,gene)
#testing: block end. Uncomment this when running for real
mostFrequentSpecies = str(maxSpProtLen)+"X:"+maxSp #species with most genes/prot length, and how many
#3.10) calculations on interproscan data #note this really looks at genes, not at protein length. Stats are gene-based, not altered for protein length.
assert (nGenesWithPfam <= nGenes)
assert (nGenesWithTMHMM <= nGenes)
assert (nGenesWithSignalP <= nGenes)
percPfam = "%.1f" % (nGenesWithPfam*100.0/nGenes)
percSignalP = "%.1f" % (nGenesWithSignalP*100.0/nGenes)
percTMHMM = "%.1f" % (nGenesWithTMHMM*100.0/nGenes)
##Getting 3 most common pfam terms, and % genes in family with that pfam
pfams = {}
pfamText = ""
for term in pfamTerms:
if term not in pfams:
pfams[term] = float("%.1f" % (pfamTerms.count(term)*100.0/nGenes ) )
pfamsSorted = sorted(pfams, key=pfams.get, reverse=True) #need numeric sorting
for term in pfamsSorted[0:3]: #top 3 pfam domains #if less than 3 exist will simply pick less
pfamText+=str(pfams[term])+"%:"+term+";"
if pfamText == "":
pfamText = "100%:None"
##Getting all pfam domains in family statistics, and how many times
pfamsAll = {}
pfamTextAll = ""
for term in pfamTermsAll:
if term not in pfamsAll:
pfamsAll[term] = pfamTermsAll.count(term)
pfamsSortedAll = sorted(pfamsAll, key=pfamsAll.get, reverse=True)
for term in pfamsSortedAll:
pfamTextAll+=str(pfamsAll[term])+"X:"+term+";"
if pfamTextAll == "":
pfamTextAll = "0X:None"
#3.11) calculate ranking measures
### IMPORTANT NOTE ### 14-Sep: Even though variable names and comments still say "genes", it may actually be using protein lengths.
#Note that all these measures count zeroes.
#E.g. I know the species tree node corresponding to root of the gene tree. Therefore I always take into account all the species AND clades that SHOULD be present in the calculation.
##3.11.1) Calculate Z-score for each clade. Z-score = (mean prot length per species in a clade - mean prot length per species)/(std of prot length per species)
#This only works if more than 1 clade. Otherwise, all entries have 'NA'
#I always check root and calculate for each clade descendant of the root. Other clades will have 'NA'.
#note that I always divide by all species in the clade, regardless all of them have a gene or not
#note: standard deviation can be zero (e.g. all species having same number prot length), and we cannot divide by zero. I tag these cases with "NA_std_is_zero"
#get clades that should be present.
if branchName in nodeInfo: #if not, it means it is a leaf, not an internal node
descendantClades = nodeInfo[branchName]
else:
assert("_" in branchName)
descendantClades = []
#calcualte mean genes per clade #adding zeroes if species does not have gene
meanProtLenPerClade = {} #key -> clade, value -> mean prot length per species in that clade (total, including zeroes) #note this dict only contains clade entries if they should have any value
sdProtLenOutsideClade = {} #key -> clade, value -> std dev of prot length per species that are *not* in this clade. Added Avril Coghlan 14-Jan-2016
for clade in descendantClades:
if clade in protLenPerClade:
meanProtLenPerClade[clade] = sum(protLenPerClade[clade]) / float(len(speciesPerClade[clade] ) )
# calculate the standard deviation of protein lengths in species outside this clade. Added Avril Coghlan 14-Jan-2016
otherSpecies = [] #list of protein lengths for the species not in current clade, will include zeroes. Added Avril Coghlan 14-Jan-2016
for species in nodeInfoSpp[branchName]: #species under the tree root. Added Avril Coghlan 14-Jan-2016.
if cladePerSpecies[species] == clade: # Added Avril Coghlan 14-Jan-2016
continue # Added Avril Coghlan 14-Jan-2016
else: # Added Avril Coghlan 14-Jan-2016
if species in protLenPerSp: #protLenPerSp only contains species entry if any gene. Added Avril Coghlan 14-Jan-2016.
otherSpecies.append(protLenPerSp[species]) # Added Avril Coghlan 14-Jan-2016.
else: # Added Avril Coghlan 14-Jan-2016.
otherSpecies.append(0) # Added Avril Coghlan 14-Jan-2016.
stdProteinLengthsSpeciesZeroesOutsideClade = std(otherSpecies, ddof=1) # Added Avril Coghlan 14-Jan-2016. Used ddof=1 as Diogo has used this elsewhere for std. dev.
sdProtLenOutsideClade[clade] = stdProteinLengthsSpeciesZeroesOutsideClade # Added Avril Coghlan 14-Jan-2016.
else: #if no species have it
meanProtLenPerClade[clade] = 0
sdProtLenOutsideClade[clade] = 0 # Added by bh4 - 17-Jan-2016
assert(set(meanProtLenPerClade.keys()) == set(descendantClades) )
#actually calculate z-scores
zscoreResults = {} #key -> clade, val -> zscore
maxZscore = 0 #will only count positive Zscores
maxZclade = "NA"
for clade in sorted(speciesPerClade): #just so I loop over all clades
if len(descendantClades) < 2: #cannot calculate clade Z-score if only one clade
zscoreResults[clade] = "NA"
else:
if clade in meanProtLenPerClade:
# use the standard deviation of protein lengths in species outside this clade. Added Avril Coghlan 14-Jan-2016.
stdProteinLengthsSpeciesZeroesOutsideClade = sdProtLenOutsideClade[clade] # Added Avril Coghlan 14-Jan-2016.
# if stdnGenesPerSpeciesZeroes == 0: Commented out Avril Coghlan 14-Jan-2016
if stdProteinLengthsSpeciesZeroesOutsideClade == 0: # Added Avril Coghlan 14-Jan-2016
zscoreResults[clade] = "NA_std_is_zero"
else:
# use the standard deviation of protein lengths in species outside this clade. Added Avril Coghlan 14-Jan-2016.
zscore = (meanProtLenPerClade[clade] - meanProteinLengthsSpeciesZeroes) / stdProteinLengthsSpeciesZeroesOutsideClade # Added Avril Coghlan 14-Jan-2016
# zscore = (meanProtLenPerClade[clade] - meanProteinLengthsSpeciesZeroes) / stdProteinLengthsSpeciesZeroes. Commented out Avril Coghlan 14-Jan-2016
#validated using http://ncalculators.com/statistics/z-score-calculator.htm
zscoreResults[clade] = zscore
if zscore > maxZscore:
maxZscore = zscore
maxZclade = clade
else:
zscoreResults[clade] = "NA"
maxZscore = "%.2f" % maxZscore
#Z-score calculation validation:
# ##Mono-clade family
# family 1212416 : NECAME_04199 (necator_americanus) NBR_0000394301 (nippostrongylus_brasiliensis) NBR_0001482701 (nippostrongylus_brasiliensis) NBR_0002215501 (nippostrongylus_brasiliensis)
# 1212416 0.5 0.5 NA NA NA NA NA NA NA 0.0
# #CORRECT
# ##Two-neighbour-clade family
# family 851973 : csin104847 (clonorchis_sinensis) csin112867 (clonorchis_sinensis) D915_09852 (fasciola_hepatica) D915_09853 (fasciola_hepatica) SSLN_0001350701 (schistocephalus_solidus) Sjp_0074790 (schistosoma_japonicum) Sjp_0074800 (schistosoma_japonicum) Smp_166880 (schistosoma_mansoni)
# Em Ts Hm Mc Ss Cs Fh Sj Sm
# 0 0 0 0 1 2 2 2 1
# avg 0.888888889 zscore tapeworm -0.787400787
# std 0.874889764 zscore fluke 0.984250984
# tapeworm avg 0.2
# fluke avg 1.75
# #VALUES VALIDATED, MANUAL CALCULATION!
# ##Root with 6 clades, but genes in only 3 clades.
# descendants: {'IV', 'V', 'Tapeworm', 'III', 'I', 'Flukes'} data keys: ['I', 'IV', 'Flukes']
# family 919890 : csin102733 (clonorchis_sinensis) D915_07255 (fasciola_hepatica) MhA1_Contig762.frz3.gene6 (meloidogyne_hapla) nRc.2.0.1.g27258 (romanomermis_culicivorax) Sjp_0091230 (schistosoma_japonicum) Smp_090010 (schistosoma_mansoni)
# #VALUES VALIDADED, MANUAL CALCULATION!
##3.11.2) Calculate clade gene enrichement (Avril's/Bhavana's measure)
# mean prot length per species in a clade/mean prot length per species in all other species
#Note that I use root to see which clades could be present, and I count zeroes on clades and species. When I say "all other species" it consists of the species that could have a gene based on root.
# enrichResults = {} #key -> clade, val -> zscore
# maxEnrich = 0 #will only count positive Zscores
# maxEnrichClade = "NA"
# for clade in sorted(speciesPerClade): #just so I loop over all clades
# if len(descendantClades) < 2: #cannot calculate if only one clade
# enrichResults[clade] = "NA"
# else:
# if clade in meanProtLenPerClade: #clades that should have genes (but may not have)
# otherSpecies = [] #list of number of genes for the species not in current clade, will include zeroes
# for cladeDesc in descendantClades: #loop over all clades that should/could contain a gene based on root, not just the ones that do have a gene
# if cladeDesc == clade: #if current clade
# continue
# if cladeDesc in protLenPerClade: #this means that there will be at least one species with a gene here
# otherSpecies.extend(protLenPerClade[cladeDesc])
# missingZeroes = speciesPerClade[cladeDesc] - len(protLenPerClade[cladeDesc]) #the protLenPerClade only contains data on species with genes, therefore need to add zeroes
# for m in range(missingZeroes):
# otherSpecies.append(0)
# else: #add zeroes accordingly
# l = [0]*speciesPerClade[cladeDesc]
# otherSpecies.extend(l)
# #print (family, branchName, clade, len(otherSpecies),speciesPerClade[clade],nSpeciesBelowTaxon)
# assert (len(otherSpecies) + speciesPerClade[clade] == nSpeciesBelowTaxon)
# enrich = meanProtLenPerClade[clade] / mean(otherSpecies) #meannGenesPerSpeciesZeroes #mean(otherSpecies)
# enrichResults[clade] = enrich
# if enrich > maxEnrich:
# maxEnrich = enrich
# maxEnrichClade = clade
# else: #so that I write NA in clades which should not have genes in this family
# enrichResults[clade] = "NA"
# assert (len(enrichResults) == len(speciesPerClade))
# maxEnrich = "%.2f" % maxEnrich
# ##Another approach to calculate clade enrichment, allowing paraphyletic groups.
#Note that if gene tree root is in between a paraphyletic group, the score is calculated for all members of the paraphyletic clade.
enrichResults = {} #key -> clade, val -> zscore
maxEnrich = 0 #max will only count positive Zscores
maxEnrichClade = "NA"
for clade in sorted(speciesPerClade): #loop over all clades, regardless of having gene or not, to fill all columns in output
if len(descendantClades) < 2: #cannot calculate if only one clade
enrichResults[clade] = "NA"
else: #calculate enrichment
if clade in meanProtLenPerClade: #clades that should have genes (but may not have) #note that meanProtLenPerClade already accounts for zeroes
otherSpecies = [] #list of protein lengths for the species not in current clade, will include zeroes
countSppInClade = 0 #just for validation #because of paraphyletic groups, the number of species in a clade may be different from number of species below gene tree root.
for species in nodeInfoSpp[branchName]: #species under the tree root
if cladePerSpecies[species] == clade:
countSppInClade+=1
continue
else:
if species in protLenPerSp: #protLenPerSp only contains species entry if any gene
otherSpecies.append(protLenPerSp[species])
else:
otherSpecies.append(0)
assert (len(otherSpecies) + countSppInClade == nSpeciesBelowTaxon)
enrich = meanProtLenPerClade[clade] / mean(otherSpecies)
enrichResults[clade] = enrich
if enrich > maxEnrich:
maxEnrich = enrich
maxEnrichClade = clade
else: #so that I write NA in clades which do not have genes in this family
enrichResults[clade] = "NA"
assert (len(enrichResults) == len(speciesPerClade))
maxEnrich = "%.2f" % maxEnrich
##Confirmed that this approach gives the same results as the previous, if using same dataset.
#4) Writing to output file, one family at a time
#main output file
self.outputFile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t" % (family,familyFlag,nSpecies,nGenes,avgGenesPerSpecies,medianGenesPerSpecies,stdevDivMean,nParalogs,branchName,completeness,duplications,maxDuplicationNode,mostFrequentSpecies) )
##Presence in free-living species
for species in sorted(freeLivingSpecies):
if species in familiesDict[family]:
self.outputFile.write("%s\t" % len(familiesDict[family][species]))
else:
self.outputFile.write("0\t")
self.outputFile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (losses,medianBranchLength,percPfam,percTMHMM,percSignalP,pfamText,pfamTextAll) )
#measures file
outputFileMeasures.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t" % (family,familyFlag,nSpecies,nGenes,avgProteinLenPerGene,sumProteinLengths,stdevDivMean,speciesVarCoefZeroes,speciesVarCoefZeroesLengths) )
for clade in sorted(speciesPerClade):
outputFileMeasures.write("%s\t" % (zscoreResults[clade]) )
outputFileMeasures.write("%s\t%s\t" % (maxZclade,maxZscore) )
for clade in sorted(speciesPerClade):
outputFileMeasures.write("%s\t" % (enrichResults[clade]) )
outputFileMeasures.write("%s\t%s\t" % (maxEnrichClade,maxEnrich) )
outputFileMeasures.write("%s\t%s\n" % (pfamText,pfamTextAll) )
self.outputFile.close()
outputFileMeasures.close()
if self.verbose: print ("Total number of families processed:\t%s" % count )
def writeFamilies(self):
"""(Optional function) Writes the families (like in families.txt file) after all the filterings, the way they are actually processed.
On the original files there is no apparent gene order sorting. Here there will be species sorting.
Intended output example
-------------------------------------------------------------------------------
family 1584328 : ASU_08438 (ascaris_suum) ALUE_0001329201 (ascaris_lumbricoides)
-------------------------------------------------------------------------------
family 1584332 : nRc.2.0.1.g04834 (romanomermis_culicivorax) nRc.2.0.1.g36069 (romanomermis_culicivorax)
"""
if self.verbose: print ("Writing new families.txt file..")
familiesDict = self.familiesDict
dumpFamiliesFile = open(self.dumpFamiliesFile,"w")
count = 0
for family in sorted(familiesDict):
count+=1
text = "-------------------------------------------------------------------------------\n"
text+= "family %s :" % family
for species in sorted(familiesDict[family]):
for gene in sorted(familiesDict[family][species]):
text+=" %s (%s)" % (gene,species)
text+="\n"
dumpFamiliesFile.write(text)
dumpFamiliesFile.close()
if self.verbose: print ("Wrote %s families at %s" % (count,dumpFamiliesFile.name))
def cleanUp(self):
"""removes temporary files"""
#pass
os.system("rm tmp_familiesPerTaxonomyLevel_"+self.jobID+".txt")
def dumpAll(self):
if self.verbose: print ("Writing pickle dump files..")
pickle.dump(self.familiesDict,open(self.dumpsFolder+"/familiesDict","wb") )
pickle.dump(self.familiesTaxon,open(self.dumpsFolder+"/familiesTaxon","wb") )
pickle.dump(self.numTaxonBelow,open(self.dumpsFolder+"/numTaxonBelow","wb") )
pickle.dump(self.familiesGeneDuplication,open(self.dumpsFolder+"/familiesGeneDuplication","wb") )
pickle.dump(self.familiesGeneDuplicationMaxNode,open(self.dumpsFolder+"/familiesGeneDuplicationMaxNode","wb") )
pickle.dump(self.familiesGeneLosses,open(self.dumpsFolder+"/familiesGeneLosses","wb") )
pickle.dump(self.tagSpeciesName,open(self.dumpsFolder+"/tagSpeciesName","wb") )
pickle.dump(self.geneIDMapping,open(self.dumpsFolder+"/geneIDMapping","wb") )
pickle.dump(self.interproscanData,open(self.dumpsFolder+"/interproscanData","wb") )
pickle.dump(self.branchLengths,open(self.dumpsFolder+"/branchLengths","wb") )
pickle.dump(self.cladePerSpecies,open(self.dumpsFolder+"/cladePerSpecies","wb") )
pickle.dump(self.speciesPerClade,open(self.dumpsFolder+"/speciesPerClade","wb") )
pickle.dump(self.speciesTree,open(self.dumpsFolder+"/speciesTree","wb") )
pickle.dump(self.removeOutgroups,open(self.dumpsFolder+"/removeOutgroups","wb") )
pickle.dump(self.speciesToRemove,open(self.dumpsFolder+"/speciesToRemove","wb") )
pickle.dump(self.familiesToFlag,open(self.dumpsFolder+"/familiesToFlag","wb") )
pickle.dump(self.listOfSpecies,open(self.dumpsFolder+"/listOfSpecies","wb") )
pickle.dump(self.geneIDMappingAllEntries,open(self.dumpsFolder+"/geneIDMappingAllEntries","wb") )
pickle.dump(self.geneFilterList,open(self.dumpsFolder+"/geneFilterList","wb") )
pickle.dump(self.nodeInfo,open(self.dumpsFolder+"/nodeInfo","wb") )
pickle.dump(self.nodeInfoSpp,open(self.dumpsFolder+"/nodeInfoSpp","wb") )
pickle.dump(self.proteinLengths,open(self.dumpsFolder+"/proteinLengths","wb") )
pickle.dump(self.familiesDictLengths,open(self.dumpsFolder+"/familiesDictLengths","wb") )
# for key in vars(self):
# pickle.dump(eval("self."+key),open("dumps/"+str(key),"wb") )
def loadAll(self,dumpPath):
if self.verbose: print ("Loading pickle dump files..")
#loading only the required data
self.familiesDict=pickle.load(open(dumpPath+"/familiesDict","rb") )
self.familiesTaxon=pickle.load(open(dumpPath+"/familiesTaxon","rb") )
self.numTaxonBelow=pickle.load(open(dumpPath+"/numTaxonBelow","rb") )
self.familiesGeneDuplication=pickle.load(open(dumpPath+"/familiesGeneDuplication","rb") )
self.familiesGeneDuplicationMaxNode=pickle.load(open(dumpPath+"/familiesGeneDuplicationMaxNode","rb") )
self.familiesGeneLosses=pickle.load(open(dumpPath+"/familiesGeneLosses","rb") )
self.interproscanData=pickle.load(open(dumpPath+"/interproscanData","rb") ) #can comment this to run faster, uncomment when running for real
self.branchLengths=pickle.load(open(dumpPath+"/branchLengths","rb") )
self.cladePerSpecies=pickle.load(open(dumpPath+"/cladePerSpecies","rb") )
self.speciesPerClade=pickle.load(open(dumpPath+"/speciesPerClade","rb") )
self.familiesToFlag=pickle.load(open(dumpPath+"/familiesToFlag","rb") )
self.listOfSpecies=pickle.load(open(dumpPath+"/listOfSpecies","rb") )
self.nodeInfo=pickle.load(open(dumpPath+"/nodeInfo","rb") )
self.nodeInfoSpp=pickle.load(open(dumpPath+"/nodeInfoSpp","rb") )
self.familiesDictLengths=pickle.load(open(dumpPath+"/familiesDictLengths","rb") )
def run(self):
"""Runs the functions in order."""
self.readGeneIDMappingFolder() #this has to be run before the readGeneFilterList()
if self.removeOutgroupsFile != "": self.readOutgroupList() #this has to be run before readFamilies()
if self.geneFilterListFile != "": self.readGeneFilterList() #this has to be run before readFamilies()
self.readProteinLengthFiles() #this has to be run before readFamilies
self.readFamilies()
self.readSpeciesTree()
if self.removeOutgroupsFile != "": self.writeFamiliesPerTaxonomyLevel() #this has to be run after readFamilies()
if self.flagFamilyListFile != "": self.readFlagFamilyList() #this can be run anywhere before statsPerFamily()
self.readFamiliesPerTaxonomyLevel() #note that this will read a file created in this script
self.readGeneDuplicationsFolder()
self.readGeneLossesFolder()
self.readSpeciesNameTag()
self.readInterproGFFFolder()
self.readBranchLengths()
self.readCladeFile()
self.readNodeInfoFile() #has to be run after readCladeFile
if self.dumpsFolder != "": self.dumpAll()
self.statsPerFamily()
if self.dumpFamiliesFile != "": self.writeFamilies()
self.cleanUp()
def runLoad(self,dumpPath):
"""For script development. To use if just changing the statsPerFamily function."""
if self.verbose: print ("Loading dumps..")
self.loadAll(dumpPath) #this loads are required objects to runs stats on them
self.statsPerFamily()
if self.dumpFamiliesFile != "": self.writeFamilies()
def main():
import argparse
parser = argparse.ArgumentParser(description='Analysis suite for exploring Compara families.')
#positional args
parser.add_argument('comparaFamilies', metavar='comparaFamilies', type=str, help='Flat file. Compara families file. One family per line, list of genes/species. Output from get_all_50HG_families_v75.pl.')
parser.add_argument('familiesPerTaxonomyLevel', metavar='familiesPerTaxonomyLevel', type=str, help='Flat file. One taxonomy level per line, list of families. Output from get_families_per_taxonomy_level_v75.pl.')
parser.add_argument('speciesTree', metavar='speciesTree', type=str, help='Species tree in newick format. Each node should have an unique name, this expects species/node+id (e.g. dorylaimia40001065, romanomermis_culicivorax40001064). Output from get_species_tree_v75.pl.')
parser.add_argument('geneDuplicationsFolder', metavar='geneDuplicationsFolder', type=str, help='Folder with a file for each taxon/node. File names should be e.g. ciona_intestinalis40001180.txt. Output from get_gene_duplications_per_taxonomy_level_v75.pl.')
parser.add_argument('geneLossesFolder', metavar='geneLossesFolder', type=str, help='Folder with a file for each taxon/node. File names should be e.g. ciona_intestinalis40001180.txt. Output from get_gene_losses_per_taxonomy_level_v75.pl.')
parser.add_argument('speciesNameTag', metavar='speciesNameTag', type=str, help='TSV file. Locus tag - Species name correspondence. PREFIX Genus species VERSION.')
parser.add_argument('geneIDMappingFolder', metavar='geneIDMappingFolder', type=str, help='Folder with a *_id_mapping.txt TSV file for each species, with Gene IDs from different sources. SPECIES_NAME GENEMEMBER_ID SEQMEMBER_ID TRANSCRIPT_ID PROTEIN_ID IPRSCAN_GFF_ID.')
parser.add_argument('interproscanGFFFolder', metavar='interproscanGFFFolder', type=str, help='Folder with a *ipr* GFF file for each species. Files can be gzipped. File names should be e.g. ciona_intestinalis40001180.txt. Output from get_gene_losses_per_taxonomy_level_v75.pl.')
parser.add_argument('branchLenghts', metavar='branchLenghts', type=str, help='TSV file. One line per Compara family, ID in first column, average branch length of tree in second column, median in third column. Output from get_compara_branch_length.py.')
parser.add_argument('speciesClade', metavar='speciesClade', type=str, help='TSV file. One line per species, first column species name as in comparaFamilies, second column the clade name/number, third column either Parasitic or Freeliving. E.g. loa_loa\tIII\tParasitic')
parser.add_argument('nodeInfo', metavar='nodeInfo', type=str, help='File with leaves per each node in the tree. Works even if whole species tree was used (no need to prune). Produced by ~alc/Documents/git/Python/parse_tree_with_ETE.py')
parser.add_argument('proteinLengthsFolder', metavar='proteinLengthsFolder', type=str, help='Path to folder with a file for each species. Files need to have same names as species names and contain a list of all proteins and their respective length, in amino acids. Used also to recreate singletons lists. From get_protein_lengths_all_species_v75.pl.')
#optional argument
parser.add_argument('--outputFile', metavar='outputFile', default = os.getcwd()+"/comparaFamiliesParser.out", type=str, help='File where important output will be written (but you should also store stdout and stderr). If none given it will created file called comparaFamiliesParser.out in cwd.')
parser.add_argument('--verbose', metavar='verbose', default = 1, type=int, help='Verbose mode (1 = Yes, 0 = No). Default = 1.')
parser.add_argument('--removeOutgroups', metavar='removeOutgroups', default = "", type=str, help='Whether to remove outgropup species from results. Give file with list of species/tree nodes to remove/ignore from analysis, one per line. First column is nodeID (e.g. 400010010), second column is the node/species name, third column whether this is "species" or "node". Remember this must include also internal nodes. Default = "" (off)')
parser.add_argument('--dumpFamilies', metavar='dumpFamilies', default = "", type=str, help='Option to create output file with the post-filtered Compara families used here. Introduce wanted file name after argument. Default = No')
parser.add_argument('--flagFamilyList', metavar='flagFamilyList', default = "", type=str, help='Option to flag with "exclude" specific families provided in a flat file, with one family ID per line. Default = No')
parser.add_argument('--geneFilterList', metavar='geneFilterList', default = "", type=str, help='Option to exclude the listed genes from all output measures. Use mRNA IDs (SEQMEMBER_ID), which may be different from families file. Default = No')
parser.add_argument('--loadDumps', metavar='loadDumps', default = "", type=str, help='Instead of running the script from scratch, load already dumped python objects. Provide dumps folder path. Default = No')
parser.add_argument('--dumpsFolder', metavar='dumpsFolder', default = "dumps", type=str, help='Give folder path to create object dumps. Default = create dumps in folder called "dumps".')
args = parser.parse_args() #gets the arguments
print (args)
#any better way to pass arguments? possible with *args or **kwargs? couldn't make it work using eval
run = Run(args.comparaFamilies,args.familiesPerTaxonomyLevel,args.geneDuplicationsFolder,args.geneLossesFolder,args.speciesNameTag,args.geneIDMappingFolder,\
args.interproscanGFFFolder,args.branchLenghts,args.removeOutgroups,args.speciesTree,args.verbose,args.outputFile,args.dumpFamilies,args.flagFamilyList,\
args.speciesClade,args.geneFilterList,args.dumpsFolder,args.nodeInfo,args.proteinLengthsFolder)
if args.loadDumps != "":
#to speed up processing, just load data and run stats
run.runLoad(args.loadDumps)
else:
#create dump folder, write parameter file and run all
if not os.path.exists(args.dumpsFolder):
os.mkdir(args.dumpsFolder)
with open(args.dumpsFolder+"/parameters.txt","w") as parametersFile:
parametersFile.write(str(args) )
run.run()
print ("Finished in %.1f seconds." % (clock()-time0))
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment