avrilcoghlan/comparaFamiliesAnalysis.py

## comparaFamiliesAnalysis.py
#!/usr/bin/env python3
#25-Feb-2015 dr7
#Analysis suite for exploring Compara families. The purpose is to gather information about Compara trees to that we can mine this large set of information and select the most interesting families to study.

import sys
import os
import re
import gzip
import random
import pickle
from numpy import mean, median, std #numpy.std gets the population standard deviation, not the sample standard deviation
from glob import glob
from pprint import pprint
from time import clock
from warnings import warn

#dependency
import findLastCommonAncestor #dr7 deployed and tested code

from Bio import Phylo

time0 = clock()

class Run(object):

	def __init__(self, comparaFamilies,familiesPerTaxonomyLevel,geneDuplicationsFolder,geneLossesFolder,speciesNameTag,geneIDMappingFolder, \
		interproscanGFFFolder,branchLengths,removeOutgroupsFile,speciesTreeFile,verbose,outputFile,dumpFamilies,flagFamilyList,speciesClade, \
		geneFilterList,dumpsFolder,nodeInfo,proteinLengthsFolder):

		"""Initialization of all class variables. Explains data structures."""

		#inputs / arguments
		self.comparaFamiliesFile = open(comparaFamilies,"r")
		self.familiesPerTaxonomyLevelFile = familiesPerTaxonomyLevel
		self.speciesTreeFile = speciesTreeFile
		self.geneDuplicationsFolder = geneDuplicationsFolder
		self.geneLossesFolder = geneLossesFolder
		self.speciesNameTagFile = open(speciesNameTag,"r")
		self.geneIDMappingFolder = geneIDMappingFolder
		self.interproscanGFFFolder = interproscanGFFFolder
		self.branchLengthsFile = open(branchLengths,"r")
		self.cladeFile = open(speciesClade,"r")
		self.nodeInfoFile = open(nodeInfo,"r")

		#optional inputs / arguments
		self.verbose = verbose
		self.outputFile = open(outputFile,"w")
		self.removeOutgroupsFile = removeOutgroupsFile
		self.dumpFamiliesFile = dumpFamilies
		self.flagFamilyListFile = flagFamilyList
		self.geneFilterListFile = geneFilterList
		self.dumpsFolder = dumpsFolder
		self.proteinLengthsFolder = proteinLengthsFolder

		#hard-coded 'arguments'
		self.jobID = str(random.randint(1,100000))
		self.targetSources = ["Pfam","TMHMM","SignalP_EUK"] #the types of GFF source entries that we are interested in storing in memory. #this is kept here as it may be modified, but not by user.
		self.freeLivingSpecies = ["caenorhabditis_elegans","panagrellus_redivivus","rhabditophanes_kr3021","pristionchus_pacificus","schmidtea_mediterranea"] # 50HG Species that are completely free-living, so that I can distinguish parasitic-specific gene families. #This is just for helminth species, other non-helminth outgroups would be free-living too.

		#data structures #main connector is familyID or species
		self.familiesDict = {} #key -> familyID, val -> dict: key -> species, val -> list of genes
		self.familiesTaxon = {} #key -> familyID, val -> taxonomy level
		self.numTaxonBelow = {} #key -> taxon ID, val -> number species below
		self.familiesGeneDuplication = {} #key -> familyID, val -> sum of duplications on throughout all species nodes
		self.familiesGeneDuplicationMaxNode = {} #key -> familyID, val -> node ID with most duplication events
		self.familiesGeneLosses = {} #key -> familyID, val -> sum of losses on throughout all species nodes
		self.tagSpeciesName = {} #key -> species_tag, val -> species_name #this is to connect species to interproscan files
		self.geneIDMapping = {} #key -> species, val -> dict: key -> geneID(interpro), val -> list of other geneIDs #this is to connect genes in interproscan to genes in families.txt
		self.geneIDMappingAllEntries = {} #key -> mRNA ID, val -> list of other IDs
		self.interproscanData = {} #key -> species, value -> dict; key -> gene, value -> dict; key-> source, val-> all lines attributed to that gene/source
		self.branchLengths = {} #key -> familyID, val -> average branch length in family
		self.cladePerSpecies = {} #key -> species, val -> clade
		self.speciesPerClade = {} #key -> clade, value -> number of species in the clade (based on the species tree)
		self.nodeInfo = {} #key -> node ID, list of clades potentially present under that node. e.g. 'bilateria40001012': {'I', 'IV', 'Flatworm', 'Flukes', 'Tapeworm', 'V', 'III'}
		self.nodeInfoSpp = {} #key -> node ID, list of species present under that node.
		self.proteinLengths = {} #key -> species, val -> dict ; key -> gene, val -> protein length in aa
		self.familiesDictLengths = {} #key -> familyID, val -> dict: key -> species, val -> sum of protein length of proteins in that family and species.

		#optional data structures
		self.speciesTree = {} #key -> parent, value -> child. Done from a newick tree with external script "findLastCommonAncestor"
		self.removeOutgroups = [] #list of node/leaf ids + species/node-name to remove from analysis
		self.speciesToRemove = [] #list of species to remove from analysis #to apply when processing families.txt
		self.familiesToFlag = [] #list of family IDs of families to have 'exclude' flag
		self.geneFilterList = [] #list of gene IDs to filter out
		self.listOfSpecies = [] #list of species we process here #based on the species tree!

		##Notes:
		#The species name should be lower case and with underscore e.g. panagrellus_redivivus
		#The family IDs should be an int
		#geneFilterListFile: added gene filtering into the readFamily step. Most measures will be (re)calculated in this script itself (e.g. gene tree root is recalculated). Note though that the following measures won't be correct/updated if gene filtering: Total_duplications, Total_losses, Max_duplications_node, Median_branch_length. These are external measures and calculated only before the gene filtering.


	def readFamilies(self):
		"""Parse compara families file. This file can contain all species and genes (i.e. can be Compara families raw file).
		Also does the species filtering (i.e. only counts genes in species that are not listed to remove).
		Also does gene filtering, if given a list of genes to discard.
		Only retains families that have at least 2 genes after filtering.
		Also uses protein lengths instead of number of genes. instead of counting number of genes it sums up protein length of gene. (e.g. a species, instead of having 10 genes in that family has 3000 aa).
		E.g. input file line: family 3 : SSTP_0000649300.1 (strongyloides_stercoralis) DME_0000314201-mRNA-1 (dracunculus_medinensis) TELCIR_18205 (teladorsagia_circumcincta) HPBE_0001583601-mRNA-1 (heligmosomoides_bakeri) """

		if self.verbose: print ("Reading compara families files..")

		comparaFamiliesFile = self.comparaFamiliesFile
		speciesToRemove = self.speciesToRemove
		geneFilterList = self.geneFilterList
		proteinLengths = self.proteinLengths

		familiesDict = {}
		familiesDictLengths = {}

		for line in comparaFamiliesFile: #for each family
			if "----------" not in line:
				line = line.strip()
				spl = line.split(":",1) #doing only max of one split, this is because some IDs also contain ":"
				assert (len(spl) == 2)

				familyID = int(spl[0].split(" ")[1])

				speciesDict = {} #key -> sp, val -> list of genes
				speciesDictLengths = {} #same as previous but stores cumulative lengths of genes of a species as an integer

				genes = spl[1].split(" ")[1:]

				for i in range(0,len(genes),2):
					gene,sp = genes[i],genes[i+1]

					if gene not in geneFilterList: #otherwise it is ignored, as if not existing. After this there should be a condition excluding potentially empty families.
						sp = sp.replace("(","").replace(")","")

						if sp not in speciesToRemove: #validated that this works well by looking at a couple of families
							if sp not in speciesDict:
								speciesDict[sp] = []
								speciesDictLengths[sp] = 0

							speciesDict[sp].append(gene)

							#for protein lengths instead of gene numbers
							proteinLength = int(proteinLengths[sp][gene] )
							speciesDictLengths[sp]+= proteinLength


				lens = [len(speciesDict[sp]) for sp in speciesDict] #numbers of genes
				sumLens = sum(lens)

				if len(speciesDict) > 0 and sumLens > 1: #and sum(list(speciesDict.values()) ) > 1: #there can be empty families, if all species of that tree are to be removed #also there could be single gene 'families' after filtering.
					familiesDict[familyID] = speciesDict
					familiesDictLengths[familyID] = speciesDictLengths

		if self.verbose:	print ("Total number compara families (from families.txt) to be processed, after filtering:\t%s" % len(familiesDict)) #this number does not have to match the total number of families, in case species are being removed

		assert (len(familiesDict) == len(familiesDictLengths))

		self.familiesDict = familiesDict
		self.familiesDictLengths = familiesDictLengths


	def readFamiliesPerTaxonomyLevel(self):
		"""Reads families per taxonomy level. Note that, if the families.txt is filterd in this script, the input for this function may be a temporary file (corrected for new families) created in this script itself.
		E.g. Hymenolepis microstoma 40001043 : 841230,912843,1018527,1058154,1059819,1081886,1082962,1090840,1117204,1117804,1118812"""

		if self.verbose: print ("Reading %s file.." % self.familiesPerTaxonomyLevelFile)

		familiesPerTaxonomyLevelFile = open(self.familiesPerTaxonomyLevelFile,"r")

		familiesTaxon = {}

		for line in familiesPerTaxonomyLevelFile:
			line = line.strip()

			taxon,families = line.split(":")

			taxon = taxon.lower().strip().split(" ")  #may need lower()
			taxonNum = taxon[-1]

			taxon = "_".join(taxon[:-1])+taxonNum  #e.g. Nippostrongylus_brasiliensis40001108

			families = families.strip().split(",")

			for family in families:
				family = int(family)

				if family not in familiesTaxon:
					familiesTaxon[family] = taxon
				else:
					print ("Two roots for family?",family)


		if self.verbose:	print ("Total families in families_per_taxonomy_level:\t%s" % len(familiesTaxon))

		self.familiesTaxon = familiesTaxon


	def _readTaxonFiles(self,inFolder):
		"""E.g filename: echinostomatoidea40001020.txt. Inside file: 66838   2    (familyID\tNum_Events).
		What I want is to combine all events for each family."""

		removeOutgroups = self.removeOutgroups

		outDict = {} #key -> family ID, val -> tot events
		outDictNode = {} #key -> family ID, val -> dict; key -> nodeID, val -> tot events #to have node with most events

		filesToProcess = glob(inFolder+"/*.txt")

		for fi in filesToProcess:
			fileName = fi.split("/")[-1].replace(".txt","")

			if fileName in removeOutgroups: #exclude data from nodes/species that we want to exclude
				continue

			# #checking if taxon IDs match between files
			# print (self.numTaxonBelow[fileName])

			f = open(fi,"r")

			for line in f:
				if "#" in line:
					continue
				line = line.strip()
				spl = line.split("\t")
				familyID = int(spl[0])
				events = int(spl[1])

				if familyID not in outDict:
					outDict[familyID] = 0
					outDictNode[familyID] = {}

				if fileName not in outDictNode[familyID]:
					outDictNode[familyID][fileName] = 0

				outDict[familyID]+=events
				outDictNode[familyID][fileName]+=events


			f.close()

		return outDict,outDictNode


	def readGeneDuplicationsFolder(self):
		"""Process all files in Gene Duplications folder"""

		if self.verbose: print ("Reading gene duplication files..")

		geneDuplicationsFolder = self.geneDuplicationsFolder

		run = self._readTaxonFiles(geneDuplicationsFolder)
		familiesGeneDuplication = run[0]
		nodeDict = run[1]

		#getting most frequent node
		familiesGeneDuplicationMaxNode = {}
		for familyID in nodeDict:
			maxVal = 0
			maxNode = ""
			if familyID not in familiesGeneDuplicationMaxNode:
				familiesGeneDuplicationMaxNode[familyID] = ""
			for node in nodeDict[familyID]:
				val = nodeDict[familyID][node]
				if val > maxVal:
					maxVal = val
					maxNode = node

				familiesGeneDuplicationMaxNode[familyID] = str(maxVal)+"X:"+maxNode

		##validation
		# dr7@farm3-head2:/nfs/helminths02/analysis/50HGP/00ANALYSES/final_gene_duplications/cutoffm1$ g 1178810 *
		# crassostrea_gigas40001175.txt:1178810	1
		# schmidtea_mediterranea40001061.txt:1178810	2

		# Family 126577: '37X:schmidtea_mediterranea40001061'
 		# dr7@farm3-head2:/nfs/helminths02/analysis/50HGP/00ANALYSES/final_gene_duplications/cutoffm1$ grep 126577 *
		# ...
		# ascaridoidea40001165.txt:126577	4
		# bilateria40001006.txt:126577	12
		# ...
		# schmidtea_mediterranea40001061.txt:126577	37
		# ...

		if self.verbose:	print ("Total families with gene duplication info:\t%s" % len(familiesGeneDuplication))

		self.familiesGeneDuplication = familiesGeneDuplication
		self.familiesGeneDuplicationMaxNode = familiesGeneDuplicationMaxNode


	def readGeneLossesFolder(self):
		"""Process all files in Gene Losses folder"""

		if self.verbose: print ("Reading gene loss files..")

		geneLossesFolder = self.geneLossesFolder

		familiesGeneLosses = self._readTaxonFiles(geneLossesFolder)[0]

		#Validation: 1135828: 6
		# dr7@pcs5b:/nfs/helminths02/analysis/50HGP/00ANALYSES/final_gene_losses$ g 1135828 *
		# chromadorea40001076.txt:1135828	1
		# platyhelminthes40001014.txt:1135828	1
		# romanomermis_culicivorax40001064.txt:1135828	1
		# soboliphyme_baturini40001066.txt:1135828	1
		# trichuris_muris40001075.txt:1135828	1
		# trichuris_trichiura40001074.txt:1135828	1

		if self.verbose:	print ("Total families with gene losses info:\t%s" % len(familiesGeneLosses))

		self.familiesGeneLosses = familiesGeneLosses


	def readSpeciesNameTag(self):
		"""Reads tag-species_name correspondence. E.g. PREFIX  Genus   species VERSION
		ACAC    Angiostrongylus cantonensis     1.5.4"""

		if self.verbose: print ("Reading species name mapping files..")

		speciesNameTagFile = self.speciesNameTagFile

		tagSpeciesName = {}

		for line in speciesNameTagFile:
			if line.startswith('"This') or line.startswith("PREFIX") or "#" in line:
				continue
			line = line.strip()
			spl = line.split("\t")

			tag = spl[0]
			sp = spl[1].lower().strip()+"_"+spl[2].strip() #strip because sometimes there is some empty spaces

			tagSpeciesName[tag] = sp


		if self.verbose:	print ("Total species with locus tag info:\t%s" % len(tagSpeciesName))

		self.tagSpeciesName = tagSpeciesName


	def readGeneIDMappingFolder(self):
		"""Reads Gene ID mapping files
		E.g. acanthocheilonema_viteae	nAv.1.0.1.g00113	nAv.1.0.1.t00113-RA	nAv.1.0.1.t00113-RA	nAv.1.0.1.t00113-RA	nAvx1x0x1xt00113-RA"""

		if self.verbose: print ("Reading id mapping files..")

		geneIDMappingFolder = self.geneIDMappingFolder

		geneIDMapping = {}  #key -> species, val -> dict: key -> geneID(interpro), val -> list of other geneIDs #this is to connect genes in interproscan to genes in families.txt
		geneIDMappingAllEntries = {} #key -> mRNA ID, val -> list of other IDs

		#Note: not all genes are present in the interproscan files, so the total number of genes for a species here is not representative of the full dataset.

		filesToProcess = glob(geneIDMappingFolder+"/*_id_mapping.txt")

		for fi in filesToProcess:
			f = open(fi,"r")

			fileName = fi.split("/")[-1].replace("_id_mapping.txt","")

			if fileName not in geneIDMapping:
				geneIDMapping[fileName] = {}

			for line in f:
				if line.startswith("SPECIES_NAME"):
					continue
				line = line.strip()
				spl = line.split("\t")
				iprID = spl[5]
				mRNA = spl[2]
				gene = spl[1]

				geneIDMappingAllEntries[mRNA] = spl[1:]

				if iprID != "NA":
					if iprID not in geneIDMapping[fileName]:
						geneIDMapping[fileName][iprID] = []
					geneIDMapping[fileName][iprID] = spl[1:-1] #first column is species name

			f.close()

		if self.verbose:	print ("Total species with id mapping info:\t%s" % (len(geneIDMapping)) )
		if self.verbose:	print ("Total genes with id mapping info:\t%s" % (len(geneIDMappingAllEntries)) )

		self.geneIDMapping = geneIDMapping
		self.geneIDMappingAllEntries = geneIDMappingAllEntries


	def readInterproGFFFolder(self):
		"""Reads interproscan GFF files for all species. The files can be gzipped."""

		if self.verbose: print ("Reading interproscan files..")

		interproscanGFFFolder = self.interproscanGFFFolder
		tagSpeciesName = self.tagSpeciesName
		geneIDMapping = self.geneIDMapping

		interproscanData = {} #key -> species, value -> dict; key -> gene, value -> dict; key-> source, val-> all lines attributed to that gene/source

		filesToProcess = glob(interproscanGFFFolder+"/*ipr*")

		IDsNotFound = {}

		for fi in filesToProcess:
			if fi.endswith(".gz"):
                # f = gzip.open(fi, 'rb').read().decode('ascii').split("\n") #this might be memory hungry
				f = gzip.open(fi, 'rb').read().decode('utf-8').split("\n") #this might be memory hungry         # bh4 changed this to utf-8 because ascii fails for the new ipr file for T.regenti
			else:
				f = open(fi,"r")
			##e.g. file name /nfs/helminths02/analysis/50HGP/01INTERPRO/ACOC.protein.fa.gz.fas.ipr.gz

			tag = fi.split("/")[-1].split(".")[0]
			species = tagSpeciesName[tag]

			if species not in interproscanData:
				interproscanData[species] = {}

			if species not in geneIDMapping:
				warn("Warning: %s not in geneIDMapping" % species)

			for line in f:
				if "##FASTA" in line:
					break #I do not want to read the fasta part of GFF. #I assume that after this line appears there will be no more GFF/Interproscan entries
				if "##" in line:
					continue
				line = line.strip()

				spl = line.split("\t")

				if len(spl) < 8: #means is not proper GFF line
					continue

				source = spl[1]

				if source not in self.targetSources:
					continue

				iprID = spl[0]

				# if iprID not in geneIDMapping[species]:
				# 	if species not in IDsNotFound:
				# 		IDsNotFound[species] = set()
				# 	IDsNotFound[species].add(iprID)

				if iprID in geneIDMapping[species]: #It can happen that a gene is not present in geneIDMapping, this is because some interproscan files are not updated, containing genes that have since been removed for compara.
					geneID = geneIDMapping[species][iprID][0]
				# else:
				# 	warn("IPR ID not in gene mapping file.")

					if geneID not in interproscanData[species]:
						interproscanData[species][geneID] = {}

					if source not in interproscanData[species][geneID]:
						interproscanData[species][geneID][source] = []

					interproscanData[species][geneID][source].append(line)

				#pprint(interproscanData[species])

				#Validation, number of Pfam domains in file
				#/nfs/helminths02/analysis/50HGP/01INTERPRO/ACOC.protein.fa.gz.fas.ipr.gz 13886
 				# dr7@farm3-head2:/lustre/scratch108/parasites/dr7/50HG/50HG_100K_families_parsing/interproscanGFFs$ grep Pfam ACOC.protein.fa.gz.fas.ipr | wc -l
 				# 13886


		# print ("IDs on IPR GFF but not on ID Mapping file!",len(IDsNotFound))
		# for sp in IDsNotFound:
		# 	print (sp,len(list(IDsNotFound[sp]))
		# 	print (IDsNotFound[sp])


		if self.verbose: print ("Total species with interproscan data:\t%s" % len(interproscanData) )

		self.interproscanData = interproscanData


	def readBranchLengths(self):
		"""Reads branch lengths file. Grabs median.
		E.g. 22454   0.125   0.081   0.204589,0.263049,0.078644,0.113182,0.282062,0.03627,0.105361,0.177002,0.089029,0.129357,0.076138,0.102818"""

		if self.verbose: print ("Reading family branch lengths file..")

		branchLengthsFile = self.branchLengthsFile

		branchLengths = {}

		for line in branchLengthsFile:
			line = line.strip()
			familyID,mean,median,lista = line.split("\t")
			familyID = int(familyID)

			if familyID not in branchLengths:
				branchLengths[familyID] = 0
			branchLengths[familyID] = float(median)

		if self.verbose: print ("Total families with branch lengths:\t%s" % len(branchLengths) )

		self.branchLengths = branchLengths


	def readOutgroupList(self):
		"""Reads file with outgroup / node ids to exclude from analysis. ID\tspecies/node-name\twhether_is_species_or_node
		e.g. 40001001        amphimedon_queenslandica        species"""

		if self.verbose: print ("Reading file with outgroups to remove..")

		removeOutgroupsFile = open(self.removeOutgroupsFile,"r")

		removeOutgroups = []
		speciesToRemove = []

		for line in removeOutgroupsFile:
			line = line.strip()
			spl = line.split("\t")
			removeOutgroups.append(spl[1]+spl[0])

			if spl[2] == "species":
				speciesToRemove.append(spl[1])

		if self.verbose: print ("Total outgroup nodes/leaves to be removed:\t%s" % len(removeOutgroups) )
		if self.verbose: print ("Total species (leaves) to be removed:\t%s" % len(speciesToRemove) )

		self.speciesToRemove = speciesToRemove
		self.removeOutgroups = removeOutgroups


	def readSpeciesTree(self):
		"""Reads newick species tree. Node/leaf names must have at least one alphabethic character, otherwise if only numeric Phylo will read names as being branch lengths. Branch lengths are not required.
		This outputs a tree with parent-child relationships. Will only work for simple binary trees.
		Also returns number taxons below each node based on the tree."""

		if self.verbose: print ("Reading species tree..")

		speciesTreeFile = self.speciesTreeFile

		numTaxonBelow = {}
		sppTaxonBelow = {}

		tree = Phylo.read(speciesTreeFile, 'newick')
		speciesTree = findLastCommonAncestor.convertTreeToDict(tree) #in python dictionary format

		reverseDict = findLastCommonAncestor.reverseDict(speciesTree)

		allNodes = list(speciesTree.keys())
		allNodes.extend(list(speciesTree.values()) )
		allNodes = set(allNodes)

		for node in allNodes:
			numTaxonBelow[node] = int(findLastCommonAncestor.countLeaves(reverseDict,node))

		if self.verbose: print ("Total nodes/leaves in species tree:\t%s" % len(speciesTree) )
		if self.verbose: print ("Total nodes/leaves with number of taxon below:\t%s" % len(numTaxonBelow) )

		self.numTaxonBelow = numTaxonBelow
		self.speciesTree = speciesTree
		leaves = [leaf.name for leaf in tree.get_terminals()]
		listOfSpecies = []
		for leaf in leaves:
			listOfSpecies.append(leaf.split("400")[0])

		self.listOfSpecies = listOfSpecies


	def writeFamiliesPerTaxonomyLevel(self):
		"""If filtering out some species/nodes, I need to calculate new root for these altered compara families, and create an updated familiesPerTaxonomyLevel. The root (branchName) will then be taken from this file.
		This has to be run after readFamilies. The external 'findLastCommonAncestor' functions have been tested previously.
		This function has been properly validated: the created file is the same as the one from Compara, if not removing any outgroups."""

		#e.g.
		#Hymenolepis microstoma 40001043 : 841230,912843,1018527,1058154,1059819
		#Strongyloides 40001088 : 269778,518849,529306,576845,647355,668201,705124,735608

		if self.verbose: print ("Creating familiesPerTaxonomyLevel.txt with recalculated roots..")

		familiesDict = self.familiesDict
		speciesTree = self.speciesTree

		tmpFile = open("tmp_familiesPerTaxonomyLevel_"+self.jobID+".txt","w")

		speciesKeys = list(speciesTree.keys())

		familiesTaxonLevel = {} #this is temporary structure to store what will be written in tmp file

		#loop through each family, calculate new root and write in file.
		for familyID in familiesDict:
			species = list(familiesDict[familyID].keys()) #these are e.g. clonorchis_sinensis, but we need to match to speciesTree clonorchis_sinensis40001019

			speciesKey = ""
			renamedSpecies = []
			for sp in species:
				for key in speciesKeys:
					if sp in key:
						renamedSpecies.append(key)

			assert (len(species) == len(renamedSpecies)) #otherwise species-tree species names do not match families.txt species names..

			rootNode = findLastCommonAncestor.run(renamedSpecies,speciesTree)

			#rename rootNode so that it resembles the same as in familiesPerTaxonomyLevel.
			#This bit may be particular to 50HG Compara. But it should not matter if other database/ID type, simply the file with look differently.
			if "4000" in rootNode:
				spl = rootNode.partition("4000")
				name = spl[0]
				ID = "".join(spl[1:])

				if "_" in name:
					name = name.replace("_"," ")
				name = name[0].upper()+name[1:]

				rootNode = name+" "+ID

			if rootNode not in familiesTaxonLevel:
				familiesTaxonLevel[rootNode] = []

			familiesTaxonLevel[rootNode].append(str(familyID) )


		#write to new file
		for taxon in familiesTaxonLevel:
			tmpFile.write(taxon+" : "+",".join(familiesTaxonLevel[taxon])+"\n")

		tmpFile.close()

		if self.verbose: print ("Total number of node/species in new familiesPerTaxonomyLevel file:\t%s" % len(familiesTaxonLevel)) #this should be equal to: total nodes+species in species tree (180 for 50HG) minus the removed ones.

		self.familiesPerTaxonomyLevelFile = tmpFile.name #change familiesPerTaxonomyLevelFile file pointer to newly created file


	def readFlagFamilyList(self):
		"""Read file with list of families to be flagged."""

		if self.verbose: print ("Reading file with families to flag with 'exclude'..")

		flagFamilyListFile = open(self.flagFamilyListFile,"r")

		familiesToFlag = [int(familyID.strip()) for familyID in flagFamilyListFile]

		self.familiesToFlag = familiesToFlag

		if self.verbose: print ("Total number of families to be flagged: %s" % (len(familiesToFlag)) )


	def readGeneFilterList(self):
		"""Read file with list of genes to be excluded from measures."""

		if self.verbose: print ("Reading file with genes to exclude..")

		geneIDMappingAllEntries = self.geneIDMappingAllEntries #mRNA ID to other IDs

		geneFilterListFile = open(self.geneFilterListFile,"r")

		#convert mRNA ID to gene ID, to match families
		geneFilterList = {geneIDMappingAllEntries[geneID.strip()][0] for geneID in geneFilterListFile}

		self.geneFilterList = geneFilterList

		if self.verbose: print ("Total number of genes to be excluded: %s" % (len(geneFilterList)) )


	def readCladeFile(self):
		"""Read clade file, there must be a clade per each species"""

		if self.verbose: print ("Reading clade file..")

		cladeFile = self.cladeFile
		listOfSpecies = self.listOfSpecies

		cladePerSpecies = {} #key -> species, val -> clade
		speciesPerClade = {} #key -> clade, value -> species in the clade (based on the species tree)

		for line in cladeFile:
			line = line.strip()
			spl = line.split("\t")
			sp = spl[0]
			clade = spl[1]
			sp = sp.lower()
			sp = sp.replace(" ","_")

			if sp in listOfSpecies:
				cladePerSpecies[sp] = clade
				if clade not in speciesPerClade:
					speciesPerClade[clade] = []
				speciesPerClade[clade].append(sp)

		self.cladePerSpecies = cladePerSpecies
		self.speciesPerClade = speciesPerClade

		if self.verbose: print ("Total number of species in clade file: %s" % (len(cladePerSpecies)) )


	def readNodeInfoFile(self):
		"""Reads file with information on leaves for each node. E.g.
		node amphimedon_queenslandica40001001 has parent metazoa40001000
		node euteleostomi40001177 has descendants danio_rerio40001178,homo_sapiens40001179
		Retrieves the set of Clades present for each node. All species of that clade need to be present for a clade to be considered (e.g. paraphyletic)."""

		if self.verbose: print ("Reading node info file..")

		nodeInfoFile = self.nodeInfoFile
		speciesPerClade = self.speciesPerClade
		cladePerSpecies = self.cladePerSpecies

		nodeInfo = {} #key -> node ID, list of clades potentially present under that node.
		nodeInfoSpp = {} #key -> node ID, list of species present under that node.

		for line in nodeInfoFile:
			line = line.strip()
			if "descendants" in line:
				spl = line.split(" ")
				nodeID = spl[1]
				descendants = spl[4].split(",")

				cladeList = []
				for des in descendants:
					des = des.split("400")[0]
					cladeList.append(cladePerSpecies[des])

				descendantClades = set()
				for clade in speciesPerClade:
					if cladeList.count(clade) == len(speciesPerClade[clade]): #to make sure the whole clade is complete. I should only count a descendant clade if all species are present
						descendantClades.add(clade)

				nodeInfo[nodeID] = sorted(list(descendantClades))
				nodeInfoSpp[nodeID] = [des.split("400")[0] for des in descendants]

		self.nodeInfo = nodeInfo
		self.nodeInfoSpp = nodeInfoSpp

		if self.verbose: print ("Total number of tree nodes with leaf info: %s" % (len(nodeInfo)) )


	def readProteinLengthFiles(self):
		"""Read list of all genes and their length"""

		if self.verbose: print ("Reading protein lengths folder..")

		proteinLengthsFolder = self.proteinLengthsFolder

		proteinLengths = {} #key -> species, val -> dict. Key -> gene id, val -> length

		filesToProcess = glob(proteinLengthsFolder+"/*_protein_lengths.txt")

		count = 0
		for f in filesToProcess:
			inFile = open(f,"r")
			spName = f.split("/")[-1].replace("_protein_lengths.txt","")

			if spName not in proteinLengths:
				proteinLengths[spName] = {}

			for line in inFile:
				gene,geneLength = line.strip().split("\t")

				proteinLengths[spName][gene] = geneLength

				count+=1

			inFile.close()

		self.proteinLengths = proteinLengths

		if self.verbose: print ("Total number of species with protein lengths: %s and total genes: %s" % (len(proteinLengths),count) )


	def statsPerFamily(self):
		"""Main function to write stats for each family"""

		if self.verbose: print ("Making stats..")

		#1) all global data structures in use #just wrote this to easier localization and make sure I don't change the values
		familiesDict = self.familiesDict
		familiesTaxon = self.familiesTaxon
		numTaxonBelow = self.numTaxonBelow
		familiesGeneDuplication = self.familiesGeneDuplication
		familiesGeneDuplicationMaxNode = self.familiesGeneDuplicationMaxNode
		familiesGeneLosses = self.familiesGeneLosses
		interproscanData = self.interproscanData
		freeLivingSpecies = self.freeLivingSpecies
		branchLengths = self.branchLengths
		familiesToFlag = self.familiesToFlag
		cladePerSpecies = self.cladePerSpecies
		speciesPerClade = self.speciesPerClade
		listOfSpecies = self.listOfSpecies
		nodeInfo = self.nodeInfo
		nodeInfoSpp = self.nodeInfoSpp
		familiesDictLengths = self.familiesDictLengths

		#2) output file headers
		self.outputFile.write("familyID\tFlag\tn_species\tn_genes\tMean_genes_per_species\tMedian_genes_per_species\tVariation_coefficient_n_genes_per_species\tn_paralogs\tBranch_name\tCompleteness_score\tTotal_duplications\tMax_duplications_node\tMost_frequent_species\t")
		for species in sorted(freeLivingSpecies):
			self.outputFile.write(species+"\t")
		self.outputFile.write("Total_losses\tMedian_branch_length\tPfam_perc_genes\tTMHMM_perc_genes\tSignalP_perc_genes\tPfam_perc_in_family\tPfam_all_in_family\n")

		outputFileMeasures = open(self.outputFile.name+"_measures","w") #special file just for the measures
		outputFileMeasures.write("familyID\tFlag\tn_species\tn_genes\tmean_prot_len\tsum_prot_len\tSpecies_var_coef\tSpecies_var_coef_zeroes\tVar_coef_lengths_zeroes\t")
		for clade in sorted(speciesPerClade):
			outputFileMeasures.write("zscore_%s\t" % clade)
		outputFileMeasures.write("Max_zscore_clade\tMax_zscore\t")
		for clade in sorted(speciesPerClade):
			outputFileMeasures.write("enrich_%s\t" % (clade) )
		outputFileMeasures.write("Max_enrich_clade\tMax_enrich\t")
		outputFileMeasures.write("Pfam_perc_in_family\tPfam_all_in_family\n")

		#3) Loop through each compara family, and produce all stats for this family
		count = 0 #count processed families
		for family in familiesDict:
			count+=1

			#3.1) General basic stats
			nSpecies = len(familiesDict[family]) #note that this familiesDict only contain species entry if any gene in that family (does not count zeroes)
			nGenesPerSpecies = [len(familiesDict[family][sp]) for sp in familiesDict[family]]
			proteinLengthsSpecies = [familiesDictLengths[family][sp] for sp in familiesDictLengths[family]] #this is list of protein lenghts sums

			nGenes = sum(nGenesPerSpecies)
			sumProteinLengths = sum(proteinLengthsSpecies)

			avgProteinLenPerGene = "%.1f" % (sumProteinLengths/nGenes)

			nParalogs = nGenes - nSpecies
			avgGenesPerSpecies = "%.1f" % (mean(nGenesPerSpecies))
			medianGenesPerSpecies = "%.1f" % (median(nGenesPerSpecies))
			stdevGenesPerSpecies = std(nGenesPerSpecies,ddof=1)
			# 21-Aug-2015: Variation coefficient = Stdev of number of genes in species with at least one gene / mean of number of genes in species with at least one gene
			# this excludes the zeroes (species without any gene in that family)
			stdevDivMean = "%.1f" % (stdevGenesPerSpecies/float(avgGenesPerSpecies)) # Variations in numbers of genes per family across species -> stdev / mean

			#3.2) tag families to exclude
			if family in familiesToFlag:
				familyFlag = "exclude"
			else:
				familyFlag = ""

			#3.3) Calculate/retrieve Branch name, median branch length and completeness score
			branchName = familiesTaxon[family]
			if family in branchLengths:
				medianBranchLength = branchLengths[family]
			else: #this should not happen except for a couple of known exceptions e.g. family 1109776 when only 33 species, because pruning did not work. #as I do not know if there are/will be other exceptions I decided to put "NA" when it does not work.
				medianBranchLength = "NA"
			nSpeciesBelowTaxon = numTaxonBelow[branchName]
			completeness = "%.2f" % (nSpecies/nSpeciesBelowTaxon)
			#Validation completeness for family: 80624. On James plot shows 0.5 completeness score. VERIFIED 80624 41 319 7.8 2.0 278 Metazoa40001000 0.5
			#Validation completeness for family: 63382. On James plot shows <0.25 completeness score. VERIFIED 63382 10 393 39.3 36.5 383 Chromadorea40001076 0.2

			#3.4) recalculate species variation coefficient, having insight from phylogeny.
			#filling the zeroes when a species should have a gene (based on tree root) but does not
			zeroesMissing = nSpeciesBelowTaxon - nSpecies #species that should have a gene but don't, based on the family species root
			assert(zeroesMissing >= 0)
			nGenesPerSpeciesZeroes = nGenesPerSpecies[:] #nGenesPerSpeciesZeroes = nGenesPerSpecies[:]
			for i in range(zeroesMissing):
				nGenesPerSpeciesZeroes.append(0)

			stdnGenesPerSpeciesZeroes = std(nGenesPerSpeciesZeroes,ddof=1)
			meannGenesPerSpeciesZeroes = mean(nGenesPerSpeciesZeroes)
			speciesVarCoefZeroes = "%.1f" % (stdnGenesPerSpeciesZeroes / meannGenesPerSpeciesZeroes )

			#variation coefficient for protein lengths
			proteinLengthsSpeciesZeroes = proteinLengthsSpecies[:] #nGenesPerSpeciesZeroes = nGenesPerSpecies[:]
			for i in range(zeroesMissing):
				proteinLengthsSpeciesZeroes.append(0)

			stdProteinLengthsSpeciesZeroes = std(proteinLengthsSpeciesZeroes,ddof=1)
			meanProteinLengthsSpeciesZeroes = mean(proteinLengthsSpeciesZeroes)
			speciesVarCoefZeroesLengths = "%.1f" % (stdProteinLengthsSpeciesZeroes / meanProteinLengthsSpeciesZeroes )

			#3.5) retrieve gene losses and gene duplication data
			if family in familiesGeneDuplication:
				duplications = familiesGeneDuplication[family]
			else:
				duplications = 0

			if family in familiesGeneLosses:
				losses = familiesGeneLosses[family]
			else:
				losses = 0

			if family in familiesGeneDuplicationMaxNode:
				maxDuplicationNode = familiesGeneDuplicationMaxNode[family]
			else:
				maxDuplicationNode = "None"

			#initialize interproscan-related items and others
			nGenesWithPfam = 0 #at least 1 Pfam entry
			nGenesWithSignalP = 0 #at least 1 SignalP entry
			nGenesWithTMHMM = 0 #at least 1 TMHMM entry
			pfamTerms = [] # this is to store one pfam domain example per gene
			pfamTermsAll = [] # this is to store all pfam domains in a gene (not just one per gene)

			maxSp = "" #the species with most number of genes
			maxSpProtLen = 0

			protLenPerClade = {} #key -> clade, val ->  list of protein lengths in species of that clade . note that I only initialize a clade if any species on that clade, and only contains data on species with at least one gene.
			protLenPerSp = {} #key -> sp, val -> prot length
			#3.6) Loop through each species in gene family. To retrieve interproscan info and make other calculations
			for species in sorted(familiesDict[family]):
				#ngenesSp = len(familiesDict[family][species]) #if calculating measures based on gene numbers
				protLenSp = familiesDictLengths[family][species] #if calculating measures using protein length instead of gene numbers, without changing all variable names..
				protLenPerSp[species] = familiesDictLengths[family][species] #redundant but needed #only initializes species if any gene

				#3.7) Get genes per clade
				clade = cladePerSpecies[species]
				if clade not in protLenPerClade:
					protLenPerClade[clade] = []
				protLenPerClade[clade].append(protLenSp)

				#3.8) get measure of species with most genes in family
				if protLenSp > maxSpProtLen:
					maxSpProtLen = protLenSp
					maxSp = species

				# #testing: block start. Uncomment this when running for real

				#3.9) get interproscan data
				for gene in familiesDict[family][species]:
					if gene in interproscanData[species]: #it is normal that a gene may not have any interproscan entries.
						#Validation. Gene present in IPR file and with correct description
						#Python: syphacia_muris SMUV_0000960301 {'Pfam': ['SMUV_0000960301-mRNA-1\tPfam\tprotein_match\t71\t295\t4.6E-58\t+\t.\tName=PF00089;signature_desc=Trypsin;Target=SMUV_0000960301-mRNA-1 71 295;status=T;ID=match$342_71_295;Ontology_term="GO:0004252","GO:0006508";date=04-06-2014;Dbxref="InterPro:IPR001254"']}
						#Farm: /lustre/scratch108/parasites/dr7/50HG/50HG_100K_families_parsing/interproscanGFFs_testing$ grep SMUV_0000960301 SMUV.protein.fa.gz.fas.ipr
						#	SMUV_0000960301-mRNA-1	Pfam	protein_match	71	295	4.6E-58	+	.	Name=PF00089;signature_desc=Trypsin;Target=SMUV_0000960301-mRNA-1 71 295;status=T;ID=match$342_71_295;Ontology_term="GO:0004252","GO:0006508";date=04-06-2014;Dbxref="InterPro:IPR001254"

						typesOfEntries = list(interproscanData[species][gene].keys()) #this is dictionary entries, therefore no duplications
						assert (len(typesOfEntries) <= len(self.targetSources))

						for entryType in typesOfEntries:
							if "Pfam" in entryType:
								nGenesWithPfam+=1

								spl = "".join(interproscanData[species][gene][entryType]).split(";") #there may be several lines of pfam domains for same gene
								#Note: the presence of pfam entries for the same gene in the IPR GFF is random, (i.e. the last entry is not always the most N-terminal domain etc, its just random),
								#Note(cont): meaning that picking only 1 pfam example for each gene is randomized (but the same everytime we run this)

								##originally I was picking only one pfam domain per gene
								desc = ""
								pfamTermsSet = set()
								for item in spl:
									if item.startswith("signature_desc="):
										desc = item.replace("signature_desc=","")
										pfamTermsSet.add(desc)
								for term in pfamTermsSet:
									pfamTerms.append(term)

								##then I also added column where we have all pfam domains in gene
								for item in spl:
									desc = ""
									if item.startswith("signature_desc="):
										desc = item.replace("signature_desc=","")
										pfamTermsAll.append(desc)

							if "SignalP" in entryType: #this can in principle match any SignalP_* entry
								nGenesWithSignalP+=1
							if "TMHMM" in entryType:
								nGenesWithTMHMM+=1
					# else: #dr7@farm3-head2:/lustre/scratch108/parasites/dr7/50HG/50HG_100K_families_parsing/interproscanGFFs_testing$ grep HNAJ_0000929301 HNAJ.protein.fa.gz.fas.fas.sl.ipr
					# 	print (species,gene)

					#testing: block end. Uncomment this when running for real


			mostFrequentSpecies = str(maxSpProtLen)+"X:"+maxSp #species with most genes/prot length, and how many

			#3.10) calculations on interproscan data #note this really looks at genes, not at protein length. Stats are gene-based, not altered for protein length.
			assert (nGenesWithPfam <= nGenes)
			assert (nGenesWithTMHMM <= nGenes)
			assert (nGenesWithSignalP <= nGenes)

			percPfam = "%.1f" % (nGenesWithPfam*100.0/nGenes)
			percSignalP = "%.1f" % (nGenesWithSignalP*100.0/nGenes)
			percTMHMM = "%.1f" % (nGenesWithTMHMM*100.0/nGenes)

			##Getting 3 most common pfam terms, and % genes in family with that pfam
			pfams = {}
			pfamText = ""
			for term in pfamTerms:
				if term not in pfams:
					pfams[term] = float("%.1f" % (pfamTerms.count(term)*100.0/nGenes ) )

			pfamsSorted = sorted(pfams, key=pfams.get, reverse=True) #need numeric sorting

			for term in pfamsSorted[0:3]: #top 3 pfam domains #if less than 3 exist will simply pick less
				pfamText+=str(pfams[term])+"%:"+term+";"

			if pfamText == "":
				pfamText = "100%:None"

			##Getting all pfam domains in family statistics, and how many times
			pfamsAll = {}
			pfamTextAll = ""
			for term in pfamTermsAll:
				if term not in pfamsAll:
					pfamsAll[term] = pfamTermsAll.count(term)

			pfamsSortedAll = sorted(pfamsAll, key=pfamsAll.get, reverse=True)

			for term in pfamsSortedAll:
				pfamTextAll+=str(pfamsAll[term])+"X:"+term+";"

			if pfamTextAll == "":
				pfamTextAll = "0X:None"


			#3.11) calculate ranking measures

			### IMPORTANT NOTE ### 14-Sep: Even though variable names and comments still say "genes", it may actually be using protein lengths.

			#Note that all these measures count zeroes.
			#E.g. I know the species tree node corresponding to root of the gene tree. Therefore I always take into account all the species AND clades that SHOULD be present in the calculation.

			##3.11.1) Calculate Z-score for each clade. Z-score = (mean prot length per species in a clade - mean prot length per species)/(std of prot length per species)
			#This only works if more than 1 clade. Otherwise, all entries have 'NA'
			#I always check root and calculate for each clade descendant of the root. Other clades will have 'NA'.
			#note that I always divide by all species in the clade, regardless all of them have a gene or not
			#note: standard deviation can be zero (e.g. all species having same number prot length), and we cannot divide by zero. I tag these cases with "NA_std_is_zero"

			#get clades that should be present.
			if branchName in nodeInfo: #if not, it means it is a leaf, not an internal node
				descendantClades = nodeInfo[branchName]
			else:
				assert("_" in branchName)
				descendantClades = []

			#calcualte mean genes per clade #adding zeroes if species does not have gene
			meanProtLenPerClade = {} #key -> clade, value -> mean prot length per species in that clade (total, including zeroes) #note this dict only contains clade entries if they should have any value
			sdProtLenOutsideClade = {} #key -> clade, value -> std dev of prot length per species that are *not* in this clade. Added Avril Coghlan 14-Jan-2016
			for clade in descendantClades:
				if clade in protLenPerClade:
					meanProtLenPerClade[clade] = sum(protLenPerClade[clade]) / float(len(speciesPerClade[clade] ) )

                                        # calculate the standard deviation of protein lengths in species outside this clade. Added Avril Coghlan 14-Jan-2016
					otherSpecies = [] #list of protein lengths for the species not in current clade, will include zeroes. Added Avril Coghlan 14-Jan-2016
					for species in nodeInfoSpp[branchName]: #species under the tree root. Added Avril Coghlan 14-Jan-2016.
						if cladePerSpecies[species] == clade: # Added Avril Coghlan 14-Jan-2016
							continue # Added Avril Coghlan 14-Jan-2016
						else: # Added Avril Coghlan 14-Jan-2016
							if species in protLenPerSp: #protLenPerSp only contains species entry if any gene. Added Avril Coghlan 14-Jan-2016.
								otherSpecies.append(protLenPerSp[species]) # Added Avril Coghlan 14-Jan-2016.
							else: # Added Avril Coghlan 14-Jan-2016.
								otherSpecies.append(0) # Added Avril Coghlan 14-Jan-2016.
					stdProteinLengthsSpeciesZeroesOutsideClade = std(otherSpecies, ddof=1) # Added Avril Coghlan 14-Jan-2016. Used ddof=1 as Diogo has used this elsewhere for std. dev.
					sdProtLenOutsideClade[clade] = stdProteinLengthsSpeciesZeroesOutsideClade # Added Avril Coghlan 14-Jan-2016.

				else: #if no species have it
					meanProtLenPerClade[clade] = 0
					sdProtLenOutsideClade[clade] = 0 # Added by bh4 - 17-Jan-2016
			assert(set(meanProtLenPerClade.keys()) == set(descendantClades) )

			#actually calculate z-scores
			zscoreResults = {} #key -> clade, val -> zscore
			maxZscore = 0 #will only count positive Zscores
			maxZclade = "NA"
			for clade in sorted(speciesPerClade): #just so I loop over all clades
				if len(descendantClades) < 2: #cannot calculate clade Z-score if only one clade
					zscoreResults[clade] = "NA"
				else:
					if clade in meanProtLenPerClade:
						# use the standard deviation of protein lengths in species outside this clade. Added Avril Coghlan 14-Jan-2016.
						stdProteinLengthsSpeciesZeroesOutsideClade = sdProtLenOutsideClade[clade] # Added Avril Coghlan 14-Jan-2016.
						# if stdnGenesPerSpeciesZeroes == 0: Commented out Avril Coghlan 14-Jan-2016
						if stdProteinLengthsSpeciesZeroesOutsideClade == 0: # Added Avril Coghlan 14-Jan-2016
							zscoreResults[clade] = "NA_std_is_zero"
						else:
							# use the standard deviation of protein lengths in species outside this clade. Added Avril Coghlan 14-Jan-2016.
							zscore = (meanProtLenPerClade[clade] - meanProteinLengthsSpeciesZeroes) / stdProteinLengthsSpeciesZeroesOutsideClade #       Added Avril Coghlan 14-Jan-2016
							# zscore = (meanProtLenPerClade[clade] - meanProteinLengthsSpeciesZeroes) / stdProteinLengthsSpeciesZeroes. Commented      out Avril Coghlan 14-Jan-2016
							#validated using http://ncalculators.com/statistics/z-score-calculator.htm
							zscoreResults[clade] = zscore
							if zscore > maxZscore:
								maxZscore = zscore
								maxZclade = clade
					else:
						zscoreResults[clade] = "NA"

			maxZscore = "%.2f" % maxZscore

			#Z-score calculation validation:
			# ##Mono-clade family
			# family 1212416 : NECAME_04199 (necator_americanus) NBR_0000394301 (nippostrongylus_brasiliensis) NBR_0001482701 (nippostrongylus_brasiliensis) NBR_0002215501 (nippostrongylus_brasiliensis)
			# 1212416 0.5     0.5     NA      NA      NA      NA      NA      NA      NA      0.0
			# #CORRECT

			# ##Two-neighbour-clade family
			# family 851973 : csin104847 (clonorchis_sinensis) csin112867 (clonorchis_sinensis) D915_09852 (fasciola_hepatica) D915_09853 (fasciola_hepatica) SSLN_0001350701 (schistocephalus_solidus) Sjp_0074790 (schistosoma_japonicum) Sjp_0074800 (schistosoma_japonicum) Smp_166880 (schistosoma_mansoni)
			# Em	Ts	Hm	Mc	Ss	Cs	Fh	Sj	Sm
			# 0	0	0	0	1	2	2	2	1
			# avg	0.888888889		zscore tapeworm	-0.787400787
			# std	0.874889764		zscore fluke	0.984250984
			# tapeworm avg	0.2
			# fluke avg	1.75
			# #VALUES VALIDATED, MANUAL CALCULATION!

			# ##Root with 6 clades, but genes in only 3 clades.
			# descendants: {'IV', 'V', 'Tapeworm', 'III', 'I', 'Flukes'} data keys: ['I', 'IV', 'Flukes']
			# family 919890 : csin102733 (clonorchis_sinensis) D915_07255 (fasciola_hepatica) MhA1_Contig762.frz3.gene6 (meloidogyne_hapla) nRc.2.0.1.g27258 (romanomermis_culicivorax) Sjp_0091230 (schistosoma_japonicum) Smp_090010 (schistosoma_mansoni)
			# #VALUES VALIDADED, MANUAL CALCULATION!


			##3.11.2) Calculate clade gene enrichement (Avril's/Bhavana's measure)
			# mean prot length per species in a clade/mean prot length per species in all other species
			#Note that I use root to see which clades could be present, and I count zeroes on clades and species. When I say "all other species" it consists of the species that could have a gene based on root.

			# enrichResults = {} #key -> clade, val -> zscore
			# maxEnrich = 0 #will only count positive Zscores
			# maxEnrichClade = "NA"
			# for clade in sorted(speciesPerClade): #just so I loop over all clades
			# 	if len(descendantClades) < 2: #cannot calculate if only one clade
			# 		enrichResults[clade] = "NA"
			# 	else:
			# 		if clade in meanProtLenPerClade: #clades that should have genes (but may not have)
			# 			otherSpecies = [] #list of number of genes for the species not in current clade, will include zeroes
			# 			for cladeDesc in descendantClades: #loop over all clades that should/could contain a gene based on root, not just the ones that do have a gene
			# 				if cladeDesc == clade: #if current clade
			# 					continue
			# 				if cladeDesc in protLenPerClade: #this means that there will be at least one species with a gene here
			# 					otherSpecies.extend(protLenPerClade[cladeDesc])
			# 					missingZeroes = speciesPerClade[cladeDesc] - len(protLenPerClade[cladeDesc]) #the protLenPerClade only contains data on species with genes, therefore need to add zeroes
			# 					for m in range(missingZeroes):
			# 						otherSpecies.append(0)
			# 				else: #add zeroes accordingly
			# 					l = [0]*speciesPerClade[cladeDesc]
			# 					otherSpecies.extend(l)

			# 			#print (family, branchName, clade, len(otherSpecies),speciesPerClade[clade],nSpeciesBelowTaxon)
			# 			assert (len(otherSpecies) + speciesPerClade[clade] == nSpeciesBelowTaxon)

			# 			enrich = meanProtLenPerClade[clade] / mean(otherSpecies) #meannGenesPerSpeciesZeroes #mean(otherSpecies)
			# 			enrichResults[clade] = enrich
			# 			if enrich > maxEnrich:
			# 				maxEnrich = enrich
			# 				maxEnrichClade = clade
			# 		else: #so that I write NA in clades which should not have genes in this family
			# 			enrichResults[clade] = "NA"

			# assert (len(enrichResults) == len(speciesPerClade))
			# maxEnrich = "%.2f" % maxEnrich

			# ##Another approach to calculate clade enrichment, allowing paraphyletic groups.
			#Note that if gene tree root is in between a paraphyletic group, the score is calculated for all members of the paraphyletic clade.

			enrichResults = {} #key -> clade, val -> zscore
			maxEnrich = 0 #max will only count positive Zscores
			maxEnrichClade = "NA"
			for clade in sorted(speciesPerClade): #loop over all clades, regardless of having gene or not, to fill all columns in output
				if len(descendantClades) < 2: #cannot calculate if only one clade
					enrichResults[clade] = "NA"
				else: #calculate enrichment
					if clade in meanProtLenPerClade: #clades that should have genes (but may not have) #note that meanProtLenPerClade already accounts for zeroes
						otherSpecies = [] #list of protein lengths for the species not in current clade, will include zeroes

						countSppInClade = 0 #just for validation #because of paraphyletic groups, the number of species in a clade may be different from number of species below gene tree root.
						for species in nodeInfoSpp[branchName]: #species under the tree root
							if cladePerSpecies[species] == clade:
								countSppInClade+=1
								continue
							else:
								if species in protLenPerSp: #protLenPerSp only contains species entry if any gene
									otherSpecies.append(protLenPerSp[species])
								else:
									otherSpecies.append(0)

						assert (len(otherSpecies) + countSppInClade == nSpeciesBelowTaxon)

						enrich = meanProtLenPerClade[clade] / mean(otherSpecies)
						enrichResults[clade] = enrich
						if enrich > maxEnrich:
							maxEnrich = enrich
							maxEnrichClade = clade
					else: #so that I write NA in clades which do not have genes in this family
						enrichResults[clade] = "NA"

			assert (len(enrichResults) == len(speciesPerClade))
			maxEnrich = "%.2f" % maxEnrich

			##Confirmed that this approach gives the same results as the previous, if using same dataset.


			#4) Writing to output file, one family at a time
			#main output file
			self.outputFile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t" % 	(family,familyFlag,nSpecies,nGenes,avgGenesPerSpecies,medianGenesPerSpecies,stdevDivMean,nParalogs,branchName,completeness,duplications,maxDuplicationNode,mostFrequentSpecies) )
			##Presence in free-living species
			for species in sorted(freeLivingSpecies):
				if species in familiesDict[family]:
					self.outputFile.write("%s\t" % len(familiesDict[family][species]))
				else:
					self.outputFile.write("0\t")
			self.outputFile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (losses,medianBranchLength,percPfam,percTMHMM,percSignalP,pfamText,pfamTextAll) )

			#measures file
			outputFileMeasures.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t" % (family,familyFlag,nSpecies,nGenes,avgProteinLenPerGene,sumProteinLengths,stdevDivMean,speciesVarCoefZeroes,speciesVarCoefZeroesLengths) )
			for clade in sorted(speciesPerClade):
				outputFileMeasures.write("%s\t" % (zscoreResults[clade]) )
			outputFileMeasures.write("%s\t%s\t" % (maxZclade,maxZscore) )
			for clade in sorted(speciesPerClade):
				outputFileMeasures.write("%s\t" % (enrichResults[clade]) )
			outputFileMeasures.write("%s\t%s\t" % (maxEnrichClade,maxEnrich) )
			outputFileMeasures.write("%s\t%s\n" % (pfamText,pfamTextAll) )


		self.outputFile.close()
		outputFileMeasures.close()

		if self.verbose: print ("Total number of families processed:\t%s" % count )


	def writeFamilies(self):
		"""(Optional function) Writes the families (like in families.txt file) after all the filterings, the way they are actually processed.
		On the original files there is no apparent gene order sorting. Here there will be species sorting.
		Intended output example
		-------------------------------------------------------------------------------
		family 1584328 : ASU_08438 (ascaris_suum) ALUE_0001329201 (ascaris_lumbricoides)
		-------------------------------------------------------------------------------
		family 1584332 : nRc.2.0.1.g04834 (romanomermis_culicivorax) nRc.2.0.1.g36069 (romanomermis_culicivorax)
		"""

		if self.verbose: print ("Writing new families.txt file..")

		familiesDict = self.familiesDict

		dumpFamiliesFile = open(self.dumpFamiliesFile,"w")

		count = 0
		for family in sorted(familiesDict):
			count+=1
			text = "-------------------------------------------------------------------------------\n"
			text+= "family %s :" % family

			for species in sorted(familiesDict[family]):
				for gene in sorted(familiesDict[family][species]):
					text+=" %s (%s)" % (gene,species)

			text+="\n"

			dumpFamiliesFile.write(text)

		dumpFamiliesFile.close()

		if self.verbose: print ("Wrote %s families at %s" % (count,dumpFamiliesFile.name))


	def cleanUp(self):
		"""removes temporary files"""
		#pass
		os.system("rm tmp_familiesPerTaxonomyLevel_"+self.jobID+".txt")


	def dumpAll(self):

		if self.verbose: print ("Writing pickle dump files..")

		pickle.dump(self.familiesDict,open(self.dumpsFolder+"/familiesDict","wb") )
		pickle.dump(self.familiesTaxon,open(self.dumpsFolder+"/familiesTaxon","wb") )
		pickle.dump(self.numTaxonBelow,open(self.dumpsFolder+"/numTaxonBelow","wb") )
		pickle.dump(self.familiesGeneDuplication,open(self.dumpsFolder+"/familiesGeneDuplication","wb") )
		pickle.dump(self.familiesGeneDuplicationMaxNode,open(self.dumpsFolder+"/familiesGeneDuplicationMaxNode","wb") )
		pickle.dump(self.familiesGeneLosses,open(self.dumpsFolder+"/familiesGeneLosses","wb") )
		pickle.dump(self.tagSpeciesName,open(self.dumpsFolder+"/tagSpeciesName","wb") )
		pickle.dump(self.geneIDMapping,open(self.dumpsFolder+"/geneIDMapping","wb") )
		pickle.dump(self.interproscanData,open(self.dumpsFolder+"/interproscanData","wb") )
		pickle.dump(self.branchLengths,open(self.dumpsFolder+"/branchLengths","wb") )
		pickle.dump(self.cladePerSpecies,open(self.dumpsFolder+"/cladePerSpecies","wb") )
		pickle.dump(self.speciesPerClade,open(self.dumpsFolder+"/speciesPerClade","wb") )
		pickle.dump(self.speciesTree,open(self.dumpsFolder+"/speciesTree","wb") )
		pickle.dump(self.removeOutgroups,open(self.dumpsFolder+"/removeOutgroups","wb") )
		pickle.dump(self.speciesToRemove,open(self.dumpsFolder+"/speciesToRemove","wb") )
		pickle.dump(self.familiesToFlag,open(self.dumpsFolder+"/familiesToFlag","wb") )
		pickle.dump(self.listOfSpecies,open(self.dumpsFolder+"/listOfSpecies","wb") )
		pickle.dump(self.geneIDMappingAllEntries,open(self.dumpsFolder+"/geneIDMappingAllEntries","wb") )
		pickle.dump(self.geneFilterList,open(self.dumpsFolder+"/geneFilterList","wb") )
		pickle.dump(self.nodeInfo,open(self.dumpsFolder+"/nodeInfo","wb") )
		pickle.dump(self.nodeInfoSpp,open(self.dumpsFolder+"/nodeInfoSpp","wb") )
		pickle.dump(self.proteinLengths,open(self.dumpsFolder+"/proteinLengths","wb")  )
		pickle.dump(self.familiesDictLengths,open(self.dumpsFolder+"/familiesDictLengths","wb") )


        # for key in vars(self):
        #     pickle.dump(eval("self."+key),open("dumps/"+str(key),"wb") )


	def loadAll(self,dumpPath):

		if self.verbose: print ("Loading pickle dump files..")

		#loading only the required data
		self.familiesDict=pickle.load(open(dumpPath+"/familiesDict","rb") )
		self.familiesTaxon=pickle.load(open(dumpPath+"/familiesTaxon","rb") )
		self.numTaxonBelow=pickle.load(open(dumpPath+"/numTaxonBelow","rb") )
		self.familiesGeneDuplication=pickle.load(open(dumpPath+"/familiesGeneDuplication","rb") )
		self.familiesGeneDuplicationMaxNode=pickle.load(open(dumpPath+"/familiesGeneDuplicationMaxNode","rb") )
		self.familiesGeneLosses=pickle.load(open(dumpPath+"/familiesGeneLosses","rb") )
		self.interproscanData=pickle.load(open(dumpPath+"/interproscanData","rb") ) #can comment this to run faster, uncomment when running for real
		self.branchLengths=pickle.load(open(dumpPath+"/branchLengths","rb") )
		self.cladePerSpecies=pickle.load(open(dumpPath+"/cladePerSpecies","rb") )
		self.speciesPerClade=pickle.load(open(dumpPath+"/speciesPerClade","rb") )
		self.familiesToFlag=pickle.load(open(dumpPath+"/familiesToFlag","rb") )
		self.listOfSpecies=pickle.load(open(dumpPath+"/listOfSpecies","rb") )
		self.nodeInfo=pickle.load(open(dumpPath+"/nodeInfo","rb") )
		self.nodeInfoSpp=pickle.load(open(dumpPath+"/nodeInfoSpp","rb") )
		self.familiesDictLengths=pickle.load(open(dumpPath+"/familiesDictLengths","rb") )


	def run(self):
		"""Runs the functions in order."""

		self.readGeneIDMappingFolder() #this has to be run before the readGeneFilterList()
		if self.removeOutgroupsFile != "": self.readOutgroupList() #this has to be run before readFamilies()
		if self.geneFilterListFile != "": self.readGeneFilterList() #this has to be run before readFamilies()
		self.readProteinLengthFiles() #this has to be run before readFamilies
		self.readFamilies()
		self.readSpeciesTree()
		if self.removeOutgroupsFile != "": self.writeFamiliesPerTaxonomyLevel() #this has to be run after readFamilies()
		if self.flagFamilyListFile != "": self.readFlagFamilyList() #this can be run anywhere before statsPerFamily()
		self.readFamiliesPerTaxonomyLevel() #note that this will read a file created in this script
		self.readGeneDuplicationsFolder()
		self.readGeneLossesFolder()
		self.readSpeciesNameTag()
		self.readInterproGFFFolder()
		self.readBranchLengths()
		self.readCladeFile()
		self.readNodeInfoFile() #has to be run after readCladeFile
		if self.dumpsFolder != "": self.dumpAll()
		self.statsPerFamily()
		if self.dumpFamiliesFile != "": self.writeFamilies()
		self.cleanUp()


	def runLoad(self,dumpPath):
		"""For script development. To use if just changing the statsPerFamily function."""

		if self.verbose: print ("Loading dumps..")

		self.loadAll(dumpPath) #this loads are required objects to runs stats on them

		self.statsPerFamily()

		if self.dumpFamiliesFile != "": self.writeFamilies()


def main():

	import argparse

	parser = argparse.ArgumentParser(description='Analysis suite for exploring Compara families.')

	#positional args
	parser.add_argument('comparaFamilies', metavar='comparaFamilies', type=str, help='Flat file. Compara families file. One family per line, list of genes/species. Output from get_all_50HG_families_v75.pl.')
	parser.add_argument('familiesPerTaxonomyLevel', metavar='familiesPerTaxonomyLevel', type=str, help='Flat file. One taxonomy level per line, list of families. Output from get_families_per_taxonomy_level_v75.pl.')
	parser.add_argument('speciesTree', metavar='speciesTree', type=str, help='Species tree in newick format. Each node should have an unique name, this expects species/node+id (e.g. dorylaimia40001065, romanomermis_culicivorax40001064). Output from get_species_tree_v75.pl.')
	parser.add_argument('geneDuplicationsFolder', metavar='geneDuplicationsFolder', type=str, help='Folder with a file for each taxon/node. File names should be e.g. ciona_intestinalis40001180.txt. Output from get_gene_duplications_per_taxonomy_level_v75.pl.')
	parser.add_argument('geneLossesFolder', metavar='geneLossesFolder', type=str, help='Folder with a file for each taxon/node. File names should be e.g. ciona_intestinalis40001180.txt. Output from get_gene_losses_per_taxonomy_level_v75.pl.')
	parser.add_argument('speciesNameTag', metavar='speciesNameTag', type=str, help='TSV file. Locus tag - Species name correspondence. PREFIX  Genus   species VERSION.')
	parser.add_argument('geneIDMappingFolder', metavar='geneIDMappingFolder', type=str, help='Folder with a *_id_mapping.txt TSV file for each species, with Gene IDs from different sources. SPECIES_NAME    GENEMEMBER_ID   SEQMEMBER_ID    TRANSCRIPT_ID   PROTEIN_ID      IPRSCAN_GFF_ID.')
	parser.add_argument('interproscanGFFFolder', metavar='interproscanGFFFolder', type=str, help='Folder with a *ipr* GFF file for each species. Files can be gzipped. File names should be e.g. ciona_intestinalis40001180.txt. Output from get_gene_losses_per_taxonomy_level_v75.pl.')
	parser.add_argument('branchLenghts', metavar='branchLenghts', type=str, help='TSV file. One line per Compara family, ID in first column, average branch length of tree in second column, median in third column. Output from get_compara_branch_length.py.')
	parser.add_argument('speciesClade', metavar='speciesClade', type=str, help='TSV file. One line per species, first column species name as in comparaFamilies, second column the clade name/number, third column either Parasitic or Freeliving. E.g. loa_loa\tIII\tParasitic')
	parser.add_argument('nodeInfo', metavar='nodeInfo', type=str, help='File with leaves per each node in the tree. Works even if whole species tree was used (no need to prune). Produced by ~alc/Documents/git/Python/parse_tree_with_ETE.py')
	parser.add_argument('proteinLengthsFolder', metavar='proteinLengthsFolder', type=str, help='Path to folder with a file for each species. Files need to have same names as species names and contain a list of all proteins and their respective length, in amino acids. Used also to recreate singletons lists. From get_protein_lengths_all_species_v75.pl.')

	#optional argument
	parser.add_argument('--outputFile', metavar='outputFile', default = os.getcwd()+"/comparaFamiliesParser.out", type=str, help='File where important output will be written (but you should also store stdout and stderr). If none given it will created file called comparaFamiliesParser.out in cwd.')
	parser.add_argument('--verbose', metavar='verbose', default = 1, type=int, help='Verbose mode (1 = Yes, 0 = No). Default = 1.')
	parser.add_argument('--removeOutgroups', metavar='removeOutgroups', default = "", type=str, help='Whether to remove outgropup species from results. Give file with list of species/tree nodes to remove/ignore from analysis, one per line. First column is nodeID (e.g. 400010010), second column is the node/species name, third column whether this is "species" or "node". Remember this must include also internal nodes. Default = "" (off)')
	parser.add_argument('--dumpFamilies', metavar='dumpFamilies', default = "", type=str, help='Option to create output file with the post-filtered Compara families used here. Introduce wanted file name after argument. Default = No')
	parser.add_argument('--flagFamilyList', metavar='flagFamilyList', default = "", type=str, help='Option to flag with "exclude" specific families provided in a flat file, with one family ID per line. Default = No')
	parser.add_argument('--geneFilterList', metavar='geneFilterList', default = "", type=str, help='Option to exclude the listed genes from all output measures. Use mRNA IDs (SEQMEMBER_ID), which may be different from families file. Default = No')
	parser.add_argument('--loadDumps', metavar='loadDumps', default = "", type=str, help='Instead of running the script from scratch, load already dumped python objects. Provide dumps folder path. Default = No')
	parser.add_argument('--dumpsFolder', metavar='dumpsFolder', default = "dumps", type=str, help='Give folder path to create object dumps. Default = create dumps in folder called "dumps".')

	args = parser.parse_args() #gets the arguments
	print (args)

	#any better way to pass arguments? possible with *args or **kwargs? couldn't make it work using eval
	run = Run(args.comparaFamilies,args.familiesPerTaxonomyLevel,args.geneDuplicationsFolder,args.geneLossesFolder,args.speciesNameTag,args.geneIDMappingFolder,\
		args.interproscanGFFFolder,args.branchLenghts,args.removeOutgroups,args.speciesTree,args.verbose,args.outputFile,args.dumpFamilies,args.flagFamilyList,\
		args.speciesClade,args.geneFilterList,args.dumpsFolder,args.nodeInfo,args.proteinLengthsFolder)

	if args.loadDumps != "":
		#to speed up processing, just load data and run stats
		run.runLoad(args.loadDumps)
	else:
		#create dump folder, write parameter file and run all
		if not os.path.exists(args.dumpsFolder):
			os.mkdir(args.dumpsFolder)
		with open(args.dumpsFolder+"/parameters.txt","w") as parametersFile:
			parametersFile.write(str(args) )
		run.run()


	print ("Finished in %.1f seconds." % (clock()-time0))


if __name__ == '__main__':
	main()