avrilcoghlan/retrieve_genetrees_from_wormbase_parasite.py

## retrieve_genetrees_from_wormbase_parasite.py
import os
import sys
import requests # this is used to access json files
from ete3 import Phyloxml
import datetime

# Note: this script must be run in Python2 because ete3 uses Python2

#====================================================================#

# use the wormbase parasite REST API to retrieve the gene tree containing a particular gene:

def retrieve_tree_from_wormbase_parasite(gene):

    server = "https://parasite.wormbase.org"
    ext = "/rest-13/genetree/member/id/%s?aligned=1" % gene # aligned=1 means that we should get the alignment for each gene (rather than just its sequence)

    r = requests.get(server+ext, headers={ "Content-Type" : "text/x-phyloxml+xml", "Accept" : ""})

    if not r.ok: # if a particular S. mansoni gene is not in any tree, we would get an error message back from the REST API:
        return("no_tree")

    # print(r.text) # this will be the XML version of the tree returned

    return r.text

#====================================================================#

# parse a PhyloXML tree to get info. on the genes in it and their sequences:

def parse_phylo_xml(phylo_xml_file):

    mylist = list()

    project = Phyloxml()
    project.build_from_file(phylo_xml_file)
    trees = project.get_phylogeny()
    tree_cnt = 0
    for tree in trees:
        tree_cnt += 1
        # get the topology id. for the tree:
        topology_id = tree.get_topology_id() # note: I couldn't figure out how to parse the Compara gene tree id. from the XML using the ete3 phyloxml parser
        for node in tree:
            # get the name of the gene:
            gene_name = node.name
            # get the species of the gene:
            clade = node.phyloxml_clade
            species_name = clade.taxonomy[0].get_common_name()
            species_name = species_name[0]
            # get the molecular sequence for the gene:
            seqs = clade.get_sequence()
            mol_seq = seqs[0].get_mol_seq()
            mol_seq_val = mol_seq.get_valueOf_()
            # make a tuple containing the topology_id, gene name, species, and sequence:
            mytuple = (gene_name, species_name, mol_seq_val)
            mylist.append(mytuple)
    # we should just have found one tree:
        assert(tree_cnt == 1)

    return(topology_id, mylist)

#====================================================================#

# read in the input list of S. mansoni genes of interest:

def read_input_list_of_genes(input_genelist_file, output_file):

    cnt = 0
    topologies = set()
    # open the output file:
    with open(output_file, 'w') as f:
        # read in the list of S. mansoni genes:
        inputfileObj = open(input_genelist_file, "r")
        for line in inputfileObj:
            line = line.rstrip()
            temp = line.split()
            # 1   Smp_078570
            gene = temp[1] # e.g. Smp_078570
            cnt += 1
            # get the phylogenetic tree for the gene tree containing this gene:
            print(cnt,"Finding the gene tree for gene",gene)
            phylo_xml = retrieve_tree_from_wormbase_parasite(gene)
            if phylo_xml != 'no_tree': # if this S. mansoni gene is not in a gene tree, we get back a 'no_tree' response
                # write the phyloxml text to a temporary file:
                phylo_xml_file = write_phylo_xml_file(phylo_xml)
                # now parse the PhyloXML tree to get information on the genes in it and their sequences:
                (topology_id, mylist) = parse_phylo_xml(phylo_xml_file) # 'mylist' is a list of tuples, each tuple containing the gene name, species and sequence
                # if we have not seen this gene tree already, write the tuples to the output file:
                if topology_id not in topologies:
                    topologies.add(topology_id) # add this gene tree to the list of gene trees we have seen
                    for mytuple in mylist:
                        (gene_name, species_name, mol_seq_val) = mytuple
                        output_line = "%s\t%s\t%s\t%s\n" % (topology_id, gene_name, species_name, mol_seq_val)
                        f.write(output_line)
                # delete the temporary file:
                os.unlink(phylo_xml_file)
            else:
                print("Gene is not in any tree:",gene)
        inputfileObj.close()

#====================================================================#

# define a function to make a temporary file name:

def make_temporary_filename(basename):

    # code taken from http://stackoverflow.com/questions/10501247/best-way-to-generate-random-file-names-in-python:
    suffix = datetime.datetime.now().strftime("%y%m%d_%H%M%S")
    filename = "_".join([basename, suffix]) # e.g. 'mylogfile_120508_171442'
    assert(len(filename) > 0)

    return filename

#====================================================================#

# write the phyloxml text to a temporary file:

def write_phylo_xml_file(phylo_xml):

    # get a name for the temporary file:
    phylo_xml_file = make_temporary_filename("phyloxmltemp")

    phyloxmlfileObj = open(phylo_xml_file, "w")
    phyloxmlfileObj.write(phylo_xml)
    phyloxmlfileObj.close()

    return phylo_xml_file

#====================================================================#

def main():

    # check the command-line arguments:
    if len(sys.argv) != 3 or os.path.exists(sys.argv[1]) == False:
        print("Usage: %s input_genelist_file output_file" % sys.argv[0])
        print("Note: this script must be run in Python2 because ete3 uses Python2!")
        sys.exit(1)
    input_genelist_file = sys.argv[1] # input file with a list of S. mansoni genes of interest
    output_file = sys.argv[2]

    # read in the input list of S. mansoni genes of interest:
    print("Reading in gene list...")
    read_input_list_of_genes(input_genelist_file, output_file)

    print("FINISHED\n")

#====================================================================#

if __name__=="__main__":
    main()

#====================================================================#
	import os
	import sys
	import requests # this is used to access json files
	from ete3 import Phyloxml
	import datetime

	# Note: this script must be run in Python2 because ete3 uses Python2

	#====================================================================#

	# use the wormbase parasite REST API to retrieve the gene tree containing a particular gene:

	def retrieve_tree_from_wormbase_parasite(gene):

	server = "https://parasite.wormbase.org"
	ext = "/rest-13/genetree/member/id/%s?aligned=1" % gene # aligned=1 means that we should get the alignment for each gene (rather than just its sequence)

	r = requests.get(server+ext, headers={ "Content-Type" : "text/x-phyloxml+xml", "Accept" : ""})

	if not r.ok: # if a particular S. mansoni gene is not in any tree, we would get an error message back from the REST API:
	return("no_tree")

	# print(r.text) # this will be the XML version of the tree returned

	return r.text

	#====================================================================#

	# parse a PhyloXML tree to get info. on the genes in it and their sequences:

	def parse_phylo_xml(phylo_xml_file):

	mylist = list()

	project = Phyloxml()
	project.build_from_file(phylo_xml_file)
	trees = project.get_phylogeny()
	tree_cnt = 0
	for tree in trees:
	tree_cnt += 1
	# get the topology id. for the tree:
	topology_id = tree.get_topology_id() # note: I couldn't figure out how to parse the Compara gene tree id. from the XML using the ete3 phyloxml parser
	for node in tree:
	# get the name of the gene:
	gene_name = node.name
	# get the species of the gene:
	clade = node.phyloxml_clade
	species_name = clade.taxonomy[0].get_common_name()
	species_name = species_name[0]
	# get the molecular sequence for the gene:
	seqs = clade.get_sequence()
	mol_seq = seqs[0].get_mol_seq()
	mol_seq_val = mol_seq.get_valueOf_()
	# make a tuple containing the topology_id, gene name, species, and sequence:
	mytuple = (gene_name, species_name, mol_seq_val)
	mylist.append(mytuple)
	# we should just have found one tree:
	assert(tree_cnt == 1)

	return(topology_id, mylist)

	#====================================================================#

	# read in the input list of S. mansoni genes of interest:

	def read_input_list_of_genes(input_genelist_file, output_file):

	cnt = 0
	topologies = set()
	# open the output file:
	with open(output_file, 'w') as f:
	# read in the list of S. mansoni genes:
	inputfileObj = open(input_genelist_file, "r")
	for line in inputfileObj:
	line = line.rstrip()
	temp = line.split()
	# 1 Smp_078570
	gene = temp[1] # e.g. Smp_078570
	cnt += 1
	# get the phylogenetic tree for the gene tree containing this gene:
	print(cnt,"Finding the gene tree for gene",gene)
	phylo_xml = retrieve_tree_from_wormbase_parasite(gene)
	if phylo_xml != 'no_tree': # if this S. mansoni gene is not in a gene tree, we get back a 'no_tree' response
	# write the phyloxml text to a temporary file:
	phylo_xml_file = write_phylo_xml_file(phylo_xml)
	# now parse the PhyloXML tree to get information on the genes in it and their sequences:
	(topology_id, mylist) = parse_phylo_xml(phylo_xml_file) # 'mylist' is a list of tuples, each tuple containing the gene name, species and sequence
	# if we have not seen this gene tree already, write the tuples to the output file:
	if topology_id not in topologies:
	topologies.add(topology_id) # add this gene tree to the list of gene trees we have seen
	for mytuple in mylist:
	(gene_name, species_name, mol_seq_val) = mytuple
	output_line = "%s\t%s\t%s\t%s\n" % (topology_id, gene_name, species_name, mol_seq_val)
	f.write(output_line)
	# delete the temporary file:
	os.unlink(phylo_xml_file)
	else:
	print("Gene is not in any tree:",gene)
	inputfileObj.close()

	#====================================================================#

	# define a function to make a temporary file name:

	def make_temporary_filename(basename):

	# code taken from http://stackoverflow.com/questions/10501247/best-way-to-generate-random-file-names-in-python:
	suffix = datetime.datetime.now().strftime("%y%m%d_%H%M%S")
	filename = "_".join([basename, suffix]) # e.g. 'mylogfile_120508_171442'
	assert(len(filename) > 0)

	return filename

	#====================================================================#

	# write the phyloxml text to a temporary file:

	def write_phylo_xml_file(phylo_xml):

	# get a name for the temporary file:
	phylo_xml_file = make_temporary_filename("phyloxmltemp")

	phyloxmlfileObj = open(phylo_xml_file, "w")
	phyloxmlfileObj.write(phylo_xml)
	phyloxmlfileObj.close()

	return phylo_xml_file

	#====================================================================#

	def main():

	# check the command-line arguments:
	if len(sys.argv) != 3 or os.path.exists(sys.argv[1]) == False:
	print("Usage: %s input_genelist_file output_file" % sys.argv[0])
	print("Note: this script must be run in Python2 because ete3 uses Python2!")
	sys.exit(1)
	input_genelist_file = sys.argv[1] # input file with a list of S. mansoni genes of interest
	output_file = sys.argv[2]

	# read in the input list of S. mansoni genes of interest:
	print("Reading in gene list...")
	read_input_list_of_genes(input_genelist_file, output_file)

	print("FINISHED\n")

	#====================================================================#

	if __name__=="__main__":
	main()

	#====================================================================#