Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avrilcoghlan/0d6573b1717688471bca9a3868905059 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/0d6573b1717688471bca9a3868905059 to your computer and use it in GitHub Desktop.
Retrieve, and parse, all the gene trees from WormBase ParaSite for a list of Schistosoma mansoni genes
import os
import sys
import requests # this is used to access json files
from ete3 import Phyloxml
import datetime
# Note: this script must be run in Python2 because ete3 uses Python2
#====================================================================#
# use the wormbase parasite REST API to retrieve the gene tree containing a particular gene:
def retrieve_tree_from_wormbase_parasite(gene):
server = "https://parasite.wormbase.org"
ext = "/rest-13/genetree/member/id/%s?aligned=1" % gene # aligned=1 means that we should get the alignment for each gene (rather than just its sequence)
r = requests.get(server+ext, headers={ "Content-Type" : "text/x-phyloxml+xml", "Accept" : ""})
if not r.ok: # if a particular S. mansoni gene is not in any tree, we would get an error message back from the REST API:
return("no_tree")
# print(r.text) # this will be the XML version of the tree returned
return r.text
#====================================================================#
# parse a PhyloXML tree to get info. on the genes in it and their sequences:
def parse_phylo_xml(phylo_xml_file):
mylist = list()
project = Phyloxml()
project.build_from_file(phylo_xml_file)
trees = project.get_phylogeny()
tree_cnt = 0
for tree in trees:
tree_cnt += 1
# get the topology id. for the tree:
topology_id = tree.get_topology_id() # note: I couldn't figure out how to parse the Compara gene tree id. from the XML using the ete3 phyloxml parser
for node in tree:
# get the name of the gene:
gene_name = node.name
# get the species of the gene:
clade = node.phyloxml_clade
species_name = clade.taxonomy[0].get_common_name()
species_name = species_name[0]
# get the molecular sequence for the gene:
seqs = clade.get_sequence()
mol_seq = seqs[0].get_mol_seq()
mol_seq_val = mol_seq.get_valueOf_()
# make a tuple containing the topology_id, gene name, species, and sequence:
mytuple = (gene_name, species_name, mol_seq_val)
mylist.append(mytuple)
# we should just have found one tree:
assert(tree_cnt == 1)
return(topology_id, mylist)
#====================================================================#
# read in the input list of S. mansoni genes of interest:
def read_input_list_of_genes(input_genelist_file, output_file):
cnt = 0
topologies = set()
# open the output file:
with open(output_file, 'w') as f:
# read in the list of S. mansoni genes:
inputfileObj = open(input_genelist_file, "r")
for line in inputfileObj:
line = line.rstrip()
temp = line.split()
# 1 Smp_078570
gene = temp[1] # e.g. Smp_078570
cnt += 1
# get the phylogenetic tree for the gene tree containing this gene:
print(cnt,"Finding the gene tree for gene",gene)
phylo_xml = retrieve_tree_from_wormbase_parasite(gene)
if phylo_xml != 'no_tree': # if this S. mansoni gene is not in a gene tree, we get back a 'no_tree' response
# write the phyloxml text to a temporary file:
phylo_xml_file = write_phylo_xml_file(phylo_xml)
# now parse the PhyloXML tree to get information on the genes in it and their sequences:
(topology_id, mylist) = parse_phylo_xml(phylo_xml_file) # 'mylist' is a list of tuples, each tuple containing the gene name, species and sequence
# if we have not seen this gene tree already, write the tuples to the output file:
if topology_id not in topologies:
topologies.add(topology_id) # add this gene tree to the list of gene trees we have seen
for mytuple in mylist:
(gene_name, species_name, mol_seq_val) = mytuple
output_line = "%s\t%s\t%s\t%s\n" % (topology_id, gene_name, species_name, mol_seq_val)
f.write(output_line)
# delete the temporary file:
os.unlink(phylo_xml_file)
else:
print("Gene is not in any tree:",gene)
inputfileObj.close()
#====================================================================#
# define a function to make a temporary file name:
def make_temporary_filename(basename):
# code taken from http://stackoverflow.com/questions/10501247/best-way-to-generate-random-file-names-in-python:
suffix = datetime.datetime.now().strftime("%y%m%d_%H%M%S")
filename = "_".join([basename, suffix]) # e.g. 'mylogfile_120508_171442'
assert(len(filename) > 0)
return filename
#====================================================================#
# write the phyloxml text to a temporary file:
def write_phylo_xml_file(phylo_xml):
# get a name for the temporary file:
phylo_xml_file = make_temporary_filename("phyloxmltemp")
phyloxmlfileObj = open(phylo_xml_file, "w")
phyloxmlfileObj.write(phylo_xml)
phyloxmlfileObj.close()
return phylo_xml_file
#====================================================================#
def main():
# check the command-line arguments:
if len(sys.argv) != 3 or os.path.exists(sys.argv[1]) == False:
print("Usage: %s input_genelist_file output_file" % sys.argv[0])
print("Note: this script must be run in Python2 because ete3 uses Python2!")
sys.exit(1)
input_genelist_file = sys.argv[1] # input file with a list of S. mansoni genes of interest
output_file = sys.argv[2]
# read in the input list of S. mansoni genes of interest:
print("Reading in gene list...")
read_input_list_of_genes(input_genelist_file, output_file)
print("FINISHED\n")
#====================================================================#
if __name__=="__main__":
main()
#====================================================================#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment