Created
June 20, 2019 09:35
-
-
Save avrilcoghlan/0d6573b1717688471bca9a3868905059 to your computer and use it in GitHub Desktop.
Retrieve, and parse, all the gene trees from WormBase ParaSite for a list of Schistosoma mansoni genes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
import requests # this is used to access json files | |
from ete3 import Phyloxml | |
import datetime | |
# Note: this script must be run in Python2 because ete3 uses Python2 | |
#====================================================================# | |
# use the wormbase parasite REST API to retrieve the gene tree containing a particular gene: | |
def retrieve_tree_from_wormbase_parasite(gene): | |
server = "https://parasite.wormbase.org" | |
ext = "/rest-13/genetree/member/id/%s?aligned=1" % gene # aligned=1 means that we should get the alignment for each gene (rather than just its sequence) | |
r = requests.get(server+ext, headers={ "Content-Type" : "text/x-phyloxml+xml", "Accept" : ""}) | |
if not r.ok: # if a particular S. mansoni gene is not in any tree, we would get an error message back from the REST API: | |
return("no_tree") | |
# print(r.text) # this will be the XML version of the tree returned | |
return r.text | |
#====================================================================# | |
# parse a PhyloXML tree to get info. on the genes in it and their sequences: | |
def parse_phylo_xml(phylo_xml_file): | |
mylist = list() | |
project = Phyloxml() | |
project.build_from_file(phylo_xml_file) | |
trees = project.get_phylogeny() | |
tree_cnt = 0 | |
for tree in trees: | |
tree_cnt += 1 | |
# get the topology id. for the tree: | |
topology_id = tree.get_topology_id() # note: I couldn't figure out how to parse the Compara gene tree id. from the XML using the ete3 phyloxml parser | |
for node in tree: | |
# get the name of the gene: | |
gene_name = node.name | |
# get the species of the gene: | |
clade = node.phyloxml_clade | |
species_name = clade.taxonomy[0].get_common_name() | |
species_name = species_name[0] | |
# get the molecular sequence for the gene: | |
seqs = clade.get_sequence() | |
mol_seq = seqs[0].get_mol_seq() | |
mol_seq_val = mol_seq.get_valueOf_() | |
# make a tuple containing the topology_id, gene name, species, and sequence: | |
mytuple = (gene_name, species_name, mol_seq_val) | |
mylist.append(mytuple) | |
# we should just have found one tree: | |
assert(tree_cnt == 1) | |
return(topology_id, mylist) | |
#====================================================================# | |
# read in the input list of S. mansoni genes of interest: | |
def read_input_list_of_genes(input_genelist_file, output_file): | |
cnt = 0 | |
topologies = set() | |
# open the output file: | |
with open(output_file, 'w') as f: | |
# read in the list of S. mansoni genes: | |
inputfileObj = open(input_genelist_file, "r") | |
for line in inputfileObj: | |
line = line.rstrip() | |
temp = line.split() | |
# 1 Smp_078570 | |
gene = temp[1] # e.g. Smp_078570 | |
cnt += 1 | |
# get the phylogenetic tree for the gene tree containing this gene: | |
print(cnt,"Finding the gene tree for gene",gene) | |
phylo_xml = retrieve_tree_from_wormbase_parasite(gene) | |
if phylo_xml != 'no_tree': # if this S. mansoni gene is not in a gene tree, we get back a 'no_tree' response | |
# write the phyloxml text to a temporary file: | |
phylo_xml_file = write_phylo_xml_file(phylo_xml) | |
# now parse the PhyloXML tree to get information on the genes in it and their sequences: | |
(topology_id, mylist) = parse_phylo_xml(phylo_xml_file) # 'mylist' is a list of tuples, each tuple containing the gene name, species and sequence | |
# if we have not seen this gene tree already, write the tuples to the output file: | |
if topology_id not in topologies: | |
topologies.add(topology_id) # add this gene tree to the list of gene trees we have seen | |
for mytuple in mylist: | |
(gene_name, species_name, mol_seq_val) = mytuple | |
output_line = "%s\t%s\t%s\t%s\n" % (topology_id, gene_name, species_name, mol_seq_val) | |
f.write(output_line) | |
# delete the temporary file: | |
os.unlink(phylo_xml_file) | |
else: | |
print("Gene is not in any tree:",gene) | |
inputfileObj.close() | |
#====================================================================# | |
# define a function to make a temporary file name: | |
def make_temporary_filename(basename): | |
# code taken from http://stackoverflow.com/questions/10501247/best-way-to-generate-random-file-names-in-python: | |
suffix = datetime.datetime.now().strftime("%y%m%d_%H%M%S") | |
filename = "_".join([basename, suffix]) # e.g. 'mylogfile_120508_171442' | |
assert(len(filename) > 0) | |
return filename | |
#====================================================================# | |
# write the phyloxml text to a temporary file: | |
def write_phylo_xml_file(phylo_xml): | |
# get a name for the temporary file: | |
phylo_xml_file = make_temporary_filename("phyloxmltemp") | |
phyloxmlfileObj = open(phylo_xml_file, "w") | |
phyloxmlfileObj.write(phylo_xml) | |
phyloxmlfileObj.close() | |
return phylo_xml_file | |
#====================================================================# | |
def main(): | |
# check the command-line arguments: | |
if len(sys.argv) != 3 or os.path.exists(sys.argv[1]) == False: | |
print("Usage: %s input_genelist_file output_file" % sys.argv[0]) | |
print("Note: this script must be run in Python2 because ete3 uses Python2!") | |
sys.exit(1) | |
input_genelist_file = sys.argv[1] # input file with a list of S. mansoni genes of interest | |
output_file = sys.argv[2] | |
# read in the input list of S. mansoni genes of interest: | |
print("Reading in gene list...") | |
read_input_list_of_genes(input_genelist_file, output_file) | |
print("FINISHED\n") | |
#====================================================================# | |
if __name__=="__main__": | |
main() | |
#====================================================================# |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment