Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Last active January 4, 2016 20:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avrilcoghlan/8671799 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/8671799 to your computer and use it in GitHub Desktop.
Python script to calculate the number of steps from a GO term to the top of the GO hierarchy, using breadth-first search
import sys
import os
from collections import defaultdict
class Error (Exception): pass
#====================================================================#
# define a function to read in the ancestors of each GO term in the GO hierarchy:
def read_go_ancestors(go_desc_file):
# first read in the input GO file, and make a list of all the GO terms, and the
# terms above them in the GO hierarchy:
# eg.
# [Term]
# id: GO:0004835
terms = [] # list of GO terms
ancestors = defaultdict(list) # ancestors of a particular GO term, in the hierarchy
take = 0
fileObj = open(go_desc_file, "r")
for line in fileObj:
line = line.rstrip()
temp = line.split()
if len(temp) == 1:
if temp[0] == '[Term]':
take = 1
elif len(temp) >= 2 and take == 1:
if temp[0] == 'id:':
go = temp[1]
terms.append(go) # append this to our list of terms
elif temp[0] == 'is_a:': # eg. is_a: GO:0048308 ! organelle inheritance
ancestor = temp[1]
ancestors[go].append(ancestor)
elif len(temp) == 0:
take = 0
fileObj.close()
return(ancestors, terms)
#====================================================================#
# Noel's function for finding the minimum distances from a query node
# to ancestor GO nodes. It returns a dictionary result{} that has the
# ancestors and the distances to them. This function uses a breadth-first
# search of the tree.
def BFS_dist_from_node(query_node, parents):
"""Return dictionary containing minimum distances of parent GO nodes from the query"""
result = {}
queue = []
queue.append( (query_node, 0) )
while queue:
node, dist = queue.pop(0) # Take the first item off the list 'queue' (and remove it)
result[node] = dist
if node in parents: # If the node *has* parents
for parent in parents[node]:
# Get the first member of each tuple, see
# http://stackoverflow.com/questions/12142133/how-to-get-first-element-in-a-list-of-tuples
queue_members = [x[0] for x in queue]
if parent not in result and parent not in queue_members: # Don't visit a second time
queue.append( (parent, dist+1) )
return result
#====================================================================#
def main():
# check the command-line arguments:
if len(sys.argv) != 2 or os.path.exists(sys.argv[1]) == False:
print("Usage: %s go_desc_file" % sys.argv[0])
sys.exit(1)
go_desc_file = sys.argv[1]
# read in the ancestors of each GO term in the GO hierarchy:
(ancestors, terms) = read_go_ancestors(go_desc_file)
# calculate the distance from each GO term to the three terms at the top of the hierarchy
# (biologial process, molecular function, cellular component):
for go in terms:
bfs_dist_dict = BFS_dist_from_node(go, ancestors)
dist1 = bfs_dist_dict['GO:0003674'] if 'GO:0003674' in bfs_dist_dict else 1e+10 # 'molecular function'
dist2 = bfs_dist_dict['GO:0005575'] if 'GO:0005575' in bfs_dist_dict else 1e+10 # 'cellular component'
dist3 = bfs_dist_dict['GO:0008150'] if 'GO:0008150' in bfs_dist_dict else 1e+10 # 'biological process'
dist = min(dist1, dist2, dist3)
print("%s %d" % (go, dist))
#====================================================================#
if __name__=="__main__":
main()
#====================================================================#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment