Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Created January 28, 2014 17:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avrilcoghlan/8671775 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/8671775 to your computer and use it in GitHub Desktop.
Python script to calculate the number of steps from a GO term to the top of the GO hierarchy, using Dijkstra's algorithm
import sys
import os
from collections import defaultdict
from scipy.sparse import lil_matrix # needed for Dijkstra's algorithm
from scipy.sparse.csgraph import dijkstra # needed for Dijkstra's algorithm
class Error (Exception): pass
#====================================================================#
# define a function to read in the ancestors of each GO term in the GO hierarchy:
def read_go_ancestors(go_desc_file):
# first read in the input GO file, and make a list of all the GO terms, and the
# terms above them in the GO hierarchy:
# eg.
# [Term]
# id: GO:0004835
terms = [] # list of GO terms
ancestors = defaultdict(list) # ancestors of a particular GO term, in the hierarchy
take = 0
fileObj = open(go_desc_file, "r")
for line in fileObj:
line = line.rstrip()
temp = line.split()
if len(temp) == 1:
if temp[0] == '[Term]':
take = 1
elif len(temp) >= 2 and take == 1:
if temp[0] == 'id:':
go = temp[1]
terms.append(go) # append this to our list of terms
elif temp[0] == 'is_a:': # eg. is_a: GO:0048308 ! organelle inheritance
ancestor = temp[1]
ancestors[go].append(ancestor)
elif len(temp) == 0:
take = 0
fileObj.close()
return(ancestors, terms)
#====================================================================#
# define a function to make the graph of GO terms, ie. create the matrix
# containing the graph structure for running Dijkstra's algorithm:
def create_go_graph_structure(ancestors, terms):
num_terms = len(terms)
# create a list containing num_terms lists initialised to 0:
# OPTION 1 (use numpy):
# matrix = numpy.zeros( (num_terms, num_terms), "d")
# OPTION 2 (use scipy.sparse, apparently lil_matrix is good for building up a matrix iteratively):
matrix = lil_matrix( (num_terms, num_terms), dtype="float64")
# for each GO term X, make an edge to each term Y above it in the GO hierarchy:
# (there may be several GO terms above a particular term X in the GO hierarchy)
for go in terms:
if go in ancestors: # if there are ancestors defined for this go term
go_index = terms.index(go)
go_ancestors = ancestors[go]
for ancestor in go_ancestors:
ancestor_index = terms.index(ancestor)
matrix[go_index, ancestor_index] = 1
# Note: not all terms have ancestors defined, eg. the three terms at the top of the hierarcy,
# and obsolete terms, eg. GO:0000005
return(matrix)
#====================================================================#
# define a function to run Dijkstra's algorithm between each GO term and the three terms
# at the top of the hierarchy (biological process, molecular function, cellular component):
def run_dijkstras_for_terms(matrix, terms):
# create a dictionary to store the distance from each term to the top of the GO hierarchy:
distance = defaultdict(lambda: 1e+10)
# find the index in list 'terms' for each of the three terms at the top of the GO
# hierarchy:
ends = []
# Note: not all the top 3 GO terms are in 'terms' for the test cases:
if 'GO:0003674' in terms:
end1_index = terms.index('GO:0003674') # molecular function
ends.append(end1_index)
if 'GO:0005575' in terms:
end2_index = terms.index('GO:0005575') # cellular component
ends.append(end2_index)
if 'GO:0008150' in terms:
end3_index = terms.index('GO:0008150') # biological process
ends.append(end3_index)
for go in terms:
term_index = terms.index(go)
# run Dijkstra's algorithm, starting at index term_index
distances, predecessors = dijkstra(matrix, indices=term_index, return_predecessors=True, directed=True)
# find the distance to each of the three terms at the top of the hierarchy
min_dist = 1e+10
for end_index in ends:
dist = distances[end_index]
if dist < min_dist:
min_dist = dist
assert(go not in distance)
distance[go] = min_dist
return distance
#====================================================================#
def main():
# check the command-line arguments:
if len(sys.argv) != 2 or os.path.exists(sys.argv[1]) == False:
print("Usage: %s go_desc_file" % sys.argv[0])
sys.exit(1)
go_desc_file = sys.argv[1]
# read in the ancestors of each GO term in the GO hierarchy:
(ancestors, terms) = read_go_ancestors(go_desc_file)
# make the graph of GO terms, ie. create the matrix containing the graph structure for
# running Dijkstra's algorithm:
go_matrix = create_go_graph_structure(ancestors, terms)
# run Dijkstra's algorithm between each GO term and the three terms at the top of the hierarchy
# (biological process, molecular function, cellular component):
distance = run_dijkstras_for_terms(go_matrix, terms)
for go, dist in distance.items():
print("%s %d" % (go, dist))
# Note: Noel pointed out that I could have calculated the distance to the top of the hierarchy
# using a simpler approach (see calc_dists_to_top_of_GO_using_bfs.py).
#====================================================================#
if __name__=="__main__":
main()
#====================================================================#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment