Skip to content

Instantly share code, notes, and snippets.

@mtmorgan
Last active April 12, 2020 16:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mtmorgan/1900a67f4a17f075fa97df00a46e7991 to your computer and use it in GitHub Desktop.
Save mtmorgan/1900a67f4a17f075fa97df00a46e7991 to your computer and use it in GitHub Desktop.
construct distance between GO nodes from OBO file
obo = readLines("go.obo")
##
## Clean data
##
## blank lines separate different 'groups' in the obo
group <- cumsum(!nzchar(obo))
## keep groups that have the [Term] label
is_term = group[grepl("^\\[Term\\]$", obo)]
obo = obo[group %in% is_term]
group = group[group %in% is_term]
## what have we got?
length(unique(group)) # 47426 GO terms
unique(sub(":.*", "", obo)) # what keys are there?
## only keep 'id' and 'is_a' lines
id = obo[startsWith(obo, "id:")]
isa = obo[startsWith(obo, "is_a:")]
isa_group = group[startsWith(obo, "is_a:")]
## keep only GO ids
id <- sub(".*(GO:[[:digit:]]+).*", "\\1", id)
isa <- sub(".*(GO:[[:digit:]]+).*", "\\1", isa)
## create the map...
map0 = split(isa, factor(id[isa_group], levels = id))
length(map0) # 47426, same as number of GO terms
table(lengths(map0)) # id's have between 0 and 10 'is_a' relations
## ...and a data.frame representing an 'unlisted' version of the map
map = data.frame(
id = rep(names(map0), lengths(map0)),
is_a = unlist(map0, use.names = FALSE)
)
##
## Calculate the distance between each id and any other id reachable
## via 'is_a' relations
##
## distance from each term to itself...
step = 0L
distance = data.frame(term1 = id, term2 = id, step)
repeat {
## update
step = step + 1L
## each 'is_a' id has a number 'n' of is_a relations...
n = lengths(map0)[map$is_a]
## create a new data.frame representing the relationship between
## the original id and next level of is_a relationship
map = data.frame(
id = rep(map$id, n),
is_a = unlist(map0[map$is_a], use.names = FALSE)
)
## stop when there are no more is_a relationships to pursue
if (nrow(map) == 0L) break
message(nrow(map))
## update the table of distances
term1 = map$id
term2 = map$is_a
distance = rbind(distance, data.frame(term1, term2, step))
}
nrow(distance)
## keep the first (shortest) distance?
dup <- duplicated(distance[,1:2])
shortest <- distance[!dup,]
nrow(shortest)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment