Skip to content

Instantly share code, notes, and snippets.

@dwinston
Created April 15, 2022 18:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dwinston/fa74bb9dccb3a11fa5e7e8b6a4f57089 to your computer and use it in GitHub Desktop.
Save dwinston/fa74bb9dccb3a11fa5e7e8b6a4f57089 to your computer and use it in GitHub Desktop.
build a subsumption map scoped to ENVO terms in use by NMDC biosamples
"""
Build a subsumption map scoped to ENVO terms in use by NMDC biosamples
"""
from collections import defaultdict
import json
from rdflib import Graph
from rdflib.namespace import Namespace
from tqdm import tqdm
from nmdc_runtime.site.repository import run_config_frozen__normal_env
from nmdc_runtime.site.resources import get_mongo
g = Graph()
print("fetching and loading envo.owl...")
g.parse("http://purl.obolibrary.org/obo/envo.owl", format="xml")
print("done loading envo.owl")
nmdc_envo_terms = set()
nmdc_envo_fields = {"env_broad_scale.has_raw_value", "env_local_scale.has_raw_value", "env_medium.has_raw_value"}
mongo = get_mongo(run_config_frozen__normal_env)
mdb = mongo.db
for field in nmdc_envo_fields:
nmdc_envo_terms |= set(mdb.biosample_set.distinct(field))
print(len(nmdc_envo_terms)) # 36
OBO = Namespace('http://purl.obolibrary.org/obo/')
def supers_for(term):
qres = g.query(f"""
SELECT DISTINCT ?o
WHERE {{
{term} rdfs:subClassOf+ ?o .
?o a owl:Class .
}}
""", initNs={"obo": OBO})
return {row.o.n3(g.namespace_manager) for row in qres}
def nmdc_to_obo(t):
return "obo:" + t.replace(":", "_")
def obo_to_nmdc(t):
return t.replace("_", ":")[4:]
term_supers = defaultdict(set)
# takes ~30 seconds in total
for term in tqdm(nmdc_envo_terms):
term_supers[term] = {obo_to_nmdc(t) for t in supers_for(nmdc_to_obo(term))}
term_subs = defaultdict(set)
for sub, supers in term_supers.items():
for sup in supers:
term_subs[sup].add(sub)
term_subs = {sup: list(subs) for sup, subs in term_subs.items() if subs and ":" in sup}
print(len(term_subs)) # 68
with open("envo_term_subterms.json", "w") as f:
json.dump(term_subs, f, indent=4)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment