Skip to content

Instantly share code, notes, and snippets.

@ejmurray
Last active April 30, 2022 08:28
Show Gist options
  • Save ejmurray/a0e8ffe471b1e31bf0e2c113c40c738a to your computer and use it in GitHub Desktop.
Save ejmurray/a0e8ffe471b1e31bf0e2c113c40c738a to your computer and use it in GitHub Desktop.
ncbo
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# import needed libraries
import glob
import json
import os
import pickle
import random
import requests
import time
from tqdm import tqdm # type: ignore
from typing import Any, Dict, Set, Tuple
API_KEY = "34343b78-0515-43ee-b199-cb17cce7f83c"
def gets_json_results_from_api_call(url: str, api_key: str) -> Dict:
"""Function makes API requests and returns results as a json file. API documentation can be found here:
https://data.bioontology.org/documentation. If a 500 HTTP server-side code is return from "status_code" then the
algorithm pauses for 90-120 seconds before trying the request again. By default, the process sleeps for 5-20
seconds after each API call.
Args:
url: A string containing a URL to be run against an API.
api_key: A string containing an API key.
Return:
A json-formatted file containing API results.
Raises:
An exception is raised if a 500 HTTP server-side code is raised.
"""
response = requests.get(url, headers={"Authorization": "apikey token=" + api_key})
time.sleep(random.randint(30, 60)) # ease rate limiting by sleeping for random intervals
if response.status_code == 500:
time.sleep(random.randint(90, 120)) # ease rate limiting by sleeping for random intervals
response = requests.get(url, headers={"Authorization": "apikey token=" + api_key})
return json.loads(response.text)
def writes_data_to_file(file_out: str, results: Set[Tuple[str, Any]]) -> None:
"""Function iterates over set of tuples and writes data to text file locally.
Args:
file_out: A filepath to write data to.
results: A set of tuples, where each tuple represents a mapping between two identifiers.
Returns:
None.
"""
print("Writing results to {location}".format(location=file_out))
with open(file_out, "w") as outfile:
for res in results:
outfile.write(res[0] + "\t" + res[1] + "\n")
outfile.close()
return None
def processes_api_page_results(content: Dict, source2: str) -> Set:
"""Takes a page of API results and processes them to capture and only those mappings that exist between source1
and source2. The method returns the results as a set of tuples. Between each batch the process sleeps for 90-120
seconds to ease the burden on the API.
Args:
content: A dictionary of API page results.
source2: A string naming an ontology that you want to map identifiers from source1 to.
Returns:
unique_edges: A set of tuples, where each tuple is an edge representing a mapping between source1 and source2.
"""
unique_edges = set()
if "collection" not in content.keys():
raise KeyError("Something went wrong: {}".format(content["error"]))
else:
for result in content["collection"]:
if source2 in result["classes"][1]["links"]["ontology"]:
source1_class, source2_class = result["classes"][0]["@id"], result["classes"][1]["@id"]
if ".owl" not in source1_class and ".owl" not in source2_class:
unique_edges.add((source1_class, source2_class))
return unique_edges
def extracts_mapping_data(api_key: str, source1: str, source2: str, file_out: str) -> None:
"""Function uses the BioPortal API to retrieve mappings between two sources. The function batch processes the
results in chunks of 500, writes the data to a temporary directory and then once all batches have been processed,
the data is concatenated into a single file.
Args:
api_key: A string containing a user BiPortal API key.
source1: A string naming a source ontology that you want to map from.
source2: A string naming an ontology that you want to map identifiers from source1 to.
file_out: A filepath to write data to.
Returns:
None.
"""
print("=" * 50 + "\nRetrieving - {src1} - {src2} Mappings\n".format(src1=source1, src2=source2) + "=" * 50)
# get the available resources for mappings to source
ont_source = "https://data.bioontology.org/ontologies/{source}/mappings/".format(source=source1)
api_results = gets_json_results_from_api_call(ont_source, api_key)
print("Processing {} Pages of Results".format(api_results["pageCount"]))
# create temp progress directory to store pages
temp_progress_storage = "/".join(file_out.split("/")[:-1]) + "/processed_data"
os.mkdir(temp_progress_storage)
# batch process api result pages
total_pages = list(range(1, int(api_results["pageCount"]) + 1))
n = 100 if len(total_pages) > 5000 else 50 # 500, 100
batches = [total_pages[i : i + n] for i in range(0, len(total_pages), n)]
for batch in range(0, len(batches)):
print("\nProcessing batch {} of {}".format(batch + 1, len(batches) + 1))
page_results = set()
for page in tqdm(batches[batch]):
content = gets_json_results_from_api_call(ont_source + "?page={page}".format(page=page), api_key)
page_results |= processes_api_page_results(content, source2)
writes_data_to_file(file_out + "_{batch_num}".format(batch_num=batch + 1) + ".txt", page_results)
time.sleep(random.randint(60, 90)) # ease rate limiting by sleeping for random intervals
return None
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment