Last active
June 9, 2020 11:42
-
-
Save d0choa/67489a44636d557a5f53a3fbfe208d28 to your computer and use it in GitHub Desktop.
EFO-OTAR mappings prototype
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import rdflib | |
import pandas as pd | |
## Path to efo local or remote | |
owlpath = "https://github.com/EBISPOT/efo/releases/download/v3.18.0/efo_otar_slim.owl" | |
## File downloaded from | |
## https://docs.google.com/spreadsheets/d/1CV_shXJy1ACM09HZBB_-3Nl6l_dfkrA26elMAF0ttHs/edit | |
mappingFile = "/Users/ochoa/Downloads/OTAR project EFO mappings for disease profile pages - Sheet1-2.csv" | |
intraneturl = "http://home.opentargets.org/" | |
outputFile = "/Users/ochoa/Desktop/output.json" | |
## loading the graph | |
g = rdflib.Graph() | |
g.load(owlpath) | |
## sparql query to get all ancestors for all EFO terms | |
sparql_query = ''' | |
prefix oio: <http://www.geneontology.org/formats/oboInOwl#> | |
prefix owl: <http://www.w3.org/2002/07/owl#> | |
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> | |
SELECT ?cls ?label ?ancestor ?ancestorlabel WHERE { | |
?cls a owl:Class . | |
?cls rdfs:label ?label_all . | |
?cls rdfs:subClassOf* ?ancestor . | |
?ancestor rdfs:label ?ancestorlabel_all | |
BIND (STR(?ancestorlabel_all) AS ?ancestorlabel) | |
BIND (STR(?label_all) AS ?label) | |
} | |
''' | |
qres = g.query(sparql_query) | |
## Convert into pandas data.frame | |
## Note: there might be better ways to searialise this | |
clss = [] | |
labels = [] | |
ancestors = [] | |
ancestorlabels = [] | |
for (cls, label, ancestor, ancestorlabel) in g.query(sparql_query): | |
clss.append(str(cls)) | |
labels.append(str(label)) | |
ancestors.append(str(ancestor)) | |
ancestorlabels.append(str(ancestorlabel)) | |
efo_df = pd.DataFrame({ | |
"cls": clss, | |
"label": labels, | |
"ancestor": ancestors, | |
"ancestorlabel": ancestorlabels | |
}) | |
## create ids from uri | |
efo_df["cls_id"] = efo_df["cls"].str.replace("(.+)/", "") | |
efo_df["ancestor_id"] = efo_df["ancestor"].str.replace("(.+)/", "") | |
## Read file with OTAR-EFO mappings | |
mapdf = pd.read_csv(mappingFile) | |
## merge 2 datasets | |
df = pd.merge(mapdf, efo_df, \ | |
how = 'inner', \ | |
left_on = 'EFO Disease ID', | |
right_on = 'cls_id') | |
## Json formatting and output | |
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '') | |
df['reference'] = intraneturl + df['otar_code'] | |
df.rename(columns = {'ancestor_id':'disease_id'}) \ | |
.groupby('disease_id') \ | |
.apply(lambda x: x[['otar_code','project_name','status','reference']].to_dict('r')) \ | |
.reset_index() \ | |
.rename(columns={0:'projects'}) \ | |
.to_json(outputFile, orient='records') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[{ | |
"disease_id": "Orphanet_68367", | |
"projects": [ | |
{ | |
"otar_code": "OTAR2073", | |
"project_name": "NeuroFlux", | |
"status": "Active", | |
"reference": "http://home.opentargets.org/OTAR2073" | |
} | |
] | |
}, | |
{ | |
"disease_id": "Orphanet_71859", | |
"projects": [ | |
{ | |
"otar_code": "OTAR2072", | |
"project_name": "DNA Damage", | |
"status": "Active", | |
"reference": "http://home.opentargets.org/OTAR2072" | |
} | |
] | |
}] |
@dochoa here you have the code to get it properly using spark opentargets/platform-etl-backend#45
and the output generated is like this
{
"efo_id": "MONDO_0021193",
"projects": [
{
"otar_code": "OTAR041",
"status": "Active",
"project_name": "CELLector",
"reference": "http://home.opentargets.org/OTAR041"
},
{
"otar_code": "OTAR015",
"status": "Active",
"project_name": "CRISPR-Cas9 Target ID",
"reference": "http://home.opentargets.org/OTAR015"
},
{
"otar_code": "OTAR016",
"status": "Closed",
"project_name": "Cancer Functional Genomics",
"reference": "http://home.opentargets.org/OTAR016"
},
{
"otar_code": "OTAR2055",
"status": "Active",
"project_name": "Prediction of Oncology Targets and in Silico Drug Prescriptions",
"reference": "http://home.opentargets.org/OTAR2055"
}
]
}
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Pending Issue. Focusing on the ontology expansion, I have forgotten to add the direct EFO-OTAR project link to the final json file. It contains all EFO ancestors but not the EFO itself.