Skip to content

Instantly share code, notes, and snippets.

@Orbifold
Created February 9, 2022 11:00
Show Gist options
  • Save Orbifold/68b3add8ccfffb52d41bff2f2aecb3fd to your computer and use it in GitHub Desktop.
Save Orbifold/68b3add8ccfffb52d41bff2f2aecb3fd to your computer and use it in GitHub Desktop.
DRKG Schema and Data Import script.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"id": "5ecd5de6",
"metadata": {},
"source": [
"# DRKG TigerGraph Schema and Import"
]
},
{
"cell_type": "markdown",
"id": "33ee741e",
"metadata": {},
"source": [
"## Connecting\n",
"\n",
"Make sure you install the package first via `pip install pyTigerGraph`."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "149cc636",
"metadata": {},
"outputs": [],
"source": [
"import pyTigerGraph as tg \n",
"host = 'https://your-company.i.tgcloud.io'\n",
"secret = \"your-secret\"\n",
"graph_name = \"drkg\"\n",
"user_name = \"tigergraph\"\n",
"password = \"your-password\"\n",
"token = tg.TigerGraphConnection(host=host, graphname=graph_name, username=user_name, password=password).getToken(secret, \"1000000\")[0]\n",
"conn = tg.TigerGraphConnection(host=host, graphname=graph_name, username=user_name, password=password, apiToken=token)"
]
},
{
"cell_type": "markdown",
"id": "4d362d15",
"metadata": {},
"source": [
"Check the connection is hot:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "74b653e1",
"metadata": {},
"outputs": [],
"source": [
"conn.echo()"
]
},
{
"cell_type": "markdown",
"id": "107f1ff4",
"metadata": {},
"source": [
"## Schema Creation\n",
"\n",
"Make sure you have Pandas installed (`pip install pandas`):"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a993d6e0",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"drkg_file = './drkg.tsv'\n",
"df = pd.read_csv(drkg_file, sep=\"\\t\")\n",
"triplets = df.values.tolist()"
]
},
{
"cell_type": "markdown",
"id": "9c06929a",
"metadata": {},
"source": [
"This should given 5.8 million triples:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "275b86ce",
"metadata": {},
"outputs": [],
"source": [
"len(triplets)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "716c9f9b",
"metadata": {},
"outputs": [],
"source": [
"rtypes = dict() # edge types per entity-couple\n",
"entity_dic = {} # entities organized per type\n",
"for triplet in triplets:\n",
" [h,r,t] = triplet\n",
" h_type = h.split(\"::\")[0].replace(\" \" ,\"\")\n",
" h_id = str(h.split(\"::\")[1])\n",
" t_type = t.split(\"::\")[0].replace(\" \" ,\"\")\n",
" t_id = str(t.split(\"::\")[1])\n",
" \n",
" # add the type if not present\n",
" if not h_type in entity_dic:\n",
" entity_dic[h_type]={}\n",
" if not t_type in entity_dic:\n",
" entity_dic[t_type] ={}\n",
" \n",
" # add the edge type per type couple\n",
" type_edge = f\"{h_type}::{t_type}\"\n",
" if not type_edge in rtypes:\n",
" rtypes[type_edge]=[]\n",
" r = r.replace(\" \",\"\").replace(\":\",\"\").replace(\"+\",\"\").replace(\">\",\"\").replace(\"-\",\"\")\n",
" if not r in rtypes[type_edge]:\n",
" rtypes[type_edge].append(r)\n",
" \n",
" # spread entities\n",
" if not h_id in entity_dic[h_type]:\n",
" entity_dic[h_type][h_id] = h\n",
" if not t in entity_dic[t_type]:\n",
" entity_dic[t_type][t_id] = t\n",
" \n",
"schema = \"\"\n",
"for entity_type in entity_dic.keys():\n",
" schema += f\"CREATE VERTEX {entity_type} (PRIMARY_ID Id STRING) With primary_id_as_attribute=\\\"true\\\"\\n\"\n",
"for endpoints in rtypes:\n",
" [source_name, target_name] = endpoints.split(\"::\")\n",
" for edge_name in rtypes[endpoints]:\n",
" schema += f\"CREATE DIRECTED EDGE {edge_name} (FROM {source_name}, TO {target_name})\\n\"\n",
"print(schema)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "98c83bfc",
"metadata": {},
"outputs": [],
"source": [
"print(conn.gsql(\n",
"\"\"\"\n",
"use global\n",
"CREATE VERTEX Gene (PRIMARY_ID Id STRING) With primary_id_as_attribute=\"true\"\n",
"CREATE VERTEX Compound (PRIMARY_ID Id STRING) With primary_id_as_attribute=\"true\"\n",
"CREATE VERTEX Disease (PRIMARY_ID Id STRING) With primary_id_as_attribute=\"true\"\n",
"CREATE VERTEX Atc (PRIMARY_ID Id STRING) With primary_id_as_attribute=\"true\"\n",
"CREATE VERTEX Tax (PRIMARY_ID Id STRING) With primary_id_as_attribute=\"true\"\n",
"CREATE VERTEX BiologicalProcess (PRIMARY_ID Id STRING) With primary_id_as_attribute=\"true\"\n",
"CREATE VERTEX Symptom (PRIMARY_ID Id STRING) With primary_id_as_attribute=\"true\"\n",
"CREATE VERTEX Anatomy (PRIMARY_ID Id STRING) With primary_id_as_attribute=\"true\"\n",
"CREATE VERTEX MolecularFunction (PRIMARY_ID Id STRING) With primary_id_as_attribute=\"true\"\n",
"CREATE VERTEX PharmacologicClass (PRIMARY_ID Id STRING) With primary_id_as_attribute=\"true\"\n",
"CREATE VERTEX CellularComponent (PRIMARY_ID Id STRING) With primary_id_as_attribute=\"true\"\n",
"CREATE VERTEX Pathway (PRIMARY_ID Id STRING) With primary_id_as_attribute=\"true\"\n",
"CREATE VERTEX SideEffect (PRIMARY_ID Id STRING) With primary_id_as_attribute=\"true\"\n",
"CREATE DIRECTED EDGE bioarxHumGenHumGenGeneGene (FROM Gene, TO Gene)\n",
"CREATE DIRECTED EDGE bioarxVirGenHumGenGeneGene (FROM Gene, TO Gene)\n",
"CREATE DIRECTED EDGE GNBRVGeneGene (FROM Gene, TO Gene)\n",
"CREATE DIRECTED EDGE GNBRQGeneGene (FROM Gene, TO Gene)\n",
"CREATE DIRECTED EDGE GNBRRgGeneGene (FROM Gene, TO Gene)\n",
"CREATE DIRECTED EDGE GNBRBGeneGene (FROM Gene, TO Gene)\n",
"CREATE DIRECTED EDGE GNBRIGeneGene (FROM Gene, TO Gene)\n",
"CREATE DIRECTED EDGE GNBREGeneGene (FROM Gene, TO Gene)\n",
"CREATE DIRECTED EDGE GNBRHGeneGene (FROM Gene, TO Gene)\n",
"CREATE DIRECTED EDGE GNBRWGeneGene (FROM Gene, TO Gene)\n",
"CREATE DIRECTED EDGE HetionetGiGGeneGene (FROM Gene, TO Gene)\n",
"CREATE DIRECTED EDGE HetionetGcGGeneGene (FROM Gene, TO Gene)\n",
"CREATE DIRECTED EDGE HetionetGrGGeneGene (FROM Gene, TO Gene)\n",
"CREATE DIRECTED EDGE INTACTASSOCIATIONGeneGene (FROM Gene, TO Gene)\n",
"CREATE DIRECTED EDGE INTACTPHYSICALASSOCIATIONGeneGene (FROM Gene, TO Gene)\n",
"CREATE DIRECTED EDGE INTACTCOLOCALIZATIONGeneGene (FROM Gene, TO Gene)\n",
"CREATE DIRECTED EDGE INTACTDEPHOSPHORYLATIONREACTIONGeneGene (FROM Gene, TO Gene)\n",
"CREATE DIRECTED EDGE INTACTCLEAVAGEREACTIONGeneGene (FROM Gene, TO Gene)\n",
"CREATE DIRECTED EDGE INTACTDIRECTINTERACTIONGeneGene (FROM Gene, TO Gene)\n",
"CREATE DIRECTED EDGE INTACTPHOSPHORYLATIONREACTIONGeneGene (FROM Gene, TO Gene)\n",
"CREATE DIRECTED EDGE INTACTADPRIBOSYLATIONREACTIONGeneGene (FROM Gene, TO Gene)\n",
"CREATE DIRECTED EDGE INTACTUBIQUITINATIONREACTIONGeneGene (FROM Gene, TO Gene)\n",
"CREATE DIRECTED EDGE INTACTPROTEINCLEAVAGEGeneGene (FROM Gene, TO Gene)\n",
"CREATE DIRECTED EDGE STRINGREACTIONGeneGene (FROM Gene, TO Gene)\n",
"CREATE DIRECTED EDGE STRINGCATALYSISGeneGene (FROM Gene, TO Gene)\n",
"CREATE DIRECTED EDGE STRINGACTIVATIONGeneGene (FROM Gene, TO Gene)\n",
"CREATE DIRECTED EDGE STRINGINHIBITIONGeneGene (FROM Gene, TO Gene)\n",
"CREATE DIRECTED EDGE STRINGOTHERGeneGene (FROM Gene, TO Gene)\n",
"CREATE DIRECTED EDGE STRINGBINDINGGeneGene (FROM Gene, TO Gene)\n",
"CREATE DIRECTED EDGE STRINGPTMODGeneGene (FROM Gene, TO Gene)\n",
"CREATE DIRECTED EDGE STRINGEXPRESSIONGeneGene (FROM Gene, TO Gene)\n",
"CREATE DIRECTED EDGE bioarxDrugVirGenCompoundGene (FROM Compound, TO Gene)\n",
"CREATE DIRECTED EDGE bioarxDrugHumGenCompoundGene (FROM Compound, TO Gene)\n",
"CREATE DIRECTED EDGE DRUGBANKtargetCompoundGene (FROM Compound, TO Gene)\n",
"CREATE DIRECTED EDGE DRUGBANKenzymeCompoundGene (FROM Compound, TO Gene)\n",
"CREATE DIRECTED EDGE DRUGBANKcarrierCompoundGene (FROM Compound, TO Gene)\n",
"CREATE DIRECTED EDGE GNBRECompoundGene (FROM Compound, TO Gene)\n",
"CREATE DIRECTED EDGE GNBRACompoundGene (FROM Compound, TO Gene)\n",
"CREATE DIRECTED EDGE GNBRNCompoundGene (FROM Compound, TO Gene)\n",
"CREATE DIRECTED EDGE GNBRKCompoundGene (FROM Compound, TO Gene)\n",
"CREATE DIRECTED EDGE GNBRBCompoundGene (FROM Compound, TO Gene)\n",
"CREATE DIRECTED EDGE GNBROCompoundGene (FROM Compound, TO Gene)\n",
"CREATE DIRECTED EDGE GNBRZCompoundGene (FROM Compound, TO Gene)\n",
"CREATE DIRECTED EDGE HetionetCbGCompoundGene (FROM Compound, TO Gene)\n",
"CREATE DIRECTED EDGE HetionetCuGCompoundGene (FROM Compound, TO Gene)\n",
"CREATE DIRECTED EDGE HetionetCdGCompoundGene (FROM Compound, TO Gene)\n",
"CREATE DIRECTED EDGE INTACTPHYSICALASSOCIATIONCompoundGene (FROM Compound, TO Gene)\n",
"CREATE DIRECTED EDGE INTACTDIRECTINTERACTIONCompoundGene (FROM Compound, TO Gene)\n",
"CREATE DIRECTED EDGE INTACTASSOCIATIONCompoundGene (FROM Compound, TO Gene)\n",
"CREATE DIRECTED EDGE bioarxCovid2_acc_host_geneDiseaseGene (FROM Disease, TO Gene)\n",
"CREATE DIRECTED EDGE bioarxCoronavirus_ass_host_geneDiseaseGene (FROM Disease, TO Gene)\n",
"CREATE DIRECTED EDGE HetionetDdGDiseaseGene (FROM Disease, TO Gene)\n",
"CREATE DIRECTED EDGE HetionetDaGDiseaseGene (FROM Disease, TO Gene)\n",
"CREATE DIRECTED EDGE HetionetDuGDiseaseGene (FROM Disease, TO Gene)\n",
"CREATE DIRECTED EDGE DGIDBINHIBITORGeneCompound (FROM Gene, TO Compound)\n",
"CREATE DIRECTED EDGE DGIDBANTAGONISTGeneCompound (FROM Gene, TO Compound)\n",
"CREATE DIRECTED EDGE DGIDBOTHERGeneCompound (FROM Gene, TO Compound)\n",
"CREATE DIRECTED EDGE DGIDBAGONISTGeneCompound (FROM Gene, TO Compound)\n",
"CREATE DIRECTED EDGE DGIDBBINDERGeneCompound (FROM Gene, TO Compound)\n",
"CREATE DIRECTED EDGE DGIDBMODULATORGeneCompound (FROM Gene, TO Compound)\n",
"CREATE DIRECTED EDGE DGIDBBLOCKERGeneCompound (FROM Gene, TO Compound)\n",
"CREATE DIRECTED EDGE DGIDBCHANNELBLOCKERGeneCompound (FROM Gene, TO Compound)\n",
"CREATE DIRECTED EDGE DGIDBANTIBODYGeneCompound (FROM Gene, TO Compound)\n",
"CREATE DIRECTED EDGE DGIDBPOSITIVEALLOSTERICMODULATORGeneCompound (FROM Gene, TO Compound)\n",
"CREATE DIRECTED EDGE DGIDBALLOSTERICMODULATORGeneCompound (FROM Gene, TO Compound)\n",
"CREATE DIRECTED EDGE DGIDBACTIVATORGeneCompound (FROM Gene, TO Compound)\n",
"CREATE DIRECTED EDGE DGIDBPARTIALAGONISTGeneCompound (FROM Gene, TO Compound)\n",
"CREATE DIRECTED EDGE DRUGBANKxatcCompoundAtc (FROM Compound, TO Atc)\n",
"CREATE DIRECTED EDGE DRUGBANKddiinteractorinCompoundCompound (FROM Compound, TO Compound)\n",
"CREATE DIRECTED EDGE HetionetCrCCompoundCompound (FROM Compound, TO Compound)\n",
"CREATE DIRECTED EDGE DRUGBANKtreatsCompoundDisease (FROM Compound, TO Disease)\n",
"CREATE DIRECTED EDGE GNBRTCompoundDisease (FROM Compound, TO Disease)\n",
"CREATE DIRECTED EDGE GNBRCCompoundDisease (FROM Compound, TO Disease)\n",
"CREATE DIRECTED EDGE GNBRSaCompoundDisease (FROM Compound, TO Disease)\n",
"CREATE DIRECTED EDGE GNBRPaCompoundDisease (FROM Compound, TO Disease)\n",
"CREATE DIRECTED EDGE GNBRMpCompoundDisease (FROM Compound, TO Disease)\n",
"CREATE DIRECTED EDGE GNBRPrCompoundDisease (FROM Compound, TO Disease)\n",
"CREATE DIRECTED EDGE GNBRJCompoundDisease (FROM Compound, TO Disease)\n",
"CREATE DIRECTED EDGE HetionetCtDCompoundDisease (FROM Compound, TO Disease)\n",
"CREATE DIRECTED EDGE HetionetCpDCompoundDisease (FROM Compound, TO Disease)\n",
"CREATE DIRECTED EDGE GNBRLGeneDisease (FROM Gene, TO Disease)\n",
"CREATE DIRECTED EDGE GNBRUGeneDisease (FROM Gene, TO Disease)\n",
"CREATE DIRECTED EDGE GNBRYGeneDisease (FROM Gene, TO Disease)\n",
"CREATE DIRECTED EDGE GNBRJGeneDisease (FROM Gene, TO Disease)\n",
"CREATE DIRECTED EDGE GNBRTeGeneDisease (FROM Gene, TO Disease)\n",
"CREATE DIRECTED EDGE GNBRMdGeneDisease (FROM Gene, TO Disease)\n",
"CREATE DIRECTED EDGE GNBRGGeneDisease (FROM Gene, TO Disease)\n",
"CREATE DIRECTED EDGE GNBRDGeneDisease (FROM Gene, TO Disease)\n",
"CREATE DIRECTED EDGE GNBRXGeneDisease (FROM Gene, TO Disease)\n",
"CREATE DIRECTED EDGE GNBRUdGeneDisease (FROM Gene, TO Disease)\n",
"CREATE DIRECTED EDGE GNBRin_taxGeneTax (FROM Gene, TO Tax)\n",
"CREATE DIRECTED EDGE HetionetGpBPGeneBiologicalProcess (FROM Gene, TO BiologicalProcess)\n",
"CREATE DIRECTED EDGE HetionetDpSDiseaseSymptom (FROM Disease, TO Symptom)\n",
"CREATE DIRECTED EDGE HetionetDlADiseaseAnatomy (FROM Disease, TO Anatomy)\n",
"CREATE DIRECTED EDGE HetionetDrDDiseaseDisease (FROM Disease, TO Disease)\n",
"CREATE DIRECTED EDGE HetionetAdGAnatomyGene (FROM Anatomy, TO Gene)\n",
"CREATE DIRECTED EDGE HetionetAuGAnatomyGene (FROM Anatomy, TO Gene)\n",
"CREATE DIRECTED EDGE HetionetAeGAnatomyGene (FROM Anatomy, TO Gene)\n",
"CREATE DIRECTED EDGE HetionetGpMFGeneMolecularFunction (FROM Gene, TO MolecularFunction)\n",
"CREATE DIRECTED EDGE HetionetPCiCPharmacologicClassCompound (FROM PharmacologicClass, TO Compound)\n",
"CREATE DIRECTED EDGE HetionetGpCCGeneCellularComponent (FROM Gene, TO CellularComponent)\n",
"CREATE DIRECTED EDGE HetionetGpPWGenePathway (FROM Gene, TO Pathway)\n",
"CREATE DIRECTED EDGE HetionetCcSECompoundSideEffect (FROM Compound, TO SideEffect)\n",
"\"\"\"\n",
"\n",
"))\n"
]
},
{
"cell_type": "markdown",
"id": "7d78bc61",
"metadata": {},
"source": [
"## Data Import"
]
},
{
"cell_type": "markdown",
"id": "c47ecb2e",
"metadata": {},
"source": [
"If the 5.8M triples is too much for your system you can sample the data like this (`pip install numpy`):\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f64c4fc2",
"metadata": {},
"outputs": [],
"source": [
"triple_count = len(triplets)\n",
"sample = np.random.choice(np.arange(triple_count),5000)\n",
"for i in sample:\n",
" [h,r,t] = triplets[i]\n",
" h_type = h.split(\"::\")[0].replace(\" \" ,\"\")\n",
" h_id = str(h.split(\"::\")[1])\n",
" t_type = t.split(\"::\")[0].replace(\" \" ,\"\")\n",
" t_id = str(t.split(\"::\")[1])\n",
" r = r.replace(\" \",\"\").replace(\":\",\"\").replace(\"+\",\"\").replace(\">\",\"\").replace(\"-\",\"\")\n",
" \n",
" conn.upsertEdge(h_type, h_id, r, t_type, t_id)"
]
},
{
"cell_type": "markdown",
"id": "6dd8056f",
"metadata": {},
"source": [
"Note that upserting the edge also upserts the nodes.\n"
]
},
{
"cell_type": "markdown",
"id": "5cf51798",
"metadata": {},
"source": [
"If you want to go full scale, use this:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d7ce8529",
"metadata": {},
"outputs": [],
"source": [
"for triplet in triplets:\n",
" [h,r,t] = triplet\n",
" h_type = h.split(\"::\")[0].replace(\" \" ,\"\")\n",
" h_id = str(h.split(\"::\")[1])\n",
" t_type = t.split(\"::\")[0].replace(\" \" ,\"\")\n",
" t_id = str(t.split(\"::\")[1])\n",
" r = r.replace(\" \",\"\").replace(\":\",\"\").replace(\"+\",\"\").replace(\">\",\"\").replace(\"-\",\"\")\n",
" \n",
" conn.upsertEdge(h_type, h_id, r, t_type, t_id)"
]
},
{
"cell_type": "markdown",
"id": "57c23d56",
"metadata": {},
"source": [
"## Fetching some data"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9b2ba76d",
"metadata": {},
"outputs": [],
"source": [
"df = conn.getVertexDataframe(\"Gene\", limit=10)\n",
"df.head()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment