Created
July 31, 2023 13:40
-
-
Save DSuveges/05c8d63cfd6be2af5331fa81d9fe9a51 to your computer and use it in GitHub Desktop.
GCS/Optimizing_datamodel.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2023-07-31T13:33:19.324955Z", | |
"end_time": "2023-07-31T13:33:20.892011Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "from pyspark.sql import functions as f, types as t, SparkSession\n\nspark = SparkSession.builder.getOrCreate()\n\n# Last release:\nlast_release = '23.06'\nrelease_url = f'gs://open-targets-data-releases/{last_release}/output/etl/parquet'\n\n# datasets:\ntargets = spark.read.parquet(f'{release_url}/targets')\nmolecules = spark.read.parquet(f'{release_url}/molecule')\nmoa = spark.read.parquet(f'{release_url}/mechanismOfAction')\ndisease = spark.read.parquet(f'{release_url}/diseases')\nassociations = spark.read.parquet(f'{release_url}/associationByOverallDirect')", | |
"execution_count": 22, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2023-07-31T13:33:20.895813Z", | |
"end_time": "2023-07-31T13:33:22.442116Z" | |
}, | |
"trusted": true, | |
"scrolled": false | |
}, | |
"cell_type": "code", | |
"source": "# targets:\nprint('targets:')\ntargets.printSchema()\ntargets.show(1, False, True)\n\n# molecules:\nprint('molecules:')\nmolecules.printSchema()\nmolecules.show(1, False, True)\n\n# moa:\nprint('moa:')\nmoa.printSchema()\nmoa.show(1, False, True)\n\n# disease:\nprint('disease:')\ndisease.printSchema()\ndisease.show(1, False, True)\n\n# Associations:\nprint('associations:')\nassociations.printSchema()\nassociations.show(1,False, True)", | |
"execution_count": 23, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "targets:\nroot\n |-- id: string (nullable = true)\n |-- approvedSymbol: string (nullable = true)\n |-- biotype: string (nullable = true)\n |-- transcriptIds: array (nullable = true)\n | |-- element: string (containsNull = true)\n |-- canonicalTranscript: struct (nullable = true)\n | |-- id: string (nullable = true)\n | |-- chromosome: string (nullable = true)\n | |-- start: long (nullable = true)\n | |-- end: long (nullable = true)\n | |-- strand: string (nullable = true)\n |-- canonicalExons: array (nullable = true)\n | |-- element: string (containsNull = true)\n |-- genomicLocation: struct (nullable = true)\n | |-- chromosome: string (nullable = true)\n | |-- start: long (nullable = true)\n | |-- end: long (nullable = true)\n | |-- strand: integer (nullable = true)\n |-- alternativeGenes: array (nullable = true)\n | |-- element: string (containsNull = true)\n |-- approvedName: string (nullable = true)\n |-- go: array (nullable = true)\n | |-- element: struct (containsNull = true)\n | | |-- id: string (nullable = true)\n | | |-- source: string (nullable = true)\n | | |-- evidence: string (nullable = true)\n | | |-- aspect: string (nullable = true)\n | | |-- geneProduct: string (nullable = true)\n | | |-- ecoId: string (nullable = true)\n |-- hallmarks: struct (nullable = true)\n | |-- attributes: array (nullable = true)\n | | |-- element: struct (containsNull = true)\n | | | |-- pmid: long (nullable = true)\n | | | |-- description: string (nullable = true)\n | | | |-- attribute_name: string (nullable = true)\n | |-- cancerHallmarks: array (nullable = true)\n | | |-- element: struct (containsNull = true)\n | | | |-- pmid: long (nullable = true)\n | | | |-- description: string (nullable = true)\n | | | |-- impact: string (nullable = true)\n | | | |-- label: string (nullable = true)\n |-- synonyms: array (nullable = true)\n | |-- element: struct (containsNull = true)\n | | |-- label: string (nullable = true)\n | | |-- source: string (nullable = true)\n |-- symbolSynonyms: array (nullable = true)\n | |-- element: struct (containsNull = true)\n | | |-- label: string (nullable = true)\n | | |-- source: string (nullable = true)\n |-- nameSynonyms: array (nullable = true)\n | |-- element: struct (containsNull = true)\n | | |-- label: string (nullable = true)\n | | |-- source: string (nullable = true)\n |-- functionDescriptions: array (nullable = true)\n | |-- element: string (containsNull = true)\n |-- subcellularLocations: array (nullable = true)\n | |-- element: struct (containsNull = true)\n | | |-- location: string (nullable = true)\n | | |-- source: string (nullable = true)\n | | |-- termSL: string (nullable = true)\n | | |-- labelSL: string (nullable = true)\n |-- targetClass: array (nullable = true)\n | |-- element: struct (containsNull = true)\n | | |-- id: long (nullable = true)\n | | |-- label: string (nullable = true)\n | | |-- level: string (nullable = true)\n |-- obsoleteSymbols: array (nullable = true)\n | |-- element: struct (containsNull = true)\n | | |-- label: string (nullable = true)\n | | |-- source: string (nullable = true)\n |-- obsoleteNames: array (nullable = true)\n | |-- element: struct (containsNull = true)\n | | |-- label: string (nullable = true)\n | | |-- source: string (nullable = true)\n |-- constraint: array (nullable = true)\n | |-- element: struct (containsNull = true)\n | | |-- constraintType: string (nullable = true)\n | | |-- score: float (nullable = true)\n | | |-- exp: float (nullable = true)\n | | |-- obs: integer (nullable = true)\n | | |-- oe: float (nullable = true)\n | | |-- oeLower: float (nullable = true)\n | | |-- oeUpper: float (nullable = true)\n | | |-- upperRank: integer (nullable = true)\n | | |-- upperBin: integer (nullable = true)\n | | |-- upperBin6: integer (nullable = true)\n |-- tep: struct (nullable = true)\n | |-- targetFromSourceId: string (nullable = true)\n | |-- description: string (nullable = true)\n | |-- therapeuticArea: string (nullable = true)\n | |-- url: string (nullable = true)\n |-- proteinIds: array (nullable = true)\n | |-- element: struct (containsNull = true)\n | | |-- id: string (nullable = true)\n | | |-- source: string (nullable = true)\n |-- dbXrefs: array (nullable = true)\n | |-- element: struct (containsNull = true)\n | | |-- id: string (nullable = true)\n | | |-- source: string (nullable = true)\n |-- chemicalProbes: array (nullable = true)\n | |-- element: struct (containsNull = true)\n | | |-- drugId: string (nullable = true)\n | | |-- id: string (nullable = true)\n | | |-- isHighQuality: boolean (nullable = true)\n | | |-- mechanismOfAction: array (nullable = true)\n | | | |-- element: string (containsNull = true)\n | | |-- origin: array (nullable = true)\n | | | |-- element: string (containsNull = true)\n | | |-- probeMinerScore: double (nullable = true)\n | | |-- probesDrugsScore: double (nullable = true)\n | | |-- scoreInCells: double (nullable = true)\n | | |-- scoreInOrganisms: double (nullable = true)\n | | |-- targetFromSourceId: string (nullable = true)\n | | |-- urls: array (nullable = true)\n | | | |-- element: struct (containsNull = true)\n | | | | |-- niceName: string (nullable = true)\n | | | | |-- url: string (nullable = true)\n |-- homologues: array (nullable = true)\n | |-- element: struct (containsNull = true)\n | | |-- speciesId: string (nullable = true)\n | | |-- speciesName: string (nullable = true)\n | | |-- homologyType: string (nullable = true)\n | | |-- targetGeneId: string (nullable = true)\n | | |-- isHighConfidence: string (nullable = true)\n | | |-- targetGeneSymbol: string (nullable = true)\n | | |-- queryPercentageIdentity: double (nullable = true)\n | | |-- targetPercentageIdentity: double (nullable = true)\n | | |-- priority: integer (nullable = true)\n |-- tractability: array (nullable = true)\n | |-- element: struct (containsNull = true)\n | | |-- modality: string (nullable = true)\n | | |-- id: string (nullable = true)\n | | |-- value: boolean (nullable = true)\n |-- safetyLiabilities: array (nullable = true)\n | |-- element: struct (containsNull = true)\n | | |-- event: string (nullable = true)\n | | |-- eventId: string (nullable = true)\n | | |-- effects: array (nullable = true)\n | | | |-- element: struct (containsNull = true)\n | | | | |-- direction: string (nullable = true)\n | | | | |-- dosing: string (nullable = true)\n | | |-- biosamples: array (nullable = true)\n | | | |-- element: struct (containsNull = true)\n | | | | |-- cellFormat: string (nullable = true)\n | | | | |-- cellLabel: string (nullable = true)\n | | | | |-- tissueId: string (nullable = true)\n | | | | |-- tissueLabel: string (nullable = true)\n | | |-- isHumanApplicable: boolean (nullable = true)\n | | |-- datasource: string (nullable = true)\n | | |-- literature: string (nullable = true)\n | | |-- url: string (nullable = true)\n | | |-- studies: array (nullable = true)\n | | | |-- element: struct (containsNull = true)\n | | | | |-- description: string (nullable = true)\n | | | | |-- name: string (nullable = true)\n | | | | |-- type: string (nullable = true)\n |-- pathways: array (nullable = true)\n | |-- element: struct (containsNull = true)\n | | |-- pathwayId: string (nullable = true)\n | | |-- pathway: string (nullable = true)\n | | |-- topLevelTerm: string (nullable = true)\n\n-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n id | ENSG00000002586 \n approvedSymbol | CD99 \n biotype | protein_coding \n transcriptIds | [ENST00000482405, ENST00000624481, ENST00000645950, ENST00000381187, ENST00000449611, ENST00000611428, ENST00000646103, ENST00000647297, ENST00000381192, ENST00000381180, ENST00000482293, ENST00000381184, ENST00000381177, ENST00000623253, ENST00000497752] \n canonicalTranscript | {ENST00000381192, X, 2691295, 2741309, +} \n canonicalExons | [2738200, 2738256, 2740779, 2741309, 2691295, 2691427, 2717605, 2717652, 2714422, 2714454, 2722627, 2722674, 2720356, 2720424, 2719661, 2719705, 2723314, 2723364, 2726260, 2726373] \n genomicLocation | {X, 2691187, 2741309, 1} \n alternativeGenes | null \n approvedName | CD99 molecule (Xg blood group) \n go | [{GO:0005515, PMID:20374249, IPI, F, P14209, ECO:0000353}, {GO:0005886, Reactome:R-HSA-8862084, TAS, C, P14209, ECO:0000304}, {GO:0034109, PMID:21873635, IBA, P, P14209, ECO:0000318}, {GO:2000391, PMID:21873635, IBA, P, P14209, ECO:0000318}, {GO:0005925, PMID:21423176, HDA, C, P14209, ECO:0007005}, {GO:0072683, PMID:21873635, IBA, P, P14209, ECO:0000318}, {GO:0005737, PMID:2479542, TAS, C, P14209, ECO:0000304}] \n hallmarks | null \n synonyms | [{CD99 antigen, uniprot}, {CD99, uniprot}, {MIC2, uniprot}, {MIC2X, uniprot}, {MIC2Y, uniprot}, {12E7, uniprot}, {E2 antigen, uniprot}, {Protein MIC2, uniprot}, {T-cell surface glycoprotein E2, uniprot}, {HBA71, NCBI_entrez}, {MIC2, NCBI_entrez}, {MIC2X, NCBI_entrez}, {MIC2Y, NCBI_entrez}, {MSK5X, NCBI_entrez}, {CD99 antigen, NCBI_entrez}, {E2 antigen, NCBI_entrez}, {MIC2 (monoclonal antibody 12E7), NCBI_entrez}, {T-cell surface glycoprotein E2, NCBI_entrez}, {antigen identified by monoclonal 12E7, Y homolog, NCBI_entrez}, {antigen identified by monoclonal antibodies 12E7, F21 and O13, NCBI_entrez}, {cell surface antigen 12E7, NCBI_entrez}, {cell surface antigen HBA-71, NCBI_entrez}, {cell surface antigen O13, NCBI_entrez}, {surface antigen MIC2, NCBI_entrez}, {CD99, NCBI_entrez}] \n symbolSynonyms | [{CD99, uniprot}, {MIC2, uniprot}, {MIC2X, uniprot}, {MIC2Y, uniprot}, {HBA71, NCBI_entrez}, {MIC2, NCBI_entrez}, {MIC2X, NCBI_entrez}, {MIC2Y, NCBI_entrez}, {MSK5X, NCBI_entrez}, {CD99, NCBI_entrez}] \n nameSynonyms | [{CD99 antigen, uniprot}, {12E7, uniprot}, {E2 antigen, uniprot}, {Protein MIC2, uniprot}, {T-cell surface glycoprotein E2, uniprot}, {CD99 antigen, NCBI_entrez}, {E2 antigen, NCBI_entrez}, {MIC2 (monoclonal antibody 12E7), NCBI_entrez}, {T-cell surface glycoprotein E2, NCBI_entrez}, {antigen identified by monoclonal 12E7, Y homolog, NCBI_entrez}, {antigen identified by monoclonal antibodies 12E7, F21 and O13, NCBI_entrez}, {cell surface antigen 12E7, NCBI_entrez}, {cell surface antigen HBA-71, NCBI_entrez}, {cell surface antigen O13, NCBI_entrez}, {surface antigen MIC2, NCBI_entrez}] \n functionDescriptions | [Involved in T-cell adhesion processes and in spontaneous rosette formation with erythrocytes. Plays a role in a late step of leukocyte extravasation helping leukocytes to overcome the endothelial basement membrane. Acts at the same site as, but independently of, PECAM1. Involved in T-cell adhesion processes (By similarity). .] \n subcellularLocations | [{Membrane, uniprot, SL-0162, Cellular component}, {Golgi apparatus, HPA_main, SL-0132, Golgi apparatus}] \n targetClass | null \n obsoleteSymbols | [{MIC2, HGNC}, {MIC2X, HGNC}, {MIC2Y, HGNC}] \n obsoleteNames | [{antigen identified by monoclonal antibodies 12E7, F21 and O13, HGNC}, {CD99 antigen, HGNC}, {CD99 molecule, HGNC}] \n constraint | [{syn, -0.16373, 42.64, 44, 1.0319, 0.81, 1.327, null, null, null}, {mis, -0.52942, 113.15, 129, 1.1401, 0.987, 1.32, null, null, null}, {lof, 0.0026127, 12.271, 6, 0.48897, 0.267, 0.965, 10262, 5, 3}] \n tep | null \n proteinIds | [{P14209, uniprot_swissprot}, {A0A096LP69, uniprot_trembl}, {A6NGF6, uniprot_trembl}, {A6NJT9, uniprot_trembl}, {A8MQT7, uniprot_trembl}, {A6NIW1, uniprot_obsolete}, {O00518, uniprot_obsolete}, {Q6ICV7, uniprot_obsolete}] \n dbXrefs | [{7082, HGNC}, {7SFX, PDB}, {R-HSA-198933, Reactome}, {R-HSA-202733, Reactome}, {IPR022078, InterPro}, {SignalP-noTM, signalP}] \n chemicalProbes | null \n homologues | [{9606, Human, within_species_paralog, ENSG00000102181, NULL, CD99L2, 30.8108, 21.7557, 0}, {9598, Chimpanzee, ortholog_one2one, ENSPTRG00000028324, 0, ENSPTRG00000028324, 75.1351, 69.5, 1}, {9544, Macaque, ortholog_one2many, ENSMMUG00000018687, 1, ENSMMUG00000018687, 90.2703, 70.7627, 2}, {9544, Macaque, ortholog_one2many, ENSMMUG00000055976, 0, ENSMMUG00000055976, 55.6757, 55.9783, 2}, {9615, Dog, ortholog_one2many, ENSCAFG00845022471, 0, ENSCAFG00845022471, 65.4054, 48.0159, 7}, {9615, Dog, ortholog_one2many, ENSCAFG00845025501, 0, ENSCAFG00845025501, 67.5676, 38.8199, 7}, {9823, Pig, ortholog_one2one, ENSSSCG00000055885, 0, ENSSSCG00000055885, 64.3243, 63.9785, 8}, {8364, Tropical clawed frog, ortholog_one2one, ENSXETG00000039100, 1, cd99, 40.0, 37.3737, 9}, {7955, Zebrafish, ortholog_one2one, ENSDARG00000051975, 0, cd99, 34.0541, 28.6364, 10}] \n tractability | [{SM, Approved Drug, false}, {SM, Advanced Clinical, false}, {SM, Phase 1 Clinical, false}, {SM, Structure with Ligand, false}, {SM, High-Quality Ligand, false}, {SM, High-Quality Pocket, false}, {SM, Med-Quality Pocket, false}, {SM, Druggable Family, false}, {AB, Approved Drug, false}, {AB, Advanced Clinical, false}, {AB, Phase 1 Clinical, false}, {AB, UniProt loc high conf, false}, {AB, GO CC high conf, true}, {AB, UniProt loc med conf, false}, {AB, UniProt SigP or TMHMM, true}, {AB, GO CC med conf, false}, {AB, Human Protein Atlas loc, false}, {PR, Approved Drug, false}, {PR, Advanced Clinical, false}, {PR, Phase 1 Clinical, false}, {PR, Literature, false}, {PR, UniProt Ubiquitination, false}, {PR, Database Ubiquitination, true}, {PR, Half-life Data, true}, {PR, Small Molecule Binder, false}, {OC, Approved Drug, false}, {OC, Advanced Clinical, false}, {OC, Phase 1 Clinical, false}] \n safetyLiabilities | null \n pathways | [{R-HSA-198933, Immunoregulatory interactions between a Lymphoid and a non-Lymphoid cell, Immune System}, {R-HSA-202733, Cell surface interactions at the vascular wall, Hemostasis}] \nonly showing top 1 row\n\nmolecules:\nroot\n |-- id: string (nullable = true)\n |-- canonicalSmiles: string (nullable = true)\n |-- inchiKey: string (nullable = true)\n |-- drugType: string (nullable = true)\n |-- blackBoxWarning: boolean (nullable = true)\n |-- name: string (nullable = true)\n |-- yearOfFirstApproval: long (nullable = true)\n |-- maximumClinicalTrialPhase: double (nullable = true)\n |-- parentId: string (nullable = true)\n |-- hasBeenWithdrawn: boolean (nullable = true)\n |-- isApproved: boolean (nullable = true)\n |-- tradeNames: array (nullable = true)\n | |-- element: string (containsNull = true)\n |-- synonyms: array (nullable = true)\n | |-- element: string (containsNull = true)\n |-- crossReferences: map (nullable = true)\n | |-- key: string\n | |-- value: array (valueContainsNull = true)\n | | |-- element: string (containsNull = true)\n |-- childChemblIds: array (nullable = true)\n | |-- element: string (containsNull = true)\n |-- linkedDiseases: struct (nullable = true)\n | |-- rows: array (nullable = true)\n | | |-- element: string (containsNull = true)\n | |-- count: integer (nullable = true)\n |-- linkedTargets: struct (nullable = true)\n | |-- rows: array (nullable = true)\n | | |-- element: string (containsNull = true)\n | |-- count: integer (nullable = true)\n |-- description: string (nullable = true)\n\n", | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "stream", | |
"text": "-RECORD 0--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n id | CHEMBL1173055 \n canonicalSmiles | CNCc1ccc(-c2[nH]c3cc(F)cc4c3c2CCNC4=O)cc1 \n inchiKey | HMABYWSNWIZPAG-UHFFFAOYSA-N \n drugType | Small molecule \n blackBoxWarning | false \n name | RUCAPARIB \n yearOfFirstApproval | 2016 \n maximumClinicalTrialPhase | 4.0 \n parentId | null \n hasBeenWithdrawn | false \n isApproved | true \n tradeNames | [] \n synonyms | [AG-014699, AG-14447, Rucaparib] \n crossReferences | {DailyMed -> [rucaparib+camsylate], PubChem -> [103905261, 137276017, 174006354], drugbank -> [DB12332]} \n childChemblIds | [CHEMBL2105733, CHEMBL3833368] \n linkedDiseases | {[EFO_0003060, EFO_0002517, MONDO_0002087, MONDO_0002158, MONDO_0004992, MONDO_0008170, EFO_0000616, MONDO_0007254, MONDO_0008315, MONDO_0008170, EFO_0000702, MONDO_0001056, MONDO_0003060, EFO_0000616, EFO_0000616, Orphanet_145, MONDO_0002974, EFO_0002618, EFO_1000613, EFO_0000673, EFO_0000564, MONDO_0011962, EFO_0001075, EFO_0000588], 24} \n linkedTargets | {[ENSG00000143799, ENSG00000041880, ENSG00000129484], 3} \n description | Small molecule drug with a maximum clinical trial phase of IV (across all indications) that was first approved in 2016 and has 4 approved and 19 investigational indications. \nonly showing top 1 row\n\nmoa:\nroot\n |-- actionType: string (nullable = true)\n |-- mechanismOfAction: string (nullable = true)\n |-- chemblIds: array (nullable = true)\n | |-- element: string (containsNull = true)\n |-- targetName: string (nullable = true)\n |-- targetType: string (nullable = true)\n |-- targets: array (nullable = true)\n | |-- element: string (containsNull = true)\n |-- references: array (nullable = true)\n | |-- element: struct (containsNull = true)\n | | |-- source: string (nullable = true)\n | | |-- ids: array (nullable = true)\n | | | |-- element: string (containsNull = true)\n | | |-- urls: array (nullable = true)\n | | | |-- element: string (containsNull = true)\n\n-RECORD 0---------------------------------------------------------------------------------\n actionType | INHIBITOR \n mechanismOfAction | Inhibin beta A chain inhibitor \n chemblIds | [CHEMBL1743073] \n targetName | Inhibin beta A chain \n targetType | single protein \n targets | [ENSG00000122641] \n references | [{PubMed, [19049340], [http://europepmc.org/abstract/MED/19049340]}] \nonly showing top 1 row\n\ndisease:\nroot\n |-- id: string (nullable = true)\n |-- code: string (nullable = true)\n |-- dbXRefs: array (nullable = true)\n | |-- element: string (containsNull = true)\n |-- description: string (nullable = true)\n |-- name: string (nullable = true)\n |-- directLocationIds: array (nullable = true)\n | |-- element: string (containsNull = true)\n |-- obsoleteTerms: array (nullable = true)\n | |-- element: string (containsNull = true)\n |-- parents: array (nullable = true)\n | |-- element: string (containsNull = true)\n |-- synonyms: struct (nullable = true)\n | |-- hasBroadSynonym: array (nullable = true)\n | | |-- element: string (containsNull = true)\n | |-- hasExactSynonym: array (nullable = true)\n | | |-- element: string (containsNull = true)\n | |-- hasNarrowSynonym: array (nullable = true)\n | | |-- element: string (containsNull = true)\n | |-- hasRelatedSynonym: array (nullable = true)\n | | |-- element: string (containsNull = true)\n |-- ancestors: array (nullable = true)\n | |-- element: string (containsNull = true)\n |-- descendants: array (nullable = true)\n | |-- element: string (containsNull = true)\n |-- children: array (nullable = true)\n | |-- element: string (containsNull = true)\n |-- therapeuticAreas: array (nullable = true)\n | |-- element: string (containsNull = true)\n |-- indirectLocationIds: array (nullable = true)\n | |-- element: string (containsNull = true)\n |-- ontology: struct (nullable = true)\n | |-- isTherapeuticArea: boolean (nullable = true)\n | |-- leaf: boolean (nullable = true)\n | |-- sources: struct (nullable = true)\n | | |-- url: string (nullable = true)\n | | |-- name: string (nullable = true)\n\n", | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "stream", | |
"text": "-RECORD 0-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n id | EFO_0000255 \n code | http://www.ebi.ac.uk/efo/EFO_0000255 \n dbXRefs | [NCIT:C7528, ICDO:9767/1, MESH:D007119, MONDO:0004977, GARD:11973, UMLS:C0020981, Orphanet:86886, ONCOTREE:AITL, ICDO:9705/3, EFO:0000255, GARD:0011973, ICD9:202.70, ICD10CM:C86.5, ICD10:C86.5, DOID:0111147, MedDRA:10002449, SCTID:413537009] \n description | A mature T-cell non-Hodgkin lymphoma, characterized by systemic disease and a polymorphous infiltrate involving lymph nodes and extranodal sites. The clinical course is typically aggressive. \n name | angioimmunoblastic T-cell lymphoma \n directLocationIds | null \n obsoleteTerms | null \n parents | [MONDO_0000430] \n synonyms | {null, [angioimmunoblastic lymphadenopathy, angioimmunoblastic lymphadenopathy with Dysproteinemia, lymphogranulomatosis X, AILD, T-cell lymphoma, AILD type, angioimmunoblastic T-cell lymphoma, angioimmunoblastic lymphadenopathy type T-cell lymphoma, immunoblastic lymphadenopathy, AILT], null, [AITL]} \n ancestors | [MONDO_0000430, OTAR_0000018, MONDO_0002334, MONDO_0023370, MONDO_0044881, MONDO_0045024, Orphanet_322126, EFO_0002426, Orphanet_68336, EFO_0001642, EFO_0000574, MONDO_0015757, MONDO_0024615, EFO_0005952, EFO_0000508, MONDO_0019044, EFO_0005803, MONDO_0015760, EFO_0000616] \n descendants | [] \n children | [] \n therapeuticAreas | [OTAR_0000018, MONDO_0045024, EFO_0005803] \n indirectLocationIds | null \n ontology | {false, true, {http://www.ebi.ac.uk/efo/EFO_0000255, EFO_0000255}} \nonly showing top 1 row\n\nassociations:\nroot\n |-- diseaseId: string (nullable = true)\n |-- targetId: string (nullable = true)\n |-- score: double (nullable = true)\n |-- evidenceCount: long (nullable = true)\n\n-RECORD 0---------------------------\n diseaseId | EFO_0000574 \n targetId | ENSG00000000938 \n score | 0.0831554673040579 \n evidenceCount | 2 \nonly showing top 1 row\n\n", | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2023-07-31T13:20:47.944835Z", | |
"end_time": "2023-07-31T13:20:56.050323Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "# Let's get disease data:\nresolved_children = (\n disease\n .select(\n f.col('id').alias('diseaseId'),\n f.col('name').alias('diseaseName'),\n f.explode_outer(f.col('children')).alias('childId')\n )\n .join(\n disease\n .select(\n f.col('id').alias('childId'),\n f.col('name').alias('childName')\n ),\n on='childId',\n how='left'\n )\n .groupBy('diseaseId', 'diseaseName')\n .agg(\n f.collect_set(\n f.struct(\n f.col('childName'),\n f.col('childId')\n )\n ).alias('children')\n )\n)\n\nresolved_children.show()\nresolved_children.printSchema()", | |
"execution_count": 20, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "[Stage 22:==================================================> (7 + 1) / 8]\r", | |
"name": "stderr" | |
}, | |
{ | |
"output_type": "stream", | |
"text": "+------------+--------------------+--------------------+\n| diseaseId| diseaseName| children|\n+------------+--------------------+--------------------+\n|DOID_0050890| synucleinopathy|[{Lewy body demen...|\n| DOID_10113| trypanosomiasis|[{Chagas disease,...|\n| DOID_10718| giardiasis| [{null, null}]|\n| DOID_1947| trichomoniasis|[{Trichomonas vag...|\n| DOID_7551| gonorrhea| [{null, null}]|\n| EFO_0000094|B-cell acute lymp...|[{B-cell acute ly...|\n| EFO_0000095|chronic lymphocyt...|[{hairy cell leuk...|\n| EFO_0000096|neoplasm of matur...|[{plasma cell neo...|\n| EFO_0000182|hepatocellular ca...|[{Liver Diffuse L...|\n| EFO_0000183| Hodgkins lymphoma|[{classic Hodgkin...|\n| EFO_0000196|metastatic prosta...| [{null, null}]|\n| EFO_0000197| mucinous carcinoma|[{Endometrial Muc...|\n| EFO_0000200|plasma cell neoplasm|[{Heavy Chain Dis...|\n| EFO_0000203|monoclonal gammop...|[{benign monoclon...|\n| EFO_0000216|acinar cell carci...|[{Parotid Gland A...|\n| EFO_0000217| gastritis|[{gastric mucosal...|\n| EFO_0000228| adenocarcinoma|[{gastroesophagea...|\n| EFO_0000232| adenoma|[{parathyroid ade...|\n| EFO_0000233|adenosquamous lun...| [{null, null}]|\n| EFO_0000239|adrenal gland phe...|[{Malignant Adren...|\n+------------+--------------------+--------------------+\nonly showing top 20 rows\n\nroot\n |-- diseaseId: string (nullable = true)\n |-- diseaseName: string (nullable = true)\n |-- children: array (nullable = false)\n | |-- element: struct (containsNull = false)\n | | |-- childName: string (nullable = true)\n | | |-- childId: string (nullable = true)\n\n", | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "stream", | |
"text": "\r \r", | |
"name": "stderr" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "", | |
"execution_count": null, | |
"outputs": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3", | |
"language": "python" | |
}, | |
"language_info": { | |
"name": "python", | |
"version": "3.10.8", | |
"mimetype": "text/x-python", | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"pygments_lexer": "ipython3", | |
"nbconvert_exporter": "python", | |
"file_extension": ".py" | |
}, | |
"gist": { | |
"id": "", | |
"data": { | |
"description": "GCS/Optimizing_datamodel.ipynb", | |
"public": true | |
} | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment