Skip to content

Instantly share code, notes, and snippets.

@DSuveges
Created July 31, 2023 13:40
Show Gist options
  • Save DSuveges/05c8d63cfd6be2af5331fa81d9fe9a51 to your computer and use it in GitHub Desktop.
Save DSuveges/05c8d63cfd6be2af5331fa81d9fe9a51 to your computer and use it in GitHub Desktop.
GCS/Optimizing_datamodel.ipynb
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"metadata": {
"ExecuteTime": {
"start_time": "2023-07-31T13:33:19.324955Z",
"end_time": "2023-07-31T13:33:20.892011Z"
},
"trusted": true
},
"cell_type": "code",
"source": "from pyspark.sql import functions as f, types as t, SparkSession\n\nspark = SparkSession.builder.getOrCreate()\n\n# Last release:\nlast_release = '23.06'\nrelease_url = f'gs://open-targets-data-releases/{last_release}/output/etl/parquet'\n\n# datasets:\ntargets = spark.read.parquet(f'{release_url}/targets')\nmolecules = spark.read.parquet(f'{release_url}/molecule')\nmoa = spark.read.parquet(f'{release_url}/mechanismOfAction')\ndisease = spark.read.parquet(f'{release_url}/diseases')\nassociations = spark.read.parquet(f'{release_url}/associationByOverallDirect')",
"execution_count": 22,
"outputs": []
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2023-07-31T13:33:20.895813Z",
"end_time": "2023-07-31T13:33:22.442116Z"
},
"trusted": true,
"scrolled": false
},
"cell_type": "code",
"source": "# targets:\nprint('targets:')\ntargets.printSchema()\ntargets.show(1, False, True)\n\n# molecules:\nprint('molecules:')\nmolecules.printSchema()\nmolecules.show(1, False, True)\n\n# moa:\nprint('moa:')\nmoa.printSchema()\nmoa.show(1, False, True)\n\n# disease:\nprint('disease:')\ndisease.printSchema()\ndisease.show(1, False, True)\n\n# Associations:\nprint('associations:')\nassociations.printSchema()\nassociations.show(1,False, True)",
"execution_count": 23,
"outputs": [
{
"output_type": "stream",
"text": "targets:\nroot\n |-- id: string (nullable = true)\n |-- approvedSymbol: string (nullable = true)\n |-- biotype: string (nullable = true)\n |-- transcriptIds: array (nullable = true)\n | |-- element: string (containsNull = true)\n |-- canonicalTranscript: struct (nullable = true)\n | |-- id: string (nullable = true)\n | |-- chromosome: string (nullable = true)\n | |-- start: long (nullable = true)\n | |-- end: long (nullable = true)\n | |-- strand: string (nullable = true)\n |-- canonicalExons: array (nullable = true)\n | |-- element: string (containsNull = true)\n |-- genomicLocation: struct (nullable = true)\n | |-- chromosome: string (nullable = true)\n | |-- start: long (nullable = true)\n | |-- end: long (nullable = true)\n | |-- strand: integer (nullable = true)\n |-- alternativeGenes: array (nullable = true)\n | |-- element: string (containsNull = true)\n |-- approvedName: string (nullable = true)\n |-- go: array (nullable = true)\n | |-- element: struct (containsNull = true)\n | | |-- id: string (nullable = true)\n | | |-- source: string (nullable = true)\n | | |-- evidence: string (nullable = true)\n | | |-- aspect: string (nullable = true)\n | | |-- geneProduct: string (nullable = true)\n | | |-- ecoId: string (nullable = true)\n |-- hallmarks: struct (nullable = true)\n | |-- attributes: array (nullable = true)\n | | |-- element: struct (containsNull = true)\n | | | |-- pmid: long (nullable = true)\n | | | |-- description: string (nullable = true)\n | | | |-- attribute_name: string (nullable = true)\n | |-- cancerHallmarks: array (nullable = true)\n | | |-- element: struct (containsNull = true)\n | | | |-- pmid: long (nullable = true)\n | | | |-- description: string (nullable = true)\n | | | |-- impact: string (nullable = true)\n | | | |-- label: string (nullable = true)\n |-- synonyms: array (nullable = true)\n | |-- element: struct (containsNull = true)\n | | |-- label: string (nullable = true)\n | | |-- source: string (nullable = true)\n |-- symbolSynonyms: array (nullable = true)\n | |-- element: struct (containsNull = true)\n | | |-- label: string (nullable = true)\n | | |-- source: string (nullable = true)\n |-- nameSynonyms: array (nullable = true)\n | |-- element: struct (containsNull = true)\n | | |-- label: string (nullable = true)\n | | |-- source: string (nullable = true)\n |-- functionDescriptions: array (nullable = true)\n | |-- element: string (containsNull = true)\n |-- subcellularLocations: array (nullable = true)\n | |-- element: struct (containsNull = true)\n | | |-- location: string (nullable = true)\n | | |-- source: string (nullable = true)\n | | |-- termSL: string (nullable = true)\n | | |-- labelSL: string (nullable = true)\n |-- targetClass: array (nullable = true)\n | |-- element: struct (containsNull = true)\n | | |-- id: long (nullable = true)\n | | |-- label: string (nullable = true)\n | | |-- level: string (nullable = true)\n |-- obsoleteSymbols: array (nullable = true)\n | |-- element: struct (containsNull = true)\n | | |-- label: string (nullable = true)\n | | |-- source: string (nullable = true)\n |-- obsoleteNames: array (nullable = true)\n | |-- element: struct (containsNull = true)\n | | |-- label: string (nullable = true)\n | | |-- source: string (nullable = true)\n |-- constraint: array (nullable = true)\n | |-- element: struct (containsNull = true)\n | | |-- constraintType: string (nullable = true)\n | | |-- score: float (nullable = true)\n | | |-- exp: float (nullable = true)\n | | |-- obs: integer (nullable = true)\n | | |-- oe: float (nullable = true)\n | | |-- oeLower: float (nullable = true)\n | | |-- oeUpper: float (nullable = true)\n | | |-- upperRank: integer (nullable = true)\n | | |-- upperBin: integer (nullable = true)\n | | |-- upperBin6: integer (nullable = true)\n |-- tep: struct (nullable = true)\n | |-- targetFromSourceId: string (nullable = true)\n | |-- description: string (nullable = true)\n | |-- therapeuticArea: string (nullable = true)\n | |-- url: string (nullable = true)\n |-- proteinIds: array (nullable = true)\n | |-- element: struct (containsNull = true)\n | | |-- id: string (nullable = true)\n | | |-- source: string (nullable = true)\n |-- dbXrefs: array (nullable = true)\n | |-- element: struct (containsNull = true)\n | | |-- id: string (nullable = true)\n | | |-- source: string (nullable = true)\n |-- chemicalProbes: array (nullable = true)\n | |-- element: struct (containsNull = true)\n | | |-- drugId: string (nullable = true)\n | | |-- id: string (nullable = true)\n | | |-- isHighQuality: boolean (nullable = true)\n | | |-- mechanismOfAction: array (nullable = true)\n | | | |-- element: string (containsNull = true)\n | | |-- origin: array (nullable = true)\n | | | |-- element: string (containsNull = true)\n | | |-- probeMinerScore: double (nullable = true)\n | | |-- probesDrugsScore: double (nullable = true)\n | | |-- scoreInCells: double (nullable = true)\n | | |-- scoreInOrganisms: double (nullable = true)\n | | |-- targetFromSourceId: string (nullable = true)\n | | |-- urls: array (nullable = true)\n | | | |-- element: struct (containsNull = true)\n | | | | |-- niceName: string (nullable = true)\n | | | | |-- url: string (nullable = true)\n |-- homologues: array (nullable = true)\n | |-- element: struct (containsNull = true)\n | | |-- speciesId: string (nullable = true)\n | | |-- speciesName: string (nullable = true)\n | | |-- homologyType: string (nullable = true)\n | | |-- targetGeneId: string (nullable = true)\n | | |-- isHighConfidence: string (nullable = true)\n | | |-- targetGeneSymbol: string (nullable = true)\n | | |-- queryPercentageIdentity: double (nullable = true)\n | | |-- targetPercentageIdentity: double (nullable = true)\n | | |-- priority: integer (nullable = true)\n |-- tractability: array (nullable = true)\n | |-- element: struct (containsNull = true)\n | | |-- modality: string (nullable = true)\n | | |-- id: string (nullable = true)\n | | |-- value: boolean (nullable = true)\n |-- safetyLiabilities: array (nullable = true)\n | |-- element: struct (containsNull = true)\n | | |-- event: string (nullable = true)\n | | |-- eventId: string (nullable = true)\n | | |-- effects: array (nullable = true)\n | | | |-- element: struct (containsNull = true)\n | | | | |-- direction: string (nullable = true)\n | | | | |-- dosing: string (nullable = true)\n | | |-- biosamples: array (nullable = true)\n | | | |-- element: struct (containsNull = true)\n | | | | |-- cellFormat: string (nullable = true)\n | | | | |-- cellLabel: string (nullable = true)\n | | | | |-- tissueId: string (nullable = true)\n | | | | |-- tissueLabel: string (nullable = true)\n | | |-- isHumanApplicable: boolean (nullable = true)\n | | |-- datasource: string (nullable = true)\n | | |-- literature: string (nullable = true)\n | | |-- url: string (nullable = true)\n | | |-- studies: array (nullable = true)\n | | | |-- element: struct (containsNull = true)\n | | | | |-- description: string (nullable = true)\n | | | | |-- name: string (nullable = true)\n | | | | |-- type: string (nullable = true)\n |-- pathways: array (nullable = true)\n | |-- element: struct (containsNull = true)\n | | |-- pathwayId: string (nullable = true)\n | | |-- pathway: string (nullable = true)\n | | |-- topLevelTerm: string (nullable = true)\n\n-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n id | ENSG00000002586 \n approvedSymbol | CD99 \n biotype | protein_coding \n transcriptIds | [ENST00000482405, ENST00000624481, ENST00000645950, ENST00000381187, ENST00000449611, ENST00000611428, ENST00000646103, ENST00000647297, ENST00000381192, ENST00000381180, ENST00000482293, ENST00000381184, ENST00000381177, ENST00000623253, ENST00000497752] \n canonicalTranscript | {ENST00000381192, X, 2691295, 2741309, +} \n canonicalExons | [2738200, 2738256, 2740779, 2741309, 2691295, 2691427, 2717605, 2717652, 2714422, 2714454, 2722627, 2722674, 2720356, 2720424, 2719661, 2719705, 2723314, 2723364, 2726260, 2726373] \n genomicLocation | {X, 2691187, 2741309, 1} \n alternativeGenes | null \n approvedName | CD99 molecule (Xg blood group) \n go | [{GO:0005515, PMID:20374249, IPI, F, P14209, ECO:0000353}, {GO:0005886, Reactome:R-HSA-8862084, TAS, C, P14209, ECO:0000304}, {GO:0034109, PMID:21873635, IBA, P, P14209, ECO:0000318}, {GO:2000391, PMID:21873635, IBA, P, P14209, ECO:0000318}, {GO:0005925, PMID:21423176, HDA, C, P14209, ECO:0007005}, {GO:0072683, PMID:21873635, IBA, P, P14209, ECO:0000318}, {GO:0005737, PMID:2479542, TAS, C, P14209, ECO:0000304}] \n hallmarks | null \n synonyms | [{CD99 antigen, uniprot}, {CD99, uniprot}, {MIC2, uniprot}, {MIC2X, uniprot}, {MIC2Y, uniprot}, {12E7, uniprot}, {E2 antigen, uniprot}, {Protein MIC2, uniprot}, {T-cell surface glycoprotein E2, uniprot}, {HBA71, NCBI_entrez}, {MIC2, NCBI_entrez}, {MIC2X, NCBI_entrez}, {MIC2Y, NCBI_entrez}, {MSK5X, NCBI_entrez}, {CD99 antigen, NCBI_entrez}, {E2 antigen, NCBI_entrez}, {MIC2 (monoclonal antibody 12E7), NCBI_entrez}, {T-cell surface glycoprotein E2, NCBI_entrez}, {antigen identified by monoclonal 12E7, Y homolog, NCBI_entrez}, {antigen identified by monoclonal antibodies 12E7, F21 and O13, NCBI_entrez}, {cell surface antigen 12E7, NCBI_entrez}, {cell surface antigen HBA-71, NCBI_entrez}, {cell surface antigen O13, NCBI_entrez}, {surface antigen MIC2, NCBI_entrez}, {CD99, NCBI_entrez}] \n symbolSynonyms | [{CD99, uniprot}, {MIC2, uniprot}, {MIC2X, uniprot}, {MIC2Y, uniprot}, {HBA71, NCBI_entrez}, {MIC2, NCBI_entrez}, {MIC2X, NCBI_entrez}, {MIC2Y, NCBI_entrez}, {MSK5X, NCBI_entrez}, {CD99, NCBI_entrez}] \n nameSynonyms | [{CD99 antigen, uniprot}, {12E7, uniprot}, {E2 antigen, uniprot}, {Protein MIC2, uniprot}, {T-cell surface glycoprotein E2, uniprot}, {CD99 antigen, NCBI_entrez}, {E2 antigen, NCBI_entrez}, {MIC2 (monoclonal antibody 12E7), NCBI_entrez}, {T-cell surface glycoprotein E2, NCBI_entrez}, {antigen identified by monoclonal 12E7, Y homolog, NCBI_entrez}, {antigen identified by monoclonal antibodies 12E7, F21 and O13, NCBI_entrez}, {cell surface antigen 12E7, NCBI_entrez}, {cell surface antigen HBA-71, NCBI_entrez}, {cell surface antigen O13, NCBI_entrez}, {surface antigen MIC2, NCBI_entrez}] \n functionDescriptions | [Involved in T-cell adhesion processes and in spontaneous rosette formation with erythrocytes. Plays a role in a late step of leukocyte extravasation helping leukocytes to overcome the endothelial basement membrane. Acts at the same site as, but independently of, PECAM1. Involved in T-cell adhesion processes (By similarity). .] \n subcellularLocations | [{Membrane, uniprot, SL-0162, Cellular component}, {Golgi apparatus, HPA_main, SL-0132, Golgi apparatus}] \n targetClass | null \n obsoleteSymbols | [{MIC2, HGNC}, {MIC2X, HGNC}, {MIC2Y, HGNC}] \n obsoleteNames | [{antigen identified by monoclonal antibodies 12E7, F21 and O13, HGNC}, {CD99 antigen, HGNC}, {CD99 molecule, HGNC}] \n constraint | [{syn, -0.16373, 42.64, 44, 1.0319, 0.81, 1.327, null, null, null}, {mis, -0.52942, 113.15, 129, 1.1401, 0.987, 1.32, null, null, null}, {lof, 0.0026127, 12.271, 6, 0.48897, 0.267, 0.965, 10262, 5, 3}] \n tep | null \n proteinIds | [{P14209, uniprot_swissprot}, {A0A096LP69, uniprot_trembl}, {A6NGF6, uniprot_trembl}, {A6NJT9, uniprot_trembl}, {A8MQT7, uniprot_trembl}, {A6NIW1, uniprot_obsolete}, {O00518, uniprot_obsolete}, {Q6ICV7, uniprot_obsolete}] \n dbXrefs | [{7082, HGNC}, {7SFX, PDB}, {R-HSA-198933, Reactome}, {R-HSA-202733, Reactome}, {IPR022078, InterPro}, {SignalP-noTM, signalP}] \n chemicalProbes | null \n homologues | [{9606, Human, within_species_paralog, ENSG00000102181, NULL, CD99L2, 30.8108, 21.7557, 0}, {9598, Chimpanzee, ortholog_one2one, ENSPTRG00000028324, 0, ENSPTRG00000028324, 75.1351, 69.5, 1}, {9544, Macaque, ortholog_one2many, ENSMMUG00000018687, 1, ENSMMUG00000018687, 90.2703, 70.7627, 2}, {9544, Macaque, ortholog_one2many, ENSMMUG00000055976, 0, ENSMMUG00000055976, 55.6757, 55.9783, 2}, {9615, Dog, ortholog_one2many, ENSCAFG00845022471, 0, ENSCAFG00845022471, 65.4054, 48.0159, 7}, {9615, Dog, ortholog_one2many, ENSCAFG00845025501, 0, ENSCAFG00845025501, 67.5676, 38.8199, 7}, {9823, Pig, ortholog_one2one, ENSSSCG00000055885, 0, ENSSSCG00000055885, 64.3243, 63.9785, 8}, {8364, Tropical clawed frog, ortholog_one2one, ENSXETG00000039100, 1, cd99, 40.0, 37.3737, 9}, {7955, Zebrafish, ortholog_one2one, ENSDARG00000051975, 0, cd99, 34.0541, 28.6364, 10}] \n tractability | [{SM, Approved Drug, false}, {SM, Advanced Clinical, false}, {SM, Phase 1 Clinical, false}, {SM, Structure with Ligand, false}, {SM, High-Quality Ligand, false}, {SM, High-Quality Pocket, false}, {SM, Med-Quality Pocket, false}, {SM, Druggable Family, false}, {AB, Approved Drug, false}, {AB, Advanced Clinical, false}, {AB, Phase 1 Clinical, false}, {AB, UniProt loc high conf, false}, {AB, GO CC high conf, true}, {AB, UniProt loc med conf, false}, {AB, UniProt SigP or TMHMM, true}, {AB, GO CC med conf, false}, {AB, Human Protein Atlas loc, false}, {PR, Approved Drug, false}, {PR, Advanced Clinical, false}, {PR, Phase 1 Clinical, false}, {PR, Literature, false}, {PR, UniProt Ubiquitination, false}, {PR, Database Ubiquitination, true}, {PR, Half-life Data, true}, {PR, Small Molecule Binder, false}, {OC, Approved Drug, false}, {OC, Advanced Clinical, false}, {OC, Phase 1 Clinical, false}] \n safetyLiabilities | null \n pathways | [{R-HSA-198933, Immunoregulatory interactions between a Lymphoid and a non-Lymphoid cell, Immune System}, {R-HSA-202733, Cell surface interactions at the vascular wall, Hemostasis}] \nonly showing top 1 row\n\nmolecules:\nroot\n |-- id: string (nullable = true)\n |-- canonicalSmiles: string (nullable = true)\n |-- inchiKey: string (nullable = true)\n |-- drugType: string (nullable = true)\n |-- blackBoxWarning: boolean (nullable = true)\n |-- name: string (nullable = true)\n |-- yearOfFirstApproval: long (nullable = true)\n |-- maximumClinicalTrialPhase: double (nullable = true)\n |-- parentId: string (nullable = true)\n |-- hasBeenWithdrawn: boolean (nullable = true)\n |-- isApproved: boolean (nullable = true)\n |-- tradeNames: array (nullable = true)\n | |-- element: string (containsNull = true)\n |-- synonyms: array (nullable = true)\n | |-- element: string (containsNull = true)\n |-- crossReferences: map (nullable = true)\n | |-- key: string\n | |-- value: array (valueContainsNull = true)\n | | |-- element: string (containsNull = true)\n |-- childChemblIds: array (nullable = true)\n | |-- element: string (containsNull = true)\n |-- linkedDiseases: struct (nullable = true)\n | |-- rows: array (nullable = true)\n | | |-- element: string (containsNull = true)\n | |-- count: integer (nullable = true)\n |-- linkedTargets: struct (nullable = true)\n | |-- rows: array (nullable = true)\n | | |-- element: string (containsNull = true)\n | |-- count: integer (nullable = true)\n |-- description: string (nullable = true)\n\n",
"name": "stdout"
},
{
"output_type": "stream",
"text": "-RECORD 0--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n id | CHEMBL1173055 \n canonicalSmiles | CNCc1ccc(-c2[nH]c3cc(F)cc4c3c2CCNC4=O)cc1 \n inchiKey | HMABYWSNWIZPAG-UHFFFAOYSA-N \n drugType | Small molecule \n blackBoxWarning | false \n name | RUCAPARIB \n yearOfFirstApproval | 2016 \n maximumClinicalTrialPhase | 4.0 \n parentId | null \n hasBeenWithdrawn | false \n isApproved | true \n tradeNames | [] \n synonyms | [AG-014699, AG-14447, Rucaparib] \n crossReferences | {DailyMed -> [rucaparib+camsylate], PubChem -> [103905261, 137276017, 174006354], drugbank -> [DB12332]} \n childChemblIds | [CHEMBL2105733, CHEMBL3833368] \n linkedDiseases | {[EFO_0003060, EFO_0002517, MONDO_0002087, MONDO_0002158, MONDO_0004992, MONDO_0008170, EFO_0000616, MONDO_0007254, MONDO_0008315, MONDO_0008170, EFO_0000702, MONDO_0001056, MONDO_0003060, EFO_0000616, EFO_0000616, Orphanet_145, MONDO_0002974, EFO_0002618, EFO_1000613, EFO_0000673, EFO_0000564, MONDO_0011962, EFO_0001075, EFO_0000588], 24} \n linkedTargets | {[ENSG00000143799, ENSG00000041880, ENSG00000129484], 3} \n description | Small molecule drug with a maximum clinical trial phase of IV (across all indications) that was first approved in 2016 and has 4 approved and 19 investigational indications. \nonly showing top 1 row\n\nmoa:\nroot\n |-- actionType: string (nullable = true)\n |-- mechanismOfAction: string (nullable = true)\n |-- chemblIds: array (nullable = true)\n | |-- element: string (containsNull = true)\n |-- targetName: string (nullable = true)\n |-- targetType: string (nullable = true)\n |-- targets: array (nullable = true)\n | |-- element: string (containsNull = true)\n |-- references: array (nullable = true)\n | |-- element: struct (containsNull = true)\n | | |-- source: string (nullable = true)\n | | |-- ids: array (nullable = true)\n | | | |-- element: string (containsNull = true)\n | | |-- urls: array (nullable = true)\n | | | |-- element: string (containsNull = true)\n\n-RECORD 0---------------------------------------------------------------------------------\n actionType | INHIBITOR \n mechanismOfAction | Inhibin beta A chain inhibitor \n chemblIds | [CHEMBL1743073] \n targetName | Inhibin beta A chain \n targetType | single protein \n targets | [ENSG00000122641] \n references | [{PubMed, [19049340], [http://europepmc.org/abstract/MED/19049340]}] \nonly showing top 1 row\n\ndisease:\nroot\n |-- id: string (nullable = true)\n |-- code: string (nullable = true)\n |-- dbXRefs: array (nullable = true)\n | |-- element: string (containsNull = true)\n |-- description: string (nullable = true)\n |-- name: string (nullable = true)\n |-- directLocationIds: array (nullable = true)\n | |-- element: string (containsNull = true)\n |-- obsoleteTerms: array (nullable = true)\n | |-- element: string (containsNull = true)\n |-- parents: array (nullable = true)\n | |-- element: string (containsNull = true)\n |-- synonyms: struct (nullable = true)\n | |-- hasBroadSynonym: array (nullable = true)\n | | |-- element: string (containsNull = true)\n | |-- hasExactSynonym: array (nullable = true)\n | | |-- element: string (containsNull = true)\n | |-- hasNarrowSynonym: array (nullable = true)\n | | |-- element: string (containsNull = true)\n | |-- hasRelatedSynonym: array (nullable = true)\n | | |-- element: string (containsNull = true)\n |-- ancestors: array (nullable = true)\n | |-- element: string (containsNull = true)\n |-- descendants: array (nullable = true)\n | |-- element: string (containsNull = true)\n |-- children: array (nullable = true)\n | |-- element: string (containsNull = true)\n |-- therapeuticAreas: array (nullable = true)\n | |-- element: string (containsNull = true)\n |-- indirectLocationIds: array (nullable = true)\n | |-- element: string (containsNull = true)\n |-- ontology: struct (nullable = true)\n | |-- isTherapeuticArea: boolean (nullable = true)\n | |-- leaf: boolean (nullable = true)\n | |-- sources: struct (nullable = true)\n | | |-- url: string (nullable = true)\n | | |-- name: string (nullable = true)\n\n",
"name": "stdout"
},
{
"output_type": "stream",
"text": "-RECORD 0-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n id | EFO_0000255 \n code | http://www.ebi.ac.uk/efo/EFO_0000255 \n dbXRefs | [NCIT:C7528, ICDO:9767/1, MESH:D007119, MONDO:0004977, GARD:11973, UMLS:C0020981, Orphanet:86886, ONCOTREE:AITL, ICDO:9705/3, EFO:0000255, GARD:0011973, ICD9:202.70, ICD10CM:C86.5, ICD10:C86.5, DOID:0111147, MedDRA:10002449, SCTID:413537009] \n description | A mature T-cell non-Hodgkin lymphoma, characterized by systemic disease and a polymorphous infiltrate involving lymph nodes and extranodal sites. The clinical course is typically aggressive. \n name | angioimmunoblastic T-cell lymphoma \n directLocationIds | null \n obsoleteTerms | null \n parents | [MONDO_0000430] \n synonyms | {null, [angioimmunoblastic lymphadenopathy, angioimmunoblastic lymphadenopathy with Dysproteinemia, lymphogranulomatosis X, AILD, T-cell lymphoma, AILD type, angioimmunoblastic T-cell lymphoma, angioimmunoblastic lymphadenopathy type T-cell lymphoma, immunoblastic lymphadenopathy, AILT], null, [AITL]} \n ancestors | [MONDO_0000430, OTAR_0000018, MONDO_0002334, MONDO_0023370, MONDO_0044881, MONDO_0045024, Orphanet_322126, EFO_0002426, Orphanet_68336, EFO_0001642, EFO_0000574, MONDO_0015757, MONDO_0024615, EFO_0005952, EFO_0000508, MONDO_0019044, EFO_0005803, MONDO_0015760, EFO_0000616] \n descendants | [] \n children | [] \n therapeuticAreas | [OTAR_0000018, MONDO_0045024, EFO_0005803] \n indirectLocationIds | null \n ontology | {false, true, {http://www.ebi.ac.uk/efo/EFO_0000255, EFO_0000255}} \nonly showing top 1 row\n\nassociations:\nroot\n |-- diseaseId: string (nullable = true)\n |-- targetId: string (nullable = true)\n |-- score: double (nullable = true)\n |-- evidenceCount: long (nullable = true)\n\n-RECORD 0---------------------------\n diseaseId | EFO_0000574 \n targetId | ENSG00000000938 \n score | 0.0831554673040579 \n evidenceCount | 2 \nonly showing top 1 row\n\n",
"name": "stdout"
}
]
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2023-07-31T13:20:47.944835Z",
"end_time": "2023-07-31T13:20:56.050323Z"
},
"trusted": true
},
"cell_type": "code",
"source": "# Let's get disease data:\nresolved_children = (\n disease\n .select(\n f.col('id').alias('diseaseId'),\n f.col('name').alias('diseaseName'),\n f.explode_outer(f.col('children')).alias('childId')\n )\n .join(\n disease\n .select(\n f.col('id').alias('childId'),\n f.col('name').alias('childName')\n ),\n on='childId',\n how='left'\n )\n .groupBy('diseaseId', 'diseaseName')\n .agg(\n f.collect_set(\n f.struct(\n f.col('childName'),\n f.col('childId')\n )\n ).alias('children')\n )\n)\n\nresolved_children.show()\nresolved_children.printSchema()",
"execution_count": 20,
"outputs": [
{
"output_type": "stream",
"text": "[Stage 22:==================================================> (7 + 1) / 8]\r",
"name": "stderr"
},
{
"output_type": "stream",
"text": "+------------+--------------------+--------------------+\n| diseaseId| diseaseName| children|\n+------------+--------------------+--------------------+\n|DOID_0050890| synucleinopathy|[{Lewy body demen...|\n| DOID_10113| trypanosomiasis|[{Chagas disease,...|\n| DOID_10718| giardiasis| [{null, null}]|\n| DOID_1947| trichomoniasis|[{Trichomonas vag...|\n| DOID_7551| gonorrhea| [{null, null}]|\n| EFO_0000094|B-cell acute lymp...|[{B-cell acute ly...|\n| EFO_0000095|chronic lymphocyt...|[{hairy cell leuk...|\n| EFO_0000096|neoplasm of matur...|[{plasma cell neo...|\n| EFO_0000182|hepatocellular ca...|[{Liver Diffuse L...|\n| EFO_0000183| Hodgkins lymphoma|[{classic Hodgkin...|\n| EFO_0000196|metastatic prosta...| [{null, null}]|\n| EFO_0000197| mucinous carcinoma|[{Endometrial Muc...|\n| EFO_0000200|plasma cell neoplasm|[{Heavy Chain Dis...|\n| EFO_0000203|monoclonal gammop...|[{benign monoclon...|\n| EFO_0000216|acinar cell carci...|[{Parotid Gland A...|\n| EFO_0000217| gastritis|[{gastric mucosal...|\n| EFO_0000228| adenocarcinoma|[{gastroesophagea...|\n| EFO_0000232| adenoma|[{parathyroid ade...|\n| EFO_0000233|adenosquamous lun...| [{null, null}]|\n| EFO_0000239|adrenal gland phe...|[{Malignant Adren...|\n+------------+--------------------+--------------------+\nonly showing top 20 rows\n\nroot\n |-- diseaseId: string (nullable = true)\n |-- diseaseName: string (nullable = true)\n |-- children: array (nullable = false)\n | |-- element: struct (containsNull = false)\n | | |-- childName: string (nullable = true)\n | | |-- childId: string (nullable = true)\n\n",
"name": "stdout"
},
{
"output_type": "stream",
"text": "\r \r",
"name": "stderr"
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "",
"execution_count": null,
"outputs": []
}
],
"metadata": {
"kernelspec": {
"name": "python3",
"display_name": "Python 3",
"language": "python"
},
"language_info": {
"name": "python",
"version": "3.10.8",
"mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
"file_extension": ".py"
},
"gist": {
"id": "",
"data": {
"description": "GCS/Optimizing_datamodel.ipynb",
"public": true
}
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment