Last active
October 17, 2023 13:32
-
-
Save DSuveges/1d18e00fadbf521beaae14c3900eff44 to your computer and use it in GitHub Desktop.
ChEMBL-ChEBI mapping.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2023-10-17T10:55:39.094774Z", | |
"end_time": "2023-10-17T10:55:42.109137Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "%%bash \n\n\nwget https://ftp.ebi.ac.uk/pub/databases/chembl/UniChem/data/wholeSourceMapping/src_id1/src1src7.txt.gz \n\ngsutil cp src1src7.txt.gz gs://ot-team/dsuveges/", | |
"execution_count": 3, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "--2023-10-17 10:55:39-- https://ftp.ebi.ac.uk/pub/databases/chembl/UniChem/data/wholeSourceMapping/src_id1/src1src7.txt.gz\nResolving ftp.ebi.ac.uk (ftp.ebi.ac.uk)... 193.62.193.165\nConnecting to ftp.ebi.ac.uk (ftp.ebi.ac.uk)|193.62.193.165|:443... connected.\nHTTP request sent, awaiting response... 200 OK\nLength: 258211 (252K) [application/x-gzip]\nSaving to: ‘src1src7.txt.gz.1’\n\n 0K .......... .......... .......... .......... .......... 19% 2.20M 0s\n 50K .......... .......... .......... .......... .......... 39% 4.38M 0s\n 100K .......... .......... .......... .......... .......... 59% 136M 0s\n 150K .......... .......... .......... .......... .......... 79% 4.40M 0s\n 200K .......... .......... .......... .......... .......... 99% 4.44M 0s\n 250K .. 100% 4.02T=0.06s\n\n2023-10-17 10:55:39 (4.41 MB/s) - ‘src1src7.txt.gz.1’ saved [258211/258211]\n\nCopying file://src1src7.txt.gz [Content-Type=text/plain]...\n/ [0 files][ 0.0 B/252.2 KiB] \r/ [1 files][252.2 KiB/252.2 KiB] \r\nOperation completed over 1 objects/252.2 KiB. \n", | |
"name": "stderr" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2023-10-17T10:59:47.528009Z", | |
"end_time": "2023-10-17T10:59:48.002900Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "from pyspark.sql import SparkSession, functions as f, types as t\n\nspark = SparkSession.builder.getOrCreate()\n\nmapping = (\n spark.read.csv('gs://ot-team/dsuveges/src1src7.txt.gz', header=True, sep='\\t')\n .select(\n f.col(\"From src:'1'\").alias('chemblId'),\n f.concat(f.lit('CHEBI_'), f.col(\"To src:'7'\")).alias('drugId'),\n )\n .persist()\n)\nmapping.show()", | |
"execution_count": 9, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "+-------------+------------+\n| chemblId| drugId|\n+-------------+------------+\n| CHEMBL506398| CHEBI_59016|\n|CHEMBL1382530|CHEBI_111182|\n| CHEMBL1387| CHEBI_34895|\n|CHEMBL4588490|CHEBI_144256|\n| CHEMBL230391| CHEBI_78278|\n|CHEMBL1610669|CHEBI_172330|\n|CHEMBL2357920|CHEBI_100683|\n| CHEMBL206452| CHEBI_23053|\n|CHEMBL2179724|CHEBI_146244|\n|CHEMBL1450327| CHEBI_34595|\n|CHEMBL3559807|CHEBI_130500|\n|CHEMBL4748482| CHEBI_37720|\n|CHEMBL1728353| CHEBI_92315|\n|CHEMBL3559794|CHEBI_110714|\n|CHEMBL1511287|CHEBI_105530|\n|CHEMBL1339502|CHEBI_114453|\n| CHEMBL86443|CHEBI_173991|\n| CHEMBL39221| CHEBI_75204|\n| CHEMBL39221| CHEBI_29474|\n|CHEMBL1988902|CHEBI_194885|\n+-------------+------------+\nonly showing top 20 rows\n\n", | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "stream", | |
"text": "23/10/17 10:59:47 WARN CacheManager: Asked to cache already cached data.\n", | |
"name": "stderr" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2023-10-17T11:00:37.456755Z", | |
"end_time": "2023-10-17T11:00:39.135526Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "mapped_chembl = (\n spark.read.json('gs://otar012-eva/pharmacogenomics/cttv012-2023-10-12_pgkb.json.gz')\n .join(mapping, on='drugId', how='left')\n .persist()\n)\n\nmapped_chembl.show(1, False, True)", | |
"execution_count": 12, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "\r[Stage 20:> (0 + 1) / 1]\r\r \r", | |
"name": "stderr" | |
}, | |
{ | |
"output_type": "stream", | |
"text": "-RECORD 0----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n drugId | CHEBI_66901 \n datasourceId | pharmgkb \n datasourceVersion | 2023-10-05 \n datatypeId | clinical_annotation \n drugFromSource | ivacaftor \n evidenceLevel | 1A \n genotype | AA \n genotypeAnnotationText | Patients with the rs75527207 AA genotype (two copies of the CFTR G551D variant) and cystic fibrosis may respond to ivacaftor treatment. FDA-approved drug labeling information and CPIC guidelines indicate use of ivacaftor in cystic fibrosis patients with at least one copy of a list of 33 CFTR genetic variants, including G551D. Other genetic and clinical factors may also influence response to ivacaftor. \n genotypeId | 7_117587806_G_A,A \n literature | [21083385, 22047557, 23590265, 23757361, 23891399, 24066763, 27745802, 27773592, 25682022, 28651844, 28711222, 25311995, 28611235, 26135562, 25171465, 25755212, 26568242, 25473543, 25145599, 23628510, 25049054, 22942289, 19846789, 22293084, 23757359, 23313410, 24461666, 27158673] \n pgxCategory | efficacy \n phenotypeFromSourceId | null \n phenotypeText | Cystic Fibrosis \n studyId | 981755803 \n targetFromSourceId | ENSG00000001626 \n variantFunctionalConsequenceId | SO_0001583 \n variantRsId | rs75527207 \n chemblId | CHEMBL2010601 \nonly showing top 1 row\n\n", | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2023-10-17T11:01:37.318690Z", | |
"end_time": "2023-10-17T11:01:37.557935Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "print(f'All entires: {mapped_chembl.count()}')\nprint(f'Unmapped entires: {mapped_chembl.filter(f.col(\"chemblId\").isNull()).count()}')", | |
"execution_count": 14, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "All entires: 24641\nUnmapped entires: 7643\n", | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2023-10-17T11:02:06.605746Z", | |
"end_time": "2023-10-17T11:02:07.079703Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "(\n mapped_chembl\n .filter(f.col(\"chemblId\").isNull())\n .select('drugId')\n .distinct()\n .count()\n)", | |
"execution_count": 15, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 15, | |
"data": { | |
"text/plain": "52" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2023-10-17T11:02:13.712774Z", | |
"end_time": "2023-10-17T11:02:13.891787Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "(\n mapped_chembl\n .filter(f.col(\"chemblId\").isNull())\n .select('drugId')\n .distinct()\n .show()\n)", | |
"execution_count": 16, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "+------------+\n| drugId|\n+------------+\n| CHEBI_7596|\n| CHEBI_33234|\n| CHEBI_27899|\n| CHEBI_31899|\n| CHEBI_31859|\n| CHEBI_6807|\n|CHEBI_145221|\n| CHEBI_9648|\n| CHEBI_8656|\n| null|\n| CHEBI_87715|\n| CHEBI_48432|\n| CHEBI_2679|\n| CHEBI_31941|\n| CHEBI_9011|\n| CHEBI_37988|\n| CHEBI_3723|\n| CHEBI_6887|\n| CHEBI_91749|\n| CHEBI_22198|\n+------------+\nonly showing top 20 rows\n\n", | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2023-10-17T11:05:09.017533Z", | |
"end_time": "2023-10-17T11:05:10.032703Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "(\n spark.read.json('gs://otar012-eva/pharmacogenomics/cttv012-2023-10-12_pgkb.json.gz')\n .filter(f.col('drugId').isNull())\n .count()\n)", | |
"execution_count": 17, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 17, | |
"data": { | |
"text/plain": "4190" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2023-10-17T11:14:03.944148Z", | |
"end_time": "2023-10-17T11:14:04.209041Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "mapping.groupBy('chemblId').count().filter(f.col('count') > 1).count()\n", | |
"execution_count": 20, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 20, | |
"data": { | |
"text/plain": "369" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2023-10-17T11:22:10.661443Z", | |
"end_time": "2023-10-17T11:22:10.960496Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "%%bash\n\nCHEBI_ID=87715\n\ncurl -X POST \"https://www.ebi.ac.uk/unichem/api/v1/compounds\" \\\n -H \"accept: application/json\" \\\n -H \"Content-Type: application/json\" \\\n -d \"{ \\\"compound\\\": \\\"${CHEBI_ID}\\\", \\\"sourceID\\\": 7, \\\"type\\\": \\\"sourceID\\\"}\" | \n jq '.compounds[0].sources[0].compoundId'", | |
"execution_count": 23, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "\"CHEMBL2010601\"\n", | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "stream", | |
"text": " % Total % Received % Xferd Average Speed Time Time Time Current\n Dload Upload Total Spent Left Speed\n\r 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0\r100 6784 100 6727 100 57 23690 200 --:--:-- --:--:-- --:--:-- 23887\r100 6784 100 6727 100 57 23681 200 --:--:-- --:--:-- --:--:-- 23887\n", | |
"name": "stderr" | |
} | |
] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "## Explore drug index crossrefs" | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2023-10-17T13:15:02.058121Z", | |
"end_time": "2023-10-17T13:15:11.070959Z" | |
}, | |
"trusted": true, | |
"scrolled": false | |
}, | |
"cell_type": "code", | |
"source": "drugs = (\n spark.read.parquet('gs://open-targets-pre-data-releases/23.09/output/etl/parquet/molecule/')\n .persist()\n)\n\ndrugs.show()\ndrugs.printSchema()", | |
"execution_count": 24, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "[Stage 56:> (0 + 1) / 1]\r", | |
"name": "stderr" | |
}, | |
{ | |
"output_type": "stream", | |
"text": "+-------------+--------------------+--------------------+--------------+---------------+--------------------+-------------------+-------------------------+-------------+----------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+\n| id| canonicalSmiles| inchiKey| drugType|blackBoxWarning| name|yearOfFirstApproval|maximumClinicalTrialPhase| parentId|hasBeenWithdrawn|isApproved| tradeNames| synonyms| crossReferences| childChemblIds| linkedDiseases| linkedTargets| description|\n+-------------+--------------------+--------------------+--------------+---------------+--------------------+-------------------+-------------------------+-------------+----------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+\n|CHEMBL1086582|Cc1cc(CN2CCN(c3c(...|UUGWPYPNRZQDFO-UH...|Small molecule| false| CHEMBL1086582| null| null| null| false| null| []| []| null| null| null| null|Small molecule drug.|\n|CHEMBL1173055|CNCc1ccc(-c2[nH]c...|HMABYWSNWIZPAG-UH...|Small molecule| false| RUCAPARIB| 2016| 4.0| null| false| true| []|[AG-014699, AG-14...|{DailyMed -> [ruc...|[CHEMBL2105733, C...|{[EFO_0003060, EF...|{[ENSG00000143799...|Small molecule dr...|\n|CHEMBL1200910|CC(=O)N(c1onc(C)c...|JFNWFXVFBDDWCX-UH...|Small molecule| false|SULFISOXAZOLE ACETYL| 1953| 4.0| null| false| true|[Gantrisin, Gantr...|[Acetyl sulfafura...|{DailyMed -> [sul...| null| null| {[], 0}|Small molecule dr...|\n|CHEMBL1201248|COc1ccc(C[C@@H]2c...|YXSLJKQTIDHPOT-LJ...|Small molecule| false| CISATRACURIUM| 1995| 4.0| null| false| true| [Nimbex]|[Cisatracurium, C...|{DailyMed -> [cis...| [CHEMBL1200641]| {[EFO_1000637], 1}|{[ENSG00000138435...|Small molecule dr...|\n|CHEMBL1201468| null| null|Small molecule| true|ESTROGENS, ESTERI...| 1977| 4.0| null| false| true|[Amnestrogen, Est...|[Esterified estro...|{DailyMed -> [est...| null|{[EFO_1000096, EF...|{[ENSG00000140009...|Small molecule dr...|\n|CHEMBL1201718| null| null| Enzyme| false|HYALURONIDASE (HU...| 2005| 4.0| null| false| true|[Cumulase, Hylene...|[Chemophase, Enha...|{DailyMed -> [hya...| null|{[EFO_1000668, MO...| null|Enzyme drug with ...|\n|CHEMBL1201772|CC(=O)Oc1cc2c(s1)...|DTGLZDAWLRGWQN-UH...|Small molecule| true| PRASUGREL| 2009| 4.0| null| false| true| [Efient]|[LY-640315, NSC-7...|{DailyMed -> [pra...| [CHEMBL1201773]|{[EFO_0003777, EF...|{[ENSG00000169313...|Small molecule dr...|\n|CHEMBL1234354|COc1ccc(-c2cc3c(C...|XDLYKKIQACFMJG-WK...|Small molecule| false| PF-04691502| null| 2.0| null| false| false| []|[PF-04691502, PF-...|{drugbank -> [DB1...| null|{[EFO_0003869, MO...|{[ENSG00000145675...|Small molecule dr...|\n|CHEMBL1408759|Cc1ccc(C(C)C)cc2c...|FWKQNCXZGNBPFD-UH...|Small molecule| false| GUAIAZULEN| null| -1.0| null| false| false| []|[Guaiazulen, Guai...|{PubChem -> [1704...| null| {[EFO_0003966], 1}| null|Small molecule dr...|\n| CHEMBL1423|O=c1[nH]c2ccccc2n...|YVUQSNJEYSNKRX-UH...|Small molecule| false| PIMOZIDE| 1984| 4.0| null| false| true| [Orap, Pimozide]|[MCN-JR-6238, NSC...|{DailyMed -> [pim...| null|{[MONDO_0005090, ...|{[ENSG00000149295...|Small molecule dr...|\n|CHEMBL1476500|CC[C@H]1OC(=O)[C@...|RXZBMPWDPOLZGW-HI...|Small molecule| false| CHEMBL1476500| null| null| null| false| null| []| []|{PubChem -> [1442...| null| null| null|Small molecule drug.|\n| CHEMBL1515| Cn1ccnc1S|PMRYVIKBURPHAH-UH...|Small molecule| false| METHIMAZOLE| 1950| 4.0| null| false| true|[Favistan, Methim...|[Mercaptizole, Me...|{DailyMed -> [met...| null|{[HP_0000820, EFO...|{[ENSG00000115705...|Small molecule dr...|\n| CHEMBL15844| c1ccc2[nH]ccc2c1|SIKJAQJRHWYJAI-UH...|Small molecule| false| INDOLE| null| null| null| false| null| []| [Indole]|{PubChem -> [1442...| null| null| null|Small molecule drug.|\n|CHEMBL1591365|COc1cccc(N(C)C(=S...|VPHPQNGOVQYUMG-UH...|Small molecule| false| LIRANAFTATE| null| -1.0| null| false| false| []|[Liranaftate, Pir...|{PubChem -> [1442...| null| null| {[], 0}|Small molecule drug.|\n| CHEMBL16081| CC(N)=O|DLFVBJFMPXGRIB-UH...|Small molecule| false| ACETAMIDE| null| null| null| false| null| []| [Acetamide]|{PubChem -> [1111...| null| null| null|Small molecule drug.|\n| CHEMBL1622|Nc1nc(=O)c2nc(CNc...|OVBPIULPVIDEAO-LB...|Small molecule| false| FOLIC ACID| null| 4.0| null| false| true|[Bio science, Fol...|[Acidum folicum, ...|{DailyMed -> [fol...| [CHEMBL2107429]|{[EFO_0003060, EF...| null|Small molecule dr...|\n|CHEMBL1683544|C=C1C[C@@H]2CC[C@...|QAMYWGZHLCQOOJ-WR...|Small molecule| false| ERIBULIN MESYLATE| 2010| 4.0|CHEMBL1683590| false| true| [Halaven]|[E-7389, E7389, E...|{DailyMed -> [eri...| null|{[EFO_0003060, EF...|{[ENSG00000188229...|Small molecule dr...|\n| CHEMBL1729|COc1cc(N)c(Cl)cc1...|DCSUBABJRXZOMT-UH...|Small molecule| true| CISAPRIDE| 1993| 4.0| null| true| true|[Alimix, Prepulsi...|[Cisapride, Cisap...|{PubChem -> [1442...| [CHEMBL1200788]| {[EFO_0010282], 1}|{[ENSG00000164270...|Small molecule dr...|\n|CHEMBL1743065| null| null| Antibody| false| ROLEDUMAB| null| 2.0| null| false| false| []| [Roledumab]| null| null| null|{[ENSG00000187010...|Antibody drug wit...|\n|CHEMBL1789941|N#CC[C@H](C1CCCC1...|HFNKQEVNSGCOJV-OA...|Small molecule| true| RUXOLITINIB| 2011| 4.0| null| false| true| [Jakavi]|[INC-424, INC424,...|{DailyMed -> [rux...|[CHEMBL1795071, C...|{[EFO_0003086, EF...|{[ENSG00000162434...|Small molecule dr...|\n+-------------+--------------------+--------------------+--------------+---------------+--------------------+-------------------+-------------------------+-------------+----------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+\nonly showing top 20 rows\n\nroot\n |-- id: string (nullable = true)\n |-- canonicalSmiles: string (nullable = true)\n |-- inchiKey: string (nullable = true)\n |-- drugType: string (nullable = true)\n |-- blackBoxWarning: boolean (nullable = true)\n |-- name: string (nullable = true)\n |-- yearOfFirstApproval: long (nullable = true)\n |-- maximumClinicalTrialPhase: double (nullable = true)\n |-- parentId: string (nullable = true)\n |-- hasBeenWithdrawn: boolean (nullable = true)\n |-- isApproved: boolean (nullable = true)\n |-- tradeNames: array (nullable = true)\n | |-- element: string (containsNull = true)\n |-- synonyms: array (nullable = true)\n | |-- element: string (containsNull = true)\n |-- crossReferences: map (nullable = true)\n | |-- key: string\n | |-- value: array (valueContainsNull = true)\n | | |-- element: string (containsNull = true)\n |-- childChemblIds: array (nullable = true)\n | |-- element: string (containsNull = true)\n |-- linkedDiseases: struct (nullable = true)\n | |-- rows: array (nullable = true)\n | | |-- element: string (containsNull = true)\n | |-- count: integer (nullable = true)\n |-- linkedTargets: struct (nullable = true)\n | |-- rows: array (nullable = true)\n | | |-- element: string (containsNull = true)\n | |-- count: integer (nullable = true)\n |-- description: string (nullable = true)\n\n", | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "stream", | |
"text": "\r \r", | |
"name": "stderr" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2023-10-17T13:18:20.307928Z", | |
"end_time": "2023-10-17T13:18:31.672192Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "(\n drugs\n .select(\n 'id',\n f.explode_outer(f.col('crossReferences'))\n )\n .filter(f.col('key') == 'chEBI')\n .withColumn('crossrefCount', f.size(f.col('value')))\n .filter(f.col('crossrefCount') > 1)\n .show()\n)", | |
"execution_count": 27, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "[Stage 61:===================> (1 + 2) / 3]\r", | |
"name": "stderr" | |
}, | |
{ | |
"output_type": "stream", | |
"text": "+---+---+-----+-------------+\n| id|key|value|crossrefCount|\n+---+---+-----+-------------+\n+---+---+-----+-------------+\n\n", | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "stream", | |
"text": "\r[Stage 61:======================================> (2 + 1) / 3]\r\r \r", | |
"name": "stderr" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2023-10-17T13:27:57.926782Z", | |
"end_time": "2023-10-17T13:27:58.007444Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "mapping_from_drugs = (\n drugs\n .select(\n 'id',\n f.explode_outer(f.col('crossReferences'))\n )\n .filter(f.col('key') == 'chEBI')\n .select(\n 'id',\n f.explode_outer(f.col('value')).alias('chebiId')\n )\n .select(\n 'id',\n f.concat(f.lit('CHEBI_'), f.col('chebiId')).alias('drugId')\n )\n .persist()\n)\n\nmapping_from_drugs.show()", | |
"execution_count": 33, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "+-------------+-----------+\n| id| drugId|\n+-------------+-----------+\n|CHEMBL1201772|CHEBI_87723|\n|CHEMBL1408759| CHEBI_5550|\n| CHEMBL1423| CHEBI_8212|\n|CHEMBL1476500|CHEBI_48844|\n| CHEMBL1515|CHEBI_50673|\n| CHEMBL15844|CHEBI_16881|\n| CHEMBL16081|CHEBI_27856|\n| CHEMBL1622|CHEBI_27470|\n|CHEMBL1683544|CHEBI_70710|\n|CHEMBL1789941|CHEBI_66919|\n|CHEMBL1908324|CHEBI_52060|\n|CHEMBL1972860|CHEBI_79369|\n|CHEMBL2107333|CHEBI_76004|\n| CHEMBL265502|CHEBI_45906|\n| CHEMBL280998|CHEBI_18135|\n| CHEMBL292303|CHEBI_16296|\n| CHEMBL316966|CHEBI_17489|\n| CHEMBL321357|CHEBI_48950|\n| CHEMBL444814|CHEBI_16349|\n| CHEMBL458049|CHEBI_44235|\n+-------------+-----------+\nonly showing top 20 rows\n\n", | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "stream", | |
"text": "23/10/17 13:27:57 WARN CacheManager: Asked to cache already cached data.\n", | |
"name": "stderr" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2023-10-17T13:28:09.362841Z", | |
"end_time": "2023-10-17T13:28:10.843615Z" | |
}, | |
"trusted": true, | |
"scrolled": false | |
}, | |
"cell_type": "code", | |
"source": "mapped_chembl = (\n spark.read.json('gs://otar012-eva/pharmacogenomics/cttv012-2023-10-12_pgkb.json.gz')\n .join(mapping_from_drugs, on='drugId', how='left')\n .persist()\n)\n\nmapped_chembl.show(1, False, True)", | |
"execution_count": 34, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "-RECORD 0----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n drugId | CHEBI_66901 \n datasourceId | pharmgkb \n datasourceVersion | 2023-10-05 \n datatypeId | clinical_annotation \n drugFromSource | ivacaftor \n evidenceLevel | 1A \n genotype | AA \n genotypeAnnotationText | Patients with the rs75527207 AA genotype (two copies of the CFTR G551D variant) and cystic fibrosis may respond to ivacaftor treatment. FDA-approved drug labeling information and CPIC guidelines indicate use of ivacaftor in cystic fibrosis patients with at least one copy of a list of 33 CFTR genetic variants, including G551D. Other genetic and clinical factors may also influence response to ivacaftor. \n genotypeId | 7_117587806_G_A,A \n literature | [21083385, 22047557, 23590265, 23757361, 23891399, 24066763, 27745802, 27773592, 25682022, 28651844, 28711222, 25311995, 28611235, 26135562, 25171465, 25755212, 26568242, 25473543, 25145599, 23628510, 25049054, 22942289, 19846789, 22293084, 23757359, 23313410, 24461666, 27158673] \n pgxCategory | efficacy \n phenotypeFromSourceId | null \n phenotypeText | Cystic Fibrosis \n studyId | 981755803 \n targetFromSourceId | ENSG00000001626 \n variantFunctionalConsequenceId | SO_0001583 \n variantRsId | rs75527207 \n id | CHEMBL2010601 \nonly showing top 1 row\n\n", | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2023-10-17T13:29:01.133518Z", | |
"end_time": "2023-10-17T13:29:01.244384Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "(\n mapped_chembl\n .filter(f.col('id').isNull())\n .count()\n)", | |
"execution_count": 35, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 35, | |
"data": { | |
"text/plain": "7346" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2023-10-17T13:30:03.311693Z", | |
"end_time": "2023-10-17T13:30:03.520482Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "(\n mapped_chembl\n .filter(f.col('id').isNull())\n .select('drugId').distinct()\n .count()\n)", | |
"execution_count": 36, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 36, | |
"data": { | |
"text/plain": "91" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2023-10-17T13:30:32.974585Z", | |
"end_time": "2023-10-17T13:30:33.023141Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "mapping.show()", | |
"execution_count": 37, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "+-------------+------------+\n| chemblId| drugId|\n+-------------+------------+\n| CHEMBL506398| CHEBI_59016|\n|CHEMBL1382530|CHEBI_111182|\n| CHEMBL1387| CHEBI_34895|\n|CHEMBL4588490|CHEBI_144256|\n| CHEMBL230391| CHEBI_78278|\n|CHEMBL1610669|CHEBI_172330|\n|CHEMBL2357920|CHEBI_100683|\n| CHEMBL206452| CHEBI_23053|\n|CHEMBL2179724|CHEBI_146244|\n|CHEMBL1450327| CHEBI_34595|\n|CHEMBL3559807|CHEBI_130500|\n|CHEMBL4748482| CHEBI_37720|\n|CHEMBL1728353| CHEBI_92315|\n|CHEMBL3559794|CHEBI_110714|\n|CHEMBL1511287|CHEBI_105530|\n|CHEMBL1339502|CHEBI_114453|\n| CHEMBL86443|CHEBI_173991|\n| CHEMBL39221| CHEBI_75204|\n| CHEMBL39221| CHEBI_29474|\n|CHEMBL1988902|CHEBI_194885|\n+-------------+------------+\nonly showing top 20 rows\n\n", | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2023-10-17T13:31:28.572096Z", | |
"end_time": "2023-10-17T13:31:28.637327Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "merged_mapping = (\n mapping_from_drugs\n .unionByName(mapping.withColumnRenamed('chemblId', 'id'))\n .distinct()\n .persist()\n)", | |
"execution_count": 38, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2023-10-17T13:32:21.519157Z", | |
"end_time": "2023-10-17T13:32:24.021169Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "mapped_chembl = (\n spark.read.json('gs://otar012-eva/pharmacogenomics/cttv012-2023-10-12_pgkb.json.gz')\n .join(merged_mapping, on='drugId', how='left')\n .persist()\n)\n\nprint(\n mapped_chembl\n .filter(f.col('id').isNull())\n .count()\n)\n\nprint(\n mapped_chembl\n .filter(f.col('id').isNull())\n .select('drugId').distinct()\n .count()\n)", | |
"execution_count": 40, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "\r[Stage 92:> (0 + 1) / 1]\r\r \r", | |
"name": "stderr" | |
}, | |
{ | |
"output_type": "stream", | |
"text": "6462\n45\n", | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "", | |
"execution_count": null, | |
"outputs": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3", | |
"language": "python" | |
}, | |
"language_info": { | |
"name": "python", | |
"version": "3.10.8", | |
"mimetype": "text/x-python", | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"pygments_lexer": "ipython3", | |
"nbconvert_exporter": "python", | |
"file_extension": ".py" | |
}, | |
"gist": { | |
"id": "1d18e00fadbf521beaae14c3900eff44", | |
"data": { | |
"description": "ChEMBL-ChEBI mapping.ipynb", | |
"public": true | |
} | |
}, | |
"_draft": { | |
"nbviewer_url": "https://gist.github.com/DSuveges/1d18e00fadbf521beaae14c3900eff44" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment