Last active
October 17, 2023 14:55
-
-
Save DSuveges/fcb041fe0a0d64217338201503fc3863 to your computer and use it in GitHub Desktop.
EuroPMC cooccurrence export.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2023-10-17T14:49:10.103370Z", | |
"end_time": "2023-10-17T14:49:14.570132Z" | |
}, | |
"trusted": true | |
}, | |
"id": "106875a7", | |
"cell_type": "code", | |
"source": "from pyspark.sql import SparkSession, functions as f, types as t\nfrom pyspark.sql import Column, DataFrame\n\nspark = SparkSession.builder.getOrCreate()\n\ncooccurrences = spark.read.parquet('gs://open-targets-pre-data-releases/23.09/output/etl/parquet/literature/cooccurrences/part-00027-8a645131-7912-4f17-addd-ea629e13989b-c000.snappy.parquet')\ncooccurrences.show(1, False, True)", | |
"execution_count": 6, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "23/10/17 14:49:11 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n[Stage 4:============================================> (3 + 1) / 4]\r", | |
"name": "stderr" | |
}, | |
{ | |
"output_type": "stream", | |
"text": "-RECORD 0---------------------------------------------------------------------------------------------------------------------\n pmid | 33182051 \n pmcid | null \n pubDate | 2020-09-22 \n date | 2020-09-22 \n year | 2020 \n month | 9 \n day | 22 \n organisms | [mice] \n section | abstract \n text | This paper revealed the mechanism of ganoderic acid B (BB) on lipopolysaccharide-induced pneumonia in mice. \n trace_source | \n end1 | 98 \n end2 | 53 \n evidence_score | 2.0 \n label1 | pneumonia \n labelN1 | pneumonia \n keywordId1 | EFO_0003106 \n label2 | ganoderic acid B \n labelN2 | ganodericacidb \n keywordId2 | CHEMBL1087180 \n start1 | 89 \n start2 | 37 \n type | DS-CD \n type1 | DS \n type2 | CD \n isMapped | true \nonly showing top 1 row\n\n", | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "stream", | |
"text": "\r \r", | |
"name": "stderr" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2023-10-17T14:49:19.021139Z", | |
"end_time": "2023-10-17T14:49:25.476294Z" | |
}, | |
"trusted": true | |
}, | |
"id": "03bffb14", | |
"cell_type": "code", | |
"source": "def generate_uri(keyword_id: Column) -> Column:\n return f.when(\n # Genertain URL for gene:\n keyword_id.startswith('ENSG'),\n f.concat(f.lit('https://www.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g='), keyword_id)\n ).when(\n # Generating URL for molecule:\n keyword_id.startswith('CHEMBL'),\n f.concat(f.lit('https://www.ebi.ac.uk/chembl/compound_report_card/'), keyword_id)\n ).otherwise(\n # Genertating URL for disease:\n f.concat(f.lit('https://www.ebi.ac.uk/ols/ontologies/efo/terms?short_form='), keyword_id)\n )\n\ndef map_cooccurrence_type(c_type: Column) -> Column:\n return f.when(\n c_type == 'DS-CD',\n f.lit('Disease Drug Relationship')\n ).when(\n c_type == 'GP-CD',\n f.lit('Gene Drug Relationship')\n ).when(\n c_type == 'GP-DS',\n f.lit('Gene Disease Relationship')\n )\n\nepmc_cooccurrence = (\n cooccurrences\n .limit(1000)\n .select(\n # Selecting literature source:\n f.when(\n f.col('pmcid').isNotNull(), \n f.lit('PMC')\n ).otherwise(f.lit('MED')).alias('src'), \n # Selecting literature id:\n f.when(\n f.col('pmcid').isNotNull(), \n f.col('pmcid')\n ).otherwise(f.col('pmid')).alias('id'),\n\n # Extracting cooccurrence fields:\n map_cooccurrence_type(f.col('type')).alias('type'),\n f.col('text').alias('exact'), # Sentence\n f.col('section').alias('section'),\n f.array(\n f.struct(\n f.col('label1').alias('name'),\n generate_uri(f.col('keywordId1')).alias('uri')\n ),\n f.struct(\n f.col('label2').alias('name'),\n generate_uri(f.col('keywordId2')).alias('uri')\n ),\n ).alias('tags')\n )\n .groupBy('src', 'id')\n .agg(\n f.collect_set(\n f.struct(\n f.col('type'),\n f.col('exact'),\n f.col('section'),\n f.col('tags')\n )\n ).alias('anns')\n )\n # Adding provider:\n .withColumn('provider', f.lit('OpenTargets'))\n .repartition(1)\n .persist()\n)\n\nepmc_cooccurrence.show(1, False, True)", | |
"execution_count": 7, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "[Stage 6:> (0 + 1) / 1]\r", | |
"name": "stderr" | |
}, | |
{ | |
"output_type": "stream", | |
"text": "-RECORD 0------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n src | MED \n id | 10065878 \n anns | [{Gene Drug Relationship, Central administration (icv) of GLP-1 (7-36) amide produces a marked reduction in food and water intake, and the colocalization of the GLP-1 receptor, GLUT-2, and glucokinase mRNAs in hypothalamic neurons involved in glucose sensing suggests that these cells may be involved in the transduction of signals needed to produce a state of fullness., abstract, [{GLUT-2, https://www.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000163581}, {glucose, https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL1222250}]}, {Gene Drug Relationship, Central administration (icv) of GLP-1 (7-36) amide produces a marked reduction in food and water intake, and the colocalization of the GLP-1 receptor, GLUT-2, and glucokinase mRNAs in hypothalamic neurons involved in glucose sensing suggests that these cells may be involved in the transduction of signals needed to produce a state of fullness., abstract, [{GLP-1, https://www.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000112164}, {glucose, https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL1222250}]}, {Gene Drug Relationship, Central administration (icv) of GLP-1 (7-36) amide produces a marked reduction in food and water intake, and the colocalization of the GLP-1 receptor, GLUT-2, and glucokinase mRNAs in hypothalamic neurons involved in glucose sensing suggests that these cells may be involved in the transduction of signals needed to produce a state of fullness., abstract, [{GLP-1 receptor, https://www.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000112164}, {glucose, https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL1222250}]}, {Gene Drug Relationship, Central administration (icv) of GLP-1 (7-36) amide produces a marked reduction in food and water intake, and the colocalization of the GLP-1 receptor, GLUT-2, and glucokinase mRNAs in hypothalamic neurons involved in glucose sensing suggests that these cells may be involved in the transduction of signals needed to produce a state of fullness., abstract, [{glucokinase, https://www.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000106633}, {glucose, https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL1222250}]}, {Gene Drug Relationship, Although earlier studies indicated that GLP-1 (7-36) amide was an intestinal peptide with a potent effect on glucose-dependent insulin secretion, later on it was found that several biological effects of this peptide occur in the brain, rather than in peripheral tissues., abstract, [{GLP-1, https://www.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000112164}, {glucose, https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL1222250}]}] \n provider | OpenTargets \nonly showing top 1 row\n\n", | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "stream", | |
"text": "\r \r", | |
"name": "stderr" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2023-09-28T15:49:41.729918Z", | |
"start_time": "2023-09-28T15:49:41.587668Z" | |
}, | |
"trusted": false | |
}, | |
"id": "7181d0f8", | |
"cell_type": "code", | |
"source": "epmc_cooccurrence.groupBy('src').count().show()", | |
"execution_count": 24, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": "+---+------+\n|src| count|\n+---+------+\n|MED|161570|\n|PMC|292600|\n+---+------+\n\n" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2023-10-17T14:49:33.400263Z", | |
"end_time": "2023-10-17T14:49:35.174649Z" | |
}, | |
"trusted": true | |
}, | |
"id": "bff19572", | |
"cell_type": "code", | |
"source": "(\n epmc_cooccurrence\n .write.mode('overwrite')\n .json('gs://ot-team/dsuveges/epmc_cooccurrences', compression=\"gzip\")\n)", | |
"execution_count": 8, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "23/10/17 14:49:34 WARN GhfsStorageStatistics: Detected potential high latency for operation op_delete. latencyMs=157; previousMaxLatencyMs=0; operationCount=1; context=gs://ot-team/dsuveges/epmc_cooccurrences/_temporary\n23/10/17 14:49:35 WARN GhfsStorageStatistics: Detected potential high latency for operation stream_write_close_operations. latencyMs=139; previousMaxLatencyMs=0; operationCount=1; context=gs://ot-team/dsuveges/epmc_cooccurrences/_SUCCESS\n", | |
"name": "stderr" | |
} | |
] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "## Ricardo's first implementation - QC\n\n- The schema looks good. \n- All makes sense. The data and the shape as expected. " | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2023-10-17T14:36:51.224824Z", | |
"end_time": "2023-10-17T14:41:40.040811Z" | |
}, | |
"trusted": true, | |
"scrolled": false | |
}, | |
"cell_type": "code", | |
"source": "df = spark.read.json('gs://open-targets-pre-data-releases/ricardo/localrun/epmc/cooccurrences')\n\ndf.printSchema()", | |
"execution_count": 3, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "[Stage 1:> (0 + 1) / 1]\r", | |
"name": "stderr" | |
}, | |
{ | |
"output_type": "stream", | |
"text": "root\n |-- anns: array (nullable = true)\n | |-- element: struct (containsNull = true)\n | | |-- exact: string (nullable = true)\n | | |-- section: string (nullable = true)\n | | |-- tags: array (nullable = true)\n | | | |-- element: struct (containsNull = true)\n | | | | |-- name: string (nullable = true)\n | | | | |-- uri: string (nullable = true)\n | | |-- type: string (nullable = true)\n |-- id: string (nullable = true)\n |-- provider: string (nullable = true)\n |-- src: string (nullable = true)\n\n", | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "stream", | |
"text": "\r \r", | |
"name": "stderr" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2023-10-17T14:50:49.404848Z", | |
"end_time": "2023-10-17T14:50:49.623564Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "\ndf.show(1, False, True)", | |
"execution_count": 11, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "-RECORD 0-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n anns | [{The constitutive interaction between Gads and SLP-76 was mediated by the carboxy-terminal SH3 domain of Gads and a 20 amino-acid proline-rich region in SLP-76., abstract, [{Gads, https://www.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000100351}, {amino-acid, https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL1201498}], Gene Drug Relationship}, {The constitutive interaction between Gads and SLP-76 was mediated by the carboxy-terminal SH3 domain of Gads and a 20 amino-acid proline-rich region in SLP-76., abstract, [{SLP-76, https://www.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000043462}, {proline, https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL54922}], Gene Drug Relationship}, {The constitutive interaction between Gads and SLP-76 was mediated by the carboxy-terminal SH3 domain of Gads and a 20 amino-acid proline-rich region in SLP-76., abstract, [{Gads, https://www.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000128683}, {amino-acid, https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL1201498}], Gene Drug Relationship}, {The constitutive interaction between Gads and SLP-76 was mediated by the carboxy-terminal SH3 domain of Gads and a 20 amino-acid proline-rich region in SLP-76., abstract, [{SLP-76, https://www.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000043462}, {amino-acid, https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL1201498}], Gene Drug Relationship}, {The constitutive interaction between Gads and SLP-76 was mediated by the carboxy-terminal SH3 domain of Gads and a 20 amino-acid proline-rich region in SLP-76., abstract, [{Gads, https://www.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000128683}, {proline, https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL54922}], Gene Drug Relationship}, {The constitutive interaction between Gads and SLP-76 was mediated by the carboxy-terminal SH3 domain of Gads and a 20 amino-acid proline-rich region in SLP-76., abstract, [{Gads, https://www.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000100351}, {proline, https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL54922}], Gene Drug Relationship}] \n id | 10021361 \n provider | OpenTargets \n src | MED \nonly showing top 1 row\n\n", | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2023-10-17T14:50:14.542710Z", | |
"end_time": "2023-10-17T14:50:15.239869Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "(\n spark.read.json('gs://ot-team/dsuveges/epmc_cooccurrences')\n .show(1, False, True)\n)", | |
"execution_count": 10, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "-RECORD 0------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n anns | [{Central administration (icv) of GLP-1 (7-36) amide produces a marked reduction in food and water intake, and the colocalization of the GLP-1 receptor, GLUT-2, and glucokinase mRNAs in hypothalamic neurons involved in glucose sensing suggests that these cells may be involved in the transduction of signals needed to produce a state of fullness., abstract, [{GLUT-2, https://www.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000163581}, {glucose, https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL1222250}], Gene Drug Relationship}, {Central administration (icv) of GLP-1 (7-36) amide produces a marked reduction in food and water intake, and the colocalization of the GLP-1 receptor, GLUT-2, and glucokinase mRNAs in hypothalamic neurons involved in glucose sensing suggests that these cells may be involved in the transduction of signals needed to produce a state of fullness., abstract, [{GLP-1, https://www.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000112164}, {glucose, https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL1222250}], Gene Drug Relationship}, {Central administration (icv) of GLP-1 (7-36) amide produces a marked reduction in food and water intake, and the colocalization of the GLP-1 receptor, GLUT-2, and glucokinase mRNAs in hypothalamic neurons involved in glucose sensing suggests that these cells may be involved in the transduction of signals needed to produce a state of fullness., abstract, [{GLP-1 receptor, https://www.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000112164}, {glucose, https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL1222250}], Gene Drug Relationship}, {Central administration (icv) of GLP-1 (7-36) amide produces a marked reduction in food and water intake, and the colocalization of the GLP-1 receptor, GLUT-2, and glucokinase mRNAs in hypothalamic neurons involved in glucose sensing suggests that these cells may be involved in the transduction of signals needed to produce a state of fullness., abstract, [{glucokinase, https://www.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000106633}, {glucose, https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL1222250}], Gene Drug Relationship}, {Although earlier studies indicated that GLP-1 (7-36) amide was an intestinal peptide with a potent effect on glucose-dependent insulin secretion, later on it was found that several biological effects of this peptide occur in the brain, rather than in peripheral tissues., abstract, [{GLP-1, https://www.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000112164}, {glucose, https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL1222250}], Gene Drug Relationship}] \n id | 10065878 \n provider | OpenTargets \n src | MED \nonly showing top 1 row\n\n", | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "", | |
"execution_count": null, | |
"outputs": [] | |
} | |
], | |
"metadata": { | |
"_draft": { | |
"nbviewer_url": "https://gist.github.com/DSuveges/fcb041fe0a0d64217338201503fc3863" | |
}, | |
"gist": { | |
"id": "fcb041fe0a0d64217338201503fc3863", | |
"data": { | |
"description": "EuroPMC cooccurrence export.ipynb", | |
"public": false | |
} | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3", | |
"language": "python" | |
}, | |
"language_info": { | |
"name": "python", | |
"version": "3.10.8", | |
"mimetype": "text/x-python", | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"pygments_lexer": "ipython3", | |
"nbconvert_exporter": "python", | |
"file_extension": ".py" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment