Skip to content

Instantly share code, notes, and snippets.

@DSuveges
Last active October 17, 2023 14:55
Show Gist options
  • Save DSuveges/fcb041fe0a0d64217338201503fc3863 to your computer and use it in GitHub Desktop.
Save DSuveges/fcb041fe0a0d64217338201503fc3863 to your computer and use it in GitHub Desktop.
EuroPMC cooccurrence export.ipynb
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"metadata": {
"ExecuteTime": {
"start_time": "2023-10-17T14:49:10.103370Z",
"end_time": "2023-10-17T14:49:14.570132Z"
},
"trusted": true
},
"id": "106875a7",
"cell_type": "code",
"source": "from pyspark.sql import SparkSession, functions as f, types as t\nfrom pyspark.sql import Column, DataFrame\n\nspark = SparkSession.builder.getOrCreate()\n\ncooccurrences = spark.read.parquet('gs://open-targets-pre-data-releases/23.09/output/etl/parquet/literature/cooccurrences/part-00027-8a645131-7912-4f17-addd-ea629e13989b-c000.snappy.parquet')\ncooccurrences.show(1, False, True)",
"execution_count": 6,
"outputs": [
{
"output_type": "stream",
"text": "23/10/17 14:49:11 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n[Stage 4:============================================> (3 + 1) / 4]\r",
"name": "stderr"
},
{
"output_type": "stream",
"text": "-RECORD 0---------------------------------------------------------------------------------------------------------------------\n pmid | 33182051 \n pmcid | null \n pubDate | 2020-09-22 \n date | 2020-09-22 \n year | 2020 \n month | 9 \n day | 22 \n organisms | [mice] \n section | abstract \n text | This paper revealed the mechanism of ganoderic acid B (BB) on lipopolysaccharide-induced pneumonia in mice. \n trace_source | \n end1 | 98 \n end2 | 53 \n evidence_score | 2.0 \n label1 | pneumonia \n labelN1 | pneumonia \n keywordId1 | EFO_0003106 \n label2 | ganoderic acid B \n labelN2 | ganodericacidb \n keywordId2 | CHEMBL1087180 \n start1 | 89 \n start2 | 37 \n type | DS-CD \n type1 | DS \n type2 | CD \n isMapped | true \nonly showing top 1 row\n\n",
"name": "stdout"
},
{
"output_type": "stream",
"text": "\r \r",
"name": "stderr"
}
]
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2023-10-17T14:49:19.021139Z",
"end_time": "2023-10-17T14:49:25.476294Z"
},
"trusted": true
},
"id": "03bffb14",
"cell_type": "code",
"source": "def generate_uri(keyword_id: Column) -> Column:\n return f.when(\n # Genertain URL for gene:\n keyword_id.startswith('ENSG'),\n f.concat(f.lit('https://www.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g='), keyword_id)\n ).when(\n # Generating URL for molecule:\n keyword_id.startswith('CHEMBL'),\n f.concat(f.lit('https://www.ebi.ac.uk/chembl/compound_report_card/'), keyword_id)\n ).otherwise(\n # Genertating URL for disease:\n f.concat(f.lit('https://www.ebi.ac.uk/ols/ontologies/efo/terms?short_form='), keyword_id)\n )\n\ndef map_cooccurrence_type(c_type: Column) -> Column:\n return f.when(\n c_type == 'DS-CD',\n f.lit('Disease Drug Relationship')\n ).when(\n c_type == 'GP-CD',\n f.lit('Gene Drug Relationship')\n ).when(\n c_type == 'GP-DS',\n f.lit('Gene Disease Relationship')\n )\n\nepmc_cooccurrence = (\n cooccurrences\n .limit(1000)\n .select(\n # Selecting literature source:\n f.when(\n f.col('pmcid').isNotNull(), \n f.lit('PMC')\n ).otherwise(f.lit('MED')).alias('src'), \n # Selecting literature id:\n f.when(\n f.col('pmcid').isNotNull(), \n f.col('pmcid')\n ).otherwise(f.col('pmid')).alias('id'),\n\n # Extracting cooccurrence fields:\n map_cooccurrence_type(f.col('type')).alias('type'),\n f.col('text').alias('exact'), # Sentence\n f.col('section').alias('section'),\n f.array(\n f.struct(\n f.col('label1').alias('name'),\n generate_uri(f.col('keywordId1')).alias('uri')\n ),\n f.struct(\n f.col('label2').alias('name'),\n generate_uri(f.col('keywordId2')).alias('uri')\n ),\n ).alias('tags')\n )\n .groupBy('src', 'id')\n .agg(\n f.collect_set(\n f.struct(\n f.col('type'),\n f.col('exact'),\n f.col('section'),\n f.col('tags')\n )\n ).alias('anns')\n )\n # Adding provider:\n .withColumn('provider', f.lit('OpenTargets'))\n .repartition(1)\n .persist()\n)\n\nepmc_cooccurrence.show(1, False, True)",
"execution_count": 7,
"outputs": [
{
"output_type": "stream",
"text": "[Stage 6:> (0 + 1) / 1]\r",
"name": "stderr"
},
{
"output_type": "stream",
"text": "-RECORD 0------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n src | MED \n id | 10065878 \n anns | [{Gene Drug Relationship, Central administration (icv) of GLP-1 (7-36) amide produces a marked reduction in food and water intake, and the colocalization of the GLP-1 receptor, GLUT-2, and glucokinase mRNAs in hypothalamic neurons involved in glucose sensing suggests that these cells may be involved in the transduction of signals needed to produce a state of fullness., abstract, [{GLUT-2, https://www.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000163581}, {glucose, https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL1222250}]}, {Gene Drug Relationship, Central administration (icv) of GLP-1 (7-36) amide produces a marked reduction in food and water intake, and the colocalization of the GLP-1 receptor, GLUT-2, and glucokinase mRNAs in hypothalamic neurons involved in glucose sensing suggests that these cells may be involved in the transduction of signals needed to produce a state of fullness., abstract, [{GLP-1, https://www.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000112164}, {glucose, https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL1222250}]}, {Gene Drug Relationship, Central administration (icv) of GLP-1 (7-36) amide produces a marked reduction in food and water intake, and the colocalization of the GLP-1 receptor, GLUT-2, and glucokinase mRNAs in hypothalamic neurons involved in glucose sensing suggests that these cells may be involved in the transduction of signals needed to produce a state of fullness., abstract, [{GLP-1 receptor, https://www.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000112164}, {glucose, https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL1222250}]}, {Gene Drug Relationship, Central administration (icv) of GLP-1 (7-36) amide produces a marked reduction in food and water intake, and the colocalization of the GLP-1 receptor, GLUT-2, and glucokinase mRNAs in hypothalamic neurons involved in glucose sensing suggests that these cells may be involved in the transduction of signals needed to produce a state of fullness., abstract, [{glucokinase, https://www.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000106633}, {glucose, https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL1222250}]}, {Gene Drug Relationship, Although earlier studies indicated that GLP-1 (7-36) amide was an intestinal peptide with a potent effect on glucose-dependent insulin secretion, later on it was found that several biological effects of this peptide occur in the brain, rather than in peripheral tissues., abstract, [{GLP-1, https://www.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000112164}, {glucose, https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL1222250}]}] \n provider | OpenTargets \nonly showing top 1 row\n\n",
"name": "stdout"
},
{
"output_type": "stream",
"text": "\r \r",
"name": "stderr"
}
]
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-28T15:49:41.729918Z",
"start_time": "2023-09-28T15:49:41.587668Z"
},
"trusted": false
},
"id": "7181d0f8",
"cell_type": "code",
"source": "epmc_cooccurrence.groupBy('src').count().show()",
"execution_count": 24,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": "+---+------+\n|src| count|\n+---+------+\n|MED|161570|\n|PMC|292600|\n+---+------+\n\n"
}
]
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2023-10-17T14:49:33.400263Z",
"end_time": "2023-10-17T14:49:35.174649Z"
},
"trusted": true
},
"id": "bff19572",
"cell_type": "code",
"source": "(\n epmc_cooccurrence\n .write.mode('overwrite')\n .json('gs://ot-team/dsuveges/epmc_cooccurrences', compression=\"gzip\")\n)",
"execution_count": 8,
"outputs": [
{
"output_type": "stream",
"text": "23/10/17 14:49:34 WARN GhfsStorageStatistics: Detected potential high latency for operation op_delete. latencyMs=157; previousMaxLatencyMs=0; operationCount=1; context=gs://ot-team/dsuveges/epmc_cooccurrences/_temporary\n23/10/17 14:49:35 WARN GhfsStorageStatistics: Detected potential high latency for operation stream_write_close_operations. latencyMs=139; previousMaxLatencyMs=0; operationCount=1; context=gs://ot-team/dsuveges/epmc_cooccurrences/_SUCCESS\n",
"name": "stderr"
}
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## Ricardo's first implementation - QC\n\n- The schema looks good. \n- All makes sense. The data and the shape as expected. "
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2023-10-17T14:36:51.224824Z",
"end_time": "2023-10-17T14:41:40.040811Z"
},
"trusted": true,
"scrolled": false
},
"cell_type": "code",
"source": "df = spark.read.json('gs://open-targets-pre-data-releases/ricardo/localrun/epmc/cooccurrences')\n\ndf.printSchema()",
"execution_count": 3,
"outputs": [
{
"output_type": "stream",
"text": "[Stage 1:> (0 + 1) / 1]\r",
"name": "stderr"
},
{
"output_type": "stream",
"text": "root\n |-- anns: array (nullable = true)\n | |-- element: struct (containsNull = true)\n | | |-- exact: string (nullable = true)\n | | |-- section: string (nullable = true)\n | | |-- tags: array (nullable = true)\n | | | |-- element: struct (containsNull = true)\n | | | | |-- name: string (nullable = true)\n | | | | |-- uri: string (nullable = true)\n | | |-- type: string (nullable = true)\n |-- id: string (nullable = true)\n |-- provider: string (nullable = true)\n |-- src: string (nullable = true)\n\n",
"name": "stdout"
},
{
"output_type": "stream",
"text": "\r \r",
"name": "stderr"
}
]
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2023-10-17T14:50:49.404848Z",
"end_time": "2023-10-17T14:50:49.623564Z"
},
"trusted": true
},
"cell_type": "code",
"source": "\ndf.show(1, False, True)",
"execution_count": 11,
"outputs": [
{
"output_type": "stream",
"text": "-RECORD 0-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n anns | [{The constitutive interaction between Gads and SLP-76 was mediated by the carboxy-terminal SH3 domain of Gads and a 20 amino-acid proline-rich region in SLP-76., abstract, [{Gads, https://www.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000100351}, {amino-acid, https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL1201498}], Gene Drug Relationship}, {The constitutive interaction between Gads and SLP-76 was mediated by the carboxy-terminal SH3 domain of Gads and a 20 amino-acid proline-rich region in SLP-76., abstract, [{SLP-76, https://www.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000043462}, {proline, https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL54922}], Gene Drug Relationship}, {The constitutive interaction between Gads and SLP-76 was mediated by the carboxy-terminal SH3 domain of Gads and a 20 amino-acid proline-rich region in SLP-76., abstract, [{Gads, https://www.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000128683}, {amino-acid, https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL1201498}], Gene Drug Relationship}, {The constitutive interaction between Gads and SLP-76 was mediated by the carboxy-terminal SH3 domain of Gads and a 20 amino-acid proline-rich region in SLP-76., abstract, [{SLP-76, https://www.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000043462}, {amino-acid, https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL1201498}], Gene Drug Relationship}, {The constitutive interaction between Gads and SLP-76 was mediated by the carboxy-terminal SH3 domain of Gads and a 20 amino-acid proline-rich region in SLP-76., abstract, [{Gads, https://www.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000128683}, {proline, https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL54922}], Gene Drug Relationship}, {The constitutive interaction between Gads and SLP-76 was mediated by the carboxy-terminal SH3 domain of Gads and a 20 amino-acid proline-rich region in SLP-76., abstract, [{Gads, https://www.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000100351}, {proline, https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL54922}], Gene Drug Relationship}] \n id | 10021361 \n provider | OpenTargets \n src | MED \nonly showing top 1 row\n\n",
"name": "stdout"
}
]
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2023-10-17T14:50:14.542710Z",
"end_time": "2023-10-17T14:50:15.239869Z"
},
"trusted": true
},
"cell_type": "code",
"source": "(\n spark.read.json('gs://ot-team/dsuveges/epmc_cooccurrences')\n .show(1, False, True)\n)",
"execution_count": 10,
"outputs": [
{
"output_type": "stream",
"text": "-RECORD 0------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n anns | [{Central administration (icv) of GLP-1 (7-36) amide produces a marked reduction in food and water intake, and the colocalization of the GLP-1 receptor, GLUT-2, and glucokinase mRNAs in hypothalamic neurons involved in glucose sensing suggests that these cells may be involved in the transduction of signals needed to produce a state of fullness., abstract, [{GLUT-2, https://www.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000163581}, {glucose, https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL1222250}], Gene Drug Relationship}, {Central administration (icv) of GLP-1 (7-36) amide produces a marked reduction in food and water intake, and the colocalization of the GLP-1 receptor, GLUT-2, and glucokinase mRNAs in hypothalamic neurons involved in glucose sensing suggests that these cells may be involved in the transduction of signals needed to produce a state of fullness., abstract, [{GLP-1, https://www.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000112164}, {glucose, https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL1222250}], Gene Drug Relationship}, {Central administration (icv) of GLP-1 (7-36) amide produces a marked reduction in food and water intake, and the colocalization of the GLP-1 receptor, GLUT-2, and glucokinase mRNAs in hypothalamic neurons involved in glucose sensing suggests that these cells may be involved in the transduction of signals needed to produce a state of fullness., abstract, [{GLP-1 receptor, https://www.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000112164}, {glucose, https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL1222250}], Gene Drug Relationship}, {Central administration (icv) of GLP-1 (7-36) amide produces a marked reduction in food and water intake, and the colocalization of the GLP-1 receptor, GLUT-2, and glucokinase mRNAs in hypothalamic neurons involved in glucose sensing suggests that these cells may be involved in the transduction of signals needed to produce a state of fullness., abstract, [{glucokinase, https://www.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000106633}, {glucose, https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL1222250}], Gene Drug Relationship}, {Although earlier studies indicated that GLP-1 (7-36) amide was an intestinal peptide with a potent effect on glucose-dependent insulin secretion, later on it was found that several biological effects of this peptide occur in the brain, rather than in peripheral tissues., abstract, [{GLP-1, https://www.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000112164}, {glucose, https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL1222250}], Gene Drug Relationship}] \n id | 10065878 \n provider | OpenTargets \n src | MED \nonly showing top 1 row\n\n",
"name": "stdout"
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "",
"execution_count": null,
"outputs": []
}
],
"metadata": {
"_draft": {
"nbviewer_url": "https://gist.github.com/DSuveges/fcb041fe0a0d64217338201503fc3863"
},
"gist": {
"id": "fcb041fe0a0d64217338201503fc3863",
"data": {
"description": "EuroPMC cooccurrence export.ipynb",
"public": false
}
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3",
"language": "python"
},
"language_info": {
"name": "python",
"version": "3.10.8",
"mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
"file_extension": ".py"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment