Skip to content

Instantly share code, notes, and snippets.

@d0choa
Last active June 21, 2021 15:48
Show Gist options
  • Save d0choa/02f06d7cb4a359712d5cf72237f82114 to your computer and use it in GitHub Desktop.
Save d0choa/02f06d7cb4a359712d5cf72237f82114 to your computer and use it in GitHub Desktop.
Interesting (validated by drug development) L2G signals found in Finngen but not available in previous sources. DISCLAIMER: No ontology expansion
import pyspark.sql.functions as F
from pyspark import SparkConf
from pyspark.sql import SparkSession
sparkConf = SparkConf()
spark = (
SparkSession.builder
.config(conf=sparkConf)
.master('local[*]')
.getOrCreate()
)
# New genetics portal platform evidence
new = spark.read.json("gs://otar000-evidence_input/Genetics_portal/json/genetics_portal-2021-06-17").persist()
# Platforrm target dataset
target = spark.read.parquet("gs://open-targets-data-releases/21.04/output/etl/parquet/targets")
# Platform disease dataset
disease = spark.read.parquet("gs://open-targets-data-releases/21.04/output/etl/parquet/diseases")
# Platform evidence data
evidence = spark.read.parquet("gs://open-targets-data-releases/21.04/output/etl/parquet/evidence")
# Platform drug info
drug = spark.read.parquet("gs://open-targets-data-releases/21.04/output/etl/parquet/molecule")
# Platform moa info
moa = spark.read.parquet("gs://open-targets-data-releases/21.04/output/etl/parquet/mechanismOfAction")
## drug information from Platform ChEMBL evidence
drugInfo = (
evidence
.filter(F.col("sourceId") == "chembl")
.select("targetId", "diseaseId", "drugId", "clinicalPhase",
"clinicalStatus", "urls.url")
.join(drug.select(F.col("id").alias("drugId"),
F.col("name").alias("drugName")),
on="drugId",
how="inner")
.join(moa
.withColumn("chemblIds", F.explode("chemblIds"))
.withColumn("targets", F.explode("targets"))
.select(F.col("chemblIds").alias("drugId"),
F.col("targets").alias("targetId"),
"actionType")
.distinct(),
on=["drugId", "targetId"],
how="left")
.distinct()
)
# Interesting new L2G results from finngen not available in previous sources in
# Genetics Portal
result = (
new
.withColumnRenamed("diseaseFromSourceMappedId", "diseaseId")
.withColumnRenamed("targetFromSourceId", "targetId")
.join(drugInfo, how="inner", on=["targetId", "diseaseId"])
.join(target.select(F.col("id").alias("targetId"),
F.col("approvedSymbol").alias("targetName")),
how="left",
on="targetId")
.join(disease.select(F.col("id").alias("diseaseId"),
F.col("name").alias("diseaseName")),
how="left",
on="diseaseId")
.withColumn("source", F.when(F.col("studyId").contains("FIN"),
F.lit("FINNGEN")).otherwise(F.lit("Other")))
.groupBy("targetId", "targetName", "diseaseId", "diseaseName")
.agg(F.collect_set("drugName").alias("drugNames"),
F.collect_set("source").alias("sources"),
F.collect_set("variantId").alias("variantIds"),
F.collect_set("studyId").alias("studyIds"),
F.max("resourceScore").alias("maxL2G"),
F.max("clinicalPhase").alias("maxPhase"))
.sort(F.col("maxL2G").desc())
.filter(F.array_contains(F.col("sources"), "FINNGEN"))
.filter(F.size(F.col("sources")) == 1)
.filter(F.col("maxL2G") > 0.5)
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment