Skip to content

Instantly share code, notes, and snippets.

@d0choa
Last active August 27, 2021 15:32
Show Gist options
  • Save d0choa/ecf9b252d95d6e3f2f4c663332fdf94c to your computer and use it in GitHub Desktop.
Save d0choa/ecf9b252d95d6e3f2f4c663332fdf94c to your computer and use it in GitHub Desktop.
Pathogenic or L2G significant variants in L2G
import pyspark.sql.functions as F
from pyspark import SparkConf
from pyspark.sql import SparkSession
sparkConf = SparkConf()
sparkConf = sparkConf.set('spark.hadoop.fs.gs.requester.pays.mode', 'AUTO')
sparkConf = sparkConf.set('spark.hadoop.fs.gs.requester.pays.project.id', 'open-targets-eu-dev')
spark = (
SparkSession.builder
.config(conf=sparkConf)
.master('local[*]')
.getOrCreate()
)
# Platform evidence data
evidence = spark.read.parquet("gs://open-targets-data-releases/21.06/output/etl/parquet/evidence")
disease = spark.read.parquet("gs://open-targets-data-releases/21.06/output/etl/parquet/diseases")
# NOD2 aa variants
aaVariants = spark.read.csv("gs://ot-team/dochoa/nod2aa.csv", header = True)
out = (
evidence
.filter(F.col("targetId") == "ENSG00000167207")
.filter(F.col("variantId").isNotNull())
.filter((F.col("variantFunctionalConsequenceId") == "SO_0001583") |
(F.col("variantFunctionalConsequenceId") =="SO:0001587"))
# .withColumn("clinicalSignificances", F.explode("clinicalSignificances"))
.filter((F.col("datasourceId") == "ot_genetics_portal") |
(F.array_contains(F.col("clinicalSignificances"), "pathogenic")))
.persist()
.join(aaVariants, on = "variantRsId", how = "left")
.join(disease.select(F.col("id").alias("diseaseId"),
F.col("name").alias("diseaseName")),
how="left",
on="diseaseId")
.select("datasourceId", "diseaseId", "diseaseName", "variantId",
"variantRsId", "residue", "aaMutation",
"variantFunctionalConsequenceId", "clinicalSignificances",
"literature", "pValueExponent", "pValueMantissa", "studySampleSize")
.sort("variantId")
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment