Skip to content

Instantly share code, notes, and snippets.

@ireneisdoomed
Last active January 18, 2023 18:16
Show Gist options
  • Save ireneisdoomed/cba9eca7af0b179e7ef1d1bf1557dba4 to your computer and use it in GitHub Desktop.
Save ireneisdoomed/cba9eca7af0b179e7ef1d1bf1557dba4 to your computer and use it in GitHub Desktop.
Brief analysis on the degree of concordance of the L2G predictions for study/locus pairs that colocalise
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as f
spark = SparkSession.builder.getOrCreate()
### PREPARE DATA ###
# Read data
coloc = (
spark.read.parquet("gs://genetics-portal-dev-data/22.09.0/outputs/v2d_coloc")
.select(
f.concat_ws("_", "left_chrom", "left_pos", "left_ref", "left_alt").alias(
"left_locus_id"
),
f.concat_ws("_", "right_chrom", "right_pos", "right_ref", "right_alt").alias(
"right_locus_id"
),
f.col("left_study").alias("left_study_id"),
f.col("right_study").alias("right_study_id"),
"coloc_h4",
)
# Stringent filter of evidence of colocalization - filters down to ~400k records
# .filter(f.col("coloc_h4") > 0.75)
# Get highest coloc result
.withColumn(
"max_coloc_h4",
f.max("coloc_h4").over(
Window.partitionBy(
"left_locus_id",
"right_locus_id",
"left_study_id",
"right_study_id",
)
),
)
.filter(f.col("coloc_h4") == f.col("max_coloc_h4"))
.drop("coloc_h4")
.persist()
)
l2g = (
spark.read.parquet("gs://genetics-portal-dev-data/22.09.0/outputs/l2g")
.select(
"study_id",
f.concat_ws("_", "chrom", "pos", "ref", "alt").alias("locus_id"),
"gene_id",
f.col("y_proba_full_model").alias("l2g_score"),
)
# Same threshold as for the Platform evidence - filters down to ~750k study/locus/gene predictions
.filter(f.col("l2g_score") > 0.05)
# Get top scoring gene per study/locus
.withColumn(
"max_l2g_score",
f.max("l2g_score").over(Window.partitionBy("study_id", "locus_id")),
)
.filter(f.col("l2g_score") == f.col("max_l2g_score"))
.drop("l2g_score")
.persist()
)
# Join coloc and l2g
joining_cols = ["study_id", "locus_id"]
left_coloc_causal_gene = coloc.join(
l2g.selectExpr(*[f"{col} as left_{col}" for col in l2g.columns]),
on=[f"left_{col}" for col in joining_cols],
how="inner",
)
right_coloc_causal_gene = coloc.join(
l2g.selectExpr(*[f"{col} as right_{col}" for col in l2g.columns]),
on=[f"right_{col}" for col in joining_cols],
how="inner",
)
# Union coloc_causal_gene to have l2g results for both left and right
coloc_causal_gene = (
left_coloc_causal_gene.join(right_coloc_causal_gene, on=coloc.columns, how="inner")
.distinct()
.persist()
)
# coloc_causal_gene.coalesce(400).write.mode("overwrite").parquet("gs://ot-team/irene/coloc_causal_gene")
### ANALYSIS ###
# Total count: 3_142_762
# Total count of good coloc signals (h4 > 0.75): 2_802_416
# How many coloc results share the same causal gene? --> 2_442_949 all results / 2_275_102 high confidence ones (~80%)
coloc_causal_gene.filter((f.col("left_gene_id") == f.col("right_gene_id"))).count()
# Among the coloc hits that share the same causal gene, what is the mean L2G score?
# -> Both left and right have a mean score of 0.6.
# This is much higher than the mean of the OTG evidence (0.32)
coloc_causal_gene.filter(
(f.col("max_coloc_h4") > 0.75) & (f.col("left_gene_id") == f.col("right_gene_id"))
).select("left_max_l2g_score", "right_max_l2g_score").describe().show()
"""
+-------+------------------+-------------------+
|summary|left_max_l2g_score|right_max_l2g_score|
+-------+------------------+-------------------+
| count| 2275102| 2275102|
| mean|0.6071462076689128| 0.6073011888346445|
| stddev|0.1865276516386861|0.18648900428512663|
| min|0.0500185452401638| 0.0500185452401638|
| max|0.9444519281387329| 0.9444519281387329|
+-------+------------------+-------------------+
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment