ireneisdoomed/Coloc - L2G prediction concordance.py

## Coloc - L2G prediction concordance.py
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as f

spark = SparkSession.builder.getOrCreate()

### PREPARE DATA ###

# Read data
coloc = (
    spark.read.parquet("gs://genetics-portal-dev-data/22.09.0/outputs/v2d_coloc")
    .select(
        f.concat_ws("_", "left_chrom", "left_pos", "left_ref", "left_alt").alias(
            "left_locus_id"
        ),
        f.concat_ws("_", "right_chrom", "right_pos", "right_ref", "right_alt").alias(
            "right_locus_id"
        ),
        f.col("left_study").alias("left_study_id"),
        f.col("right_study").alias("right_study_id"),
        "coloc_h4",
    )
    # Stringent filter of evidence of colocalization - filters down to ~400k records
    # .filter(f.col("coloc_h4") > 0.75)
    # Get highest coloc result
    .withColumn(
        "max_coloc_h4",
        f.max("coloc_h4").over(
            Window.partitionBy(
                "left_locus_id",
                "right_locus_id",
                "left_study_id",
                "right_study_id",
            )
        ),
    )
    .filter(f.col("coloc_h4") == f.col("max_coloc_h4"))
    .drop("coloc_h4")
    .persist()
)

l2g = (
    spark.read.parquet("gs://genetics-portal-dev-data/22.09.0/outputs/l2g")
    .select(
        "study_id",
        f.concat_ws("_", "chrom", "pos", "ref", "alt").alias("locus_id"),
        "gene_id",
        f.col("y_proba_full_model").alias("l2g_score"),
    )
    # Same threshold as for the Platform evidence - filters down to ~750k study/locus/gene predictions
    .filter(f.col("l2g_score") > 0.05)
    # Get top scoring gene per study/locus
    .withColumn(
        "max_l2g_score",
        f.max("l2g_score").over(Window.partitionBy("study_id", "locus_id")),
    )
    .filter(f.col("l2g_score") == f.col("max_l2g_score"))
    .drop("l2g_score")
    .persist()
)

# Join coloc and l2g
joining_cols = ["study_id", "locus_id"]
left_coloc_causal_gene = coloc.join(
    l2g.selectExpr(*[f"{col} as left_{col}" for col in l2g.columns]),
    on=[f"left_{col}" for col in joining_cols],
    how="inner",
)
right_coloc_causal_gene = coloc.join(
    l2g.selectExpr(*[f"{col} as right_{col}" for col in l2g.columns]),
    on=[f"right_{col}" for col in joining_cols],
    how="inner",
)

# Union coloc_causal_gene to have l2g results for both left and right
coloc_causal_gene = (
    left_coloc_causal_gene.join(right_coloc_causal_gene, on=coloc.columns, how="inner")
    .distinct()
    .persist()
)

# coloc_causal_gene.coalesce(400).write.mode("overwrite").parquet("gs://ot-team/irene/coloc_causal_gene")

### ANALYSIS ###

# Total count: 3_142_762
# Total count of good coloc signals (h4 > 0.75): 2_802_416

# How many coloc results share the same causal gene? --> 2_442_949 all results / 2_275_102 high confidence ones (~80%)
coloc_causal_gene.filter((f.col("left_gene_id") == f.col("right_gene_id"))).count()

# Among the coloc hits that share the same causal gene, what is the mean L2G score?
# -> Both left and right have a mean score of 0.6.
# This is much higher than the mean of the OTG evidence (0.32)
coloc_causal_gene.filter(
    (f.col("max_coloc_h4") > 0.75) & (f.col("left_gene_id") == f.col("right_gene_id"))
).select("left_max_l2g_score", "right_max_l2g_score").describe().show()
"""
+-------+------------------+-------------------+
|summary|left_max_l2g_score|right_max_l2g_score|
+-------+------------------+-------------------+
|  count|           2275102|            2275102|
|   mean|0.6071462076689128| 0.6073011888346445|
| stddev|0.1865276516386861|0.18648900428512663|
|    min|0.0500185452401638| 0.0500185452401638|
|    max|0.9444519281387329| 0.9444519281387329|
+-------+------------------+-------------------+
"""
	from pyspark.sql import SparkSession, Window
	import pyspark.sql.functions as f

	spark = SparkSession.builder.getOrCreate()

	### PREPARE DATA ###

	# Read data
	coloc = (
	spark.read.parquet("gs://genetics-portal-dev-data/22.09.0/outputs/v2d_coloc")
	.select(
	f.concat_ws("_", "left_chrom", "left_pos", "left_ref", "left_alt").alias(
	"left_locus_id"
	),
	f.concat_ws("_", "right_chrom", "right_pos", "right_ref", "right_alt").alias(
	"right_locus_id"
	),
	f.col("left_study").alias("left_study_id"),
	f.col("right_study").alias("right_study_id"),
	"coloc_h4",
	)
	# Stringent filter of evidence of colocalization - filters down to ~400k records
	# .filter(f.col("coloc_h4") > 0.75)
	# Get highest coloc result
	.withColumn(
	"max_coloc_h4",
	f.max("coloc_h4").over(
	Window.partitionBy(
	"left_locus_id",
	"right_locus_id",
	"left_study_id",
	"right_study_id",
	)
	),
	)
	.filter(f.col("coloc_h4") == f.col("max_coloc_h4"))
	.drop("coloc_h4")
	.persist()
	)

	l2g = (
	spark.read.parquet("gs://genetics-portal-dev-data/22.09.0/outputs/l2g")
	.select(
	"study_id",
	f.concat_ws("_", "chrom", "pos", "ref", "alt").alias("locus_id"),
	"gene_id",
	f.col("y_proba_full_model").alias("l2g_score"),
	)
	# Same threshold as for the Platform evidence - filters down to ~750k study/locus/gene predictions
	.filter(f.col("l2g_score") > 0.05)
	# Get top scoring gene per study/locus
	.withColumn(
	"max_l2g_score",
	f.max("l2g_score").over(Window.partitionBy("study_id", "locus_id")),
	)
	.filter(f.col("l2g_score") == f.col("max_l2g_score"))
	.drop("l2g_score")
	.persist()
	)

	# Join coloc and l2g
	joining_cols = ["study_id", "locus_id"]
	left_coloc_causal_gene = coloc.join(
	l2g.selectExpr(*[f"{col} as left_{col}" for col in l2g.columns]),
	on=[f"left_{col}" for col in joining_cols],
	how="inner",
	)
	right_coloc_causal_gene = coloc.join(
	l2g.selectExpr(*[f"{col} as right_{col}" for col in l2g.columns]),
	on=[f"right_{col}" for col in joining_cols],
	how="inner",
	)

	# Union coloc_causal_gene to have l2g results for both left and right
	coloc_causal_gene = (
	left_coloc_causal_gene.join(right_coloc_causal_gene, on=coloc.columns, how="inner")
	.distinct()
	.persist()
	)

	# coloc_causal_gene.coalesce(400).write.mode("overwrite").parquet("gs://ot-team/irene/coloc_causal_gene")

	### ANALYSIS ###

	# Total count: 3_142_762
	# Total count of good coloc signals (h4 > 0.75): 2_802_416

	# How many coloc results share the same causal gene? --> 2_442_949 all results / 2_275_102 high confidence ones (~80%)
	coloc_causal_gene.filter((f.col("left_gene_id") == f.col("right_gene_id"))).count()

	# Among the coloc hits that share the same causal gene, what is the mean L2G score?
	# -> Both left and right have a mean score of 0.6.
	# This is much higher than the mean of the OTG evidence (0.32)
	coloc_causal_gene.filter(
	(f.col("max_coloc_h4") > 0.75) & (f.col("left_gene_id") == f.col("right_gene_id"))
	).select("left_max_l2g_score", "right_max_l2g_score").describe().show()
	"""
	+-------+------------------+-------------------+
	\|summary\|left_max_l2g_score\|right_max_l2g_score\|
	+-------+------------------+-------------------+
	\| count\| 2275102\| 2275102\|
	\| mean\|0.6071462076689128\| 0.6073011888346445\|
	\| stddev\|0.1865276516386861\|0.18648900428512663\|
	\| min\|0.0500185452401638\| 0.0500185452401638\|
	\| max\|0.9444519281387329\| 0.9444519281387329\|
	+-------+------------------+-------------------+
	"""