Last active
January 18, 2023 18:16
-
-
Save ireneisdoomed/cba9eca7af0b179e7ef1d1bf1557dba4 to your computer and use it in GitHub Desktop.
Brief analysis on the degree of concordance of the L2G predictions for study/locus pairs that colocalise
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyspark.sql import SparkSession, Window | |
import pyspark.sql.functions as f | |
spark = SparkSession.builder.getOrCreate() | |
### PREPARE DATA ### | |
# Read data | |
coloc = ( | |
spark.read.parquet("gs://genetics-portal-dev-data/22.09.0/outputs/v2d_coloc") | |
.select( | |
f.concat_ws("_", "left_chrom", "left_pos", "left_ref", "left_alt").alias( | |
"left_locus_id" | |
), | |
f.concat_ws("_", "right_chrom", "right_pos", "right_ref", "right_alt").alias( | |
"right_locus_id" | |
), | |
f.col("left_study").alias("left_study_id"), | |
f.col("right_study").alias("right_study_id"), | |
"coloc_h4", | |
) | |
# Stringent filter of evidence of colocalization - filters down to ~400k records | |
# .filter(f.col("coloc_h4") > 0.75) | |
# Get highest coloc result | |
.withColumn( | |
"max_coloc_h4", | |
f.max("coloc_h4").over( | |
Window.partitionBy( | |
"left_locus_id", | |
"right_locus_id", | |
"left_study_id", | |
"right_study_id", | |
) | |
), | |
) | |
.filter(f.col("coloc_h4") == f.col("max_coloc_h4")) | |
.drop("coloc_h4") | |
.persist() | |
) | |
l2g = ( | |
spark.read.parquet("gs://genetics-portal-dev-data/22.09.0/outputs/l2g") | |
.select( | |
"study_id", | |
f.concat_ws("_", "chrom", "pos", "ref", "alt").alias("locus_id"), | |
"gene_id", | |
f.col("y_proba_full_model").alias("l2g_score"), | |
) | |
# Same threshold as for the Platform evidence - filters down to ~750k study/locus/gene predictions | |
.filter(f.col("l2g_score") > 0.05) | |
# Get top scoring gene per study/locus | |
.withColumn( | |
"max_l2g_score", | |
f.max("l2g_score").over(Window.partitionBy("study_id", "locus_id")), | |
) | |
.filter(f.col("l2g_score") == f.col("max_l2g_score")) | |
.drop("l2g_score") | |
.persist() | |
) | |
# Join coloc and l2g | |
joining_cols = ["study_id", "locus_id"] | |
left_coloc_causal_gene = coloc.join( | |
l2g.selectExpr(*[f"{col} as left_{col}" for col in l2g.columns]), | |
on=[f"left_{col}" for col in joining_cols], | |
how="inner", | |
) | |
right_coloc_causal_gene = coloc.join( | |
l2g.selectExpr(*[f"{col} as right_{col}" for col in l2g.columns]), | |
on=[f"right_{col}" for col in joining_cols], | |
how="inner", | |
) | |
# Union coloc_causal_gene to have l2g results for both left and right | |
coloc_causal_gene = ( | |
left_coloc_causal_gene.join(right_coloc_causal_gene, on=coloc.columns, how="inner") | |
.distinct() | |
.persist() | |
) | |
# coloc_causal_gene.coalesce(400).write.mode("overwrite").parquet("gs://ot-team/irene/coloc_causal_gene") | |
### ANALYSIS ### | |
# Total count: 3_142_762 | |
# Total count of good coloc signals (h4 > 0.75): 2_802_416 | |
# How many coloc results share the same causal gene? --> 2_442_949 all results / 2_275_102 high confidence ones (~80%) | |
coloc_causal_gene.filter((f.col("left_gene_id") == f.col("right_gene_id"))).count() | |
# Among the coloc hits that share the same causal gene, what is the mean L2G score? | |
# -> Both left and right have a mean score of 0.6. | |
# This is much higher than the mean of the OTG evidence (0.32) | |
coloc_causal_gene.filter( | |
(f.col("max_coloc_h4") > 0.75) & (f.col("left_gene_id") == f.col("right_gene_id")) | |
).select("left_max_l2g_score", "right_max_l2g_score").describe().show() | |
""" | |
+-------+------------------+-------------------+ | |
|summary|left_max_l2g_score|right_max_l2g_score| | |
+-------+------------------+-------------------+ | |
| count| 2275102| 2275102| | |
| mean|0.6071462076689128| 0.6073011888346445| | |
| stddev|0.1865276516386861|0.18648900428512663| | |
| min|0.0500185452401638| 0.0500185452401638| | |
| max|0.9444519281387329| 0.9444519281387329| | |
+-------+------------------+-------------------+ | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment