ireneisdoomed/eqtl_credible_set_comparison.py

## eqtl_credible_set_comparison.py
import pyspark.sql.functions as f

from gentropy.common.session import Session
from gentropy.datasource.eqtl_catalogue.finemapping import EqtlCatalogueFinemapping

session = Session("yarn")

susie_studies_path = "gs://eqtl_catalog_data/study_index_0103"
susie_credible_sets_path = "gs://eqtl_catalog_data/credible_set_datasets/susie_0103"
pics_credible_sets_path = (
    "gs://genetics_etl_python_playground/releases/24.01/credible_set/eqtl_catalogue"
)

normalise_studyid_col = f.lower(f.regexp_replace(f.col("studyId"), r"V8_", ""))

pics = (
    session.spark.read.parquet(pics_credible_sets_path)
    .withColumn("studyId", normalise_studyid_col)
    .filter(f.size("qualityControls") == 0)
).persist()
susie = (
    session.spark.read.parquet(susie_credible_sets_path)
    .withColumn("studyId", normalise_studyid_col)
    .persist()
)
susie_studies = session.spark.read.parquet(susie_studies_path).withColumn(
    "studyId", normalise_studyid_col
)

missing_studies = pics.join(susie, on="studyId", how="left_anti").persist()

# 45% of the studies in the PICS credible sets are not in the SuSIE credible sets.
susie.select("studyId").distinct().count()
>>> 317911
missing_studies.select("studyId").distinct().count()
>>> 144443

# 80% of the PICS credible sets are not in the SuSIE credible sets
# pics.select("studyId", "variantId").distinct().count()
# 235114
# pics.join(susie, on=["studyId", "variantId"], how="left_anti").select("studyId", "variantId").distinct().count()
# 194608

# One of the examples of a very significant association in the PICS credible sets that is not in the SuSIE credible sets

missing_study_example = "gtex_artery_tibial_ensg00000237651"
missing_lead_example = "2_61145163_C_G"
# missing_studies.filter(f.col("studyId") == missing_study_example).filter(f.col("variantId") == missing_lead_example).drop("ldSet", "locus").show(1, False, True)
#  studyId                          | gtex_artery_tibial_ensg00000237651
#  studyLocusId                     | -1609360246168945094
#  variantId                        | 2_61145163_C_G
#  chromosome                       | 2
#  position                         | 61145163
#  beta                             | 1.07003
#  oddsRatio                        | null
#  oddsRatioConfidenceIntervalLower | null
#  oddsRatioConfidenceIntervalUpper | null
#  betaConfidenceIntervalLower      | 1.0537764
#  betaConfidenceIntervalUpper      | 1.0862836
#  pValueMantissa                   | 1.251
#  pValueExponent                   | -252
#  effectAlleleFrequencyFromSource  | 0.44863
#  standardError                    | 0.0162536
#  subStudyDescription              | null
#  qualityControls                  | []
#  finemappingMethod                | null

# Look for the association in the finemapping results
corresponding_study_ids = [
    "QTD000141",  # ge
    "QTD000142",  # exon
    "QTD000143",  # tx
    "QTD000144",  # txrev
    "QTD000145",  # leafcutter
]
gs_bucket = "gs://eqtl_catalog_data/tmp_susie_decompressed"
raw_susie_credible_sets = session.spark.read.csv(
    [f"{gs_bucket}/{qtd_id}.credible_sets.tsv" for qtd_id in corresponding_study_ids],
    sep="\t",
    header=True,
).withColumn(
    "dataset_id",
    EqtlCatalogueFinemapping._extract_dataset_id_from_file_path(f.input_file_name()),
)

# We have credible sets from other quantification methods, but not from the gene expression
# (raw_susie_credible_sets.filter(f.col("variant").contains(missing_lead_example)).show())
# -RECORD 0-------------------------------------------------------
#  molecular_trait_id | ENSG00000115464.15_2_61189454_61189697
#  gene_id            | ENSG00000115464
#  cs_id              | ENSG00000115464.15_2_61189454_61189697_L3
#  variant            | chr2_61145163_C_G
#  rsid               | rs3213944
#  cs_size            | 32
#  pip                | 0.00483149909469316
#  pvalue             | 0.827647
#  beta               | -0.00882123
#  se                 | 0.0404976
#  z                  | -0.219423623791845
#  cs_min_r2          | 0.645227986566961
#  region             | chr2:60189575-62189575
#  dataset_id         | QTD000142                    <--- exon expression
# -RECORD 1-------------------------------------------------------
#  molecular_trait_id | ENST00000498268
#  gene_id            | ENSG00000115464
#  cs_id              | ENST00000498268_L1
#  variant            | chr2_61145163_C_G
#  rsid               | rs3213944
#  cs_size            | 68
#  pip                | 0.0182487218166061
#  pvalue             | 4.98004e-09
#  beta               | 0.271302
#  se                 | 0.0456777
#  z                  | 5.98042337599837
#  cs_min_r2          | 0.710026045233212
#  region             | chr2:60471087-62471087
#  dataset_id         | QTD000143                      <--- tx
	import pyspark.sql.functions as f

	from gentropy.common.session import Session
	from gentropy.datasource.eqtl_catalogue.finemapping import EqtlCatalogueFinemapping

	session = Session("yarn")

	susie_studies_path = "gs://eqtl_catalog_data/study_index_0103"
	susie_credible_sets_path = "gs://eqtl_catalog_data/credible_set_datasets/susie_0103"
	pics_credible_sets_path = (
	"gs://genetics_etl_python_playground/releases/24.01/credible_set/eqtl_catalogue"
	)

	normalise_studyid_col = f.lower(f.regexp_replace(f.col("studyId"), r"V8_", ""))

	pics = (
	session.spark.read.parquet(pics_credible_sets_path)
	.withColumn("studyId", normalise_studyid_col)
	.filter(f.size("qualityControls") == 0)
	).persist()
	susie = (
	session.spark.read.parquet(susie_credible_sets_path)
	.withColumn("studyId", normalise_studyid_col)
	.persist()
	)
	susie_studies = session.spark.read.parquet(susie_studies_path).withColumn(
	"studyId", normalise_studyid_col
	)

	missing_studies = pics.join(susie, on="studyId", how="left_anti").persist()

	# 45% of the studies in the PICS credible sets are not in the SuSIE credible sets.
	susie.select("studyId").distinct().count()
	>>> 317911
	missing_studies.select("studyId").distinct().count()
	>>> 144443

	# 80% of the PICS credible sets are not in the SuSIE credible sets
	# pics.select("studyId", "variantId").distinct().count()
	# 235114
	# pics.join(susie, on=["studyId", "variantId"], how="left_anti").select("studyId", "variantId").distinct().count()
	# 194608

	# One of the examples of a very significant association in the PICS credible sets that is not in the SuSIE credible sets

	missing_study_example = "gtex_artery_tibial_ensg00000237651"
	missing_lead_example = "2_61145163_C_G"
	# missing_studies.filter(f.col("studyId") == missing_study_example).filter(f.col("variantId") == missing_lead_example).drop("ldSet", "locus").show(1, False, True)
	# studyId \| gtex_artery_tibial_ensg00000237651
	# studyLocusId \| -1609360246168945094
	# variantId \| 2_61145163_C_G
	# chromosome \| 2
	# position \| 61145163
	# beta \| 1.07003
	# oddsRatio \| null
	# oddsRatioConfidenceIntervalLower \| null
	# oddsRatioConfidenceIntervalUpper \| null
	# betaConfidenceIntervalLower \| 1.0537764
	# betaConfidenceIntervalUpper \| 1.0862836
	# pValueMantissa \| 1.251
	# pValueExponent \| -252
	# effectAlleleFrequencyFromSource \| 0.44863
	# standardError \| 0.0162536
	# subStudyDescription \| null
	# qualityControls \| []
	# finemappingMethod \| null

	# Look for the association in the finemapping results
	corresponding_study_ids = [
	"QTD000141", # ge
	"QTD000142", # exon
	"QTD000143", # tx
	"QTD000144", # txrev
	"QTD000145", # leafcutter
	]
	gs_bucket = "gs://eqtl_catalog_data/tmp_susie_decompressed"
	raw_susie_credible_sets = session.spark.read.csv(
	[f"{gs_bucket}/{qtd_id}.credible_sets.tsv" for qtd_id in corresponding_study_ids],
	sep="\t",
	header=True,
	).withColumn(
	"dataset_id",
	EqtlCatalogueFinemapping._extract_dataset_id_from_file_path(f.input_file_name()),
	)

	# We have credible sets from other quantification methods, but not from the gene expression
	# (raw_susie_credible_sets.filter(f.col("variant").contains(missing_lead_example)).show())
	# -RECORD 0-------------------------------------------------------
	# molecular_trait_id \| ENSG00000115464.15_2_61189454_61189697
	# gene_id \| ENSG00000115464
	# cs_id \| ENSG00000115464.15_2_61189454_61189697_L3
	# variant \| chr2_61145163_C_G
	# rsid \| rs3213944
	# cs_size \| 32
	# pip \| 0.00483149909469316
	# pvalue \| 0.827647
	# beta \| -0.00882123
	# se \| 0.0404976
	# z \| -0.219423623791845
	# cs_min_r2 \| 0.645227986566961
	# region \| chr2:60189575-62189575
	# dataset_id \| QTD000142 <--- exon expression
	# -RECORD 1-------------------------------------------------------
	# molecular_trait_id \| ENST00000498268
	# gene_id \| ENSG00000115464
	# cs_id \| ENST00000498268_L1
	# variant \| chr2_61145163_C_G
	# rsid \| rs3213944
	# cs_size \| 68
	# pip \| 0.0182487218166061
	# pvalue \| 4.98004e-09
	# beta \| 0.271302
	# se \| 0.0456777
	# z \| 5.98042337599837
	# cs_min_r2 \| 0.710026045233212
	# region \| chr2:60471087-62471087
	# dataset_id \| QTD000143 <--- tx