Skip to content

Instantly share code, notes, and snippets.

@ireneisdoomed
Created March 6, 2024 17:42
Show Gist options
  • Save ireneisdoomed/35c9a1e8f266442bdfbf2eba95c25eca to your computer and use it in GitHub Desktop.
Save ireneisdoomed/35c9a1e8f266442bdfbf2eba95c25eca to your computer and use it in GitHub Desktop.
Comparison between eQTL Catalogue credible sets
import pyspark.sql.functions as f
from gentropy.common.session import Session
from gentropy.datasource.eqtl_catalogue.finemapping import EqtlCatalogueFinemapping
session = Session("yarn")
susie_studies_path = "gs://eqtl_catalog_data/study_index_0103"
susie_credible_sets_path = "gs://eqtl_catalog_data/credible_set_datasets/susie_0103"
pics_credible_sets_path = (
"gs://genetics_etl_python_playground/releases/24.01/credible_set/eqtl_catalogue"
)
normalise_studyid_col = f.lower(f.regexp_replace(f.col("studyId"), r"V8_", ""))
pics = (
session.spark.read.parquet(pics_credible_sets_path)
.withColumn("studyId", normalise_studyid_col)
.filter(f.size("qualityControls") == 0)
).persist()
susie = (
session.spark.read.parquet(susie_credible_sets_path)
.withColumn("studyId", normalise_studyid_col)
.persist()
)
susie_studies = session.spark.read.parquet(susie_studies_path).withColumn(
"studyId", normalise_studyid_col
)
missing_studies = pics.join(susie, on="studyId", how="left_anti").persist()
# 45% of the studies in the PICS credible sets are not in the SuSIE credible sets.
susie.select("studyId").distinct().count()
>>> 317911
missing_studies.select("studyId").distinct().count()
>>> 144443
# 80% of the PICS credible sets are not in the SuSIE credible sets
# pics.select("studyId", "variantId").distinct().count()
# 235114
# pics.join(susie, on=["studyId", "variantId"], how="left_anti").select("studyId", "variantId").distinct().count()
# 194608
# One of the examples of a very significant association in the PICS credible sets that is not in the SuSIE credible sets
missing_study_example = "gtex_artery_tibial_ensg00000237651"
missing_lead_example = "2_61145163_C_G"
# missing_studies.filter(f.col("studyId") == missing_study_example).filter(f.col("variantId") == missing_lead_example).drop("ldSet", "locus").show(1, False, True)
# studyId | gtex_artery_tibial_ensg00000237651
# studyLocusId | -1609360246168945094
# variantId | 2_61145163_C_G
# chromosome | 2
# position | 61145163
# beta | 1.07003
# oddsRatio | null
# oddsRatioConfidenceIntervalLower | null
# oddsRatioConfidenceIntervalUpper | null
# betaConfidenceIntervalLower | 1.0537764
# betaConfidenceIntervalUpper | 1.0862836
# pValueMantissa | 1.251
# pValueExponent | -252
# effectAlleleFrequencyFromSource | 0.44863
# standardError | 0.0162536
# subStudyDescription | null
# qualityControls | []
# finemappingMethod | null
# Look for the association in the finemapping results
corresponding_study_ids = [
"QTD000141", # ge
"QTD000142", # exon
"QTD000143", # tx
"QTD000144", # txrev
"QTD000145", # leafcutter
]
gs_bucket = "gs://eqtl_catalog_data/tmp_susie_decompressed"
raw_susie_credible_sets = session.spark.read.csv(
[f"{gs_bucket}/{qtd_id}.credible_sets.tsv" for qtd_id in corresponding_study_ids],
sep="\t",
header=True,
).withColumn(
"dataset_id",
EqtlCatalogueFinemapping._extract_dataset_id_from_file_path(f.input_file_name()),
)
# We have credible sets from other quantification methods, but not from the gene expression
# (raw_susie_credible_sets.filter(f.col("variant").contains(missing_lead_example)).show())
# -RECORD 0-------------------------------------------------------
# molecular_trait_id | ENSG00000115464.15_2_61189454_61189697
# gene_id | ENSG00000115464
# cs_id | ENSG00000115464.15_2_61189454_61189697_L3
# variant | chr2_61145163_C_G
# rsid | rs3213944
# cs_size | 32
# pip | 0.00483149909469316
# pvalue | 0.827647
# beta | -0.00882123
# se | 0.0404976
# z | -0.219423623791845
# cs_min_r2 | 0.645227986566961
# region | chr2:60189575-62189575
# dataset_id | QTD000142 <--- exon expression
# -RECORD 1-------------------------------------------------------
# molecular_trait_id | ENST00000498268
# gene_id | ENSG00000115464
# cs_id | ENST00000498268_L1
# variant | chr2_61145163_C_G
# rsid | rs3213944
# cs_size | 68
# pip | 0.0182487218166061
# pvalue | 4.98004e-09
# beta | 0.271302
# se | 0.0456777
# z | 5.98042337599837
# cs_min_r2 | 0.710026045233212
# region | chr2:60471087-62471087
# dataset_id | QTD000143 <--- tx
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment