Skip to content

Instantly share code, notes, and snippets.

@ireneisdoomed
Created November 16, 2023 10:54
Show Gist options
  • Save ireneisdoomed/0fae75382e4541663838a05efd4b0412 to your computer and use it in GitHub Desktop.
Save ireneisdoomed/0fae75382e4541663838a05efd4b0412 to your computer and use it in GitHub Desktop.
L2G Gold standard QC
## RAW GOLD STANDARD
# Nr of high-quality associations: 1201
# Nr of distinct genes: 451
(
gs_curation
.filter(
f.col("gold_standard_info.highest_confidence").isin(
["High", "Medium"]
)
)
.select(
f.col("association_info.otg_id").alias("studyId"),
f.concat_ws(
"_",
f.col("sentinel_variant.locus_GRCh38.chromosome"),
f.col("sentinel_variant.locus_GRCh38.position"),
f.col("sentinel_variant.alleles.reference"),
f.col("sentinel_variant.alleles.alternative"),
).alias("variantId"),
)
.withColumn(
"studyLocusId",
StudyLocus.assign_study_locus_id(
f.col("studyId"), f.col("variantId")
),
)
.select("studyLocusId").distinct()
.count()
)
# Distribution of GS provenance
+----------------------------------------------+-----+
|sources |count|
+----------------------------------------------+-----+
|[T2D Knowledge Portal ] |286 |
|[ProGeM] |224 |
|[ChEMBL_III] |223 |
|[ChEMBL_IV] |203 |
|[ChEMBL_IV, ChEMBL_III] |149 |
|[Eric Fauman Twitter] |94 |
|[otg_curated_191108] |46 |
|[ChEMBL_IV, T2D Knowledge Portal , ChEMBL_III]|11 |
|[ProGeM, Eric Fauman Twitter] |3 |
|[ChEMBL_IV, Eric Fauman Twitter] |2 |
|[ChEMBL_IV, Eric Fauman Twitter, ChEMBL_III] |1 |
|[ChEMBL_IV, otg_curated_191108, ChEMBL_III] |1 |
+----------------------------------------------+-----+
(
gs_curation.filter(
f.col("gold_standard_info.highest_confidence").isin(
["High", "Medium"]
)
)
.select(
f.col("association_info.otg_id").alias("studyId"),
f.col("gold_standard_info.gene_id").alias("geneId"),
f.concat_ws(
"_",
f.col("sentinel_variant.locus_GRCh38.chromosome"),
f.col("sentinel_variant.locus_GRCh38.position"),
f.col("sentinel_variant.alleles.reference"),
f.col("sentinel_variant.alleles.alternative"),
).alias("variantId"),
f.col("metadata.set_label").alias("source"),
)
.withColumn(
"studyLocusId",
StudyLocus.assign_study_locus_id(
f.col("studyId"), f.col("variantId")
),
)
.groupBy("studyLocusId", "studyId", "variantId", "geneId")
.agg(
f.collect_set("source").alias("sources"),
)
.groupBy("sources")
.count()
.orderBy(f.col("count").desc())
.show(truncate=False)
)
## PARSED GOLD STANDARD
# Nr of high-quality associations: 1120
# Nr of distinct genes: 403
gold_standards.df.select("studyLocusId").distinct().count()
# Distribution of positive/negative GS status:
+---------------+-------+
|goldStandardSet| count|
+---------------+-------+
| positive|2990893|
+---------------+-------+
gold_standards.df.groupBy("goldStandardSet").count().show()
## ANNOTATED GOLD STANDARD TO BE USED FOR TRAINING
# Nr of studyLocusId after joining with feature matrix based on StudyLocus: 14
# Nr of distinct genes: 13
data.df.select("studyLocusId").distinct().count()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment