Created
November 16, 2023 10:54
-
-
Save ireneisdoomed/0fae75382e4541663838a05efd4b0412 to your computer and use it in GitHub Desktop.
L2G Gold standard QC
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## RAW GOLD STANDARD | |
# Nr of high-quality associations: 1201 | |
# Nr of distinct genes: 451 | |
( | |
gs_curation | |
.filter( | |
f.col("gold_standard_info.highest_confidence").isin( | |
["High", "Medium"] | |
) | |
) | |
.select( | |
f.col("association_info.otg_id").alias("studyId"), | |
f.concat_ws( | |
"_", | |
f.col("sentinel_variant.locus_GRCh38.chromosome"), | |
f.col("sentinel_variant.locus_GRCh38.position"), | |
f.col("sentinel_variant.alleles.reference"), | |
f.col("sentinel_variant.alleles.alternative"), | |
).alias("variantId"), | |
) | |
.withColumn( | |
"studyLocusId", | |
StudyLocus.assign_study_locus_id( | |
f.col("studyId"), f.col("variantId") | |
), | |
) | |
.select("studyLocusId").distinct() | |
.count() | |
) | |
# Distribution of GS provenance | |
+----------------------------------------------+-----+ | |
|sources |count| | |
+----------------------------------------------+-----+ | |
|[T2D Knowledge Portal ] |286 | | |
|[ProGeM] |224 | | |
|[ChEMBL_III] |223 | | |
|[ChEMBL_IV] |203 | | |
|[ChEMBL_IV, ChEMBL_III] |149 | | |
|[Eric Fauman Twitter] |94 | | |
|[otg_curated_191108] |46 | | |
|[ChEMBL_IV, T2D Knowledge Portal , ChEMBL_III]|11 | | |
|[ProGeM, Eric Fauman Twitter] |3 | | |
|[ChEMBL_IV, Eric Fauman Twitter] |2 | | |
|[ChEMBL_IV, Eric Fauman Twitter, ChEMBL_III] |1 | | |
|[ChEMBL_IV, otg_curated_191108, ChEMBL_III] |1 | | |
+----------------------------------------------+-----+ | |
( | |
gs_curation.filter( | |
f.col("gold_standard_info.highest_confidence").isin( | |
["High", "Medium"] | |
) | |
) | |
.select( | |
f.col("association_info.otg_id").alias("studyId"), | |
f.col("gold_standard_info.gene_id").alias("geneId"), | |
f.concat_ws( | |
"_", | |
f.col("sentinel_variant.locus_GRCh38.chromosome"), | |
f.col("sentinel_variant.locus_GRCh38.position"), | |
f.col("sentinel_variant.alleles.reference"), | |
f.col("sentinel_variant.alleles.alternative"), | |
).alias("variantId"), | |
f.col("metadata.set_label").alias("source"), | |
) | |
.withColumn( | |
"studyLocusId", | |
StudyLocus.assign_study_locus_id( | |
f.col("studyId"), f.col("variantId") | |
), | |
) | |
.groupBy("studyLocusId", "studyId", "variantId", "geneId") | |
.agg( | |
f.collect_set("source").alias("sources"), | |
) | |
.groupBy("sources") | |
.count() | |
.orderBy(f.col("count").desc()) | |
.show(truncate=False) | |
) | |
## PARSED GOLD STANDARD | |
# Nr of high-quality associations: 1120 | |
# Nr of distinct genes: 403 | |
gold_standards.df.select("studyLocusId").distinct().count() | |
# Distribution of positive/negative GS status: | |
+---------------+-------+ | |
|goldStandardSet| count| | |
+---------------+-------+ | |
| positive|2990893| | |
+---------------+-------+ | |
gold_standards.df.groupBy("goldStandardSet").count().show() | |
## ANNOTATED GOLD STANDARD TO BE USED FOR TRAINING | |
# Nr of studyLocusId after joining with feature matrix based on StudyLocus: 14 | |
# Nr of distinct genes: 13 | |
data.df.select("studyLocusId").distinct().count() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment