ireneisdoomed/l2g_gs_qc.py

## l2g_gs_qc.py
## RAW GOLD STANDARD

# Nr of high-quality associations: 1201
# Nr of distinct genes: 451
(
    gs_curation
    .filter(
        f.col("gold_standard_info.highest_confidence").isin(
            ["High", "Medium"]
        )
    )
    .select(
        f.col("association_info.otg_id").alias("studyId"),
        f.concat_ws(
            "_",
            f.col("sentinel_variant.locus_GRCh38.chromosome"),
            f.col("sentinel_variant.locus_GRCh38.position"),
            f.col("sentinel_variant.alleles.reference"),
            f.col("sentinel_variant.alleles.alternative"),
        ).alias("variantId"),
    )
    .withColumn(
        "studyLocusId",
        StudyLocus.assign_study_locus_id(
            f.col("studyId"), f.col("variantId")
        ),
    )
    .select("studyLocusId").distinct()
    .count()
)

# Distribution of GS provenance
+----------------------------------------------+-----+
|sources                                       |count|
+----------------------------------------------+-----+
|[T2D Knowledge Portal ]                       |286  |
|[ProGeM]                                      |224  |
|[ChEMBL_III]                                  |223  |
|[ChEMBL_IV]                                   |203  |
|[ChEMBL_IV, ChEMBL_III]                       |149  |
|[Eric Fauman Twitter]                         |94   |
|[otg_curated_191108]                          |46   |
|[ChEMBL_IV, T2D Knowledge Portal , ChEMBL_III]|11   |
|[ProGeM, Eric Fauman Twitter]                 |3    |
|[ChEMBL_IV, Eric Fauman Twitter]              |2    |
|[ChEMBL_IV, Eric Fauman Twitter, ChEMBL_III]  |1    |
|[ChEMBL_IV, otg_curated_191108, ChEMBL_III]   |1    |
+----------------------------------------------+-----+
(
    gs_curation.filter(
        f.col("gold_standard_info.highest_confidence").isin(
            ["High", "Medium"]
        )
    )
    .select(
        f.col("association_info.otg_id").alias("studyId"),
        f.col("gold_standard_info.gene_id").alias("geneId"),
        f.concat_ws(
            "_",
            f.col("sentinel_variant.locus_GRCh38.chromosome"),
            f.col("sentinel_variant.locus_GRCh38.position"),
            f.col("sentinel_variant.alleles.reference"),
            f.col("sentinel_variant.alleles.alternative"),
        ).alias("variantId"),
        f.col("metadata.set_label").alias("source"),
    )
    .withColumn(
        "studyLocusId",
        StudyLocus.assign_study_locus_id(
            f.col("studyId"), f.col("variantId")
        ),
    )
    .groupBy("studyLocusId", "studyId", "variantId", "geneId")
    .agg(
        f.collect_set("source").alias("sources"),
    )
    .groupBy("sources")
    .count()
    .orderBy(f.col("count").desc())
    .show(truncate=False)
)

## PARSED GOLD STANDARD

# Nr of high-quality associations: 1120
# Nr of distinct genes: 403
gold_standards.df.select("studyLocusId").distinct().count()

# Distribution of positive/negative GS status:
+---------------+-------+
|goldStandardSet|  count|
+---------------+-------+
|       positive|2990893|
+---------------+-------+
gold_standards.df.groupBy("goldStandardSet").count().show()

## ANNOTATED GOLD STANDARD TO BE USED FOR TRAINING

# Nr of studyLocusId after joining with feature matrix based on StudyLocus: 14
# Nr of distinct genes: 13
data.df.select("studyLocusId").distinct().count()
	## RAW GOLD STANDARD

	# Nr of high-quality associations: 1201
	# Nr of distinct genes: 451
	(
	gs_curation
	.filter(
	f.col("gold_standard_info.highest_confidence").isin(
	["High", "Medium"]
	)
	)
	.select(
	f.col("association_info.otg_id").alias("studyId"),
	f.concat_ws(
	"_",
	f.col("sentinel_variant.locus_GRCh38.chromosome"),
	f.col("sentinel_variant.locus_GRCh38.position"),
	f.col("sentinel_variant.alleles.reference"),
	f.col("sentinel_variant.alleles.alternative"),
	).alias("variantId"),
	)
	.withColumn(
	"studyLocusId",
	StudyLocus.assign_study_locus_id(
	f.col("studyId"), f.col("variantId")
	),
	)
	.select("studyLocusId").distinct()
	.count()
	)

	# Distribution of GS provenance
	+----------------------------------------------+-----+
	\|sources \|count\|
	+----------------------------------------------+-----+
	\|[T2D Knowledge Portal ] \|286 \|
	\|[ProGeM] \|224 \|
	\|[ChEMBL_III] \|223 \|
	\|[ChEMBL_IV] \|203 \|
	\|[ChEMBL_IV, ChEMBL_III] \|149 \|
	\|[Eric Fauman Twitter] \|94 \|
	\|[otg_curated_191108] \|46 \|
	\|[ChEMBL_IV, T2D Knowledge Portal , ChEMBL_III]\|11 \|
	\|[ProGeM, Eric Fauman Twitter] \|3 \|
	\|[ChEMBL_IV, Eric Fauman Twitter] \|2 \|
	\|[ChEMBL_IV, Eric Fauman Twitter, ChEMBL_III] \|1 \|
	\|[ChEMBL_IV, otg_curated_191108, ChEMBL_III] \|1 \|
	+----------------------------------------------+-----+
	(
	gs_curation.filter(
	f.col("gold_standard_info.highest_confidence").isin(
	["High", "Medium"]
	)
	)
	.select(
	f.col("association_info.otg_id").alias("studyId"),
	f.col("gold_standard_info.gene_id").alias("geneId"),
	f.concat_ws(
	"_",
	f.col("sentinel_variant.locus_GRCh38.chromosome"),
	f.col("sentinel_variant.locus_GRCh38.position"),
	f.col("sentinel_variant.alleles.reference"),
	f.col("sentinel_variant.alleles.alternative"),
	).alias("variantId"),
	f.col("metadata.set_label").alias("source"),
	)
	.withColumn(
	"studyLocusId",
	StudyLocus.assign_study_locus_id(
	f.col("studyId"), f.col("variantId")
	),
	)
	.groupBy("studyLocusId", "studyId", "variantId", "geneId")
	.agg(
	f.collect_set("source").alias("sources"),
	)
	.groupBy("sources")
	.count()
	.orderBy(f.col("count").desc())
	.show(truncate=False)
	)

	## PARSED GOLD STANDARD

	# Nr of high-quality associations: 1120
	# Nr of distinct genes: 403
	gold_standards.df.select("studyLocusId").distinct().count()

	# Distribution of positive/negative GS status:
	+---------------+-------+
	\|goldStandardSet\| count\|
	+---------------+-------+
	\| positive\|2990893\|
	+---------------+-------+
	gold_standards.df.groupBy("goldStandardSet").count().show()

	## ANNOTATED GOLD STANDARD TO BE USED FOR TRAINING

	# Nr of studyLocusId after joining with feature matrix based on StudyLocus: 14
	# Nr of distinct genes: 13
	data.df.select("studyLocusId").distinct().count()