Last active
December 9, 2022 11:38
-
-
Save ireneisdoomed/da6e7ba344a50651ebbe860224907243 to your computer and use it in GitHub Desktop.
Mock up studies dataset that takes the current one and adds the QTLs to be considered equally as GWAS
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
credset = spark.read.parquet( | |
"gs://genetics_etl_python_playground/input/220224_merged_credset" | |
) | |
studies = spark.read.parquet( | |
"gs://genetics-portal-dev-data/22.09.0/outputs/lut/study-index" | |
) | |
phenotype_id_gene = spark.read.csv( | |
"gs://genetics_etl_python_playground/input/phenotype_id_gene_luts", | |
sep="\t", | |
header=True, | |
).select( | |
f.col("phenotype_id").alias("traitFromSource"), | |
f.col("gene_id").alias("geneFromPhenotypeId"), | |
) | |
qtl_studies = ( | |
credset.filter(f.col("type") != "gwas") | |
.select( | |
f.col("study_id").alias("source"), | |
f.col("phenotype_id").alias("traitFromSource"), | |
f.col("bio_feature").alias("biofeature"), | |
f.when( | |
f.col("bio_feature").startswith("UBERON_"), | |
f.regexp_extract("bio_feature", r"^(UBERON_\d+)", 1), | |
).alias("biofeatureId"), | |
"type", | |
) | |
.join(f.broadcast(phenotype_id_gene), on="traitFromSource", how="left") | |
.withColumn( | |
# the mapped ensembl id might come either from traitFromSource or from the phenotype/gene lut | |
"geneFromPhenotypeId", | |
f.when( | |
f.col("traitFromSource").contains("ENSG"), | |
f.regexp_extract("traitFromSource", r"(ENSG\d+)", 1), | |
).otherwise(f.col("geneFromPhenotypeId")), | |
) | |
.distinct() | |
.select( | |
"*", | |
f.xxhash64(*["type", "source", "traitFromSource", "biofeature"]).alias("id"), | |
f.array(f.lit("toBePopulated")).alias("ancestryInitial"), | |
f.array(f.lit("toBePopulated")).alias("ancestryReplication"), | |
f.lit(0).alias("nCases"), | |
f.lit(0).alias("nSamples"), | |
f.lit(0).alias("nControls"), | |
f.lit("toBePopulated").alias("pubmedId"), | |
f.lit("toBePopulated").alias("publicationFirstAuthor"), | |
f.lit("toBePopulated").alias("publicationDate"), | |
f.lit("toBePopulated").alias("publicationJournal"), | |
f.lit("toBePopulated").alias("publicationTitle"), | |
f.lit(True).alias("hasSummaryStats"), | |
f.array(f.lit("toBePopulated")).alias("traitFromSourceMappedIds"), | |
f.lit("toBePopulated").alias("traitCategory"), | |
) | |
) | |
studies_t = ( | |
studies.withColumn("pubmedId", f.split("pmid", "PMID:")[1]) | |
.selectExpr( | |
"study_id AS id", | |
"ancestry_initial AS ancestryInitial", | |
"ancestry_replication as ancestryReplication", | |
"n_cases AS nCases", | |
"n_initial AS nSamples", | |
"n_initial - n_cases AS nControls", | |
"pmid AS pubmedId", | |
"pub_author AS publicationFirstAuthor", | |
"pub_date AS publicationDate", | |
"pub_journal AS publicationJournal", | |
"pub_title AS publicationTitle", | |
"trait_reported AS traitFromSource", | |
"has_sumstats AS hasSummaryStats", | |
"source", | |
"trait_efos AS traitFromSourceMappedIds", | |
"trait_category AS traitCategory", | |
) | |
.select( | |
"*", | |
f.lit("gwas").alias("type"), | |
f.lit("toBePopulated").alias("backgroundTraitFromSourceMappedIds"), | |
f.lit("toBePopulated").alias("initialSampleSize"), | |
f.when(f.col("hasSummaryStats"), "toBePopulated") | |
.otherwise(f.lit(None)) | |
.alias("summaryStatsLocation"), | |
) | |
.unionByName(qtl_studies, allowMissingColumns=True) | |
) | |
""" | |
Output schema: | |
root | |
|-- id: string (nullable = true) | |
|-- ancestryInitial: array (nullable = true) | |
| |-- element: string (containsNull = true) | |
|-- ancestryReplication: array (nullable = true) | |
| |-- element: string (containsNull = true) | |
|-- nCases: long (nullable = true) | |
|-- nSamples: long (nullable = true) | |
|-- nControls: long (nullable = true) | |
|-- pubmedId: string (nullable = true) | |
|-- publicationFirstAuthor: string (nullable = true) | |
|-- publicationDate: string (nullable = true) | |
|-- publicationJournal: string (nullable = true) | |
|-- publicationTitle: string (nullable = true) | |
|-- traitFromSource: string (nullable = true) | |
|-- hasSummaryStats: boolean (nullable = true) | |
|-- source: string (nullable = true) | |
|-- traitFromSourceMappedIds: array (nullable = true) | |
| |-- element: string (containsNull = true) | |
|-- traitCategory: string (nullable = true) | |
|-- type: string (nullable = true) | |
|-- backgroundTraitFromSourceMappedIds: string (nullable = true) | |
|-- initialSampleSize: string (nullable = true) | |
|-- summaryStatsLocation: string (nullable = true) | |
|-- biofeature: string (nullable = true) | |
|-- biofeatureId: string (nullable = true) | |
|-- geneFromPhenotypeId: string (nullable = true) | |
Output location: gs://genetics_etl_python_playground/input/study_index" | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment