Skip to content

Instantly share code, notes, and snippets.

@ireneisdoomed
Last active December 9, 2022 11:38
Show Gist options
  • Save ireneisdoomed/da6e7ba344a50651ebbe860224907243 to your computer and use it in GitHub Desktop.
Save ireneisdoomed/da6e7ba344a50651ebbe860224907243 to your computer and use it in GitHub Desktop.
Mock up studies dataset that takes the current one and adds the QTLs to be considered equally as GWAS
credset = spark.read.parquet(
"gs://genetics_etl_python_playground/input/220224_merged_credset"
)
studies = spark.read.parquet(
"gs://genetics-portal-dev-data/22.09.0/outputs/lut/study-index"
)
phenotype_id_gene = spark.read.csv(
"gs://genetics_etl_python_playground/input/phenotype_id_gene_luts",
sep="\t",
header=True,
).select(
f.col("phenotype_id").alias("traitFromSource"),
f.col("gene_id").alias("geneFromPhenotypeId"),
)
qtl_studies = (
credset.filter(f.col("type") != "gwas")
.select(
f.col("study_id").alias("source"),
f.col("phenotype_id").alias("traitFromSource"),
f.col("bio_feature").alias("biofeature"),
f.when(
f.col("bio_feature").startswith("UBERON_"),
f.regexp_extract("bio_feature", r"^(UBERON_\d+)", 1),
).alias("biofeatureId"),
"type",
)
.join(f.broadcast(phenotype_id_gene), on="traitFromSource", how="left")
.withColumn(
# the mapped ensembl id might come either from traitFromSource or from the phenotype/gene lut
"geneFromPhenotypeId",
f.when(
f.col("traitFromSource").contains("ENSG"),
f.regexp_extract("traitFromSource", r"(ENSG\d+)", 1),
).otherwise(f.col("geneFromPhenotypeId")),
)
.distinct()
.select(
"*",
f.xxhash64(*["type", "source", "traitFromSource", "biofeature"]).alias("id"),
f.array(f.lit("toBePopulated")).alias("ancestryInitial"),
f.array(f.lit("toBePopulated")).alias("ancestryReplication"),
f.lit(0).alias("nCases"),
f.lit(0).alias("nSamples"),
f.lit(0).alias("nControls"),
f.lit("toBePopulated").alias("pubmedId"),
f.lit("toBePopulated").alias("publicationFirstAuthor"),
f.lit("toBePopulated").alias("publicationDate"),
f.lit("toBePopulated").alias("publicationJournal"),
f.lit("toBePopulated").alias("publicationTitle"),
f.lit(True).alias("hasSummaryStats"),
f.array(f.lit("toBePopulated")).alias("traitFromSourceMappedIds"),
f.lit("toBePopulated").alias("traitCategory"),
)
)
studies_t = (
studies.withColumn("pubmedId", f.split("pmid", "PMID:")[1])
.selectExpr(
"study_id AS id",
"ancestry_initial AS ancestryInitial",
"ancestry_replication as ancestryReplication",
"n_cases AS nCases",
"n_initial AS nSamples",
"n_initial - n_cases AS nControls",
"pmid AS pubmedId",
"pub_author AS publicationFirstAuthor",
"pub_date AS publicationDate",
"pub_journal AS publicationJournal",
"pub_title AS publicationTitle",
"trait_reported AS traitFromSource",
"has_sumstats AS hasSummaryStats",
"source",
"trait_efos AS traitFromSourceMappedIds",
"trait_category AS traitCategory",
)
.select(
"*",
f.lit("gwas").alias("type"),
f.lit("toBePopulated").alias("backgroundTraitFromSourceMappedIds"),
f.lit("toBePopulated").alias("initialSampleSize"),
f.when(f.col("hasSummaryStats"), "toBePopulated")
.otherwise(f.lit(None))
.alias("summaryStatsLocation"),
)
.unionByName(qtl_studies, allowMissingColumns=True)
)
"""
Output schema:
root
|-- id: string (nullable = true)
|-- ancestryInitial: array (nullable = true)
| |-- element: string (containsNull = true)
|-- ancestryReplication: array (nullable = true)
| |-- element: string (containsNull = true)
|-- nCases: long (nullable = true)
|-- nSamples: long (nullable = true)
|-- nControls: long (nullable = true)
|-- pubmedId: string (nullable = true)
|-- publicationFirstAuthor: string (nullable = true)
|-- publicationDate: string (nullable = true)
|-- publicationJournal: string (nullable = true)
|-- publicationTitle: string (nullable = true)
|-- traitFromSource: string (nullable = true)
|-- hasSummaryStats: boolean (nullable = true)
|-- source: string (nullable = true)
|-- traitFromSourceMappedIds: array (nullable = true)
| |-- element: string (containsNull = true)
|-- traitCategory: string (nullable = true)
|-- type: string (nullable = true)
|-- backgroundTraitFromSourceMappedIds: string (nullable = true)
|-- initialSampleSize: string (nullable = true)
|-- summaryStatsLocation: string (nullable = true)
|-- biofeature: string (nullable = true)
|-- biofeatureId: string (nullable = true)
|-- geneFromPhenotypeId: string (nullable = true)
Output location: gs://genetics_etl_python_playground/input/study_index"
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment