This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pyspark.sql.functions as f | |
from gentropy.common.session import Session | |
from gentropy.datasource.eqtl_catalogue.finemapping import EqtlCatalogueFinemapping | |
session = Session("yarn") | |
susie_studies_path = "gs://eqtl_catalog_data/study_index_0103" | |
susie_credible_sets_path = "gs://eqtl_catalog_data/credible_set_datasets/susie_0103" | |
pics_credible_sets_path = ( |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
mock_l2g_gs_df = session.spark.createDataFrame( | |
[ | |
(1, "variant1", "gene1", "positive"), | |
( | |
2, | |
"variant2", | |
"gene1", | |
"negative", | |
), # in the same locus as 1 and pointing to same gene, has to be dropped | |
( |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## RAW GOLD STANDARD | |
# Nr of high-quality associations: 1201 | |
# Nr of distinct genes: 451 | |
( | |
gs_curation | |
.filter( | |
f.col("gold_standard_info.highest_confidence").isin( | |
["High", "Medium"] | |
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyspark.sql import SparkSession | |
import pyspark.sql.functions as f | |
import tempfile | |
import os | |
spark = SparkSession.builder.appName("PGKB").getOrCreate() | |
data = spark.read.json("cttv012-2023-10-12_pgkb.json.gz") | |
drugs = spark.read.parquet("molecule/") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from typing import Iterable, Optional | |
from pyspark.sql import DataFrame, SparkSession | |
import pyspark.sql.functions as f | |
spark = SparkSession.builder.getOrCreate() | |
fm = spark.read.parquet("gs://genetics-portal-dev-staging/l2g/221107/features/output/features.raw.221107.parquet") | |
fm.printSchema() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def resolve_graph(df: DataFrame) -> DataFrame: | |
"""Graph resolver for clumping. | |
It takes a dataframe with a list of variants and their explained variants, and returns a dataframe | |
with a list of variants and their resolved roots | |
Args: | |
df (DataFrame): DataFrame | |
Returns: | |
A dataframe with the resolved roots. | |
""" | |
# Convert to vertices: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## TRAINING | |
import logging | |
from datasets import load_dataset, DatasetDict, Dataset | |
import tensorflow as tf | |
from transformers import AutoTokenizer, DefaultDataCollator, TFAutoModelForSequenceClassification | |
def tokenize_function(dataset_split): | |
return tokenizer(dataset_split["text"], padding="max_length", truncation=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def json_extract(obj, key): | |
"""Recursively fetch values from nested JSON.""" | |
arr = [] | |
def extract(obj, arr, key): | |
"""Recursively search for values of key in JSON tree.""" | |
if isinstance(obj, dict): | |
for k, v in obj.items(): | |
if isinstance(v, (dict, list)): | |
extract(v, arr, key) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyspark.sql import SparkSession, Window | |
import pyspark.sql.functions as f | |
spark = SparkSession.builder.getOrCreate() | |
# 1. Read data | |
coloc = ( | |
spark.read.parquet("gs://genetics-portal-dev-data/22.09.0/outputs/v2d_coloc") | |
.filter(f.col("right_type") != "gwas") | |
.select( |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyspark.sql import SparkSession, Window | |
import pyspark.sql.functions as f | |
spark = SparkSession.builder.getOrCreate() | |
### PREPARE DATA ### | |
# Read data | |
coloc = ( | |
spark.read.parquet("gs://genetics-portal-dev-data/22.09.0/outputs/v2d_coloc") |
NewerOlder