This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
from typing import List, Optional | |
from pydantic import BaseModel, Extra, Field | |
from pydantic.schema import schema | |
class Biosample(BaseModel): | |
""" | |
Anatomical structures referenced in resource. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
from typing import List, Optional | |
from pydantic import BaseModel, Extra, Field | |
class Url(BaseModel): | |
niceName: str | |
url: Optional[str] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
v2d_credset = spark.read.parquet( | |
"gs://genetics-portal-dev-data/22.09.0/outputs/v2d_credset" | |
) | |
v2d = spark.read.parquet("gs://genetics-portal-dev-data/22.09.0/outputs/v2d") | |
pics = spark.read.parquet("gs://genetics-portal-dev-staging/v2d/220401/ld.parquet") | |
def create_study_id_hash( | |
df: DataFrame, study_col: str, phenotype_col: str, biofeature_col: str | |
) -> DataFrame: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
credset = spark.read.parquet( | |
"gs://genetics_etl_python_playground/input/220224_merged_credset" | |
) | |
studies = spark.read.parquet( | |
"gs://genetics-portal-dev-data/22.09.0/outputs/lut/study-index" | |
) | |
phenotype_id_gene = spark.read.csv( | |
"gs://genetics_etl_python_playground/input/phenotype_id_gene_luts", | |
sep="\t", | |
header=True, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Gist to test inheritance.""" | |
from __future__ import annotations | |
from typing import TYPE_CHECKING, Type, Optional | |
from typing import List | |
import importlib.resources as pkg_resources | |
import json | |
from dataclasses import dataclass | |
from pathlib import Path |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
!pip3 install dbldatagen | |
import dbldatagen as dg | |
from pyspark.sql import SparkSession | |
import pyspark.sql.types as t | |
spark = SparkSession.builder.master("local[*]").appName("spark").getOrCreate() | |
# Generate a mock eCaviar coloc dataframe |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyspark.sql import SparkSession, Window | |
import pyspark.sql.functions as f | |
spark = SparkSession.builder.getOrCreate() | |
### PREPARE DATA ### | |
# Read data | |
coloc = ( | |
spark.read.parquet("gs://genetics-portal-dev-data/22.09.0/outputs/v2d_coloc") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyspark.sql import SparkSession, Window | |
import pyspark.sql.functions as f | |
spark = SparkSession.builder.getOrCreate() | |
# 1. Read data | |
coloc = ( | |
spark.read.parquet("gs://genetics-portal-dev-data/22.09.0/outputs/v2d_coloc") | |
.filter(f.col("right_type") != "gwas") | |
.select( |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def json_extract(obj, key): | |
"""Recursively fetch values from nested JSON.""" | |
arr = [] | |
def extract(obj, arr, key): | |
"""Recursively search for values of key in JSON tree.""" | |
if isinstance(obj, dict): | |
for k, v in obj.items(): | |
if isinstance(v, (dict, list)): | |
extract(v, arr, key) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## TRAINING | |
import logging | |
from datasets import load_dataset, DatasetDict, Dataset | |
import tensorflow as tf | |
from transformers import AutoTokenizer, DefaultDataCollator, TFAutoModelForSequenceClassification | |
def tokenize_function(dataset_split): | |
return tokenizer(dataset_split["text"], padding="max_length", truncation=True) |
OlderNewer