Skip to content

Instantly share code, notes, and snippets.

View ireneisdoomed's full-sized avatar
🙃
send memes

Irene López Santiago ireneisdoomed

🙃
send memes
  • Open Targets | EMBL-EBI
  • Cambridge, UK
View GitHub Profile
@ireneisdoomed
ireneisdoomed / safety_model.py
Created January 18, 2022 00:04
Target Safety Modelling and Validation with Pydantic
import json
from typing import List, Optional
from pydantic import BaseModel, Extra, Field
from pydantic.schema import schema
class Biosample(BaseModel):
"""
Anatomical structures referenced in resource.
@ireneisdoomed
ireneisdoomed / probes_model.py
Last active May 23, 2022 12:11
Chemical Probes Modelling and Validation with Pydantic
import json
from typing import List, Optional
from pydantic import BaseModel, Extra, Field
class Url(BaseModel):
niceName: str
url: Optional[str]
v2d_credset = spark.read.parquet(
"gs://genetics-portal-dev-data/22.09.0/outputs/v2d_credset"
)
v2d = spark.read.parquet("gs://genetics-portal-dev-data/22.09.0/outputs/v2d")
pics = spark.read.parquet("gs://genetics-portal-dev-staging/v2d/220401/ld.parquet")
def create_study_id_hash(
df: DataFrame, study_col: str, phenotype_col: str, biofeature_col: str
) -> DataFrame:
@ireneisdoomed
ireneisdoomed / studies.py
Last active December 9, 2022 11:38
Mock up studies dataset that takes the current one and adds the QTLs to be considered equally as GWAS
credset = spark.read.parquet(
"gs://genetics_etl_python_playground/input/220224_merged_credset"
)
studies = spark.read.parquet(
"gs://genetics-portal-dev-data/22.09.0/outputs/lut/study-index"
)
phenotype_id_gene = spark.read.csv(
"gs://genetics_etl_python_playground/input/phenotype_id_gene_luts",
sep="\t",
header=True,
"""Gist to test inheritance."""
from __future__ import annotations
from typing import TYPE_CHECKING, Type, Optional
from typing import List
import importlib.resources as pkg_resources
import json
from dataclasses import dataclass
from pathlib import Path
!pip3 install dbldatagen
import dbldatagen as dg
from pyspark.sql import SparkSession
import pyspark.sql.types as t
spark = SparkSession.builder.master("local[*]").appName("spark").getOrCreate()
# Generate a mock eCaviar coloc dataframe
@ireneisdoomed
ireneisdoomed / Coloc - L2G prediction concordance.py
Last active January 18, 2023 18:16
Brief analysis on the degree of concordance of the L2G predictions for study/locus pairs that colocalise
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as f
spark = SparkSession.builder.getOrCreate()
### PREPARE DATA ###
# Read data
coloc = (
spark.read.parquet("gs://genetics-portal-dev-data/22.09.0/outputs/v2d_coloc")
@ireneisdoomed
ireneisdoomed / Coloc - Directionality assessment.py
Last active January 31, 2023 10:38
Effect sizes from colocated QTLs are derived to inform the mechanism of action of a Genetics Portal association
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as f
spark = SparkSession.builder.getOrCreate()
# 1. Read data
coloc = (
spark.read.parquet("gs://genetics-portal-dev-data/22.09.0/outputs/v2d_coloc")
.filter(f.col("right_type") != "gwas")
.select(
@ireneisdoomed
ireneisdoomed / json_extract.py
Created February 1, 2023 14:13
Pretty print JSON Schema
def json_extract(obj, key):
"""Recursively fetch values from nested JSON."""
arr = []
def extract(obj, arr, key):
"""Recursively search for values of key in JSON tree."""
if isinstance(obj, dict):
for k, v in obj.items():
if isinstance(v, (dict, list)):
extract(v, arr, key)
## TRAINING
import logging
from datasets import load_dataset, DatasetDict, Dataset
import tensorflow as tf
from transformers import AutoTokenizer, DefaultDataCollator, TFAutoModelForSequenceClassification
def tokenize_function(dataset_split):
return tokenizer(dataset_split["text"], padding="max_length", truncation=True)