Skip to content

Instantly share code, notes, and snippets.

View ireneisdoomed's full-sized avatar
🙃
send memes

Irene López Santiago ireneisdoomed

🙃
send memes
  • Open Targets | EMBL-EBI
  • Cambridge, UK
View GitHub Profile
@ireneisdoomed
ireneisdoomed / eqtl_credible_set_comparison.py
Created March 6, 2024 17:42
Comparison between eQTL Catalogue credible sets
import pyspark.sql.functions as f
from gentropy.common.session import Session
from gentropy.datasource.eqtl_catalogue.finemapping import EqtlCatalogueFinemapping
session = Session("yarn")
susie_studies_path = "gs://eqtl_catalog_data/study_index_0103"
susie_credible_sets_path = "gs://eqtl_catalog_data/credible_set_datasets/susie_0103"
pics_credible_sets_path = (
@ireneisdoomed
ireneisdoomed / l2g_gs_unique_associations.py
Created November 20, 2023 16:46
l2g_gs_unique_associations.py
mock_l2g_gs_df = session.spark.createDataFrame(
[
(1, "variant1", "gene1", "positive"),
(
2,
"variant2",
"gene1",
"negative",
), # in the same locus as 1 and pointing to same gene, has to be dropped
(
@ireneisdoomed
ireneisdoomed / l2g_gs_qc.py
Created November 16, 2023 10:54
L2G Gold standard QC
## RAW GOLD STANDARD
# Nr of high-quality associations: 1201
# Nr of distinct genes: 451
(
gs_curation
.filter(
f.col("gold_standard_info.highest_confidence").isin(
["High", "Medium"]
)
@ireneisdoomed
ireneisdoomed / transform_pharmgkb.py
Last active November 22, 2023 14:31
PharmGKB - 3128
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
import tempfile
import os
spark = SparkSession.builder.appName("PGKB").getOrCreate()
data = spark.read.json("cttv012-2023-10-12_pgkb.json.gz")
drugs = spark.read.parquet("molecule/")
@ireneisdoomed
ireneisdoomed / l2g_feature_matrix_translation.py
Last active March 28, 2023 07:11
Translate feature matrix to a long format dataset of type L2GFeature
from typing import Iterable, Optional
from pyspark.sql import DataFrame, SparkSession
import pyspark.sql.functions as f
spark = SparkSession.builder.getOrCreate()
fm = spark.read.parquet("gs://genetics-portal-dev-staging/l2g/221107/features/output/features.raw.221107.parquet")
fm.printSchema()
@ireneisdoomed
ireneisdoomed / graph_based_clumping.py
Last active February 20, 2023 15:25
Function to perform clumping in Spark using Graphframes - Implementation by @DSuveges extracted from this commit https://github.com/opentargets/genetics_etl_python/commit/31cab8de2d0aa206211e578aa0fb701dd5e064b2
def resolve_graph(df: DataFrame) -> DataFrame:
"""Graph resolver for clumping.
It takes a dataframe with a list of variants and their explained variants, and returns a dataframe
with a list of variants and their resolved roots
Args:
df (DataFrame): DataFrame
Returns:
A dataframe with the resolved roots.
"""
# Convert to vertices:
## TRAINING
import logging
from datasets import load_dataset, DatasetDict, Dataset
import tensorflow as tf
from transformers import AutoTokenizer, DefaultDataCollator, TFAutoModelForSequenceClassification
def tokenize_function(dataset_split):
return tokenizer(dataset_split["text"], padding="max_length", truncation=True)
@ireneisdoomed
ireneisdoomed / json_extract.py
Created February 1, 2023 14:13
Pretty print JSON Schema
def json_extract(obj, key):
"""Recursively fetch values from nested JSON."""
arr = []
def extract(obj, arr, key):
"""Recursively search for values of key in JSON tree."""
if isinstance(obj, dict):
for k, v in obj.items():
if isinstance(v, (dict, list)):
extract(v, arr, key)
@ireneisdoomed
ireneisdoomed / Coloc - Directionality assessment.py
Last active January 31, 2023 10:38
Effect sizes from colocated QTLs are derived to inform the mechanism of action of a Genetics Portal association
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as f
spark = SparkSession.builder.getOrCreate()
# 1. Read data
coloc = (
spark.read.parquet("gs://genetics-portal-dev-data/22.09.0/outputs/v2d_coloc")
.filter(f.col("right_type") != "gwas")
.select(
@ireneisdoomed
ireneisdoomed / Coloc - L2G prediction concordance.py
Last active January 18, 2023 18:16
Brief analysis on the degree of concordance of the L2G predictions for study/locus pairs that colocalise
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as f
spark = SparkSession.builder.getOrCreate()
### PREPARE DATA ###
# Read data
coloc = (
spark.read.parquet("gs://genetics-portal-dev-data/22.09.0/outputs/v2d_coloc")