d0choa/2021_approvals.R

## 2021_approvals.csv

          
            Drug (brand name)
            Sponsor
            Properties
            Indication
            DrugId
            DiseaseId
            TA
            Manual disease mapping
            ChemblCheck

            
              Vericiguat (Verquvo)
              Merck & Co./Bayer
              sGC stimulator
              Chronic heart failure
              CHEMBL4066936
              EFO_0001645
              Cardiovascular
              fuzzy

            
              Cabotegravir; rilpivirine (Cabenuva Kit)
              ViiV
              INSTI and an NNRTI
              HIV-1 infection
              CHEMBL2403238
              EFO_0000180
              Infectious
              exact

            
              Voclosporin (Lupkynis)
              Aurinia
              Calcineurin inhibitor
              Lupus nephritis
              CHEMBL2218919
              EFO_0002690
              Nephrology
              exact

            
              Tepotinib (Tepmetko)
              EMD Serono
              MET kinase inhibitor
              NSCLC
              CHEMBL3402762
              EFO_0003060
              Oncology
              exact

            
              Umbralisib (Ukoniq)
              TG Therapeutics
              PI3Kδ and CK1ε inhibitor
              MZL, follicular lymphoma
              CHEMBL3948730
              EFO_1000630
              Oncology
              exact

            
              Evinacumab (Evkeeza)
              Regeneron
              ANGPTL3-targeted mAb
              HoFH
              CHEMBL3545191
              Orphanet_391665
              Metabolic
              exact

            
              Trilaciclib (Cosela)
              G1 Therapeutics
              CDK4 and CDK6 kinase inhibitor
              Chemotherapy-induced myelosuppression
              CHEMBL3894860
              EFO_0000702
              Oncology
              NA

            
              Casimersen (Amondys 45)
              Sarepta
              Exon 45-skipping ASO
              DMD
              CHEMBL4297566
              Orphanet_98896
              Other
              exact

            
              Fosdenopterin (Nulibry)
              BridgeBio
              cPMP
              MoCD type A
              CHEMBL2338675
              Orphanet_308386
              Other
              exact

            
              Melphalan flufenamide (Pepaxto)
              Oncopeptides
              Peptide-conjugated alkylating drug
              Multiple myeloma
              CHEMBL4303060
              EFO_0001378
              Oncology
              exact

            
              Dexmethylphenidate; serdexmethylphenidate (Azstarys)
              Commave Therapeutics
              CNS stimulant
              ADHD
              CHEMBL827
              EFO_0003888
              Psychiatric
              exact

            
              Tivozanib (Fotivda)
              Aveo
              VEGFR kinase inhibitor
              Renal cell carcinoma
              CHEMBL1289494
              EFO_0000681
              Oncology
              exact

            
              Ponesimod (Ponvory)
              J&J
              S1P receptor modulator
              Relapsing multiple sclerosis
              CHEMBL1096146
              EFO_0003885
              Other
              fuzzy

            
              Dasiglucagon (Zegalogue)
              Zealand Pharma
              Glucagon receptor agonist
              Severe hypoglycaemia
              CHEMBL4297741
              EFO_0001360
              Metabolic
              exact

            
              Viloxazine (Qelbree)
              Supernus
              SNRI
              ADHD
              CHEMBL306700
              EFO_0003888
              Psychiatric
              exact

            
              Drospirenone; estetrol (Nextstellis)
              Mayne Pharma
              Spironolactone and oestrogen analogues
              To prevent pregnancy
              CHEMBL1509
              NA
              Reproductive
              NA

            
              Dostarlimab (Jemperli)
              GlaxoSmithKline
              PD1-targeted mAb
              Endometrial cancer
              CHEMBL4298124
              MONDO_0011962
              Oncology
              exact

            
              Loncastuximab tesirine (Zynlonta)
              ADC Therapeutics
              CD19-targeted ADC
              B-cell lymphoma
              CHEMBL4297778
              EFO_0000403
              Oncology
              exact

            
              Pegcetacoplan (Empaveli)
              Apellis
              Complement protein C3 inhibitor
              PNH
              CHEMBL4298211
              Orphanet_447
              Other
              exact

            
              Amivantamab (Rybrevant)
              J&J
              EGFR×METR bispecific antibody
              EGFR exon 20-mutated NSCLC
              CHEMBL4297774
              EFO_0003060
              Oncology
              fuzzy

            
              Piflufolastat F-18 (Pylarify)
              Progenics
              Radiolabelled PSMA imaging agent
              Prostate cancer imaging
              NA
              NA
              Imaging
              NA

            
              Infigratinib (Truseltiq)
              BridgeBio
              FGFR2 kinase inhibitor
              FGFR2-mutated bile duct cancer
              CHEMBL1852688
              EFO_0005540
              Oncology
              fuzzy

            
              Sotorasib (Lumakras)
              Amgen
              KRAS-G12C inhibitor
              KRASG12C-mutated NSCLC
              CHEMBL4535757
              EFO_0003060
              Oncology
              fuzzy
              TRUE

            
              Olanzapine; samidorphan (Lybalvi)
              Alkermes
              Atypical antipsychotic and opioid antagonist
              Schizophrenia and bipolar I disorder
              CHEMBL715
              EFO_0000692
              Psychiatric
              exact

            
              Ibrexafungerp (Brexafemme)
              Scynexis
              Triterpenoid antifungal
              Vulvovaginal candidiasis
              CHEMBL4297513
              EFO_0007543
              Infectious
              exact

            
              Aducanumab (Aduhelm)
              Biogen/Eisai
              Amyloid-β-targeted mAb
              Alzheimer’s disease
              CHEMBL3039540
              EFO_0000249
              Other
              exact

            
              Asparaginase erwinia chrysanthemi (Rylaze)
              Jazz
              Recombinant asparagine-specific enzyme
              ALL and LBL, in patients allergic to E. coli-derived products
              CHEMBL1863514
              EFO_0000220
              Oncology
              fuzzy

            
              Finerenone (Kerendia)
              Bayer
              Non-steroidal MR antagonist
              CKD with type 2 diabetes
              CHEMBL2181927
              EFO_0000401
              Other
              exact

            
              Fexinidazole (Fexinidazole)
              Sanofi/DNDi
              Nitroimidazole antimicrobial
              Sleeping sickness
              CHEMBL1631694
              DOID_10113
              Infectious
              exact

            
              Belumosudil (Rezurock)
              Kadmon
              ROCK2 kinase inhibitor
              Chronic GVHD
              CHEMBL4594302
              MONDO_0013730
              Other
              exact

            
              Odevixibat (Bylvay)
              Albireo
              IBAT inhibitor
              Pruritus in PFIC
              CHEMBL4297588
              Orphanet_172
              Other
              exact
              TRUE

            
              Anifrolumab (Saphnelo)
              AstraZeneca
              IFNAR-targeted mAb
              SLE
              CHEMBL2364653
              EFO_0002690
              Other
              exact

            
              Avalglucosidase alfa (Nexviazyme)
              Sanofi
              Recombinant α-glucosidase
              Pompe disease
              CHEMBL4594320
              Orphanet_365
              Other
              fuzzy

            
              Belzutifan (Welireg)
              Merck & Co.
              HIF-2α inhibitor
              von Hippel-Lindau disease
              CHEMBL4585668
              Orphanet_892
              Oncology
              exact
              TRUE

            
              Difelikefalin (Korsuva)
              Cara Therapeutics
              κ-Opioid receptor agonist
              Pruritus associated with CKD
              CHEMBL3989915
              EFO_0003884
              Other
              fuzzy

            
              Lonapegsomatropin (Skytrofa)
              Ascendis Pharma
              PEGylated human growth hormone
              Growth failure due to GHD
              CHEMBL4298185
              HP_0001510
              Other
              NA
              TRUE

            
              Mobocertinib (Exkivity)
              Takeda
              EGFR kinase inhibitor
              EGFR exon 20-mutated NSCLC
              CHEMBL4650319
              EFO_0003060
              Oncology
              fuzzy

            
              Tisotumab vedotin (Tivdak)
              Seagen/Genmab
              Tissue-factor-directed ADC
              Cervical cancer
              CHEMBL4297841
              MONDO_0002974
              Oncology
              exact

            
              Atogepant (Qulipta)
              AbbVie
              CGRP receptor antagonist
              Episodic migraine
              CHEMBL3991065
              EFO_0003821
              Other
              fuzzy

            
              Maralixibat (Livmarli)
              Mirum
              IBAT inhibitor
              Pruritus in Alagille syndrome
              CHEMBL363392
              Orphanet_52
              Other
              fuzzy

            
              Avacopan (Tavneos)
              ChemoCentryx
              Complement 5a receptor antagonist
              ANCA-associated vasculitis
              CHEMBL3989871
              EFO_0004826
              Cardiovascular
              exact

            
              Asciminib (Scemblix)
              Novartis
              ABL/BCR–ABL1 kinase inhibitor
              Ph+ CML
              CHEMBL4208229
              EFO_0000339
              Oncology
              fuzzy

            
              Ropeginterferon alfa-2b (Besremi)
              Pharmaessentia
              PEGylated interferon α-2b
              Polycythaemia vera
              CHEMBL4297819
              EFO_0002429
              Oncology
              exact

            
              Vosoritide (Voxzogo)
              Biomarin
              CNP analogue
              Achondroplasia
              CHEMBL3707276
              Orphanet_15
              Other
              exact

            
              Maribavir (Livtencity)
              Takeda
              CMV pUL97 kinase inhibitor
              Post-transplant CMV infection
              CHEMBL515408
              EFO_0001062
              Infectious
              fuzzy

            
              Pafolacianine (Cytalux)
              On Target Labs
              Fluorescent FR imaging agent
              Ovarian cancer imaging
              CHEMBL4297412
              MONDO_0008170
              Imaging
              exact

            
              Efgartigimod alfa (Vyvgart)
              Argenx
              FcRn-binding Fc fragment
              Myasthenia gravis
              CHEMBL4297551
              EFO_0004991
              Other
              exact

            
              Tezepelumab (Tezspire)
              Astrazeneca/Amgen
              TSLP-targeted mAb
              Severe asthma
              CHEMBL3707229
              EFO_0000270
              Respiratory
              exact

            
              Inclisiran (Leqvio)
              Novartis/Alnylam
              PCSK9-targeted siRNA
              HeFH or ASCVD
              CHEMBL3990033
              MONDO_0021661
              Cardiovascular
              fuzzy

            
              Tralokinumab (Adbry)
              LEO Pharma
              IL-13-targeted mAb
              Atopic dermatitis
              CHEMBL1743081
              EFO_0000274
              Dermatology
              fuzzy

## 2021_approvals.R
library("tidyverse")
library("sparklyr")
library("sparklyr.nested")
library("cowplot")
library("ggsci")

#Spark config
config <- spark_config()

# Allowing to GCP datasets access
config$spark.hadoop.fs.gs.requester.pays.mode <- "AUTO" # nolint
config$spark.hadoop.fs.gs.requester.pays.project.id <- "open-targets-eu-dev" # nolint

# spark connect
sc <- spark_connect(master = "yarn", config = config)

# Approvals as reported in NRDD article
gs_approvals <- "gs://ot-team/dochoa/2021_approvals.csv"
approvals <- spark_read_csv(
    sc,
    path = gs_approvals,
    memory = FALSE
)

# Datasource metadata
ds_names <- spark_read_csv(
    sc,
    path = "gs://ot-team/dochoa/datasourceMetadata.csv",
    memory = FALSE) %>%
    collect()

# Read Platform data
gs_path <- "gs://open-targets-data-releases/"
data_release <- "21.11"
all_evidence_path <- paste(
    gs_path, data_release,
    "/output/etl/parquet/evidence/",
    sep = ""
)
moa_path <- paste(
    gs_path, data_release,
    "/output/etl/parquet/mechanismOfAction/",
    sep = ""
)
ass_indirectby_ds_path <- paste(
    gs_path, data_release,
    "/output/etl/parquet/associationByDatasourceIndirect/",
    sep = ""
)
disease_path <- paste(
    gs_path, data_release,
    "/output/etl/parquet/diseases/",
    sep = ""
)
interaction_path <- paste(
    gs_path, data_release,
    "/output/etl/parquet/interaction/",
    sep = ""
)
disease2phenotype_path <- paste(
    gs_path, data_release,
    "/output/etl/parquet/diseaseToPhenotype/",
    sep = ""
)

# Mechanisms of action
# Extra MoAs required to fill the gaps
ammend_moas <- list(
    "CHEMBL4594302" = "ENSG00000134318",
    "CHEMBL4297741" = "ENSG00000215644",
    "CHEMBL4297774" = "ENSG00000146648",
    "CHEMBL4297774" = "ENSG00000105976",
    "CHEMBL4298185" = "ENSG00000112964", # chembl missing in platform
    "CHEMBL4650319" = "ENSG00000146648",
    "CHEMBL1863514" = "ENSG00000166183",
    "CHEMBL4594320" = "ENSG00000171298"
)
new_moas <- data.frame(
    chemblIds = names(ammend_moas),
    targetId = unlist(ammend_moas)
)
new_moas <- sdf_copy_to(sc, new_moas, overwrite = TRUE)

# available MoAs + ammended
moa <- spark_read_parquet(sc, moa_path, memory = FALSE) %>%
    select(chemblIds, targets) %>%
    sdf_explode(chemblIds) %>%
    sdf_explode(targets) %>%
    rename(targetId = targets) %>%
    sdf_distinct() %>%
    sdf_bind_rows(new_moas)

# Platform ssociations indirect (by datasource)
ass_indirectby_ds <- spark_read_parquet(sc, ass_indirectby_ds_path)

# Joining associations information
ass <- approvals %>%
    rename(diseaseId = DiseaseId) %>%
    left_join(moa, by = c("DrugId" = "chemblIds")) %>%
    left_join(ass_indirectby_ds, by = c("diseaseId", "targetId")) %>%
    collect()

# Data about molecular interactions
interactions <- spark_read_parquet(sc, interaction_path, memory = FALSE) %>%
    filter(sourceDatabase == "intact") %>%
    filter(!is.na(targetA)) %>%
    filter(!is.na(targetB)) %>%
    filter(scoring > 0.42) %>%
    select(targetA, targetB) %>%
    sdf_distinct()

interactors_ass <- approvals %>%
    rename(diseaseId = DiseaseId) %>%
    inner_join(moa, by = c("DrugId" = "chemblIds")) %>%
    inner_join(interactions, by = c("targetId" = "targetA")) %>%
    inner_join(
        ass_indirectby_ds,
        by = c("diseaseId" = "diseaseId", "targetB" = "targetId")
    ) %>%
    select(datasourceId, Drug_brand_name) %>%
    sdf_distinct() %>%
    collect() %>%
    mutate(interactionAssociation = TRUE)

# Additional phenotype curation
ammend_phenotypes <- list(
    # Microalbuminuria (biomarker of CKD)
    "EFO_0000401" = "HP_0012594",
    # glycodeoxycholate sulfate (one of the bile acids that cause pruritus)
    "Orphanet_172" = "EFO_0005653",
    "Orphanet_52" = "EFO_0005653",
    # achondroplasia -> body height
    "Orphanet_15" = "EFO_0004339",
    "Orphanet_15" = "Orphanet_329191",
    #von hippel lindau -> renal carcinoma
    "Orphanet_892" = "EFO_0000681",
    "EFO_0001360" = "MONDO_0018582",
    # growth delay -> height
    "HP_0001510" = "EFO_0004339",
    #CAD -> myocardial infarctation
    "EFO_0001645" = "EFO_0000612"
)
new_phenotypes <- data.frame(
    diseaseId = names(ammend_phenotypes),
    phenotype = unlist(ammend_phenotypes)
)
new_phenotypes <- sdf_copy_to(sc, new_phenotypes, overwrite = TRUE)

# Platform disease to phenotype data
disease2phenotype <- spark_read_parquet(
    sc,
    disease2phenotype_path,
    memory = FALSE
) %>%
    select(diseaseId = disease, phenotype) %>%
    sdf_distinct()

# Associations through indirect phenotypes
phenotype_ass <- approvals %>%
    rename(diseaseId = DiseaseId) %>%
    inner_join(moa, by = c("DrugId" = "chemblIds")) %>%
    inner_join(
        disease2phenotype %>%
        sdf_bind_rows(new_phenotypes),
        by = c("diseaseId")) %>%
    inner_join(
        ass_indirectby_ds,
        by = c("phenotype" = "diseaseId", "targetId")) %>%
    select(datasourceId, Drug_brand_name) %>%
    sdf_distinct() %>%
    collect() %>%
    mutate(phenotypeAssociation = TRUE)

# Data to plot
data2plot <- ass %>%
    select(datasourceId, Drug_brand_name, score) %>%
    complete(datasourceId, Drug_brand_name) %>%
    mutate(score = replace_na(score, 0)) %>%
    filter(!is.na(datasourceId)) %>%
    # TA
    left_join(
        ass %>%
            select(
                Drug_brand_name,
                TA
            ) %>%
            distinct(),
        by = "Drug_brand_name"
    ) %>%
    # targets
    left_join(
        ass %>%
            mutate(noTarget = is.na(targetId)) %>%
            select(
                Drug_brand_name,
                noTarget
            ) %>%
            distinct(),
        by = "Drug_brand_name"
    ) %>%
    # interactions
    left_join(
        interactors_ass,
        by = c("datasourceId", "Drug_brand_name")
    ) %>%
    # related phenotypes
    left_join(
        phenotype_ass,
        by = c("datasourceId", "Drug_brand_name")
    ) %>%
    mutate(
        interactionAssociation = ifelse(score > 0, TRUE, interactionAssociation)
    ) %>%
    mutate(
        phenotypeAssociation = ifelse(score > 0, TRUE, phenotypeAssociation)
    ) %>%
    mutate(score = ifelse(noTarget, NA, score)) %>%
    mutate(TA = ifelse(noTarget, "No human target", TA)) %>%
    mutate(
        TA = fct_other(
            TA,
            keep = c("Oncology", "No human target"),
            other_level = "Other indication"
        )
    ) %>%
    mutate(
        TA = fct_relevel(TA, c(
            "Oncology",
            "Other indication",
            "No human target"
        ))
    ) %>%
    # mutate(datasourceId = fct_relevel(datasourceId, names(ds_name_list))) %>%
    filter(!(datasourceId %in% c("chembl", "expression_atlas", "sysbio", "europepmc", "phenodigm", "reactome", "phewas_catalog"))) %>%
    #drug score for the purpose of reordering them
    mutate(rankscore = replace_na(score, 0)) %>%
    mutate(rankscore = ifelse(!is.na(interactionAssociation), rankscore + 0.01, rankscore)) %>%
    mutate(rankscore = ifelse(!is.na(phenotypeAssociation), rankscore + 0.03, rankscore)) %>%
    mutate(Drug_brand_name = fct_rev(fct_reorder(
        Drug_brand_name, rankscore, mean,
        na.rm = TRUE, .desc = TRUE
    ))) %>%
    group_by(
        datasourceId,
        Drug_brand_name,
        TA,
        noTarget,
        interactionAssociation,
        phenotypeAssociation
    ) %>%
    summarise(score = suppressWarnings(max(score, na.rm = TRUE))) %>%
    mutate(score = ifelse(score < 0, NA, score)) %>%
    left_join(ds_names, by = "datasourceId") %>%
    mutate(
        datasourceName = factor(datasourceName, levels = ds_names$datasourceName),
        datasourceType = factor(datasourceType, levels = c("Somatic", "Functional genomics (cancer)", "Rare mendelian", "Common disease"))
    )


# symbols to overlay in the plot
overlay_data <- data2plot %>%
    ungroup() %>%
    select(
        datasourceName,
        datasourceType,
        Drug_brand_name,
        TA,
        interactionAssociation,
        phenotypeAssociation
    ) %>%
    gather("overlay", "value", -datasourceName, -datasourceType, -Drug_brand_name, -TA) %>%
    filter(!is.na(value)) %>%
    mutate(overlay = str_replace_all(overlay, "Association", "")) %>%
    mutate(overlaySize = ifelse(overlay == "phenotype", 3, 1)) %>%
    mutate(overlaySymbol = as.character(ifelse(overlay == "phenotype", 1, 16)))

# plotting
output <- data2plot %>%
    ggplot(aes(
        x = datasourceName,
        y = Drug_brand_name)) +
    geom_tile(aes(fill = score), color = "white") +
    geom_point(data = overlay_data,
        aes(shape = overlay, size = overlaySize)) +
    scale_fill_material("blue",
        na.value = "grey90",
        name = "Direct association"
    ) +
    scale_shape_manual(
        breaks = c("phenotype", "interaction"),
        labels = c("Direct or related phenotype", "Direct or interacting protein"),
        values = c(1, 16),
        name = "Supported by:") +
    scale_size_identity() +
    facet_grid(TA ~ datasourceType, scales = "free", space = "free") +
    theme_cowplot(font_size = 12) +
    # labs(
    #     title = "Supporting evidence on 2021 FDA drug approvals",
    #     subtitle = "Target-Disease evidence from Open Targets"
    #     # caption =
    #     #     "Source: Nat Reviews Drug Discovery 10.1038/d41573-022-00001-9"
    # ) +
    theme(
        plot.background = element_rect(fill = "white"),
        strip.background = element_blank(),
        legend.direction = "horizontal",
        legend.box = "vertical",
        legend.position = c(-0.7, -0.16),
        legend.justification = c(0, 0),
        axis.ticks = element_blank(),
        axis.text.x = element_text(angle = 45, hjust = 1),
        axis.title = element_blank(),
        axis.line = element_blank(),
        text = element_text(family = "sans")
    ) +
    guides(
        fill = guide_colourbar(
            title.position = "top",
            title.hjust = 0.5,
            barwidth = 8,
            frame.colour = "black",
            ticks.colour = "black",
            order = 2
            ),
        shape = guide_legend(
            title.position = "top",
            direction = "vertical",
            order = 1
        )
    )
ggsave(
    "/home/ochoa/2021_approvals.pdf",
    plot = output,
    width = 9,
    height = 11
)

## 2021_approvals_brief.r
library("tidyverse")
library("sparklyr")
library("sparklyr.nested")
library("cowplot")
library("ggsci")

#Spark config
config <- spark_config()

# Allowing to GCP datasets access
config$spark.hadoop.fs.gs.requester.pays.mode <- "AUTO" # nolint
config$spark.hadoop.fs.gs.requester.pays.project.id <- "open-targets-eu-dev" # nolint

# spark connect
sc <- spark_connect(master = "local", config = config)

# Approvals as reported in NRDD article
gs_approvals <- "gs://ot-team/dochoa/2021_approvals.csv"
approvals <- spark_read_csv(
    sc,
    path = gs_approvals,
    memory = FALSE
)

# Datasource metadata
ds_names <- spark_read_csv(
    sc,
    path = "gs://ot-team/dochoa/datasourceMetadata.csv",
    memory = FALSE) %>%
    collect()

# Read Platform data
gs_path <- "gs://open-targets-data-releases/"
data_release <- "21.11"
all_evidence_path <- paste(
    gs_path, data_release,
    "/output/etl/parquet/evidence/",
    sep = ""
)
moa_path <- paste(
    gs_path, data_release,
    "/output/etl/parquet/mechanismOfAction/",
    sep = ""
)
ass_indirectby_ds_path <- paste(
    gs_path, data_release,
    "/output/etl/parquet/associationByDatasourceIndirect/",
    sep = ""
)
disease_path <- paste(
    gs_path, data_release,
    "/output/etl/parquet/diseases/",
    sep = ""
)
interaction_path <- paste(
    gs_path, data_release,
    "/output/etl/parquet/interaction/",
    sep = ""
)
disease2phenotype_path <- paste(
    gs_path, data_release,
    "/output/etl/parquet/diseaseToPhenotype/",
    sep = ""
)

# Mechanisms of action
# Extra MoAs required to fill the gaps
ammend_moas <- list(
    "CHEMBL4594302" = "ENSG00000134318",
    "CHEMBL4297741" = "ENSG00000215644",
    "CHEMBL4297774" = "ENSG00000146648",
    "CHEMBL4297774" = "ENSG00000105976",
    "CHEMBL4298185" = "ENSG00000112964", # chembl missing in platform
    "CHEMBL4650319" = "ENSG00000146648",
    "CHEMBL1863514" = "ENSG00000166183",
    "CHEMBL4594320" = "ENSG00000171298"
)
new_moas <- data.frame(
    chemblIds = names(ammend_moas),
    targetId = unlist(ammend_moas)
)
new_moas <- sdf_copy_to(sc, new_moas, overwrite = TRUE)

# available MoAs + ammended
moa <- spark_read_parquet(sc, moa_path, memory = FALSE) %>%
    select(chemblIds, targets) %>%
    sdf_explode(chemblIds) %>%
    sdf_explode(targets) %>%
    rename(targetId = targets) %>%
    sdf_distinct() %>%
    sdf_bind_rows(new_moas)

# Platform ssociations indirect (by datasource)
ass_indirectby_ds <- spark_read_parquet(sc, ass_indirectby_ds_path)

# Joining associations information
ass <- approvals %>%
    rename(diseaseId = DiseaseId) %>%
    left_join(moa, by = c("DrugId" = "chemblIds")) %>%
    left_join(ass_indirectby_ds, by = c("diseaseId", "targetId")) %>%
    collect()

# Data about molecular interactions
interactions <- spark_read_parquet(sc, interaction_path, memory = FALSE) %>%
    filter(sourceDatabase == "intact") %>%
    filter(!is.na(targetA)) %>%
    filter(!is.na(targetB)) %>%
    filter(scoring > 0.42) %>%
    select(targetA, targetB) %>%
    sdf_distinct()

interactors_ass <- approvals %>%
    rename(diseaseId = DiseaseId) %>%
    inner_join(moa, by = c("DrugId" = "chemblIds")) %>%
    inner_join(interactions, by = c("targetId" = "targetA")) %>%
    inner_join(
        ass_indirectby_ds,
        by = c("diseaseId" = "diseaseId", "targetB" = "targetId")
    ) %>%
    select(datasourceId, Drug_brand_name) %>%
    sdf_distinct() %>%
    collect() %>%
    mutate(interactionAssociation = TRUE)

# Additional phenotype curation
ammend_phenotypes <- list(
    # Microalbuminuria (biomarker of CKD)
    "EFO_0000401" = "HP_0012594",
    # glycodeoxycholate sulfate (one of the bile acids that cause pruritus)
    "Orphanet_172" = "EFO_0005653",
    "Orphanet_52" = "EFO_0005653",
    # achondroplasia -> body height
    "Orphanet_15" = "EFO_0004339",
    "Orphanet_15" = "Orphanet_329191",
    #von hippel lindau -> renal carcinoma
    "Orphanet_892" = "EFO_0000681",
    "EFO_0001360" = "MONDO_0018582",
    # growth delay -> height
    "HP_0001510" = "EFO_0004339",
    #CAD -> myocardial infarctation
    "EFO_0001645" = "EFO_0000612"
)
new_phenotypes <- data.frame(
    diseaseId = names(ammend_phenotypes),
    phenotype = unlist(ammend_phenotypes)
)
new_phenotypes <- sdf_copy_to(sc, new_phenotypes, overwrite = TRUE)

# Platform disease to phenotype data
disease2phenotype <- spark_read_parquet(
    sc,
    disease2phenotype_path,
    memory = FALSE
) %>%
    select(diseaseId = disease, phenotype) %>%
    sdf_distinct()

# Associations through indirect phenotypes
phenotype_ass <- approvals %>%
    rename(diseaseId = DiseaseId) %>%
    inner_join(moa, by = c("DrugId" = "chemblIds")) %>%
    inner_join(
        disease2phenotype %>%
        sdf_bind_rows(new_phenotypes),
        by = c("diseaseId")) %>%
    inner_join(
        ass_indirectby_ds,
        by = c("phenotype" = "diseaseId", "targetId")) %>%
    select(datasourceId, Drug_brand_name) %>%
    sdf_distinct() %>%
    collect() %>%
    mutate(phenotypeAssociation = TRUE)

# Data to plot
data2plot <- ass %>%
    select(datasourceId, Drug_brand_name, score) %>%
    complete(datasourceId, Drug_brand_name) %>%
    mutate(score = replace_na(score, 0)) %>%
    filter(!is.na(datasourceId)) %>%
    # TA
    left_join(
        ass %>%
            select(
                Drug_brand_name,
                TA
            ) %>%
            distinct(),
        by = "Drug_brand_name"
    ) %>%
    # targets
    left_join(
        ass %>%
            mutate(noTarget = is.na(targetId)) %>%
            select(
                Drug_brand_name,
                noTarget
            ) %>%
            distinct(),
        by = "Drug_brand_name"
    ) %>%
    # interactions
    left_join(
        interactors_ass,
        by = c("datasourceId", "Drug_brand_name")
    ) %>%
    # related phenotypes
    left_join(
        phenotype_ass,
        by = c("datasourceId", "Drug_brand_name")
    ) %>%
    mutate(
        interactionAssociation = ifelse(score > 0, TRUE, interactionAssociation)
    ) %>%
    mutate(
        phenotypeAssociation = ifelse(score > 0, TRUE, phenotypeAssociation)
    ) %>%
    mutate(score = ifelse(noTarget, NA, score)) %>%
    mutate(TA = ifelse(noTarget, "No human target", TA)) %>%
    mutate(
        TA = fct_other(
            TA,
            keep = c("Oncology", "No human target"),
            other_level = "Other indication"
        )
    ) %>%
    mutate(
        TA = fct_relevel(TA, c(
            "Oncology",
            "Other indication",
            "No human target"
        ))
    ) %>%
    # mutate(datasourceId = fct_relevel(datasourceId, names(ds_name_list))) %>%
    filter(!(datasourceId %in% c("chembl", "expression_atlas", "sysbio", "europepmc", "phenodigm", "reactome", "phewas_catalog"))) %>%
    #drug score for the purpose of reordering them
    mutate(rankscore = replace_na(score, 0)) %>%
    mutate(rankscore = ifelse(!is.na(interactionAssociation), rankscore + 0.01, rankscore)) %>%
    mutate(rankscore = ifelse(!is.na(phenotypeAssociation), rankscore + 0.03, rankscore)) %>%
    mutate(Drug_brand_name = fct_rev(fct_reorder(
        Drug_brand_name, rankscore, mean,
        na.rm = TRUE, .desc = TRUE
    ))) %>%
    group_by(
        datasourceId,
        Drug_brand_name,
        TA,
        noTarget,
        interactionAssociation,
        phenotypeAssociation
    ) %>%
    summarise(score = suppressWarnings(max(score, na.rm = TRUE))) %>%
    mutate(score = ifelse(score < 0, NA, score)) %>%
    left_join(ds_names, by = "datasourceId") %>%
    mutate(
        datasourceName = factor(datasourceName, levels = ds_names$datasourceName),
        datasourceType = factor(datasourceType, levels = c("Somatic", "Functional genomics (cancer)", "Rare mendelian", "Common disease"))
    )

# Values per data source
briefplotdata <- data2plot %>%
mutate(score = replace_na(score, 0)) %>%
group_by(Drug_brand_name, TA, datasourceType) %>%
summarise(
    noTarget = any(noTarget),
    interactionAssociation = any(interactionAssociation),
    phenotypeAssociation = any(phenotypeAssociation),
    score = ifelse(max(score, na.rm = TRUE) > 0, TRUE, FALSE)
) %>%
mutate(noTarget = replace_na(noTarget, FALSE)) %>%
mutate(phenotypeAssociation = replace_na(phenotypeAssociation, FALSE)) %>%
mutate(phenotypeAssociation = ifelse(score, FALSE, phenotypeAssociation)) %>%
mutate(interactionAssociation = replace_na(interactionAssociation, FALSE)) %>%
mutate(interactionAssociation = ifelse(score, FALSE, interactionAssociation)) %>%
mutate(interactionAssociation = ifelse(phenotypeAssociation, FALSE, interactionAssociation)) %>%
mutate(noEvidence = !(interactionAssociation | phenotypeAssociation | score | noTarget)) %>%
gather("evidence", "value", -Drug_brand_name, -TA, -datasourceType) %>%
filter(value)

# Values any data source
briefplotdataAny <- data2plot %>%
mutate(score = replace_na(score, 0)) %>%
group_by(Drug_brand_name, TA) %>%
summarise(
    noTarget = any(noTarget),
    interactionAssociation = any(interactionAssociation),
    phenotypeAssociation = any(phenotypeAssociation),
    score = ifelse(max(score, na.rm = TRUE) > 0, TRUE, FALSE)
) %>%
mutate(datasourceType = "Any") %>%
mutate(noTarget = replace_na(noTarget, FALSE)) %>%
mutate(phenotypeAssociation = replace_na(phenotypeAssociation, FALSE)) %>%
mutate(phenotypeAssociation = ifelse(score, FALSE, phenotypeAssociation)) %>%
mutate(interactionAssociation = replace_na(interactionAssociation, FALSE)) %>%
mutate(interactionAssociation = ifelse(score, FALSE, interactionAssociation)) %>%
mutate(interactionAssociation = ifelse(phenotypeAssociation, FALSE, interactionAssociation)) %>%
mutate(noEvidence = !(interactionAssociation | phenotypeAssociation | score | noTarget)) %>%
gather("evidence", "value", -Drug_brand_name, -TA, -datasourceType) %>%
filter(value)


output <- bind_rows(briefplotdataAny, briefplotdata) %>%
mutate(datasourceType = fct_relevel(datasourceType, levels = c("Any", "Somatic", "Functional genomics (cancer)", "Rare mendelian", "Common disease"))) %>%
mutate(evidence = fct_relevel(evidence,
    "score",
    "phenotypeAssociation",
    "interactionAssociation",
    "noTarget",
    "noEvidence")) %>%
mutate(evidence = fct_recode(evidence,
    "Direct" = "score",
    "Close phenotype" = "phenotypeAssociation",
    "Interacting protein" = "interactionAssociation",
    "No human target" = "noTarget",
    "Not available" = "noEvidence"
)) %>%
arrange(TA, desc(evidence)) %>%
group_by(datasourceType) %>%
mutate(rn = row_number()) %>%
mutate(evidence = replace(evidence, evidence == "Not available", NA)) %>%
ggplot(aes(x = rn, y = fct_rev(datasourceType), fill = fct_rev(evidence))) +
geom_tile(color = "white", height = .8, size = 0.5) +
facet_grid(
    ~TA,
    scales = "free",
    space = "free"
) +
# scale_fill_npg(name = "Genetic support", na.value = "grey90") +
scale_fill_manual(
    name = "Genetic support",
    values = c("#3C5488FF", "#00A087FF", "#4DBBD5FF", "grey60"),
    breaks = c("Direct", "Close phenotype", "Interacting protein", "No human target"),
    na.value = "grey90") +
scale_y_discrete(name = "Genetic data source", labels = function(x) str_wrap(x, width = 12)) +
theme_cowplot(font_size = 11) +
theme(
    plot.background = element_rect(fill = "white"),
    strip.background = element_blank(),
    axis.ticks = element_blank(),
    legend.position = "bottom",
    axis.text.x = element_blank(),
    axis.title.x = element_blank(),
    axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0)),
    axis.line = element_blank(),
    text = element_text(family = "sans"),
    panel.spacing = unit(-0.5, "lines")
)

ggsave(
    "/home/ochoa/2021_approvals_brief.pdf",
    plot = output,
    width = 6.5,
    height = 3.5,
    dpi = 400,
)

## datasourceMetadata.csv

          
            datasourceId
            datasourceName
            datasourceType

            
              cancer_gene_census
              CGC (COSMIC)
              Somatic

            
              intogen
              IntOgen
              Somatic

            
              cancer_biomarkers
              Cancer Biomarkers (CGI)
              Somatic

            
              crispr
              Project Score
              Functional genomics (cancer)

            
              slapenrich
              SlapEnrich
              Functional genomics (cancer)

            
              progeny
              Progeny
              Functional genomics (cancer)

            
              eva_somatic
              ClinVar (Somatic)
              Somatic

            
              ot_genetics_portal
              OT Genetics Portal
              Common disease

            
              phewas_catalog
              Phewas Catalog
              Common disease

            
              eva
              ClinVar
              Rare mendelian

            
              clingen
              Clingen
              Rare mendelian

            
              genomics_england
              GEL PanelApp
              Rare mendelian

            
              orphanet
              Orphanet
              Rare mendelian

            
              gene2phenotype
              gene2phenotype
              Rare mendelian

            
              uniprot_literature
              Uniprot (gene-disease)
              Rare mendelian

            
              uniprot_variants
              Uniprot (variants)
              Rare mendelian

            
              reactome
              Reactome
              Functional genomics (cancer)

            
              phenodigm
              Mouse model (phenodigm)
              Mouse model

            
              europepmc
              Literature (EPMC)
              Literature

            
              expression_atlas
              ExpressionAtlas (Diff expression)
              Differential Expression

            
              chembl
              drugs
              Drugs

## export_data.r


directSources <- ass %>%
    filter(!(datasourceId %in% c("chembl", "expression_atlas", "sysbio", "europepmc", "phenodigm", "reactome", "phewas_catalog"))) %>%
    mutate(datasourceId = datasourceId %>% str_replace("eva", "clinvar")) %>%
    filter(!is.na(datasourceId)) %>%
    group_by(Drug_brand_name) %>%
    summarise(directSources = paste(unique(datasourceId), collapse = ";"))

summaryResults <- output %>%
    filter(datasourceType == "Any") %>%
    select(Drug_brand_name, evidence)

closePhenotypes <- phenotype_ass %>%
    select(Drug_brand_name, datasourceId, phenotype) %>%
    left_join(
        spark_read_parquet(sc, disease_path) %>%
        select(phenotype = id, phenotypeName = name),
        by = "phenotype") %>%
    collect() %>%
    mutate(datasourceId = datasourceId %>% str_replace("eva", "clinvar")) %>%
    filter(!(datasourceId %in% c("chembl", "expression_atlas", "sysbio", "europepmc", "phenodigm", "reactome", "phewas_catalog"))) %>%
    distinct() %>%
    group_by(Drug_brand_name) %>%
    summarise(
        closePhenotypeIds = paste(unique(phenotype), collapse = ";"),
        closePhenotypeNames = paste(unique(phenotypeName), collapse = ";"),
        closePhenotypeDataSources = paste(unique(datasourceId), collapse = ";")
    )

target_path <- paste(
    gs_path, data_release,
    "/output/etl/parquet/target/",
    sep = ""
)

intDf <- approvals %>%
    rename(diseaseId = DiseaseId) %>%
    inner_join(moa, by = c("DrugId" = "chemblIds")) %>%
    inner_join(interactions, by = c("targetId" = "targetA")) %>%
    inner_join(
        ass_indirectby_ds,
        by = c("diseaseId" = "diseaseId", "targetB" = "targetId")
    ) %>%
    left_join(
        spark_read_parquet(sc, target_path) %>%
        select(targetB = id, approvedSymbol),
        by = "targetB"
    ) %>%
    select(Drug_brand_name, targetB, datasourceId, approvedSymbol) %>%
    collect() %>%
    mutate(datasourceId = datasourceId %>% str_replace("eva", "clinvar")) %>%
    filter(!(datasourceId %in% c("chembl", "expression_atlas", "sysbio", "europepmc", "phenodigm", "reactome", "phewas_catalog"))) %>%
    distinct() %>%
    group_by(Drug_brand_name) %>%
    summarise(
        interactingIds = paste(unique(targetB), collapse = ";"),
        interactingSymbols = paste(unique(approvedSymbol), collapse = ";"),
        interactingDataSources = paste(unique(datasourceId), collapse = ";")
    )

out <- ass %>%
    group_by(Drug_brand_name, Sponsor, DrugId, Indication, diseaseId, Properties) %>%
    summarise(targetIds = paste(targetId, collapse = ";")) %>%
    left_join(summaryResults, by = "Drug_brand_name") %>%
    left_join(directSources, by = "Drug_brand_name") %>%
    left_join(closePhenotypes, by = "Drug_brand_name") %>%
    left_join(intDf, by = "Drug_brand_name")

out %>% write_csv("/home/ochoa/2021_approvals_output.csv")
Drug (brand name)	Sponsor	Properties	Indication	DrugId	DiseaseId	TA	Manual disease mapping	ChemblCheck
Vericiguat (Verquvo)	Merck & Co./Bayer	sGC stimulator	Chronic heart failure	CHEMBL4066936	EFO_0001645	Cardiovascular	fuzzy
Cabotegravir; rilpivirine (Cabenuva Kit)	ViiV	INSTI and an NNRTI	HIV-1 infection	CHEMBL2403238	EFO_0000180	Infectious	exact
Voclosporin (Lupkynis)	Aurinia	Calcineurin inhibitor	Lupus nephritis	CHEMBL2218919	EFO_0002690	Nephrology	exact
Tepotinib (Tepmetko)	EMD Serono	MET kinase inhibitor	NSCLC	CHEMBL3402762	EFO_0003060	Oncology	exact
Umbralisib (Ukoniq)	TG Therapeutics	PI3Kδ and CK1ε inhibitor	MZL, follicular lymphoma	CHEMBL3948730	EFO_1000630	Oncology	exact
Evinacumab (Evkeeza)	Regeneron	ANGPTL3-targeted mAb	HoFH	CHEMBL3545191	Orphanet_391665	Metabolic	exact
Trilaciclib (Cosela)	G1 Therapeutics	CDK4 and CDK6 kinase inhibitor	Chemotherapy-induced myelosuppression	CHEMBL3894860	EFO_0000702	Oncology	NA
Casimersen (Amondys 45)	Sarepta	Exon 45-skipping ASO	DMD	CHEMBL4297566	Orphanet_98896	Other	exact
Fosdenopterin (Nulibry)	BridgeBio	cPMP	MoCD type A	CHEMBL2338675	Orphanet_308386	Other	exact
Melphalan flufenamide (Pepaxto)	Oncopeptides	Peptide-conjugated alkylating drug	Multiple myeloma	CHEMBL4303060	EFO_0001378	Oncology	exact
Dexmethylphenidate; serdexmethylphenidate (Azstarys)	Commave Therapeutics	CNS stimulant	ADHD	CHEMBL827	EFO_0003888	Psychiatric	exact
Tivozanib (Fotivda)	Aveo	VEGFR kinase inhibitor	Renal cell carcinoma	CHEMBL1289494	EFO_0000681	Oncology	exact
Ponesimod (Ponvory)	J&J	S1P receptor modulator	Relapsing multiple sclerosis	CHEMBL1096146	EFO_0003885	Other	fuzzy
Dasiglucagon (Zegalogue)	Zealand Pharma	Glucagon receptor agonist	Severe hypoglycaemia	CHEMBL4297741	EFO_0001360	Metabolic	exact
Viloxazine (Qelbree)	Supernus	SNRI	ADHD	CHEMBL306700	EFO_0003888	Psychiatric	exact
Drospirenone; estetrol (Nextstellis)	Mayne Pharma	Spironolactone and oestrogen analogues	To prevent pregnancy	CHEMBL1509	NA	Reproductive	NA
Dostarlimab (Jemperli)	GlaxoSmithKline	PD1-targeted mAb	Endometrial cancer	CHEMBL4298124	MONDO_0011962	Oncology	exact
Loncastuximab tesirine (Zynlonta)	ADC Therapeutics	CD19-targeted ADC	B-cell lymphoma	CHEMBL4297778	EFO_0000403	Oncology	exact
Pegcetacoplan (Empaveli)	Apellis	Complement protein C3 inhibitor	PNH	CHEMBL4298211	Orphanet_447	Other	exact
Amivantamab (Rybrevant)	J&J	EGFR×METR bispecific antibody	EGFR exon 20-mutated NSCLC	CHEMBL4297774	EFO_0003060	Oncology	fuzzy
Piflufolastat F-18 (Pylarify)	Progenics	Radiolabelled PSMA imaging agent	Prostate cancer imaging	NA	NA	Imaging	NA
Infigratinib (Truseltiq)	BridgeBio	FGFR2 kinase inhibitor	FGFR2-mutated bile duct cancer	CHEMBL1852688	EFO_0005540	Oncology	fuzzy
Sotorasib (Lumakras)	Amgen	KRAS-G12C inhibitor	KRASG12C-mutated NSCLC	CHEMBL4535757	EFO_0003060	Oncology	fuzzy	TRUE
Olanzapine; samidorphan (Lybalvi)	Alkermes	Atypical antipsychotic and opioid antagonist	Schizophrenia and bipolar I disorder	CHEMBL715	EFO_0000692	Psychiatric	exact
Ibrexafungerp (Brexafemme)	Scynexis	Triterpenoid antifungal	Vulvovaginal candidiasis	CHEMBL4297513	EFO_0007543	Infectious	exact
Aducanumab (Aduhelm)	Biogen/Eisai	Amyloid-β-targeted mAb	Alzheimer’s disease	CHEMBL3039540	EFO_0000249	Other	exact
Asparaginase erwinia chrysanthemi (Rylaze)	Jazz	Recombinant asparagine-specific enzyme	ALL and LBL, in patients allergic to E. coli-derived products	CHEMBL1863514	EFO_0000220	Oncology	fuzzy
Finerenone (Kerendia)	Bayer	Non-steroidal MR antagonist	CKD with type 2 diabetes	CHEMBL2181927	EFO_0000401	Other	exact
Fexinidazole (Fexinidazole)	Sanofi/DNDi	Nitroimidazole antimicrobial	Sleeping sickness	CHEMBL1631694	DOID_10113	Infectious	exact
Belumosudil (Rezurock)	Kadmon	ROCK2 kinase inhibitor	Chronic GVHD	CHEMBL4594302	MONDO_0013730	Other	exact
Odevixibat (Bylvay)	Albireo	IBAT inhibitor	Pruritus in PFIC	CHEMBL4297588	Orphanet_172	Other	exact	TRUE
Anifrolumab (Saphnelo)	AstraZeneca	IFNAR-targeted mAb	SLE	CHEMBL2364653	EFO_0002690	Other	exact
Avalglucosidase alfa (Nexviazyme)	Sanofi	Recombinant α-glucosidase	Pompe disease	CHEMBL4594320	Orphanet_365	Other	fuzzy
Belzutifan (Welireg)	Merck & Co.	HIF-2α inhibitor	von Hippel-Lindau disease	CHEMBL4585668	Orphanet_892	Oncology	exact	TRUE
Difelikefalin (Korsuva)	Cara Therapeutics	κ-Opioid receptor agonist	Pruritus associated with CKD	CHEMBL3989915	EFO_0003884	Other	fuzzy
Lonapegsomatropin (Skytrofa)	Ascendis Pharma	PEGylated human growth hormone	Growth failure due to GHD	CHEMBL4298185	HP_0001510	Other	NA	TRUE
Mobocertinib (Exkivity)	Takeda	EGFR kinase inhibitor	EGFR exon 20-mutated NSCLC	CHEMBL4650319	EFO_0003060	Oncology	fuzzy
Tisotumab vedotin (Tivdak)	Seagen/Genmab	Tissue-factor-directed ADC	Cervical cancer	CHEMBL4297841	MONDO_0002974	Oncology	exact
Atogepant (Qulipta)	AbbVie	CGRP receptor antagonist	Episodic migraine	CHEMBL3991065	EFO_0003821	Other	fuzzy
Maralixibat (Livmarli)	Mirum	IBAT inhibitor	Pruritus in Alagille syndrome	CHEMBL363392	Orphanet_52	Other	fuzzy
Avacopan (Tavneos)	ChemoCentryx	Complement 5a receptor antagonist	ANCA-associated vasculitis	CHEMBL3989871	EFO_0004826	Cardiovascular	exact
Asciminib (Scemblix)	Novartis	ABL/BCR–ABL1 kinase inhibitor	Ph+ CML	CHEMBL4208229	EFO_0000339	Oncology	fuzzy
Ropeginterferon alfa-2b (Besremi)	Pharmaessentia	PEGylated interferon α-2b	Polycythaemia vera	CHEMBL4297819	EFO_0002429	Oncology	exact
Vosoritide (Voxzogo)	Biomarin	CNP analogue	Achondroplasia	CHEMBL3707276	Orphanet_15	Other	exact
Maribavir (Livtencity)	Takeda	CMV pUL97 kinase inhibitor	Post-transplant CMV infection	CHEMBL515408	EFO_0001062	Infectious	fuzzy
Pafolacianine (Cytalux)	On Target Labs	Fluorescent FR imaging agent	Ovarian cancer imaging	CHEMBL4297412	MONDO_0008170	Imaging	exact
Efgartigimod alfa (Vyvgart)	Argenx	FcRn-binding Fc fragment	Myasthenia gravis	CHEMBL4297551	EFO_0004991	Other	exact
Tezepelumab (Tezspire)	Astrazeneca/Amgen	TSLP-targeted mAb	Severe asthma	CHEMBL3707229	EFO_0000270	Respiratory	exact
Inclisiran (Leqvio)	Novartis/Alnylam	PCSK9-targeted siRNA	HeFH or ASCVD	CHEMBL3990033	MONDO_0021661	Cardiovascular	fuzzy
Tralokinumab (Adbry)	LEO Pharma	IL-13-targeted mAb	Atopic dermatitis	CHEMBL1743081	EFO_0000274	Dermatology	fuzzy
	library("tidyverse")
	library("sparklyr")
	library("sparklyr.nested")
	library("cowplot")
	library("ggsci")

	#Spark config
	config <- spark_config()

	# Allowing to GCP datasets access
	config$spark.hadoop.fs.gs.requester.pays.mode <- "AUTO" # nolint
	config$spark.hadoop.fs.gs.requester.pays.project.id <- "open-targets-eu-dev" # nolint

	# spark connect
	sc <- spark_connect(master = "yarn", config = config)

	# Approvals as reported in NRDD article
	gs_approvals <- "gs://ot-team/dochoa/2021_approvals.csv"
	approvals <- spark_read_csv(
	sc,
	path = gs_approvals,
	memory = FALSE
	)

	# Datasource metadata
	ds_names <- spark_read_csv(
	sc,
	path = "gs://ot-team/dochoa/datasourceMetadata.csv",
	memory = FALSE) %>%
	collect()

	# Read Platform data
	gs_path <- "gs://open-targets-data-releases/"
	data_release <- "21.11"
	all_evidence_path <- paste(
	gs_path, data_release,
	"/output/etl/parquet/evidence/",
	sep = ""
	)
	moa_path <- paste(
	gs_path, data_release,
	"/output/etl/parquet/mechanismOfAction/",
	sep = ""
	)
	ass_indirectby_ds_path <- paste(
	gs_path, data_release,
	"/output/etl/parquet/associationByDatasourceIndirect/",
	sep = ""
	)
	disease_path <- paste(
	gs_path, data_release,
	"/output/etl/parquet/diseases/",
	sep = ""
	)
	interaction_path <- paste(
	gs_path, data_release,
	"/output/etl/parquet/interaction/",
	sep = ""
	)
	disease2phenotype_path <- paste(
	gs_path, data_release,
	"/output/etl/parquet/diseaseToPhenotype/",
	sep = ""
	)

	# Mechanisms of action
	# Extra MoAs required to fill the gaps
	ammend_moas <- list(
	"CHEMBL4594302" = "ENSG00000134318",
	"CHEMBL4297741" = "ENSG00000215644",
	"CHEMBL4297774" = "ENSG00000146648",
	"CHEMBL4297774" = "ENSG00000105976",
	"CHEMBL4298185" = "ENSG00000112964", # chembl missing in platform
	"CHEMBL4650319" = "ENSG00000146648",
	"CHEMBL1863514" = "ENSG00000166183",
	"CHEMBL4594320" = "ENSG00000171298"
	)
	new_moas <- data.frame(
	chemblIds = names(ammend_moas),
	targetId = unlist(ammend_moas)
	)
	new_moas <- sdf_copy_to(sc, new_moas, overwrite = TRUE)

	# available MoAs + ammended
	moa <- spark_read_parquet(sc, moa_path, memory = FALSE) %>%
	select(chemblIds, targets) %>%
	sdf_explode(chemblIds) %>%
	sdf_explode(targets) %>%
	rename(targetId = targets) %>%
	sdf_distinct() %>%
	sdf_bind_rows(new_moas)

	# Platform ssociations indirect (by datasource)
	ass_indirectby_ds <- spark_read_parquet(sc, ass_indirectby_ds_path)

	# Joining associations information
	ass <- approvals %>%
	rename(diseaseId = DiseaseId) %>%
	left_join(moa, by = c("DrugId" = "chemblIds")) %>%
	left_join(ass_indirectby_ds, by = c("diseaseId", "targetId")) %>%
	collect()

	# Data about molecular interactions
	interactions <- spark_read_parquet(sc, interaction_path, memory = FALSE) %>%
	filter(sourceDatabase == "intact") %>%
	filter(!is.na(targetA)) %>%
	filter(!is.na(targetB)) %>%
	filter(scoring > 0.42) %>%
	select(targetA, targetB) %>%
	sdf_distinct()

	interactors_ass <- approvals %>%
	rename(diseaseId = DiseaseId) %>%
	inner_join(moa, by = c("DrugId" = "chemblIds")) %>%
	inner_join(interactions, by = c("targetId" = "targetA")) %>%
	inner_join(
	ass_indirectby_ds,
	by = c("diseaseId" = "diseaseId", "targetB" = "targetId")
	) %>%
	select(datasourceId, Drug_brand_name) %>%
	sdf_distinct() %>%
	collect() %>%
	mutate(interactionAssociation = TRUE)

	# Additional phenotype curation
	ammend_phenotypes <- list(
	# Microalbuminuria (biomarker of CKD)
	"EFO_0000401" = "HP_0012594",
	# glycodeoxycholate sulfate (one of the bile acids that cause pruritus)
	"Orphanet_172" = "EFO_0005653",
	"Orphanet_52" = "EFO_0005653",
	# achondroplasia -> body height
	"Orphanet_15" = "EFO_0004339",
	"Orphanet_15" = "Orphanet_329191",
	#von hippel lindau -> renal carcinoma
	"Orphanet_892" = "EFO_0000681",
	"EFO_0001360" = "MONDO_0018582",
	# growth delay -> height
	"HP_0001510" = "EFO_0004339",
	#CAD -> myocardial infarctation
	"EFO_0001645" = "EFO_0000612"
	)
	new_phenotypes <- data.frame(
	diseaseId = names(ammend_phenotypes),
	phenotype = unlist(ammend_phenotypes)
	)
	new_phenotypes <- sdf_copy_to(sc, new_phenotypes, overwrite = TRUE)

	# Platform disease to phenotype data
	disease2phenotype <- spark_read_parquet(
	sc,
	disease2phenotype_path,
	memory = FALSE
	) %>%
	select(diseaseId = disease, phenotype) %>%
	sdf_distinct()

	# Associations through indirect phenotypes
	phenotype_ass <- approvals %>%
	rename(diseaseId = DiseaseId) %>%
	inner_join(moa, by = c("DrugId" = "chemblIds")) %>%
	inner_join(
	disease2phenotype %>%
	sdf_bind_rows(new_phenotypes),
	by = c("diseaseId")) %>%
	inner_join(
	ass_indirectby_ds,
	by = c("phenotype" = "diseaseId", "targetId")) %>%
	select(datasourceId, Drug_brand_name) %>%
	sdf_distinct() %>%
	collect() %>%
	mutate(phenotypeAssociation = TRUE)

	# Data to plot
	data2plot <- ass %>%
	select(datasourceId, Drug_brand_name, score) %>%
	complete(datasourceId, Drug_brand_name) %>%
	mutate(score = replace_na(score, 0)) %>%
	filter(!is.na(datasourceId)) %>%
	# TA
	left_join(
	ass %>%
	select(
	Drug_brand_name,
	TA
	) %>%
	distinct(),
	by = "Drug_brand_name"
	) %>%
	# targets
	left_join(
	ass %>%
	mutate(noTarget = is.na(targetId)) %>%
	select(
	Drug_brand_name,
	noTarget
	) %>%
	distinct(),
	by = "Drug_brand_name"
	) %>%
	# interactions
	left_join(
	interactors_ass,
	by = c("datasourceId", "Drug_brand_name")
	) %>%
	# related phenotypes
	left_join(
	phenotype_ass,
	by = c("datasourceId", "Drug_brand_name")
	) %>%
	mutate(
	interactionAssociation = ifelse(score > 0, TRUE, interactionAssociation)
	) %>%
	mutate(
	phenotypeAssociation = ifelse(score > 0, TRUE, phenotypeAssociation)
	) %>%
	mutate(score = ifelse(noTarget, NA, score)) %>%
	mutate(TA = ifelse(noTarget, "No human target", TA)) %>%
	mutate(
	TA = fct_other(
	TA,
	keep = c("Oncology", "No human target"),
	other_level = "Other indication"
	)
	) %>%
	mutate(
	TA = fct_relevel(TA, c(
	"Oncology",
	"Other indication",
	"No human target"
	))
	) %>%
	# mutate(datasourceId = fct_relevel(datasourceId, names(ds_name_list))) %>%
	filter(!(datasourceId %in% c("chembl", "expression_atlas", "sysbio", "europepmc", "phenodigm", "reactome", "phewas_catalog"))) %>%
	#drug score for the purpose of reordering them
	mutate(rankscore = replace_na(score, 0)) %>%
	mutate(rankscore = ifelse(!is.na(interactionAssociation), rankscore + 0.01, rankscore)) %>%
	mutate(rankscore = ifelse(!is.na(phenotypeAssociation), rankscore + 0.03, rankscore)) %>%
	mutate(Drug_brand_name = fct_rev(fct_reorder(
	Drug_brand_name, rankscore, mean,
	na.rm = TRUE, .desc = TRUE
	))) %>%
	group_by(
	datasourceId,
	Drug_brand_name,
	TA,
	noTarget,
	interactionAssociation,
	phenotypeAssociation
	) %>%
	summarise(score = suppressWarnings(max(score, na.rm = TRUE))) %>%
	mutate(score = ifelse(score < 0, NA, score)) %>%
	left_join(ds_names, by = "datasourceId") %>%
	mutate(
	datasourceName = factor(datasourceName, levels = ds_names$datasourceName),
	datasourceType = factor(datasourceType, levels = c("Somatic", "Functional genomics (cancer)", "Rare mendelian", "Common disease"))
	)


	# symbols to overlay in the plot
	overlay_data <- data2plot %>%
	ungroup() %>%
	select(
	datasourceName,
	datasourceType,
	Drug_brand_name,
	TA,
	interactionAssociation,
	phenotypeAssociation
	) %>%
	gather("overlay", "value", -datasourceName, -datasourceType, -Drug_brand_name, -TA) %>%
	filter(!is.na(value)) %>%
	mutate(overlay = str_replace_all(overlay, "Association", "")) %>%
	mutate(overlaySize = ifelse(overlay == "phenotype", 3, 1)) %>%
	mutate(overlaySymbol = as.character(ifelse(overlay == "phenotype", 1, 16)))

	# plotting
	output <- data2plot %>%
	ggplot(aes(
	x = datasourceName,
	y = Drug_brand_name)) +
	geom_tile(aes(fill = score), color = "white") +
	geom_point(data = overlay_data,
	aes(shape = overlay, size = overlaySize)) +
	scale_fill_material("blue",
	na.value = "grey90",
	name = "Direct association"
	) +
	scale_shape_manual(
	breaks = c("phenotype", "interaction"),
	labels = c("Direct or related phenotype", "Direct or interacting protein"),
	values = c(1, 16),
	name = "Supported by:") +
	scale_size_identity() +
	facet_grid(TA ~ datasourceType, scales = "free", space = "free") +
	theme_cowplot(font_size = 12) +
	# labs(
	# title = "Supporting evidence on 2021 FDA drug approvals",
	# subtitle = "Target-Disease evidence from Open Targets"
	# # caption =
	# # "Source: Nat Reviews Drug Discovery 10.1038/d41573-022-00001-9"
	# ) +
	theme(
	plot.background = element_rect(fill = "white"),
	strip.background = element_blank(),
	legend.direction = "horizontal",
	legend.box = "vertical",
	legend.position = c(-0.7, -0.16),
	legend.justification = c(0, 0),
	axis.ticks = element_blank(),
	axis.text.x = element_text(angle = 45, hjust = 1),
	axis.title = element_blank(),
	axis.line = element_blank(),
	text = element_text(family = "sans")
	) +
	guides(
	fill = guide_colourbar(
	title.position = "top",
	title.hjust = 0.5,
	barwidth = 8,
	frame.colour = "black",
	ticks.colour = "black",
	order = 2
	),
	shape = guide_legend(
	title.position = "top",
	direction = "vertical",
	order = 1
	)
	)
	ggsave(
	"/home/ochoa/2021_approvals.pdf",
	plot = output,
	width = 9,
	height = 11
	)
datasourceId	datasourceName	datasourceType
cancer_gene_census	CGC (COSMIC)	Somatic
intogen	IntOgen	Somatic
cancer_biomarkers	Cancer Biomarkers (CGI)	Somatic
crispr	Project Score	Functional genomics (cancer)
slapenrich	SlapEnrich	Functional genomics (cancer)
progeny	Progeny	Functional genomics (cancer)
eva_somatic	ClinVar (Somatic)	Somatic
ot_genetics_portal	OT Genetics Portal	Common disease
phewas_catalog	Phewas Catalog	Common disease
eva	ClinVar	Rare mendelian
clingen	Clingen	Rare mendelian
genomics_england	GEL PanelApp	Rare mendelian
orphanet	Orphanet	Rare mendelian
gene2phenotype	gene2phenotype	Rare mendelian
uniprot_literature	Uniprot (gene-disease)	Rare mendelian
uniprot_variants	Uniprot (variants)	Rare mendelian
reactome	Reactome	Functional genomics (cancer)
phenodigm	Mouse model (phenodigm)	Mouse model
europepmc	Literature (EPMC)	Literature
expression_atlas	ExpressionAtlas (Diff expression)	Differential Expression
chembl	drugs	Drugs


	directSources <- ass %>%
	filter(!(datasourceId %in% c("chembl", "expression_atlas", "sysbio", "europepmc", "phenodigm", "reactome", "phewas_catalog"))) %>%
	mutate(datasourceId = datasourceId %>% str_replace("eva", "clinvar")) %>%
	filter(!is.na(datasourceId)) %>%
	group_by(Drug_brand_name) %>%
	summarise(directSources = paste(unique(datasourceId), collapse = ";"))

	summaryResults <- output %>%
	filter(datasourceType == "Any") %>%
	select(Drug_brand_name, evidence)

	closePhenotypes <- phenotype_ass %>%
	select(Drug_brand_name, datasourceId, phenotype) %>%
	left_join(
	spark_read_parquet(sc, disease_path) %>%
	select(phenotype = id, phenotypeName = name),
	by = "phenotype") %>%
	collect() %>%
	mutate(datasourceId = datasourceId %>% str_replace("eva", "clinvar")) %>%
	filter(!(datasourceId %in% c("chembl", "expression_atlas", "sysbio", "europepmc", "phenodigm", "reactome", "phewas_catalog"))) %>%
	distinct() %>%
	group_by(Drug_brand_name) %>%
	summarise(
	closePhenotypeIds = paste(unique(phenotype), collapse = ";"),
	closePhenotypeNames = paste(unique(phenotypeName), collapse = ";"),
	closePhenotypeDataSources = paste(unique(datasourceId), collapse = ";")
	)

	target_path <- paste(
	gs_path, data_release,
	"/output/etl/parquet/target/",
	sep = ""
	)

	intDf <- approvals %>%
	rename(diseaseId = DiseaseId) %>%
	inner_join(moa, by = c("DrugId" = "chemblIds")) %>%
	inner_join(interactions, by = c("targetId" = "targetA")) %>%
	inner_join(
	ass_indirectby_ds,
	by = c("diseaseId" = "diseaseId", "targetB" = "targetId")
	) %>%
	left_join(
	spark_read_parquet(sc, target_path) %>%
	select(targetB = id, approvedSymbol),
	by = "targetB"
	) %>%
	select(Drug_brand_name, targetB, datasourceId, approvedSymbol) %>%
	collect() %>%
	mutate(datasourceId = datasourceId %>% str_replace("eva", "clinvar")) %>%
	filter(!(datasourceId %in% c("chembl", "expression_atlas", "sysbio", "europepmc", "phenodigm", "reactome", "phewas_catalog"))) %>%
	distinct() %>%
	group_by(Drug_brand_name) %>%
	summarise(
	interactingIds = paste(unique(targetB), collapse = ";"),
	interactingSymbols = paste(unique(approvedSymbol), collapse = ";"),
	interactingDataSources = paste(unique(datasourceId), collapse = ";")
	)

	out <- ass %>%
	group_by(Drug_brand_name, Sponsor, DrugId, Indication, diseaseId, Properties) %>%
	summarise(targetIds = paste(targetId, collapse = ";")) %>%
	left_join(summaryResults, by = "Drug_brand_name") %>%
	left_join(directSources, by = "Drug_brand_name") %>%
	left_join(closePhenotypes, by = "Drug_brand_name") %>%
	left_join(intDf, by = "Drug_brand_name")

	out %>% write_csv("/home/ochoa/2021_approvals_output.csv")