Created
September 8, 2023 17:55
-
-
Save nturaga/6426ddb4c23809918884efeb0a166444 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --cc R/tbl_patient_record.R | |
index 43600b0,b236cb4..0000000 | |
--- a/R/tbl_patient_record.R | |
+++ b/R/tbl_patient_record.R | |
@@@ -1,132 -1,154 +1,198 @@@ | |
-#' Table of patient records with a curated diagnosis | |
+#' Return a Table of Patient Records with a Curated Diagnosis | |
#' | |
-#' Returns relevant metadata regarding the patient records and the | |
-#' depth of associated data for each record. Excludes records that are | |
-#' delivery restricted, problematic, or otherwise flagged. | |
+#' Return a Table of Patient Records with a Curated Diagnosis by joining the | |
+#' `patient`, `patient_duplicate_boolean_ie`, `combined_diagnosis`, and | |
+#' `medications_enhanced` tables. The table is filtered to exclude records that | |
+#' are either delivery restricted or are identifiable, and some columns are | |
+#' renamed for clarity. | |
#' | |
-#' @param connection BigQueryConnection object to expert_deid. If not | |
-#' provided, link[trutils]{connect_bq} is used. | |
-#' | |
-#' @details | |
-#' | |
-#' \item Only keeps patient IDs that are NOT delivery or analysis | |
-#' (referred as `is_deidentified`) restricted. | |
-#' | |
-#' \item Any patient ID with is flagged as `has_tempus_sequencing` TRUE if: | |
-#' 1) QC passing DNA analysis_id in `molecular_inventory_analysis` where | |
++#' @details | |
++#' - Any patient ID with is flagged as `has_tempus_sequencing` TRUE if: | |
++#' - QC passing DNA analysis_id in `molecular_inventory_analysis` where | |
+ #' results are also available in `molecular_master_file` | |
+ #' OR | |
-#' 1) QC passing RNA analysis_id in `molecular_inventory_analysis` where | |
++#' - QC passing RNA analysis_id in `molecular_inventory_analysis` where | |
+ #' results are also available in `normalized_rna` | |
+ #' | |
-#' \item Only keeps patients that are flagged "CURATED", `is_primary` | |
++#' - Only keeps patients that are flagged "CURATED", `is_primary` | |
+ #' == TRUE and has a diagnosis date in `combined_diagnosis` | |
+ #' table. | |
+ #' | |
-#' \item Any patient ID present in `medications_enhanced` as | |
++#' - Any patient ID present in `medications_enhanced` as | |
+ #' "CURATED", at least one non-null treatment start date, and at | |
+ #' least one non-null drug/drug_class field is flagged as | |
+ #' `has_treatment` = TRUE | |
+ #' | |
+#' @param connection a `BigQueryConnection` object, [connect_bq()] by default | |
+#' | |
+#' @return a lazy [tibble][tibble::tibble] of patient records with the following | |
+#' columns: `patient_id`, `gender`, `birth_date`, `birth_year`, `race`, | |
+#' `ethnicity`, `has_treatment`, and `has_sequencing`. | |
+#' | |
+#' @examples | |
+#' \dontrun{ | |
+#' .tbl_patient_record() | |
+#' } | |
+#' | |
- #' @noRd | |
+ #' @return lazy query | |
#' | |
- #' @importFrom trutils connect_bq | |
+ #' @keywords internal | |
+ #' | |
-#' @import dplyr | |
+ #' @importFrom trutils connect_bq is_bq_connection | |
#' @importFrom assertthat assert_that | |
- #' @importFrom trutils is_bq_connection | |
-#' | |
-.tbl_patient_record <- function(connection) { | |
- ## Patient table for join purposes. Remove delivery restricted and | |
- ## partner_purge records which are handled by the !is_deidentified | |
- ## flag. Completely unclear why that flag is called | |
- ## "is_deidentified" | |
- | |
- if (missing(connection)) { | |
- connection <- trutils::connect_bq() | |
- } | |
- | |
+#' @importFrom dplyr tbl | |
+#' @importFrom dplyr filter | |
+#' @importFrom rlang .data | |
+#' @importFrom dplyr rename | |
+#' @importFrom dplyr distinct | |
+#' @importFrom dplyr mutate | |
+#' @importFrom dplyr inner_join | |
+#' @importFrom dplyr left_join | |
+#' @importFrom dplyr relocate | |
+#' @importFrom dplyr case_when | |
+.tbl_patient_record <- function(connection = connect_bq()) { | |
assert_that( | |
is_bq_connection(connection) | |
) | |
- patient <- connection |> | |
- tbl("patient") |> | |
+ # The patient table is filtered to exclude records that are delivery | |
+ # restricted and records that are identifiable; some columns are renamed. | |
+ patient <- | |
+ tbl(connection, "patient") |> | |
filter( | |
- !is_delivery_restricted, | |
- !is_deidentified | |
+ !.data[["is_delivery_restricted"]], | |
+ !.data[["is_deidentified"]] | |
) |> | |
- select(patient_id, | |
- gender, | |
- birth_date = birth_date_indexed, | |
- birth_year = birth_date_year_indexed, | |
- race = race_concept_canonical_name, | |
- ethnicity = ethnicity_canonical_name | |
+ rename( | |
+ birth_date = "birth_date_indexed", | |
+ birth_year = "birth_date_year_indexed", | |
+ race = "race_concept_canonical_name", | |
+ ethnicity = "ethnicity_canonical_name" | |
) |> | |
- distinct() | |
+ distinct( | |
+ .data[["patient_id"]], | |
+ .data[["gender"]], | |
+ .data[["birth_date"]], | |
+ .data[["birth_year"]], | |
+ .data[["race"]], | |
+ .data[["ethnicity"]] | |
+ ) | |
++<<<<<<< HEAD | |
+ # The patient_duplicate_boolean_ie table is filtered to include records that | |
+ # are in the vw_patient table and exclude records with a one-to-many mapping | |
+ # between sample_family_id or analysis_id and patient_id. The has_sequencing | |
+ # column is created by renaming the in_molecular_inventory_analysis column. | |
+ patient_duplicate_boolean_ie <- | |
+ tbl(connection, "patient_duplicate_boolean_ie") |> | |
++======= | |
+ ## Use flags for QA/QC to remove patients with multiple assigned SFID or | |
+ # analysis_id and uses an existing flag to determine whether the patient | |
+ # has tempus sequencing data. | |
+ patient_inclusion_exclusion <- connection |> | |
+ tbl("patient_duplicate_boolean_ie") |> | |
+ filter(!has_multiple_patients, | |
+ in_vw_patient) | |
+ | |
+ ## Uses the presence of QC passed DNA data in `molecular_master_file` OR QC | |
+ # passed RNA data in `normalized_rna` to determine whether a sample has | |
+ # Tempus sequencing | |
+ tempus_dna_seq <- connection |> | |
+ .tbl_dna_analysis() |> | |
+ filter(in_mmf) |> | |
+ select(patient_id) |> | |
+ distinct() | |
+ | |
+ tempus_rna_seq <- connection |> | |
+ .tbl_rna_analysis() |> | |
+ filter(in_normalized_rna) |> | |
+ select(patient_id) |> | |
+ distinct() | |
+ | |
+ tempus_seq <- union_all( | |
+ tempus_dna_seq, | |
+ tempus_rna_seq | |
+ ) |> | |
+ mutate(has_tempus_sequencing = TRUE) |> | |
+ distinct() | |
+ | |
+ ## Inner join to patient_ie to exclude duplicate patient_id, left_join to | |
+ # DNA seq data to ensure Tempus sequencing | |
+ patient <- inner_join( | |
+ patient, | |
+ patient_inclusion_exclusion | |
+ ) |> | |
+ left_join( | |
+ tempus_seq | |
+ ) | |
+ | |
+ ## Does this records have "curated" data? Defined as curated | |
+ ## diagnosis from having a "CURATED", "primary", non-null | |
+ ## diagnosis date in the combined_diagnosis table | |
+ curated <- connection |> | |
+ tbl("combined_diagnosis") |> | |
++>>>>>>> origin/snv_indel_mmf_refactor | |
+ filter( | |
+ .data[["in_vw_patient"]], | |
+ !.data[["has_multiple_patients"]] | |
+ ) |> | |
+ rename( | |
+ has_sequencing = "in_molecular_inventory_analysis" | |
+ ) |> | |
+ distinct( | |
+ .data[["patient_id"]], | |
+ .data[["has_sequencing"]] | |
+ ) | |
+ | |
+ # The combined_diagnosis table is filtered to include only curated records; | |
+ # such records must have a value of "CURATED" for diagnosis_source, | |
+ # is_primary must be true, and combined_diagnosis_date_indexed cannot be NA. | |
+ combined_diagnosis <- | |
+ tbl(connection, "combined_diagnosis") |> | |
filter( | |
- diagnosis_source == "CURATED", | |
- is_primary, | |
- !is.na(combined_diagnosis_date_indexed) | |
+ .data[["diagnosis_source"]] %in% "CURATED", | |
+ .data[["is_primary"]], | |
+ !is.na(.data[["combined_diagnosis_date_indexed"]]) | |
) |> | |
- distinct(patient_id) | |
+ distinct(.data[["patient_id"]]) | |
- ## Does this record have any curated medication / treatment data? | |
- ## Filters defined by Lens. We query `medications_enhanced` here | |
- ## because it returns the same results and is an order of | |
- ## magnitude smaller than `medications` | |
- medication <- connection |> | |
- tbl("medications_enhanced") |> | |
+ # The medications_enhanced table is used to determine if a record has any | |
+ # curated medication or treatment data using the same filters as LENS. Here, | |
+ # the medications_enhanced table is used because it is an order of magnitude | |
+ # smaller than the medications table. | |
+ medications_enhanced <- | |
+ tbl(connection, "medications_enhanced") |> | |
filter( | |
- !is.na(effective_date_start_indexed), | |
- effective_date_start_precision != "YYYY", | |
- !is.na(patient_id), | |
- !is.na(drug_class_group_name), | |
- !is.na(drug_class_name), | |
- source_system_canonical_name == "CURATED" | |
+ !is.na(.data[["patient_id"]]), | |
+ !is.na(.data[["effective_date_start_indexed"]]), | |
+ !(.data[["effective_date_start_precision"]] %in% "YYYY"), | |
+ !is.na(.data[["drug_class_name"]]), | |
+ !is.na(.data[["drug_class_group_name"]]), | |
+ .data[["source_system_canonical_name"]] %in% "CURATED" | |
) |> | |
- distinct(patient_id) |> | |
+ distinct(.data[["patient_id"]]) |> | |
mutate(has_treatment = TRUE) | |
- ## Join together to get the patient record table. Include logic to | |
- ## understand what data modalities are available | |
- patient |> | |
- inner_join(curated, by = "patient_id") |> | |
- left_join(medication, by = "patient_id") |> | |
+ # The patient_record tibble to be returned is created by joining the tables | |
+ # created above by patient_id. The two inner joins come first for SQL | |
+ # efficiency, which requires has_treatment to be relocated. Also, the left | |
+ # join introduces some NA values, which are mutated to FALSE. | |
+ patient_record <- | |
+ inner_join(patient, patient_duplicate_boolean_ie, by = "patient_id") |> | |
+ inner_join(combined_diagnosis, by = "patient_id") |> | |
+ left_join(medications_enhanced, by = "patient_id") |> | |
+ relocate("has_treatment", .before = "has_sequencing") |> | |
mutate( | |
has_treatment = case_when( | |
- is.na(has_treatment) ~ FALSE, | |
- TRUE ~ TRUE | |
+ .data[["has_treatment"]] ~ .data[["has_treatment"]], | |
+ is.na(.data[["has_treatment"]]) ~ FALSE | |
), | |
- has_tempus_sequencing = case_when( | |
- is.na(has_tempus_sequencing) ~ FALSE, | |
- TRUE ~ TRUE | |
+ has_sequencing = case_when( | |
+ .data[["has_sequencing"]] ~ .data[["has_sequencing"]], | |
+ is.na(.data[["has_sequencing"]]) ~ FALSE | |
) | |
) |> | |
- relocate(has_tempus_sequencing, .after = has_treatment) |> | |
distinct() | |
+ | |
+ patient_record | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment