Skip to content

Instantly share code, notes, and snippets.

@nturaga
Created September 8, 2023 17:55
Show Gist options
  • Save nturaga/6426ddb4c23809918884efeb0a166444 to your computer and use it in GitHub Desktop.
Save nturaga/6426ddb4c23809918884efeb0a166444 to your computer and use it in GitHub Desktop.
diff --cc R/tbl_patient_record.R
index 43600b0,b236cb4..0000000
--- a/R/tbl_patient_record.R
+++ b/R/tbl_patient_record.R
@@@ -1,132 -1,154 +1,198 @@@
-#' Table of patient records with a curated diagnosis
+#' Return a Table of Patient Records with a Curated Diagnosis
#'
-#' Returns relevant metadata regarding the patient records and the
-#' depth of associated data for each record. Excludes records that are
-#' delivery restricted, problematic, or otherwise flagged.
+#' Return a Table of Patient Records with a Curated Diagnosis by joining the
+#' `patient`, `patient_duplicate_boolean_ie`, `combined_diagnosis`, and
+#' `medications_enhanced` tables. The table is filtered to exclude records that
+#' are either delivery restricted or are identifiable, and some columns are
+#' renamed for clarity.
#'
-#' @param connection BigQueryConnection object to expert_deid. If not
-#' provided, link[trutils]{connect_bq} is used.
-#'
-#' @details
-#'
-#' \item Only keeps patient IDs that are NOT delivery or analysis
-#' (referred as `is_deidentified`) restricted.
-#'
-#' \item Any patient ID with is flagged as `has_tempus_sequencing` TRUE if:
-#' 1) QC passing DNA analysis_id in `molecular_inventory_analysis` where
++#' @details
++#' - Any patient ID with is flagged as `has_tempus_sequencing` TRUE if:
++#' - QC passing DNA analysis_id in `molecular_inventory_analysis` where
+ #' results are also available in `molecular_master_file`
+ #' OR
-#' 1) QC passing RNA analysis_id in `molecular_inventory_analysis` where
++#' - QC passing RNA analysis_id in `molecular_inventory_analysis` where
+ #' results are also available in `normalized_rna`
+ #'
-#' \item Only keeps patients that are flagged "CURATED", `is_primary`
++#' - Only keeps patients that are flagged "CURATED", `is_primary`
+ #' == TRUE and has a diagnosis date in `combined_diagnosis`
+ #' table.
+ #'
-#' \item Any patient ID present in `medications_enhanced` as
++#' - Any patient ID present in `medications_enhanced` as
+ #' "CURATED", at least one non-null treatment start date, and at
+ #' least one non-null drug/drug_class field is flagged as
+ #' `has_treatment` = TRUE
+ #'
+#' @param connection a `BigQueryConnection` object, [connect_bq()] by default
+#'
+#' @return a lazy [tibble][tibble::tibble] of patient records with the following
+#' columns: `patient_id`, `gender`, `birth_date`, `birth_year`, `race`,
+#' `ethnicity`, `has_treatment`, and `has_sequencing`.
+#'
+#' @examples
+#' \dontrun{
+#' .tbl_patient_record()
+#' }
+#'
- #' @noRd
+ #' @return lazy query
#'
- #' @importFrom trutils connect_bq
+ #' @keywords internal
+ #'
-#' @import dplyr
+ #' @importFrom trutils connect_bq is_bq_connection
#' @importFrom assertthat assert_that
- #' @importFrom trutils is_bq_connection
-#'
-.tbl_patient_record <- function(connection) {
- ## Patient table for join purposes. Remove delivery restricted and
- ## partner_purge records which are handled by the !is_deidentified
- ## flag. Completely unclear why that flag is called
- ## "is_deidentified"
-
- if (missing(connection)) {
- connection <- trutils::connect_bq()
- }
-
+#' @importFrom dplyr tbl
+#' @importFrom dplyr filter
+#' @importFrom rlang .data
+#' @importFrom dplyr rename
+#' @importFrom dplyr distinct
+#' @importFrom dplyr mutate
+#' @importFrom dplyr inner_join
+#' @importFrom dplyr left_join
+#' @importFrom dplyr relocate
+#' @importFrom dplyr case_when
+.tbl_patient_record <- function(connection = connect_bq()) {
assert_that(
is_bq_connection(connection)
)
- patient <- connection |>
- tbl("patient") |>
+ # The patient table is filtered to exclude records that are delivery
+ # restricted and records that are identifiable; some columns are renamed.
+ patient <-
+ tbl(connection, "patient") |>
filter(
- !is_delivery_restricted,
- !is_deidentified
+ !.data[["is_delivery_restricted"]],
+ !.data[["is_deidentified"]]
) |>
- select(patient_id,
- gender,
- birth_date = birth_date_indexed,
- birth_year = birth_date_year_indexed,
- race = race_concept_canonical_name,
- ethnicity = ethnicity_canonical_name
+ rename(
+ birth_date = "birth_date_indexed",
+ birth_year = "birth_date_year_indexed",
+ race = "race_concept_canonical_name",
+ ethnicity = "ethnicity_canonical_name"
) |>
- distinct()
+ distinct(
+ .data[["patient_id"]],
+ .data[["gender"]],
+ .data[["birth_date"]],
+ .data[["birth_year"]],
+ .data[["race"]],
+ .data[["ethnicity"]]
+ )
++<<<<<<< HEAD
+ # The patient_duplicate_boolean_ie table is filtered to include records that
+ # are in the vw_patient table and exclude records with a one-to-many mapping
+ # between sample_family_id or analysis_id and patient_id. The has_sequencing
+ # column is created by renaming the in_molecular_inventory_analysis column.
+ patient_duplicate_boolean_ie <-
+ tbl(connection, "patient_duplicate_boolean_ie") |>
++=======
+ ## Use flags for QA/QC to remove patients with multiple assigned SFID or
+ # analysis_id and uses an existing flag to determine whether the patient
+ # has tempus sequencing data.
+ patient_inclusion_exclusion <- connection |>
+ tbl("patient_duplicate_boolean_ie") |>
+ filter(!has_multiple_patients,
+ in_vw_patient)
+
+ ## Uses the presence of QC passed DNA data in `molecular_master_file` OR QC
+ # passed RNA data in `normalized_rna` to determine whether a sample has
+ # Tempus sequencing
+ tempus_dna_seq <- connection |>
+ .tbl_dna_analysis() |>
+ filter(in_mmf) |>
+ select(patient_id) |>
+ distinct()
+
+ tempus_rna_seq <- connection |>
+ .tbl_rna_analysis() |>
+ filter(in_normalized_rna) |>
+ select(patient_id) |>
+ distinct()
+
+ tempus_seq <- union_all(
+ tempus_dna_seq,
+ tempus_rna_seq
+ ) |>
+ mutate(has_tempus_sequencing = TRUE) |>
+ distinct()
+
+ ## Inner join to patient_ie to exclude duplicate patient_id, left_join to
+ # DNA seq data to ensure Tempus sequencing
+ patient <- inner_join(
+ patient,
+ patient_inclusion_exclusion
+ ) |>
+ left_join(
+ tempus_seq
+ )
+
+ ## Does this records have "curated" data? Defined as curated
+ ## diagnosis from having a "CURATED", "primary", non-null
+ ## diagnosis date in the combined_diagnosis table
+ curated <- connection |>
+ tbl("combined_diagnosis") |>
++>>>>>>> origin/snv_indel_mmf_refactor
+ filter(
+ .data[["in_vw_patient"]],
+ !.data[["has_multiple_patients"]]
+ ) |>
+ rename(
+ has_sequencing = "in_molecular_inventory_analysis"
+ ) |>
+ distinct(
+ .data[["patient_id"]],
+ .data[["has_sequencing"]]
+ )
+
+ # The combined_diagnosis table is filtered to include only curated records;
+ # such records must have a value of "CURATED" for diagnosis_source,
+ # is_primary must be true, and combined_diagnosis_date_indexed cannot be NA.
+ combined_diagnosis <-
+ tbl(connection, "combined_diagnosis") |>
filter(
- diagnosis_source == "CURATED",
- is_primary,
- !is.na(combined_diagnosis_date_indexed)
+ .data[["diagnosis_source"]] %in% "CURATED",
+ .data[["is_primary"]],
+ !is.na(.data[["combined_diagnosis_date_indexed"]])
) |>
- distinct(patient_id)
+ distinct(.data[["patient_id"]])
- ## Does this record have any curated medication / treatment data?
- ## Filters defined by Lens. We query `medications_enhanced` here
- ## because it returns the same results and is an order of
- ## magnitude smaller than `medications`
- medication <- connection |>
- tbl("medications_enhanced") |>
+ # The medications_enhanced table is used to determine if a record has any
+ # curated medication or treatment data using the same filters as LENS. Here,
+ # the medications_enhanced table is used because it is an order of magnitude
+ # smaller than the medications table.
+ medications_enhanced <-
+ tbl(connection, "medications_enhanced") |>
filter(
- !is.na(effective_date_start_indexed),
- effective_date_start_precision != "YYYY",
- !is.na(patient_id),
- !is.na(drug_class_group_name),
- !is.na(drug_class_name),
- source_system_canonical_name == "CURATED"
+ !is.na(.data[["patient_id"]]),
+ !is.na(.data[["effective_date_start_indexed"]]),
+ !(.data[["effective_date_start_precision"]] %in% "YYYY"),
+ !is.na(.data[["drug_class_name"]]),
+ !is.na(.data[["drug_class_group_name"]]),
+ .data[["source_system_canonical_name"]] %in% "CURATED"
) |>
- distinct(patient_id) |>
+ distinct(.data[["patient_id"]]) |>
mutate(has_treatment = TRUE)
- ## Join together to get the patient record table. Include logic to
- ## understand what data modalities are available
- patient |>
- inner_join(curated, by = "patient_id") |>
- left_join(medication, by = "patient_id") |>
+ # The patient_record tibble to be returned is created by joining the tables
+ # created above by patient_id. The two inner joins come first for SQL
+ # efficiency, which requires has_treatment to be relocated. Also, the left
+ # join introduces some NA values, which are mutated to FALSE.
+ patient_record <-
+ inner_join(patient, patient_duplicate_boolean_ie, by = "patient_id") |>
+ inner_join(combined_diagnosis, by = "patient_id") |>
+ left_join(medications_enhanced, by = "patient_id") |>
+ relocate("has_treatment", .before = "has_sequencing") |>
mutate(
has_treatment = case_when(
- is.na(has_treatment) ~ FALSE,
- TRUE ~ TRUE
+ .data[["has_treatment"]] ~ .data[["has_treatment"]],
+ is.na(.data[["has_treatment"]]) ~ FALSE
),
- has_tempus_sequencing = case_when(
- is.na(has_tempus_sequencing) ~ FALSE,
- TRUE ~ TRUE
+ has_sequencing = case_when(
+ .data[["has_sequencing"]] ~ .data[["has_sequencing"]],
+ is.na(.data[["has_sequencing"]]) ~ FALSE
)
) |>
- relocate(has_tempus_sequencing, .after = has_treatment) |>
distinct()
+
+ patient_record
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment