nturaga/patch.patch

## patch.patch
diff --cc R/tbl_patient_record.R
index 43600b0,b236cb4..0000000
--- a/R/tbl_patient_record.R
+++ b/R/tbl_patient_record.R
@@@ -1,132 -1,154 +1,198 @@@
 -#' Table of patient records with a curated diagnosis
 +#' Return a Table of Patient Records with a Curated Diagnosis
  #'
 -#' Returns relevant metadata regarding the patient records and the
 -#' depth of associated data for each record. Excludes records that are
 -#' delivery restricted, problematic, or otherwise flagged.
 +#' Return a Table of Patient Records with a Curated Diagnosis by joining the
 +#' `patient`, `patient_duplicate_boolean_ie`, `combined_diagnosis`, and
 +#' `medications_enhanced` tables. The table is filtered to exclude records that
 +#' are either delivery restricted or are identifiable, and some columns are
 +#' renamed for clarity.
  #'
 -#' @param connection BigQueryConnection object to expert_deid. If not
 -#'     provided, link[trutils]{connect_bq} is used.
 -#'
 -#' @details
 -#'
 -#' \item Only keeps patient IDs that are NOT delivery or analysis
 -#'     (referred as `is_deidentified`) restricted.
 -#'
 -#' \item Any patient ID with is flagged as `has_tempus_sequencing` TRUE if:
 -#'      1) QC passing DNA analysis_id in `molecular_inventory_analysis` where
++#'  @details
++#' - Any patient ID with is flagged as `has_tempus_sequencing` TRUE if:
++#'      - QC passing DNA analysis_id in `molecular_inventory_analysis` where
+ #'      results are also available in `molecular_master_file`
+ #'      OR
 -#'      1) QC passing RNA analysis_id in `molecular_inventory_analysis` where
++#'      - QC passing RNA analysis_id in `molecular_inventory_analysis` where
+ #'      results are also available in `normalized_rna`
+ #'
 -#' \item Only keeps patients that are flagged "CURATED", `is_primary`
++#' - Only keeps patients that are flagged "CURATED", `is_primary`
+ #'     == TRUE and has a diagnosis date in `combined_diagnosis`
+ #'     table.
+ #'
 -#' \item Any patient ID present in `medications_enhanced` as
++#' - Any patient ID present in `medications_enhanced` as
+ #'    "CURATED", at least one non-null treatment start date, and at
+ #'    least one non-null drug/drug_class field is flagged as
+ #'    `has_treatment` = TRUE
+ #'
 +#' @param connection a `BigQueryConnection` object, [connect_bq()] by default
 +#'
 +#' @return a lazy [tibble][tibble::tibble] of patient records with the following
 +#' columns: `patient_id`, `gender`, `birth_date`, `birth_year`, `race`,
 +#' `ethnicity`, `has_treatment`, and `has_sequencing`.
 +#'
 +#' @examples
 +#' \dontrun{
 +#' .tbl_patient_record()
 +#' }
 +#'
- #' @noRd
+ #' @return lazy query
  #'
- #' @importFrom trutils connect_bq
+ #' @keywords internal
+ #'
 -#' @import dplyr
+ #' @importFrom trutils connect_bq is_bq_connection
  #' @importFrom assertthat assert_that
- #' @importFrom trutils is_bq_connection
 -#'
 -.tbl_patient_record <- function(connection) {
 -    ## Patient table for join purposes. Remove delivery restricted and
 -    ## partner_purge records which are handled by the !is_deidentified
 -    ## flag.  Completely unclear why that flag is called
 -    ## "is_deidentified"
 -
 -    if (missing(connection)) {
 -        connection <- trutils::connect_bq()
 -    }
 -
 +#' @importFrom dplyr tbl
 +#' @importFrom dplyr filter
 +#' @importFrom rlang .data
 +#' @importFrom dplyr rename
 +#' @importFrom dplyr distinct
 +#' @importFrom dplyr mutate
 +#' @importFrom dplyr inner_join
 +#' @importFrom dplyr left_join
 +#' @importFrom dplyr relocate
 +#' @importFrom dplyr case_when
 +.tbl_patient_record <- function(connection = connect_bq()) {
      assert_that(
          is_bq_connection(connection)
      )

 -    patient <- connection |>
 -        tbl("patient") |>
 +    # The patient table is filtered to exclude records that are delivery
 +    # restricted and records that are identifiable; some columns are renamed.
 +    patient <-
 +        tbl(connection, "patient") |>
          filter(
 -            !is_delivery_restricted,
 -            !is_deidentified
 +            !.data[["is_delivery_restricted"]],
 +            !.data[["is_deidentified"]]
          ) |>
 -        select(patient_id,
 -               gender,
 -               birth_date = birth_date_indexed,
 -               birth_year = birth_date_year_indexed,
 -               race = race_concept_canonical_name,
 -               ethnicity = ethnicity_canonical_name
 +        rename(
 +            birth_date = "birth_date_indexed",
 +            birth_year = "birth_date_year_indexed",
 +            race = "race_concept_canonical_name",
 +            ethnicity = "ethnicity_canonical_name"
          ) |>
 -        distinct()
 +        distinct(
 +            .data[["patient_id"]],
 +            .data[["gender"]],
 +            .data[["birth_date"]],
 +            .data[["birth_year"]],
 +            .data[["race"]],
 +            .data[["ethnicity"]]
 +        )

++<<<<<<< HEAD
 +    # The patient_duplicate_boolean_ie table is filtered to include records that
 +    # are in the vw_patient table and exclude records with a one-to-many mapping
 +    # between sample_family_id or analysis_id and patient_id. The has_sequencing
 +    # column is created by renaming the in_molecular_inventory_analysis column.
 +    patient_duplicate_boolean_ie <-
 +        tbl(connection, "patient_duplicate_boolean_ie") |>
++=======
+     ## Use flags for QA/QC to remove patients with multiple assigned SFID or
+     # analysis_id and uses an existing flag to determine whether the patient
+     # has tempus sequencing data.
+     patient_inclusion_exclusion <- connection |>
+         tbl("patient_duplicate_boolean_ie") |>
+         filter(!has_multiple_patients,
+                in_vw_patient)
+
+     ## Uses the presence of QC passed DNA data in `molecular_master_file` OR QC
+     # passed RNA data in `normalized_rna` to determine whether a sample has
+     # Tempus sequencing
+     tempus_dna_seq <- connection |>
+         .tbl_dna_analysis() |>
+         filter(in_mmf) |>
+         select(patient_id) |>
+         distinct()
+
+     tempus_rna_seq <- connection |>
+         .tbl_rna_analysis() |>
+         filter(in_normalized_rna) |>
+         select(patient_id) |>
+         distinct()
+
+     tempus_seq <- union_all(
+         tempus_dna_seq,
+         tempus_rna_seq
+     ) |>
+         mutate(has_tempus_sequencing = TRUE) |>
+         distinct()
+
+     ## Inner join to patient_ie to exclude duplicate patient_id, left_join to
+     # DNA seq data to ensure Tempus sequencing
+     patient <- inner_join(
+         patient,
+         patient_inclusion_exclusion
+     ) |>
+         left_join(
+             tempus_seq
+         )
+
+     ## Does this records have "curated" data? Defined as curated
+     ## diagnosis from having a "CURATED", "primary", non-null
+     ## diagnosis date in the combined_diagnosis table
+     curated <- connection |>
+         tbl("combined_diagnosis") |>
++>>>>>>> origin/snv_indel_mmf_refactor
 +        filter(
 +            .data[["in_vw_patient"]],
 +            !.data[["has_multiple_patients"]]
 +        ) |>
 +        rename(
 +            has_sequencing = "in_molecular_inventory_analysis"
 +        ) |>
 +        distinct(
 +            .data[["patient_id"]],
 +            .data[["has_sequencing"]]
 +        )
 +
 +    # The combined_diagnosis table is filtered to include only curated records;
 +    # such records must have a value of "CURATED" for diagnosis_source,
 +    # is_primary must be true, and combined_diagnosis_date_indexed cannot be NA.
 +    combined_diagnosis <-
 +        tbl(connection, "combined_diagnosis") |>
          filter(
 -            diagnosis_source == "CURATED",
 -            is_primary,
 -            !is.na(combined_diagnosis_date_indexed)
 +            .data[["diagnosis_source"]] %in% "CURATED",
 +            .data[["is_primary"]],
 +            !is.na(.data[["combined_diagnosis_date_indexed"]])
          ) |>
 -        distinct(patient_id)
 +        distinct(.data[["patient_id"]])

 -    ## Does this record have any curated medication / treatment data?
 -    ## Filters defined by Lens. We query `medications_enhanced` here
 -    ## because it returns the same results and is an order of
 -    ## magnitude smaller than `medications`
 -    medication <- connection |>
 -        tbl("medications_enhanced") |>
 +    # The medications_enhanced table is used to determine if a record has any
 +    # curated medication or treatment data using the same filters as LENS. Here,
 +    # the medications_enhanced table is used because it is an order of magnitude
 +    # smaller than the medications table.
 +    medications_enhanced <-
 +        tbl(connection, "medications_enhanced") |>
          filter(
 -            !is.na(effective_date_start_indexed),
 -            effective_date_start_precision != "YYYY",
 -            !is.na(patient_id),
 -            !is.na(drug_class_group_name),
 -            !is.na(drug_class_name),
 -            source_system_canonical_name == "CURATED"
 +            !is.na(.data[["patient_id"]]),
 +            !is.na(.data[["effective_date_start_indexed"]]),
 +            !(.data[["effective_date_start_precision"]] %in% "YYYY"),
 +            !is.na(.data[["drug_class_name"]]),
 +            !is.na(.data[["drug_class_group_name"]]),
 +            .data[["source_system_canonical_name"]] %in% "CURATED"
          ) |>
 -        distinct(patient_id) |>
 +        distinct(.data[["patient_id"]]) |>
          mutate(has_treatment = TRUE)

 -    ## Join together to get the patient record table. Include logic to
 -    ## understand what data modalities are available
 -    patient |>
 -        inner_join(curated, by = "patient_id") |>
 -        left_join(medication, by = "patient_id") |>
 +    # The patient_record tibble to be returned is created by joining the tables
 +    # created above by patient_id. The two inner joins come first for SQL
 +    # efficiency, which requires has_treatment to be relocated. Also, the left
 +    # join introduces some NA values, which are mutated to FALSE.
 +    patient_record <-
 +        inner_join(patient, patient_duplicate_boolean_ie, by = "patient_id") |>
 +        inner_join(combined_diagnosis, by = "patient_id") |>
 +        left_join(medications_enhanced, by = "patient_id") |>
 +        relocate("has_treatment", .before = "has_sequencing") |>
          mutate(
              has_treatment = case_when(
 -                is.na(has_treatment) ~ FALSE,
 -                TRUE ~ TRUE
 +                .data[["has_treatment"]] ~ .data[["has_treatment"]],
 +                is.na(.data[["has_treatment"]]) ~ FALSE
              ),
 -            has_tempus_sequencing = case_when(
 -                is.na(has_tempus_sequencing) ~ FALSE,
 -                TRUE ~ TRUE
 +            has_sequencing = case_when(
 +                .data[["has_sequencing"]] ~ .data[["has_sequencing"]],
 +                is.na(.data[["has_sequencing"]]) ~ FALSE
              )
          ) |>
 -        relocate(has_tempus_sequencing, .after = has_treatment) |>
          distinct()
 +
 +    patient_record
  }
	diff --cc R/tbl_patient_record.R
	index 43600b0,b236cb4..0000000
	--- a/R/tbl_patient_record.R
	+++ b/R/tbl_patient_record.R
	@@@ -1,132 -1,154 +1,198 @@@
	-#' Table of patient records with a curated diagnosis
	+#' Return a Table of Patient Records with a Curated Diagnosis
	#'
	-#' Returns relevant metadata regarding the patient records and the
	-#' depth of associated data for each record. Excludes records that are
	-#' delivery restricted, problematic, or otherwise flagged.
	+#' Return a Table of Patient Records with a Curated Diagnosis by joining the
	+#' `patient`, `patient_duplicate_boolean_ie`, `combined_diagnosis`, and
	+#' `medications_enhanced` tables. The table is filtered to exclude records that
	+#' are either delivery restricted or are identifiable, and some columns are
	+#' renamed for clarity.
	#'
	-#' @param connection BigQueryConnection object to expert_deid. If not
	-#' provided, link[trutils]{connect_bq} is used.
	-#'
	-#' @details
	-#'
	-#' \item Only keeps patient IDs that are NOT delivery or analysis
	-#' (referred as `is_deidentified`) restricted.
	-#'
	-#' \item Any patient ID with is flagged as `has_tempus_sequencing` TRUE if:
	-#' 1) QC passing DNA analysis_id in `molecular_inventory_analysis` where
	++#' @details
	++#' - Any patient ID with is flagged as `has_tempus_sequencing` TRUE if:
	++#' - QC passing DNA analysis_id in `molecular_inventory_analysis` where
	+ #' results are also available in `molecular_master_file`
	+ #' OR
	-#' 1) QC passing RNA analysis_id in `molecular_inventory_analysis` where
	++#' - QC passing RNA analysis_id in `molecular_inventory_analysis` where
	+ #' results are also available in `normalized_rna`
	+ #'
	-#' \item Only keeps patients that are flagged "CURATED", `is_primary`
	++#' - Only keeps patients that are flagged "CURATED", `is_primary`
	+ #' == TRUE and has a diagnosis date in `combined_diagnosis`
	+ #' table.
	+ #'
	-#' \item Any patient ID present in `medications_enhanced` as
	++#' - Any patient ID present in `medications_enhanced` as
	+ #' "CURATED", at least one non-null treatment start date, and at
	+ #' least one non-null drug/drug_class field is flagged as
	+ #' `has_treatment` = TRUE
	+ #'
	+#' @param connection a `BigQueryConnection` object, [connect_bq()] by default
	+#'
	+#' @return a lazy [tibble][tibble::tibble] of patient records with the following
	+#' columns: `patient_id`, `gender`, `birth_date`, `birth_year`, `race`,
	+#' `ethnicity`, `has_treatment`, and `has_sequencing`.
	+#'
	+#' @examples
	+#' \dontrun{
	+#' .tbl_patient_record()
	+#' }
	+#'
	- #' @noRd
	+ #' @return lazy query
	#'
	- #' @importFrom trutils connect_bq
	+ #' @keywords internal
	+ #'
	-#' @import dplyr
	+ #' @importFrom trutils connect_bq is_bq_connection
	#' @importFrom assertthat assert_that
	- #' @importFrom trutils is_bq_connection
	-#'
	-.tbl_patient_record <- function(connection) {
	- ## Patient table for join purposes. Remove delivery restricted and
	- ## partner_purge records which are handled by the !is_deidentified
	- ## flag. Completely unclear why that flag is called
	- ## "is_deidentified"
	-
	- if (missing(connection)) {
	- connection <- trutils::connect_bq()
	- }
	-
	+#' @importFrom dplyr tbl
	+#' @importFrom dplyr filter
	+#' @importFrom rlang .data
	+#' @importFrom dplyr rename
	+#' @importFrom dplyr distinct
	+#' @importFrom dplyr mutate
	+#' @importFrom dplyr inner_join
	+#' @importFrom dplyr left_join
	+#' @importFrom dplyr relocate
	+#' @importFrom dplyr case_when
	+.tbl_patient_record <- function(connection = connect_bq()) {
	assert_that(
	is_bq_connection(connection)
	)

	- patient <- connection \|>
	- tbl("patient") \|>
	+ # The patient table is filtered to exclude records that are delivery
	+ # restricted and records that are identifiable; some columns are renamed.
	+ patient <-
	+ tbl(connection, "patient") \|>
	filter(
	- !is_delivery_restricted,
	- !is_deidentified
	+ !.data[["is_delivery_restricted"]],
	+ !.data[["is_deidentified"]]
	) \|>
	- select(patient_id,
	- gender,
	- birth_date = birth_date_indexed,
	- birth_year = birth_date_year_indexed,
	- race = race_concept_canonical_name,
	- ethnicity = ethnicity_canonical_name
	+ rename(
	+ birth_date = "birth_date_indexed",
	+ birth_year = "birth_date_year_indexed",
	+ race = "race_concept_canonical_name",
	+ ethnicity = "ethnicity_canonical_name"
	) \|>
	- distinct()
	+ distinct(
	+ .data[["patient_id"]],
	+ .data[["gender"]],
	+ .data[["birth_date"]],
	+ .data[["birth_year"]],
	+ .data[["race"]],
	+ .data[["ethnicity"]]
	+ )

	++<<<<<<< HEAD
	+ # The patient_duplicate_boolean_ie table is filtered to include records that
	+ # are in the vw_patient table and exclude records with a one-to-many mapping
	+ # between sample_family_id or analysis_id and patient_id. The has_sequencing
	+ # column is created by renaming the in_molecular_inventory_analysis column.
	+ patient_duplicate_boolean_ie <-
	+ tbl(connection, "patient_duplicate_boolean_ie") \|>
	++=======
	+ ## Use flags for QA/QC to remove patients with multiple assigned SFID or
	+ # analysis_id and uses an existing flag to determine whether the patient
	+ # has tempus sequencing data.
	+ patient_inclusion_exclusion <- connection \|>
	+ tbl("patient_duplicate_boolean_ie") \|>
	+ filter(!has_multiple_patients,
	+ in_vw_patient)
	+
	+ ## Uses the presence of QC passed DNA data in `molecular_master_file` OR QC
	+ # passed RNA data in `normalized_rna` to determine whether a sample has
	+ # Tempus sequencing
	+ tempus_dna_seq <- connection \|>
	+ .tbl_dna_analysis() \|>
	+ filter(in_mmf) \|>
	+ select(patient_id) \|>
	+ distinct()
	+
	+ tempus_rna_seq <- connection \|>
	+ .tbl_rna_analysis() \|>
	+ filter(in_normalized_rna) \|>
	+ select(patient_id) \|>
	+ distinct()
	+
	+ tempus_seq <- union_all(
	+ tempus_dna_seq,
	+ tempus_rna_seq
	+ ) \|>
	+ mutate(has_tempus_sequencing = TRUE) \|>
	+ distinct()
	+
	+ ## Inner join to patient_ie to exclude duplicate patient_id, left_join to
	+ # DNA seq data to ensure Tempus sequencing
	+ patient <- inner_join(
	+ patient,
	+ patient_inclusion_exclusion
	+ ) \|>
	+ left_join(
	+ tempus_seq
	+ )
	+
	+ ## Does this records have "curated" data? Defined as curated
	+ ## diagnosis from having a "CURATED", "primary", non-null
	+ ## diagnosis date in the combined_diagnosis table
	+ curated <- connection \|>
	+ tbl("combined_diagnosis") \|>
	++>>>>>>> origin/snv_indel_mmf_refactor
	+ filter(
	+ .data[["in_vw_patient"]],
	+ !.data[["has_multiple_patients"]]
	+ ) \|>
	+ rename(
	+ has_sequencing = "in_molecular_inventory_analysis"
	+ ) \|>
	+ distinct(
	+ .data[["patient_id"]],
	+ .data[["has_sequencing"]]
	+ )
	+
	+ # The combined_diagnosis table is filtered to include only curated records;
	+ # such records must have a value of "CURATED" for diagnosis_source,
	+ # is_primary must be true, and combined_diagnosis_date_indexed cannot be NA.
	+ combined_diagnosis <-
	+ tbl(connection, "combined_diagnosis") \|>
	filter(
	- diagnosis_source == "CURATED",
	- is_primary,
	- !is.na(combined_diagnosis_date_indexed)
	+ .data[["diagnosis_source"]] %in% "CURATED",
	+ .data[["is_primary"]],
	+ !is.na(.data[["combined_diagnosis_date_indexed"]])
	) \|>
	- distinct(patient_id)
	+ distinct(.data[["patient_id"]])

	- ## Does this record have any curated medication / treatment data?
	- ## Filters defined by Lens. We query `medications_enhanced` here
	- ## because it returns the same results and is an order of
	- ## magnitude smaller than `medications`
	- medication <- connection \|>
	- tbl("medications_enhanced") \|>
	+ # The medications_enhanced table is used to determine if a record has any
	+ # curated medication or treatment data using the same filters as LENS. Here,
	+ # the medications_enhanced table is used because it is an order of magnitude
	+ # smaller than the medications table.
	+ medications_enhanced <-
	+ tbl(connection, "medications_enhanced") \|>
	filter(
	- !is.na(effective_date_start_indexed),
	- effective_date_start_precision != "YYYY",
	- !is.na(patient_id),
	- !is.na(drug_class_group_name),
	- !is.na(drug_class_name),
	- source_system_canonical_name == "CURATED"
	+ !is.na(.data[["patient_id"]]),
	+ !is.na(.data[["effective_date_start_indexed"]]),
	+ !(.data[["effective_date_start_precision"]] %in% "YYYY"),
	+ !is.na(.data[["drug_class_name"]]),
	+ !is.na(.data[["drug_class_group_name"]]),
	+ .data[["source_system_canonical_name"]] %in% "CURATED"
	) \|>
	- distinct(patient_id) \|>
	+ distinct(.data[["patient_id"]]) \|>
	mutate(has_treatment = TRUE)

	- ## Join together to get the patient record table. Include logic to
	- ## understand what data modalities are available
	- patient \|>
	- inner_join(curated, by = "patient_id") \|>
	- left_join(medication, by = "patient_id") \|>
	+ # The patient_record tibble to be returned is created by joining the tables
	+ # created above by patient_id. The two inner joins come first for SQL
	+ # efficiency, which requires has_treatment to be relocated. Also, the left
	+ # join introduces some NA values, which are mutated to FALSE.
	+ patient_record <-
	+ inner_join(patient, patient_duplicate_boolean_ie, by = "patient_id") \|>
	+ inner_join(combined_diagnosis, by = "patient_id") \|>
	+ left_join(medications_enhanced, by = "patient_id") \|>
	+ relocate("has_treatment", .before = "has_sequencing") \|>
	mutate(
	has_treatment = case_when(
	- is.na(has_treatment) ~ FALSE,
	- TRUE ~ TRUE
	+ .data[["has_treatment"]] ~ .data[["has_treatment"]],
	+ is.na(.data[["has_treatment"]]) ~ FALSE
	),
	- has_tempus_sequencing = case_when(
	- is.na(has_tempus_sequencing) ~ FALSE,
	- TRUE ~ TRUE
	+ has_sequencing = case_when(
	+ .data[["has_sequencing"]] ~ .data[["has_sequencing"]],
	+ is.na(.data[["has_sequencing"]]) ~ FALSE
	)
	) \|>
	- relocate(has_tempus_sequencing, .after = has_treatment) \|>
	distinct()
	+
	+ patient_record
	}