Created
October 12, 2023 15:23
-
-
Save nturaga/8739af1d7505b89a9a8ba2583bae8f7f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Write a R package vignette for a package called "Tempustables" that has the following functions with descriptions and examples as given below. The vignette needs to be fairly easy for users to follow. Provide examples and description for each function. | |
Collect Normalized RNA by Cohort | |
Description: | |
Collecting the whole transcriptome for over 100 samples from | |
'normalized_rna' in your R session will typically fail, requiring | |
you to write some sort of a loop to get all the patients. This | |
function does all that work for you and runs the query in | |
parallel. | |
Usage: | |
collect_normalized_rna_by_cohort( | |
connection, | |
cohort, | |
gene = NULL, | |
ensembl_gene = NULL, | |
add_cols = NULL | |
) | |
Arguments: | |
connection: BigQueryConnection object to expert_deid. If not #' | |
provided, link[tempusutils]connect_bq is used. | |
cohort: A lazy query object containing the column 'analysis_id' of | |
RNA analysis ids. | |
gene: A character vector of one or more gene symbols. If 'NULL' | |
queries the entire transcriptome. Defaults to 'NULL'. | |
ensembl_gene: A character vector of one or more Ensembl gene ids. If | |
'NULL' queries the entire transcriptome. Defaults to 'NULL'. | |
add_cols: A character vector of column names to be queried in addition | |
to default columns from the 'normalized_rna' table. Defaults | |
to 'NULL' | |
Details: | |
Collects large number of samples from 'normalized_rna' | |
Value: | |
A data frame with patient_tempus_id, analysis_id, ensembl_gene, | |
gene_tpm_cognizant_corrector, and add_cols columns from | |
'normalized_rna' | |
Examples: | |
library(dplyr) | |
con <- connect_bq() | |
cohort <- con |> pull_snv_indel_cohort(genes = "ERBB2") | |
cohort <- con |> pull_deliverable_cohort(cohort) | |
collected_cohort <- cohort |> collect_retry() | |
rna <- con |> trtools:::collect_normalized_rna_by_cohort( | |
cohort = cohort, | |
gene = c("ERBB2", "KRAS"), | |
add_cols = "gene_tpm" | |
) | |
Collect Nomralized RNA by Raw Analysis IDs | |
Description: | |
Collecting the whole transcriptome for over 100 samples from | |
'vw_normalized_rna' in your R session will typically fail, | |
requiring you to write some sort of a loop to get all the | |
patients. This function does all that work for you and runs the | |
query in parallel. | |
Usage: | |
collect_normalized_rna_by_raw_analysis_ids( | |
connection, | |
raw_analysis_ids, | |
gene = NULL, | |
ensembl_gene = NULL, | |
add_cols = NULL | |
) | |
Arguments: | |
connection: BigQueryConnection object to expert_deid. If not #' | |
provided, link[tempusutils]connect_bq is used. | |
raw_analysis_ids: A character vector of RNA analysis ids. | |
gene: A character vector of one or more gene symbols. If 'NULL' | |
queries the entire transcriptome. Defaults to 'NULL'. | |
ensembl_gene: A character vector of one or more Ensembl gene ids. If | |
'NULL' queries the entire transcriptome. Defaults to 'NULL'. | |
add_cols: A character vector of column names to be queried in addition | |
to default columns from the 'normalized_rna' table. Defaults | |
to 'NULL' | |
Details: | |
Collects large number of samples from 'vw_normalized_rna' | |
Value: | |
A data frame with patient_tempus_id, analysis_id, ensembl_gene, | |
gene_tpm_cognizant_corrector, and add_cols columns from | |
'normalized_rna' | |
Examples: | |
library(dplyr) | |
con <- connect_bq() | |
cohort <- con |> pull_snv_indel_cohort(genes = "ERBB2") | |
cohort <- con |> pull_deliverable_cohort(cohort) | |
collected_cohort <- cohort |> collect_retry() | |
rna <- con |> trtools:::collect_normalized_rna_by_raw_analysis_ids( | |
raw_analysis_ids = collected_cohort$rna_analysis_id, | |
gene = c("ERBB2", "KRAS") | |
) | |
Collect Normalized RNA | |
Description: | |
Collect gene expression data from 'normalized_rna'. Wrapper | |
function to allow for use of lazy query objects or raw Character | |
vectors as input | |
Only use this function to collect RNA data for a single or | |
small set of genes. Please refer to `unload_rna` if you | |
require the entire transcriptome. | |
Usage: | |
collect_normalized_rna( | |
connection, | |
cohort = NULL, | |
raw_analysis_ids = NULL, | |
gene = NULL, | |
ensembl_gene = NULL, | |
add_cols = NULL | |
) | |
Arguments: | |
connection: BigQueryConnection object to expert_deid. If not provided, | |
link[tempusutils]connect_bq is used. | |
cohort: A lazy query object containing the column 'analysis_id' of | |
RNA analysis ids. | |
raw_analysis_ids: A Character vector of RNA analysis ids. | |
gene: A Character vector of one or more gene symbols. If 'NULL' | |
queries the entire transcriptome. Defaults to 'NULL'. | |
ensembl_gene: A Character vector of one or more Ensembl gene ids. If | |
'NULL' queries the entire transcriptome. Defaults to 'NULL'. | |
add_cols: A Character vector of column names to be queried in addition | |
to default columns from the 'normalized_rna' table. Defaults | |
to 'NULL' | |
Value: | |
A data frame with patient_tempus_id, analysis_id, ensembl_gene, | |
gene_tpm_cognizant_corrector, and add_cols columns from | |
'normalized_rna' | |
Examples: | |
library(dplyr) | |
con <- connect_bq() | |
cohort <- con |> pull_snv_indel_cohort(genes = "ERBB2") | |
cohort <- con |> pull_deliverable_cohort(cohort) | |
collected_cohort <- cohort |> collect_retry() | |
rna <- con |> collect_normalized_rna( | |
cohort = cohort, | |
gene = c("ERBB2", "KRAS"), | |
add_cols = "gene_tpm" | |
) | |
rna <- con |> collect_normalized_rna( | |
raw_analysis_ids = collected_cohort$rna_analysis_id, | |
gene = c("ERBB2", "KRAS") | |
) | |
CNV Table | |
Description: | |
This table provides transcript length weighted averaged copy | |
number values from the 'cnv_genes' table. | |
Usage: | |
.tbl_cnv(connection) | |
Arguments: | |
connection: BigQueryConnection object to expert_deid. If not provided, | |
connect_bq is used. | |
Details: | |
• Gets sample ids from '.tbl_sample()' and picks the preferred | |
dna_analysis_id for use in joining to 'cnv_gene' | |
• Queries 'cnv_genes' to get all copy states for each gene for | |
each analysis_id. Drops all "genes" that are exactly "." or | |
start with rs IDs. Calculates the region size for each | |
independent copy region. | |
• Each gene + analysis_id can have multiple copy states. This | |
code weighs each copy state by the size of the region in | |
question, and returns a single weighted copy number for the | |
whole gene. This is available both as a full precision | |
'copy_number_full_precision' value or rounded to the nearest | |
integer as 'copy_number' | |
• When multiple copy states exist for a gene + analysis_id, we | |
concatenate 'cnv_start', 'cnv_end', 'cnv_length', and | |
'copy_number_mult' fields to collapse the data into a single | |
row, semicolon delimited. | |
• In some cases a small or internal gene deletion is detected | |
that may otherwise be ignored by weighting the 'copy_number'. | |
For example, one or two exons may be deleted in a large gene, | |
and even the absence of those exons aren't enough to | |
meaningfully change the 'copy_number'. The | |
'internal_deletion_detected' flag exists to identify those | |
cases where the full gene is not deleted, but there still may | |
be an relevant deletion event. | |
Value: | |
lazy query | |
Diagnosis dates per patient / cancer | |
Description: | |
This table provides dates for the earliest primary and metastatic | |
diagnosis per patient / condition | |
Usage: | |
.tbl_diagnosis_dates(connection) | |
Arguments: | |
connection: BigQueryConnection object to expert_deid. If not provided, | |
connect_bq is used. | |
Details: | |
• From 'primary_tumor_characterization' gets the latest primary | |
diagnosis date. There are very few cases wih multiple dates, | |
therefore later date logic here can be ignored. | |
• From 'primary_tumor_characterization' get the full set of | |
observations of metastatic status based on the the | |
tumor_characterization_type = "Metastasis of Primary", | |
m_stage, and overall_stage_rollup fields. | |
• Also get metatstatic observations and dates from the | |
'metastases' table for the patient ID and condition ID for | |
late stage tumor and w | |
• Gets the earliest metastatic observation from a union of all | |
observed metastates (caution! currently drops year) | |
Value: | |
lazy query | |
Examples: | |
con <- connect_bq() | |
diag_date <- .tbl_diagnosis_dates(con) | |
Return a Table of DNA Analysis ID Prioritization | |
Description: | |
Return a table of DNA 'analysis_id' prioritization per | |
'sample_family_id', filtered on a few important conditions. | |
Usage: | |
.tbl_dna_analysis(connection = connect_bq()) | |
Arguments: | |
connection: a 'BigQueryConnection' object, 'connect_bq()' by default | |
Details: | |
• The 'analysis_id' values come from the | |
'molecular_inventory_analysis' table | |
• Samples must have a '"tumor"' 'classification' as opposed to | |
normal-only | |
• Samples must have a 'overall_passing_qc_status_flag' value of | |
'"TRUE"' | |
• Rank is determined by: | |
1. Presence in the 'molecular_master_file_filtered' table | |
2. Presence of a matched sample, as opposed to tumor-only | |
3. The most recent run date | |
Value: | |
a 'tbl_lazy' class 'tibble' with the following columns: | |
'patient_id', 'sample_family_id', 'analysis_id', 'sample_id', | |
'match_type', 'sequencing_date_indexed', 'run_date_indexed', | |
'intent', 'procedure_type', 'assay', 'in_mmf', and | |
'dna_analysis_id_rank' | |
Last known date of curation for a patient | |
Description: | |
This table organizes curated last known data derived from many | |
potential sources. Details are available in DM2.0 documentation | |
Usage: | |
.tbl_onco_last_known_date(connection = tempusutils::connect_bq()) | |
Arguments: | |
connection: BigQueryConnection object to expert_deid. If not provided, | |
connect_bq is used. | |
Value: | |
lazy query | |
Progression event table | |
Description: | |
This table organizes progression events experienced by a patient, | |
and includes death, recurrence, observation of metastases, and | |
other observations as progression events in addition to curated | |
observations of therapy-related outcomes. | |
• Events without associated dates are removed from the results | |
Usage: | |
.tbl_onco_progression_event(connection = tempusutils::connect_bq()) | |
Arguments: | |
connection: BigQueryConnection object to expert_deid. If not provided, | |
connect_bq is used. | |
Value: | |
lazy query | |
Response assessment table | |
Description: | |
This table organizes curated outcomes on treatment regimens and/or | |
medications for all patient records with treatment data. | |
• Events without associated dates are removed from the results | |
Usage: | |
.tbl_onco_response_assessment(connection = tempusutils::connect_bq()) | |
Arguments: | |
connection: BigQueryConnection object to expert_deid. If not provided, | |
connect_bq is used. | |
Value: | |
lazy query | |
Return a Table of Patient Records with a Curated Diagnosis | |
Description: | |
Return a Table of Patient Records with a Curated Diagnosis by | |
joining the 'patient', 'patient_duplicate_boolean_ie', | |
'combined_diagnosis', and 'medications_enhanced' tables. The table | |
is filtered to exclude records that are either delivery restricted | |
or are identifiable, and some columns are renamed for clarity. | |
Usage: | |
.tbl_patient_record(connection = connect_bq()) | |
Arguments: | |
connection: a 'BigQueryConnection' object, 'connect_bq()' by default | |
Details: | |
• Results exclude records that are delivery restricted or | |
identifiable | |
• Results only include records with a curated diagnosis; that | |
is, in the 'combined_diagnosis' table, records must have a | |
'diagnosis_source' value of '"CURATED"', 'is_primary' must be | |
'TRUE', and 'combined_diagnosis_date_indexed' can not be 'NA' | |
• For a given 'patient_id', 'has_treatment' will be 'TRUE' if | |
the value of 'source_system_canonical_name' is 'TRUE', | |
'effective_date_start_indexed' has at least one non-null | |
value, and 'drug_class_name' has least non-null value in the | |
'medications_enhanced' table | |
• For a given 'patient_id', 'has_tempus_sequencing' will be | |
'TRUE' where either of the following conditions are met: | |
1. It has a DNA 'analysis_id' that has passed quality | |
control in the 'molecular_inventory_analysis' AND | |
'molecular_master_file' tables | |
2. It has a RNA 'analysis_id' that has passed quality | |
control in the 'molecular_inventory_analysis' AND | |
'normalized_rna' tables | |
Value: | |
a 'tbl_lazy' class 'tibble' with the following columns: | |
'patient_id', 'gender', 'birth_date', 'birth_year', 'race', | |
'ethnicity', 'has_treatment', and 'has_tempus_sequencing' | |
Return a Table of RNA Analysis ID Prioritization | |
Description: | |
Return a table of RNA 'analysis_id' prioritization per | |
'sample_family_id', filtered on a few important conditions. | |
Usage: | |
.tbl_rna_analysis(connection = connect_bq()) | |
Arguments: | |
connection: a 'BigQueryConnection' object, 'connect_bq()' by default | |
Details: | |
• The 'analysis_id' values come from the | |
'molecular_inventory_analysis' table | |
• Samples must have a '"tumor"' 'classification' as opposed to | |
normal-only | |
• Samples must have a 'overall_passing_qc_status_flag' value of | |
'"TRUE"' | |
• Rank is determined by: | |
1. Presence in the 'normalized_rna' table | |
2. The most recent run date | |
Value: | |
a 'tbl_lazy' class 'tibble' with the following columns: | |
'patient_id', 'sample_family_id', 'analysis_id', 'sample_id', | |
'sequencing_date_indexed', 'run_date_indexed', 'intent', | |
'procedure_type', 'assay', 'in_normalized_rna', and | |
'rna_analysis_id_rank' | |
Sample condition ID matching | |
Description: | |
This table matches sequenced samples to the nearest curated | |
condition based on the date of sample collection and the date of | |
curated observation. | |
Usage: | |
.tbl_sample_condition(connection) | |
Arguments: | |
connection: BigQueryConnection object to expert_deid. If not provided, | |
connect_bq is used. | |
Details: | |
• Gets all condition observations from | |
'primary_tumor_characterization' where clinical_status == | |
'primary' | |
• Join to '.tbl_sample()' on patient_id, ranks the diagnosis | |
observation based on proximity to sample_collection_date and | |
chooses the closest primary diagnosis observation | |
• Generally the vast majority of diagnosis observations occur | |
before sample collection. But because of how month-only dates | |
are imputed and how dates are shifted for de-identification, | |
there are cases when the closest (or only) diagnosis | |
observation is in the future w/r/t the | |
sample_collection_date. In this code, we set a threshold that | |
a diagnosis observation may be up to 45 days post sample | |
collection and still be accepted as the diagnosis of the | |
sample of interest. Any observation beyond 45 days | |
post-collection is discarded. | |
Value: | |
lazy query | |
Examples: | |
con <- | |
connect_bq() | |
sample_condition <- | |
.tbl_sample_condition(con) |> | |
collect() | |
Return a Table of Stage at Time of Sampling | |
Description: | |
Return a table of stage at time of sampling with metastatic status | |
derived from the 'primary_tumor_characterization' table. | |
Usage: | |
.tbl_sample_stage(connection = connect_bq()) | |
Arguments: | |
connection: a 'BigQueryConnection' object, 'connect_bq()' by default | |
Details: | |
• The table returned by '.tbl_sample_condition()' is inner | |
joined with the 'primary_tumor_characterization' table to | |
obtain a set of metastatic observations per | |
'patient_id'/'condition_id'. | |
• Where the M stage observation is M1 (metastatic), the imputed | |
value of 'overall_stage_rollup' will be '"Stage 4"'. | |
• Where the histology observation includes a '"metastatic"' | |
assertion and M stage is not '"M0"', the imputed value of | |
'overall_stage_rollup' will be '"Stage 4"'. | |
• The 'metastatic_status' column in the returned table is a | |
further rollup of the 'overall_stage_rollup' where '"Stage | |
4"' becomes '"Metastatic"' and lesser stages or '"M0"' become | |
'"Pre-Metastatic"'. | |
Value: | |
a 'tbl_sql' class 'tibble' with the following columns: | |
'patient_id', 'condition_id', 'diagnostic_report_id', | |
'tumor_characterization_time_from_index', | |
'tumor_characterization_date_indexed', | |
'tumor_characterization_date_precision', | |
'tumor_characterization_date_year_indexed', | |
'tumor_characterization_type', 'clinical_status', 't_stage', | |
't_stage_rollup', 'n_stage', 'n_stage_rollup', 'm_stage', | |
'm_stage_rollup', 'overall_stage', 'overall_stage_rollup', | |
'tumor_grade', 'histology_name', 'figo_stage', 'valg_stage', | |
'heme_stage', 'heme_stage_type', 'alternative_grade', | |
'primary_gleason_score', 'secondary_gleason_score', | |
'total_gleason_score', 'source_system_name', | |
'source_system_canonical_name', 'pdx_flag', 'sample_family_id', | |
'sample_collection_date', 'primary_diagnosis_date', | |
'primary_diagnosis_year', 'days_diagnosis_from_collection', | |
'metastatic_status', and 'days_stage_from_collection' | |
Return a Table of Medication to Therapy Rollups | |
Description: | |
Return a table of medication to therapy rollups per | |
'sample_family_id' by joining samples with molecular data to the | |
care plan rollups in 'onco_care_plan' by 'patient_id'. The | |
resulting table provides details about therapies administered for | |
each sample and the timing of sample collection relative to | |
therapy. | |
Usage: | |
.tbl_sample_therapy(connection = connect_bq()) | |
Arguments: | |
connection: a 'BigQueryConnection' object, 'connect_bq()' by default | |
Details: | |
• The '.tbl_sample()' function is used to return a set of | |
samples with 'patient_id', 'sample_family_id', and | |
'sample_collection_date', which is inner joined to | |
'onco_care_plan' by 'patient_id'. | |
• The timing of sample collection relative to therapy is | |
calculated and then categorized in the | |
'sample_relative_to_treatment' column of the table returned; | |
possible values include '"Pre-treatment"' and | |
'"Post-treatment"'. | |
• The determination of sample collection timing in relation to | |
treatment primarily relies on treatment start date. | |
Consequentially, samples collected after the initiation of | |
treatment, even if they fall within the treatment period, are | |
categorized as '"Post-treatment"' for analysis purposes. | |
• To provide a more granular information about sample timing, | |
the returned table also has a 'sample_treatment_timing' | |
column. This column includes modifiers '"Flanking"' and | |
'"Distant"' to further classify samples as Pre- or Post- | |
treatment. '"Flanking"' status is assigned to samples | |
collected immediately before or after treatment without any | |
other treatments in between, while '"Distant"' status is | |
applied to samples collected after one or more intervening | |
therapies between sample collection and the therapy. | |
Value: | |
a 'tbl_sql' class 'tibble' with the following columns: | |
'sample_family_id', 'care_plan_id', 'therapy', 'therapy_class', | |
'therapy_class_group', 'care_plan_order', | |
'care_plan_order_source', 'line_of_therapy_number', | |
'line_of_therapy_number_source', 'has_maintenance', | |
'has_neoadjuvant', 'has_adjuvant', 'has_induction', | |
'has_conditioning', 'has_consolidation', 'therapy_start_date', | |
'start_date_precision', 'start_date_year_indexed', | |
'therapy_end_date', 'end_date_precision', 'end_date_year_indexed', | |
'days_sample_from_therapy_start', 'days_sample_from_therapy_end', | |
'sample_relative_to_treatment', and 'sample_treatment_timing' | |
Sample tissue site to primary tumor site mapping | |
Description: | |
Get both the sample tissue site and the primary cancer expected | |
tissue site for a sample_family_id of interest. Map the sample | |
site to the expected primary site to determine whether the sample | |
in question is from primary tissue or distant (metastatic) tissue | |
Usage: | |
.tbl_sample_tissue_site(connection) | |
Arguments: | |
connection: BigQueryConnection object to expert_deid. If not provided, | |
connect_bq is used. | |
Details: | |
This table provides a 1:1 map from the sample tissue site to the | |
primary tumor tissue and returns an assertion of whether the | |
sample comes from a primary cancer tissue site or from a "distant" | |
tissue site. | |
• unfortunately as of 2023-07 the site mappings currently in | |
the TMO cohort site ontology are imperfect, and there are | |
cases when a sample truly does come from what would be | |
considered primary tissue but that tissue site doesn't map as | |
a primary tissue. Thus "primary" vs. "distant" assertions | |
should be used carefully until this issue is resolved. | |
• the TMO site system has a property called | |
'has_physical_part_of_anatomic_structure' which can be very | |
useful to assess lymph tissue source. i.e. if the sample | |
comes from a lymph-associated tissue, it can be binned into | |
the "Lymph" source | |
• when the sample tissue is a descendant of the "heme", | |
"sarcoma", or "TUO" cohorts, the | |
'sample_to_primary_tissue_match' is always set to "ambiguous" | |
Value: | |
lazy query | |
Return a Table of Sample Metadata for Samples Sequenced by Tempus | |
Description: | |
Return a table of sample metadata for samples sequenced by Tempus, | |
filtered on a few important conditions. | |
Usage: | |
.tbl_sample(connection = connect_bq()) | |
Arguments: | |
connection: a 'BigQueryConnection' object, 'connect_bq()' by default | |
Details: | |
• The 'sample_id' values are obtained from the | |
'molecular_inventory_analysis' table and the earliest | |
collection date for a given sample is chosen where there are | |
multiple collection dates. | |
• This function uses '.tbl_dna_analysis' to gather | |
'dna_analysis_id' values and relevant DNA assay and | |
sequencing metadata. The 'analysis_id' prioritization logic | |
is outlined in the '.tbl_dna_analysis' documentation. | |
• This function uses '.tbl_rna_analysis' to gather | |
'rna_analysis_id' values and relevant RNA assay and | |
sequencing metadata. The 'analysis_id' prioritization logic | |
is outlined in the '.tbl_rna_analysis' documentation. | |
• The returned 'tibble' contains boolean columns for data | |
availability depending on assay type and DNA/RNA match | |
status. | |
Value: | |
a 'tbl_lazy' class 'tibble' with the following columns: | |
'patient_id', 'sample_family_id', 'sample_collection_date', | |
'sample_tissue_site', 'focus_of_curation', 'sample_source', | |
'tmo_tissue_site_concept_code', 'dna_sample_id', | |
'dna_analysis_id', 'dna_assay', 'dna_initial_sequencing_date', | |
'dna_sequencing_intent', 'dna_sample_procedure_type', | |
'rna_sample_id', 'rna_analysis_id', 'rna_assay', | |
'rna_sequencing_date', 'rna_sequencing_intent', | |
'rna_procedure_type', 'has_dna', 'has_rna', 'has_matched_dna_rna', | |
'has_collection_date', 'has_treatment', 'is_liquid_assay', | |
'is_solid_assay', and 'is_xe' | |
Smoking status table | |
Description: | |
This table gives the smoking status of the patient. | |
Usage: | |
.tbl_smoking_status(connection = tempusutils::connect_bq()) | |
Arguments: | |
connection: BigQueryConnection object to expert_deid. If not provided, | |
connect_bq is used. | |
Value: | |
lazy query | |
Return a Table of SNV and Insertion/Deletion Variants | |
Description: | |
Return a table of SNV and insertion/deletion variants based on the | |
'molecular_master_file' table. | |
Usage: | |
.tbl_snv_indel(connection = connect_bq()) | |
Arguments: | |
connection: a 'BigQueryConnection' object, 'connect_bq()' by default | |
Details: | |
• This function obtains 'analysis_id' values from | |
'molecular_inventory_analysis' and returns variants for the | |
primary 'analysis_id' for a 'sample_family_id' | |
• The 'molecular_master_file' table is queried to obtain all | |
SNV and insertion/deletion variants for the primary | |
'analysis_id'. The function also performs some data cleaning | |
to convert all three-letter amino acid codes to single-letter | |
codes and concatenates 'chrom_pos_ref_alt' into a single | |
column | |
• This function returns low AF variants (<5%) even though they | |
don't meet the Tempus reporting threshold. | |
Value: | |
a 'tbl_lazy' class 'tibble' with the following columns: | |
'sample_family_id', 'gene_symbol', 'chr_pos_ref_alt', | |
'variant_type_canonical_name', 'mutation_effect', | |
'somatic_germline', 'classification', 'clonality', 'coverage', and | |
'supporting_reads' | |
Tempus Medical Ontology (TMO) Diagnosis Table | |
Description: | |
This function generates a lazy query joining the tables | |
'tbl_sample' and 'combined_diagnosis' on patient_id. | |
Usage: | |
.tbl_tmo_diagnosis(connection) | |
Arguments: | |
connection: BigQueryConnection object to expert_deid. If not provided, | |
connect_bq is used. | |
Details: | |
• Maps the TMO diagnosis concept ID to the preferred diagnosis | |
label used by Lens. | |
• For a given TMO diagnosis code in 'combined_diagnosis', we | |
also return the path to the root cohort concept ID through | |
the diagnosis hierarchy. | |
• Each patient may have multiple observations of diagnosis at | |
different dates, therefore the function returns the | |
difference in days between the time of sample collection and | |
time of diagnosis observation. | |
Value: | |
lazy query | |
Download RNA | |
Description: | |
Copies 'vw_normalized_rna' to a GCP bucket and saves that data | |
locally as a collection of Parquet files. | |
Usage: | |
download_rna( | |
connection, | |
cohort = NULL, | |
raw_analysis_ids = NULL, | |
expression_value = "gene_tpm_cognizant_corrector", | |
local_dir = "./data", | |
project = Sys.getenv("TR_PROJECT"), | |
verbose = FALSE | |
) | |
Arguments: | |
connection: A connection to the database, use 'connect_bq()' database | |
cohort: A lazy query object containing rna analysis ids under the | |
column rna_analysis_id. | |
raw_analysis_ids: A Character vector of RNA analysis ids. | |
expression_value: Character. One of (gene_norm, | |
gene_norm_cognizant_corrector, gene_tpm, | |
gene_tpm_cognizant_corrector, gene_raw) | |
local_dir: Character. Directory where rna_unload folder will be | |
generated | |
project: Character. GCP Project ID | |
verbose: Logical. Print SQL query used for this function in the | |
console. | |
Details: | |
Use this function if you require whole transcriptome gene | |
expression data. | |
Value: | |
Character. Local directory where RNA was unloaded | |
Examples: | |
library(dplyr) | |
con <- connect_bq() | |
cohort <- con |> pull_snv_indel_cohort(genes = "ERBB2") | |
cohort <- con |> pull_deliverable_cohort(cohort, require_rna = TRUE) | |
collected_cohort <- cohort |> collect_retry() | |
rna_dir <- con |> | |
download_rna(cohort, | |
raw_analysis_ids = collected_cohort$rna_analysis_id[1:100] | |
) | |
if (interactive()) { | |
rna <- arrow::open_dataset(rna_dir) | |
} | |
Get CNV | |
Description: | |
Helper function which will get CNV data from '.tbl_cnv()'. This | |
table performs length weighted average for duplicate gene CNV | |
calls | |
Usage: | |
get_cnv( | |
connection, | |
cohort, | |
gene = NULL, | |
cnv_type = c("all", "amplification", "deletion"), | |
only_alterations = TRUE, | |
amplification_threshold = 8L, | |
verbose = FALSE | |
) | |
Arguments: | |
connection: BigQueryConnection object to expert_deid. If not provided, | |
link[tempusutils]connect_bq is used. | |
cohort: A lazy query object containing the column of | |
'sample_family_id' and 'dna_analysis_id' of a cohort of | |
interest | |
gene: Character vector of genes of interest by HGNC name | |
cnv_type: Character - One of "deletion', 'amplification', or 'all'. | |
Defaults to 'all' | |
only_alterations: Logical, Default TRUE. If TRUE, return only deletions | |
and amplifications at amplification_threshold. If FALSE, | |
return all copy status for all samples. | |
amplification_threshold: integer, copy number threshold for amps and | |
'default' is 8. | |
verbose: Logical Print SQL query used for this function in the | |
console. | |
Value: | |
data.frame of cnv DNA variants from .tbl_cnv() | |
Examples: | |
library(dplyr) | |
## FIXME: tempusquery needs to be built | |
library(tempusquery) | |
## Step1: | |
con <- connect_bq() | |
## Step2: Define a cohort to get a list of SFIDs | |
tmo_cohort <- query_tmo_diagnosis( | |
connection = con, | |
tmo_codes = c("TMO02791501", "TMO93014375") | |
## Step3: Add filters to the cohort you defined in Step 2 | |
filters <- query_sample(con, | |
require_dna = TRUE, | |
require_rna = TRUE, | |
require_treatment = TRUE | |
) | |
## Step4: Apply filter to your cohort | |
final_cohort <- tmo_cohort |> | |
inner_join(filters) | |
## Step5: Collect all copy status for EGFR | |
cnv <- get_cnv(con, | |
final_cohort, | |
gene = "EGFR", | |
only_alterations = FALSE | |
) | |
## Other examples for get_cnv() | |
# Collect only amplifications/deletion at an amp threshold of 5 for MYC | |
cnv <- get_cnv(con, | |
final_cohort, | |
gene = "MYC", | |
only_alterations = TRUE, | |
amplification_threshold = 5L | |
) | |
# Collect only deletions for MDM2 | |
cnv <- get_cnv(con, | |
final_cohort, | |
gene = "MYC", | |
cnv_type = "deletion", | |
only_alterations = TRUE | |
) | |
Get last-known alive/curated date for patients | |
Description: | |
Gets the last-known alive / last-known curated date of a cohort of | |
patients from the DM2.0 'onco_patient' table. Sources of | |
last-known dates can be from several tables / types of curation | |
and potentially also include third- party claims data that can | |
extend the last-known date. | |
Usage: | |
get_last_known_date(cohort) | |
Arguments: | |
cohort: Lazy query with patient_id of cohort | |
connection: BigQueryConnection object to expert_deid. If not provided, | |
link[tempusutils]connect_bq is used. | |
Value: | |
tbl of cohort left joined with last-known dates | |
Examples: | |
con <- connect_bq() | |
## Lung cancer | |
tmo_cohort <- trtools::pull_tmo_cohort(con, | |
tmo_codes = "TMO71944383") | |
cohort <- inner_join(tmo_cohort, filters, by = "sample_family_id") | |
lkd <- get_last_known_date(cohort = tmo_cohort) | |
Get patient-level medication rollups | |
Description: | |
Get patient-level medication rollups | |
Usage: | |
get_medication_rollups(connection, cohort) | |
Arguments: | |
connection: BigQueryConnection object to expert_deid. If not provided, | |
link[tempusutils]connect_bq is used. | |
cohort: Lazy query with patient_id of cohort | |
Details: | |
• Uses curated only records from the 'medications' table to | |
derive regimens at the care_plan_id level. | |
Value: | |
data.frame of treatment data per patient_id | |
Examples: | |
medications <- get_medication_rollups(cohort) | |
get patient diagnosis dates | |
Description: | |
Using curated cancer, stage, and metastatic assertions, return the | |
primary diagnosis date and earliest metastatic observation for | |
each patient/cancer | |
Usage: | |
get_patient_diagnosis_dates(connection, cohort, verbose = FALSE) | |
Arguments: | |
connection: BigQueryConnection object to expert_deid. If not provided, | |
link[tempusutils]connect_bq is used. | |
cohort: a lazy query with 'sample_family_id' of cohort | |
verbose: Logical. Print SQL query used for this function in the | |
console | |
Details: | |
Provide a lazy query for cohort of interest | |
Value: | |
a 'data.frame' of patient diagnoses data | |
Examples: | |
## FIXME: tempusquery needs to be built | |
library(tempusquery) | |
## Step1: | |
con <- connect_bq() | |
## Step2: Define a cohort to get a list of SFIDs | |
tmo_cohort <- query_tmo_diagnosis( | |
connection = con, | |
tmo_codes = c("TMO02791501", "TMO93014375") | |
## Step3: Add filters to the cohort you defined in Step 2 | |
filters <- query_sample(con, | |
require_dna = TRUE, | |
require_rna = TRUE, | |
require_treatment = TRUE | |
) | |
## Step4: Apply filter to your cohort | |
final_cohort <- tmo_cohort |> | |
inner_join(filters) | |
final_cohort <- | |
inner_join(tmo_cohort, filters) | |
diagnosis_dates <- | |
get_patient_diagnosis_dates( | |
con, | |
cohort = final_cohort | |
) | |
Get the progression events of a cohort | |
Description: | |
Gets progression events of a cohort of patients from the DM2.0 | |
'onco_progression_event' table. This table includes both curated | |
therapy- associated progression events (e.g. "progressive | |
disease", "partial response") and also includes death events, | |
observed metastases, and recurrence events as real-world | |
progression events even if they are not necessarily | |
therapy-associated. | |
Usage: | |
get_progression_events(cohort) | |
Arguments: | |
cohort: Lazy query with patient_id of cohort | |
connection: BigQueryConnection object to expert_deid. If not provided, | |
link[tempusutils]connect_bq is used. | |
Value: | |
tbl of cohort left joined with progression events. | |
Examples: | |
con <- connect_bq() | |
## Lung cancer | |
tmo_cohort <- trtools::pull_tmo_cohort(con, | |
tmo_codes = "TMO71944383") | |
cohort <- inner_join(tmo_cohort, filters, by = "sample_family_id") | |
progression_events <- get_progression_events(cohort = tmo_cohort) | |
Get treatment-associated response assessments | |
Description: | |
Gets response assessments of a cohort of patients from the DM2.0 | |
'onco_response_assessment' table. As opposed to the | |
'onco_progression_events' table that includes many real-world | |
observations outside of treatment response as progression events, | |
these results are only curated assertions of treatment response | |
keyed on care_plan_id | |
Usage: | |
get_response_assessment(cohort) | |
Arguments: | |
cohort: Lazy query with patient_id of cohort | |
connection: BigQueryConnection object to expert_deid. If not provided, | |
link[tempusutils]connect_bq is used. | |
Value: | |
tbl of cohort left joined with response assertions | |
Examples: | |
con <- connect_bq() | |
## Lung cancer | |
tmo_cohort <- trtools::pull_tmo_cohort(con, | |
tmo_codes = "TMO71944383") | |
cohort <- inner_join(tmo_cohort, filters, by = "sample_family_id") | |
response_assessments <- get_response_assessment(cohort = tmo_cohort) | |
Get ID map for samples | |
Description: | |
Get sample-related IDs for a set of sample_family_id including | |
dna_analysis_id and rna_analysis_id | |
Usage: | |
get_sample_id_map(cohort, verbose = FALSE) | |
Arguments: | |
cohort: Lazy query with sample_family_ids of cohort | |
verbose: Logical. Print SQL query used for this function in the | |
console | |
Value: | |
data.frame of patient biopsy data | |
Examples: | |
library(dplyr) | |
con <- connect_bq() | |
## pull the deliverable cohort for NSCLC | |
cohort <- query_tmo_diagnosis( | |
connection = con, | |
tmo_codes = c("TMO02791501") | |
) | |
## get the molecular ID information associated with these samples | |
id_map <- get_sample_id_map(cohort, | |
verbose = FALSE | |
) | |
Get sample metadata | |
Description: | |
Get sample-related data for a set of sample_family_id. | |
Usage: | |
get_sample_metadata(connection, cohort, verbose = FALSE) | |
Arguments: | |
connection: BigQueryConnection object to expert_deid. If not provided, | |
link[tempusutils]connect_bq is used. | |
cohort: Lazy query with sample_family_ids of cohort | |
verbose: Logical. Print SQL query used for this function in the | |
console | |
Details: | |
Uses TMO site codes and labels to get the sample tissue site | |
metadata. Additional sample metadata is collected from the | |
'.tbl_sample()' table. | |
Value: | |
a 'data.frame' of patient biopsy data | |
Examples: | |
library(dplyr) | |
# pull the deliverable cohort for lung adenocarcinoma | |
cohort <- | |
query_tmo_diagnosis(tmo_codes = "TMO02791501") | |
cohort <- | |
inner_join(cohort, query_sample(con, require_dna = TRUE)) | |
# get the sample information for these samples | |
samples <- get_sample_metadata(cohort) | |
Get sample stage | |
Description: | |
Get curated stage information at time of sampling from a cohort | |
using all staging data available. Many samples do not have curated | |
staging information and therefore are not returned in the results. | |
Provide a lazy query for cohort of interest including | |
'sample_family_id' | |
Usage: | |
get_sample_stage( | |
connection, | |
cohort, | |
window_pre_collection = 90L, | |
window_post_collection = 90L, | |
verbose = FALSE | |
) | |
Arguments: | |
connection: BigQueryConnection object to expert_deid. If not provided, | |
link[tempusutils]connect_bq is used. | |
cohort: Lazy query with 'sample_family_id' of cohort | |
window_pre_collection: Integer. Window to accept stage prior to sample | |
collection date | |
window_post_collection: Integer. Window to accept stage after to sample | |
collection date | |
verbose: Logical. Print SQL query used for this function in the | |
console | |
Details: | |
In most cases, multiple observations of staging are taken at | |
different time points for each patient + cancer. For the purposes | |
of this function, we're interested in the staging at the time of | |
sample collection. In cases where the stage observations are taken | |
on the same day as the sample collection, that is straightforward. | |
The "window_" parameters here are to define a time window pre- and | |
post- sample collection in which the observed staging of the tumor | |
is relevant to the target sample. By default these are set to 90 | |
days pre and 90 days post sample collection. In cases where there | |
are multiple staging observations in the desired window, the | |
staging nearest to the sample collection date is chosen under the | |
assumption that it is the "most" correct" | |
However, there are cases where the nearest staging observation to | |
sample collection is sparse, e.g. it only consists of a single | |
data point of something like the grade or histology. In those | |
cases, there are often other observations that are within | |
days/weeks that are much more information- dense and contain full | |
TNM staging. The logic here is that we only match the nearest | |
staging observation IF it has a non-null 'overall_stage_rollup' | |
assertion. Otherwise, we ignore that observation and look for the | |
next closest in the desired window. | |
Additionally, there are observations outside of the defined time | |
window that we can use to infer not the exact stage, necessarily, | |
but the metastatic status (i.e. metastatic vs. pre-metastatic). | |
For example, if there is an observation of metastatic status | |
before sample collection, even if it is outside of the defined | |
window, we can infer that the sample is metastatic. Similarly, an | |
observation of a pre-metastatic staging in the future after sample | |
collection can be used to infer that the sample comes from a tumor | |
that has not yet metastasized. | |
These assumptions are used to infer metastatic status when | |
contemporary observations are not available. | |
Value: | |
data.frame of sample staging and metastatic status information | |
Examples: | |
con <- | |
connect_bq() | |
tmo_cohort <- | |
pull_tmo_cohort( | |
connection = con, | |
tmo_codes = "TMO39662164" | |
) | |
filters <- | |
query_sample(con, | |
require_dna = TRUE, | |
require_rna = TRUE, | |
require_treatment = TRUE | |
) | |
final_cohort <- | |
inner_join(tmo_cohort, filters, by = "sample_family_id") | |
stage_metadata <- | |
get_sample_stage(con, cohort = final_cohort) | |
Get sample treatment data | |
Description: | |
Gets therapy data using the DM2.0 onco_care_plan table. This | |
function also derives LoT assertions using earliest metastatic | |
observations when curated LoT does not exist. | |
Usage: | |
get_sample_therapy(connection, cohort) | |
Arguments: | |
connection: BigQueryConnection object to expert_deid. If not provided, | |
link[tempusutils]connect_bq is used. | |
cohort: Lazy query with sample_family_ids of cohort | |
Details: | |
• We make some assumptions here to derive/impute | |
line-of-therapy associations. Most importantly, we use the | |
earliest metastatic observation date per patient/condition to | |
re-rank the order of care_plans given post earliest | |
metastatic observation. This ranking is available in the | |
'metastatic_care_plan_order' column. | |
• The 'derived_lot' field contains the best imputation of | |
line-of-therapy. In cases where line of therapy exists in the | |
'line_of_therapy_number' field, that value is always chosen | |
over the 'metastatic_care_plan_order'. However, when | |
'line_of_therapy_number' does not exist, we use the | |
'metastatic_care_plan_order' value. In practice this works | |
quite well and agreement between curated | |
'line_of_therapy_number' and 'metastatic_care_plan_order' is | |
high. | |
• Any observation of radiotherapy alone is always discarded for | |
the purposes of line-of-therapy calculation, although in the | |
cases where radiotherapy is given in addition to other drugs | |
it may not be excluded. | |
Value: | |
data.frame of treatment data per sample_family_id | |
Examples: | |
library(dplyr) | |
con <- connect_bq() | |
## pull the deliverable cohort for lung adenocarcinoma | |
cohort <- query_tmo_diagnosis( | |
connection = con, | |
tmo_codes = c("TMO02791501") | |
) | |
cohort <- cohort |> | |
inner_join( | |
query_sample(con, | |
require_dna = TRUE, | |
require_treatment = TRUE | |
) | |
) | |
## get the treatment information on these samples | |
samples <- con |> | |
get_sample_therapy(cohort) | |
Get smoking status of a cohort | |
Description: | |
Gets smoking status of a cohort of patients and the status will be | |
determined from the "assessments" table in expert_deid. The | |
smoking status is returned in the column 'smoking' and is | |
classified into "Current-smoker", "Never-smoker", "Ex-smoker", and | |
"Unknown" based on the "value_canonical_name" column in the | |
"assessments" table. If another status shows up eventually because | |
the roll-up isn't all encomapassing, please report the bug via an | |
issue in "tempustables". | |
Usage: | |
get_smoking_status(cohort, verbose = FALSE) | |
Arguments: | |
cohort: Lazy query with sample_family_ids of cohort | |
verbose: Logical. Print SQL query used for this function in the | |
console | |
connection: BigQueryConnection object to expert_deid. If not provided, | |
link[tempusutils]connect_bq is used. | |
Value: | |
tbl of cohort left joined with smoking status information. | |
Examples: | |
con <- connect_bq() | |
## Lung cancer | |
tmo_cohort <- trtools::pull_tmo_cohort(con, | |
tmo_codes = "TMO71944383") | |
cohort <- inner_join(tmo_cohort, filters, by = "sample_family_id") | |
smoking_status <- get_smoking_status(cohort = tmo_cohort) | |
Get a Table of SNV and Insertion/Deletion Variants for a Given Cohort | |
Description: | |
When provided a 'cohort' tibble with a 'sample_family_id' column, | |
'get_snv_indel' will return a table of SNV and insertion/deletion | |
variants. | |
Usage: | |
get_snv_indel(cohort, gene = NULL, include_vus = TRUE, include_germline = TRUE) | |
Arguments: | |
cohort: a tibble that has a 'sample_family_id' column | |
gene: a gene of interests, 'NULL' by default | |
include_vus: a logical scalar indicating if variants of unknown | |
significance should be included, 'TRUE' by default | |
include_germline: a logical scalar indicating if germline variants | |
should be in included, 'TRUE' by default | |
Value: | |
a tibble with the following columns: 'sample_family_id', | |
'gene_symbol', 'chr_pos_ref_alt', 'variant_type_canonical_name', | |
'mutation_effect', 'somatic_germline', 'classification', | |
'clonality', 'coverage', and 'supporting_reads' | |
Get sample diagnosis | |
Description: | |
Get TMO curated diagnostic information at time of sampling from a | |
cohort using TMO codes from the 'combined_diagnosis' table. | |
Usage: | |
get_tmo_diagnosis( | |
connection, | |
cohort, | |
tmo_codes = NULL, | |
days_post_collection = 30L, | |
days_pre_collection = 180, | |
verbose = FALSE | |
) | |
Arguments: | |
connection: BigQueryConnection object to expert_deid. If not provided, | |
connect_bq is used. | |
cohort: Lazy query with sample_family_id of cohort | |
tmo_codes: vector of TMO code rollups used in the cohort pull to roll | |
up diagnoses. Optional. Default NULL. | |
days_post_collection: integer. Days after sample collection from which | |
a diagnosis observation will be included | |
days_pre_collection: integer. Days prior to sample collection from | |
which a diagnosis observation will be included | |
verbose: Logical. Print SQL query used for this function in the | |
console. | |
Value: | |
data.frame of patient diagnoses data | |
Examples: | |
library(dplyr) | |
con <- connect_bq() | |
## Needs a cohort | |
diagnosis <- con |> | |
get_tmo_diagnosis(cohort = cohort, | |
tmo_codes = c("TMO39662164")) | |
Parallelized Copy Delete | |
Description: | |
Parallelized Copy Delete | |
Usage: | |
parallel_copy_delete( | |
bucket_name, | |
dir_name, | |
local_path = "./data", | |
overwrite = FALSE | |
) | |
Arguments: | |
bucket_name: Character. Local_Path of GCP bucket | |
dir_name: Character. Name of temporary directory to be used | |
local_path: Character. Name of local directory where data should be | |
written | |
project: Character. GCP Project ID | |
Value: | |
invisible. parallel copy delete of objects to local dir. | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment