Skip to content

Instantly share code, notes, and snippets.

Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
@smrgit
smrgit / gist:c170b5659d095a8e01455a358c0b70ee
Created May 24, 2018 16:00
PanCancer Atlas publications -- controlled-access supplementary files at GDC
0e1bc6-22f0-424b-8a21-305c08af22a7
12003a-6135-4869-852c-a451d45bf39f
1950b4-6695-49a5-a2b1-d0c2e14b8448
19a1e4-f6b9-48bd-b1f7-160f8e720f1e
1c4fee-3e91-495d-8b5e-036c1639bdc8
1fa5dc-5e86-47ee-8eb4-695cb48169b1
24ec2c-dd64-47f9-a44d-c88a317fd88f
258859-604a-4b5c-9227-0f75089eab68
2b862d-da2b-4d51-94e3-a2c7a52ac3fb
2e54aa-4cc0-4cbd-9647-0e039d1e3da1
@smrgit
smrgit / gist:cbba2e057211f3a05d74b0eac0b9fd5f
Created May 24, 2018 15:58
PanCancer Atlas publications -- open-access supplementary files at GDC
006ce014-e113-4d61-a944-bcc03661a84a
00a32f7a-c85f-4f86-850d-be53973cbc4d
04559f49-5066-4984-a9d0-27c27ce20458
0511725d-7522-4abc-8f07-172bba90fd13
06a124df-fa5b-4f2d-8bfa-0e73b685f222
06ba0f08-fa9b-41eb-a74d-a9225f14d47f
08096f8f-7b56-495a-be45-62d5a56f2ee8
081f7161-ae17-434d-b8e5-a3047852cb10
09f59368-470e-4311-a7b7-f665886620e4
0e526660-14cb-4742-a49b-11fe47f2f93c
@smrgit
smrgit / corr_TCGA_GTEx.sql
Created November 28, 2017 23:20
Spearman correlation between TCGA expression data and GTEx median tissue-type TPM
WITH
--
-- get the top-5000 most variable genes based on computing the standard
-- deviation on the GTEx "gene_median_tpm" table
GTEx_top5K AS (
SELECT
gene_id,
gene_description,
STDDEV(gene_exp) AS sigmaExp
FROM
@smrgit
smrgit / join_CGHub_GDC_legacy_DNA_bams_info.sql
Created September 28, 2017 11:44
join CGHub and GDC legacy information for TCGA DNA bams
WITH
--
-- first, we grab information about the ~48650 TCGA DNA bam files that were at CGHub
-- (and are aligned to GRCh37/HG19 and were classified as "Live")
CGHubInfo AS (
SELECT
SUBSTR(barcode,1,12) AS case_barcode,
SUBSTR(barcode,1,16) AS sample_barcode,
barcode AS aliquot_barcode,
SUBSTR(barcode,14,2) AS sample_type,
@smrgit
smrgit / find_normal_bams_in_gcs.sql
Created September 27, 2017 22:36
join several different metadata tables together to find normal BAMs in GCS, combined with their CGHub and/or GDC file identifiers (where available)
WITH
--
-- first, we grab information about the ~48650 TCGA DNA bam files that were at CGHub
-- (and are aligned to GRCh37/HG19 and were classified as "Live")
dbGaPinfo AS (
SELECT
SUBSTR(barcode,1,12) AS case_barcode,
SUBSTR(barcode,1,16) AS sample_barcode,
barcode AS aliquot_barcode,
SUBSTR(barcode,14,2) AS sample_type,
@smrgit
smrgit / find_AML_BAM_files.sql
Created September 13, 2017 20:08
GDC metadata exploration example
WITH
--
-- this initial table collects the case barcodes for all TCGA LAML and
-- TARGET AML cases from the two "Clinical" tables
-- Result: this sub-query returns a table with 1193 rows
t1 AS (
SELECT
program_name,
case_barcode
FROM
@smrgit
smrgit / egress_01.sql
Created August 16, 2017 17:30
egress charges query
SELECT
dateString,
dayOfYear,
week,
year,
projectID,
productCat,
resourceType,
SUM(cost) AS totCost,
currency,
@smrgit
smrgit / GDC_rel8_metadata_query01.sql
Last active October 26, 2017 02:23
Identify GDC data available in ISB-CGC GCS buckets
WITH
t1 AS (
SELECT
DISTINCT file_gdc_id
FROM
`isb-cgc.GDC_metadata.rel8_GDCfileID_to_GCSurl` ),
j1 AS (
SELECT
a.dbName,
a.file_id,
@smrgit
smrgit / hg19vs38_copy_number_mirna.sql
Created July 22, 2017 02:50
compare hg19 and hg38 copy-number segments at miRNA locations
WITH
hg38t1 AS (
SELECT
sample_barcode,
mirna_id AS mirna_name,
LOG(reads_per_million_miRNA_mapped+1,2) AS logRPM
FROM
`isb-cgc.TCGA_hg38_data_v0.miRNAseq_Expression`
WHERE
reads_per_million_miRNA_mapped > 4 ),