Last active
September 15, 2017 17:05
-
-
Save smrgit/b7177d455a04c1bf70a2d910223c9000 to your computer and use it in GitHub Desktop.
This query joins two ISB-CGC GDC_metadata tables to get a summary of the GDC data that is available in ISB-CGC buckets in Google Cloud Storage.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
WITH | |
t1 AS ( | |
SELECT | |
dbName, | |
file_id AS file_gdc_id, | |
access, | |
cases__project__program__name AS program_name, | |
cases__project__project_id AS project_short_name, | |
experimental_strategy, | |
data_category, | |
data_format, | |
data_type, | |
file_size | |
FROM | |
`isb-cgc.GDC_metadata.rel8_fileData_current` | |
UNION ALL | |
SELECT | |
dbName, | |
file_id AS file_gdc_id, | |
access, | |
cases__project__program__name AS program_name, | |
cases__project__project_id AS project_short_name, | |
experimental_strategy, | |
data_category, | |
data_format, | |
data_type, | |
file_size | |
FROM | |
`isb-cgc.GDC_metadata.rel8_fileData_legacy` ), | |
j1 AS ( | |
SELECT | |
a.dbName, | |
a.access, | |
a.program_name, | |
a.project_short_name, | |
a.experimental_strategy, | |
a.data_category, | |
a.data_format, | |
a.data_type, | |
a.file_size, | |
a.file_gdc_id AS aFile_gdc_id, | |
b.file_gdc_id AS bFile_gdc_id, | |
b.file_gcs_url | |
FROM | |
t1 a | |
FULL JOIN | |
`isb-cgc.GDC_metadata.rel8_GDCfileID_to_GCSurl` b | |
ON | |
a.file_gdc_id=b.file_gdc_id ) | |
SELECT | |
dbName, | |
access, | |
program_name, | |
COUNT(*) AS n, | |
SUM(file_size)/1000000000000 AS total_size_TB | |
FROM | |
j1 | |
WHERE | |
file_gcs_url IS NOT NULL | |
GROUP BY | |
dbName, | |
access, | |
program_name | |
ORDER BY | |
total_size_TB DESC |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
updated SQL to use the release 8 tables