Skip to content

Instantly share code, notes, and snippets.

@wwood
Created February 20, 2023 01:04
Show Gist options
  • Save wwood/8c0c321ae1b189f3f09e8841b5edeedd to your computer and use it in GitHub Desktop.
Save wwood/8c0c321ae1b189f3f09e8841b5edeedd to your computer and use it in GitHub Desktop.
Estimate the total amount of public metagenome sequence using Google BigQuery
-- Loop counter
DECLARE counter int64 default 1;
-- Create intermediate table with organisms that are metagenomes
CREATE OR REPLACE TABLE test.emp
AS
WITH cte AS
(
SELECT 1 AS xlevel, tax_id, parent_id, sci_name
FROM `nih-sra-datastore.sra_tax_analysis_tool.taxonomy` e
WHERE e.sci_name = 'metagenomes'
)
SELECT *
FROM cte;
WHILE EXISTS (
SELECT c.*
FROM test.emp p
INNER JOIN `nih-sra-datastore.sra_tax_analysis_tool.taxonomy` c ON p.tax_id = c.parent_id
WHERE p.xlevel = counter
)
DO
-- Insert next level
INSERT INTO test.emp ( xlevel, tax_id, parent_id, sci_name )
SELECT counter + 1 AS xlevel, c.tax_id, c.parent_id, c.sci_name
FROM test.emp p
INNER JOIN `nih-sra-datastore.sra_tax_analysis_tool.taxonomy` c ON p.tax_id = c.parent_id
WHERE p.xlevel = counter;
SET counter = counter + 1;
-- Loop safely
IF counter > 10
THEN
BREAK;
END IF;
END WHILE;
SELECT
sum(mbases)
FROM
`nih-sra-datastore.sra.metadata`
WHERE
(
librarysource = 'METAGENOMIC'
or
organism IN (SELECT sci_name FROM test.emp)
)
AND consent = 'public'
AND libraryselection = 'RANDOM'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment