acanakoglu/all_flatten_view.sql

## all_flatten_view.sql
-- DROP MATERIALIZED VIEW all_flatten_view;
CREATE MATERIALIZED VIEW all_flatten_view
WITH (FILLFACTOR = 100)
AS
	SELECT sequence_id, accession_id, strain_name, is_reference, is_complete, strand, length, gc_percentage, n_percentage, sequencing_technology, assembly_method, coverage, sequencing_lab, submission_date, bioproject_id, database_source, taxon_id, taxon_name, species, host_taxon_name, collection_date, isolation_source, geo_group, country, region, gender, age, nucleotide_sequence, lineage, clade, host_taxon_id, originating_lab, genus, sub_family, family, equivalent_list, molecule_type, is_single_stranded, is_positive_stranded
	FROM virus
	NATURAL JOIN sequence
	NATURAL JOIN host_sample
	NATURAL JOIN host_specie
	NATURAL JOIN experiment_type
	NATURAL JOIN sequencing_project
;

-- PRIMARY KEY --
CREATE UNIQUE INDEX all_flatten_view__sequence_id__uidx ON all_flatten_view(sequence_id) WITH (FILLFACTOR=100);

CLUSTER VERBOSE all_flatten_view USING all_flatten_view__sequence_id__uidx;


CREATE INDEX all_flatten_view__accession_id_lower__idx ON all_flatten_view(LOWER(accession_id)) WITH (FILLFACTOR=100);

CREATE INDEX all_flatten_view__strain_name_lower__idx ON all_flatten_view(LOWER(strain_name)) WITH (FILLFACTOR=100);

CREATE INDEX all_flatten_view__is_reference__idx ON all_flatten_view(is_reference) WITH (FILLFACTOR=100);

CREATE INDEX all_flatten_view__is_complete__idx ON all_flatten_view(is_complete) WITH (FILLFACTOR=100);

CREATE INDEX all_flatten_view__strand_lower__idx ON all_flatten_view(LOWER(strand)) WITH (FILLFACTOR=100);

CREATE INDEX all_flatten_view__length__idx ON all_flatten_view(length) WITH (FILLFACTOR=100);

CREATE INDEX all_flatten_view__gc_percentage__idx ON all_flatten_view(gc_percentage) WITH (FILLFACTOR=100);

CREATE INDEX all_flatten_view__n_percentage__idx ON all_flatten_view(n_percentage) WITH (FILLFACTOR=100);

CREATE INDEX all_flatten_view__sequencing_technology_lower__idx ON all_flatten_view(LOWER(sequencing_technology)) WITH (FILLFACTOR=100);

CREATE INDEX all_flatten_view__assembly_method_lower__idx ON all_flatten_view(LOWER(assembly_method)) WITH (FILLFACTOR=100);

CREATE INDEX all_flatten_view__coverage__idx ON all_flatten_view(coverage) WITH (FILLFACTOR=100);

CREATE INDEX all_flatten_view__sequencing_lab_lower__idx ON all_flatten_view(LOWER(sequencing_lab)) WITH (FILLFACTOR=100);

CREATE INDEX all_flatten_view__submission_date__idx ON all_flatten_view(submission_date) WITH (FILLFACTOR=100);

CREATE INDEX all_flatten_view__bioproject_id_lower__idx ON all_flatten_view(LOWER(bioproject_id)) WITH (FILLFACTOR=100);

CREATE INDEX all_flatten_view__database_source_lower__idx ON all_flatten_view(LOWER(database_source)) WITH (FILLFACTOR=100);

CREATE INDEX all_flatten_view__taxon_id__idx ON all_flatten_view(taxon_id) WITH (FILLFACTOR=100);

CREATE INDEX all_flatten_view__taxon_name_lower__idx ON all_flatten_view(LOWER(taxon_name)) WITH (FILLFACTOR=100);

CREATE INDEX all_flatten_view__species_lower__idx ON all_flatten_view(LOWER(species)) WITH (FILLFACTOR=100);

CREATE INDEX all_flatten_view__host_taxon_name_lower__idx ON all_flatten_view(LOWER(host_taxon_name)) WITH (FILLFACTOR=100);

CREATE INDEX all_flatten_view__collection_date__idx ON all_flatten_view(collection_date) WITH (FILLFACTOR=100);

CREATE INDEX all_flatten_view__isolation_source_lower__idx ON all_flatten_view(LOWER(isolation_source)) WITH (FILLFACTOR=100);

CREATE INDEX all_flatten_view__geo_group_lower__idx ON all_flatten_view(LOWER(geo_group)) WITH (FILLFACTOR=100);

CREATE INDEX all_flatten_view__country_lower__idx ON all_flatten_view(LOWER(country)) WITH (FILLFACTOR=100);

CREATE INDEX all_flatten_view__region_lower__idx ON all_flatten_view(LOWER(region)) WITH (FILLFACTOR=100);

CREATE INDEX all_flatten_view__gender_lower__idx ON all_flatten_view(LOWER(gender)) WITH (FILLFACTOR=100);

CREATE INDEX all_flatten_view__age__idx ON all_flatten_view(age) WITH (FILLFACTOR=100);


ANALYZE VERBOSE all_flatten_view;


## nucleotide_variant_annotated.sql
-- DROP MATERIALIZED VIEW nucleotide_variant_annotated;
CREATE MATERIALIZED VIEW nucleotide_variant_annotated
WITH (FILLFACTOR = 100)
AS
 SELECT DISTINCT
  nc.nucleotide_variant_id,
  -- ann.annotation_id, -- --> possibly have annotation_id, too. but for now, we an skip that.

  nc.sequence_id,
  nc.variant_type,
  nc.start_original,
  nc.sequence_original,
  nc.sequence_alternative,
  nc.variant_length,

    ann.feature_type AS n_feature_type,
    ann.gene_name AS n_gene_name,
    ann.product AS n_product

   FROM nucleotide_variant as nc
     LEFT JOIN annotation as ann  -- --> LEFT?
   ON nc.start_original >= ann.start
  AND nc.start_original <= ann.stop
  AND nc.sequence_id = ann.sequence_id
  WHERE variant_length <= 20  -----------------> ?remove LIMIT 20
  -- and nc.sequence_id < 10 and ann.sequence_id < 10 -- FOR TESTING
  ;


-- JOIN KEY
CREATE INDEX nucleotide_variant_annotated__sequence_id__idx ON nucleotide_variant_annotated(sequence_id) WITH (FILLFACTOR = 100);

CLUSTER VERBOSE nucleotide_variant_annotated USING nucleotide_variant_annotated__sequence_id__idx;


CREATE INDEX nucleotide_variant_annotated__nucleotide_variant_id__idx ON nucleotide_variant_annotated(nucleotide_variant_id) WITH (FILLFACTOR = 100);


CREATE INDEX nucleotide_variant_annotated__variant_type_lower__idx ON nucleotide_variant_annotated(LOWER(variant_type)) WITH (FILLFACTOR = 100);


CREATE INDEX nucleotide_variant_annotated__start_original__idx ON nucleotide_variant_annotated(start_original) WITH (FILLFACTOR = 100);


CREATE INDEX nucleotide_variant_annotated__sequence_original_lower__idx ON nucleotide_variant_annotated(LOWER(sequence_original)) WITH (FILLFACTOR = 100);


CREATE INDEX nucleotide_variant_annotated__sequence_alternative_lower__idx ON nucleotide_variant_annotated(LOWER(sequence_alternative)) WITH (FILLFACTOR = 100);


CREATE INDEX nucleotide_variant_annotated__n_feature_type_lower__idx ON nucleotide_variant_annotated(LOWER(n_feature_type)) WITH (FILLFACTOR = 100);


CREATE INDEX nucleotide_variant_annotated__n_gene_name_lower__idx ON nucleotide_variant_annotated(LOWER(n_gene_name)) WITH (FILLFACTOR = 100);


CREATE INDEX nucleotide_variant_annotated__n_product__idx ON nucleotide_variant_annotated(LOWER(n_product)) WITH (FILLFACTOR = 100);


ANALYZE VERBOSE nucleotide_variant_annotated;


## nucleotide_variant_impact_annotated.sql
WITH nucleotide_variant_impact AS (
	SELECT
		nc.nucleotide_variant_id,
		nc.sequence_id,
		nc.variant_type,
		nc.start_original,
		nc.sequence_original,
		nc.sequence_alternative,
		nc.variant_length,
			-- should be distinct!!!!
		array_agg(--DISTINCT
			ARRAY[effect, putative_impact, impact_gene_name] ORDER BY effect, putative_impact, impact_gene_name) --FILTER (WHERE imp.nucleotide_variant_id is not null)
									as variant_impact_array
	FROM nucleotide_variant as nc
	LEFT JOIN variant_impact as imp ON nc.nucleotide_variant_id = imp.nucleotide_variant_id
	--WHERE nc.sequence_id < 10 -- FOR TESTING
	GROUP BY nc.nucleotide_variant_id
)
SELECT nc.*,
	-- ann.annotation_id, -- --> possibly have annotation_id, too. but for now, we an skip that.
    ann.feature_type AS n_feature_type,
    ann.gene_name AS n_gene_name,
    ann.product AS n_product
FROM nucleotide_variant_impact as nc
LEFT JOIN annotation as ann  -- --> LEFT?
				   ON nc.start_original >= ann.start
				  AND nc.start_original <= ann.stop
				  AND nc.sequence_id = ann.sequence_id
WHERE variant_length <= 20  -----------------> ?remove LIMIT 20


--second version

SELECT
	nc.nucleotide_variant_id,
	nc.sequence_id,
	nc.variant_type,
	nc.start_original,
	nc.sequence_original,
	nc.sequence_alternative,
	nc.variant_length,
		-- should be distinct!!!!
	array_agg(--DISTINCT
		ARRAY[effect, putative_impact, impact_gene_name] ORDER BY effect, putative_impact, impact_gene_name) --FILTER (WHERE imp.nucleotide_variant_id is not null)
								as variant_impact_array,
	--ann.annotation_id, -- --> possibly have annotation_id, too. but for now, we an skip that.
	ann.feature_type AS n_feature_type,
	ann.gene_name AS n_gene_name,
	ann.product AS n_product


FROM nucleotide_variant as nc
LEFT JOIN variant_impact as imp ON nc.nucleotide_variant_id = imp.nucleotide_variant_id
LEFT JOIN annotation as ann  -- --> LEFT?
			   ON nc.start_original >= ann.start
			  AND nc.start_original <= ann.stop
			  AND nc.sequence_id = ann.sequence_id
WHERE variant_length <= 20  -----------------> ?remove LIMIT 20
	GROUP BY nc.nucleotide_variant_id, ann.annotation_id
	-- DROP MATERIALIZED VIEW all_flatten_view;
	CREATE MATERIALIZED VIEW all_flatten_view
	WITH (FILLFACTOR = 100)
	AS
	SELECT sequence_id, accession_id, strain_name, is_reference, is_complete, strand, length, gc_percentage, n_percentage, sequencing_technology, assembly_method, coverage, sequencing_lab, submission_date, bioproject_id, database_source, taxon_id, taxon_name, species, host_taxon_name, collection_date, isolation_source, geo_group, country, region, gender, age, nucleotide_sequence, lineage, clade, host_taxon_id, originating_lab, genus, sub_family, family, equivalent_list, molecule_type, is_single_stranded, is_positive_stranded
	FROM virus
	NATURAL JOIN sequence
	NATURAL JOIN host_sample
	NATURAL JOIN host_specie
	NATURAL JOIN experiment_type
	NATURAL JOIN sequencing_project
	;

	-- PRIMARY KEY --
	CREATE UNIQUE INDEX all_flatten_view__sequence_id__uidx ON all_flatten_view(sequence_id) WITH (FILLFACTOR=100);

	CLUSTER VERBOSE all_flatten_view USING all_flatten_view__sequence_id__uidx;



	CREATE INDEX all_flatten_view__accession_id_lower__idx ON all_flatten_view(LOWER(accession_id)) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__strain_name_lower__idx ON all_flatten_view(LOWER(strain_name)) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__is_reference__idx ON all_flatten_view(is_reference) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__is_complete__idx ON all_flatten_view(is_complete) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__strand_lower__idx ON all_flatten_view(LOWER(strand)) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__length__idx ON all_flatten_view(length) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__gc_percentage__idx ON all_flatten_view(gc_percentage) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__n_percentage__idx ON all_flatten_view(n_percentage) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__sequencing_technology_lower__idx ON all_flatten_view(LOWER(sequencing_technology)) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__assembly_method_lower__idx ON all_flatten_view(LOWER(assembly_method)) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__coverage__idx ON all_flatten_view(coverage) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__sequencing_lab_lower__idx ON all_flatten_view(LOWER(sequencing_lab)) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__submission_date__idx ON all_flatten_view(submission_date) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__bioproject_id_lower__idx ON all_flatten_view(LOWER(bioproject_id)) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__database_source_lower__idx ON all_flatten_view(LOWER(database_source)) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__taxon_id__idx ON all_flatten_view(taxon_id) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__taxon_name_lower__idx ON all_flatten_view(LOWER(taxon_name)) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__species_lower__idx ON all_flatten_view(LOWER(species)) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__host_taxon_name_lower__idx ON all_flatten_view(LOWER(host_taxon_name)) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__collection_date__idx ON all_flatten_view(collection_date) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__isolation_source_lower__idx ON all_flatten_view(LOWER(isolation_source)) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__geo_group_lower__idx ON all_flatten_view(LOWER(geo_group)) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__country_lower__idx ON all_flatten_view(LOWER(country)) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__region_lower__idx ON all_flatten_view(LOWER(region)) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__gender_lower__idx ON all_flatten_view(LOWER(gender)) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__age__idx ON all_flatten_view(age) WITH (FILLFACTOR=100);


	ANALYZE VERBOSE all_flatten_view;
	-- DROP MATERIALIZED VIEW nucleotide_variant_annotated;
	CREATE MATERIALIZED VIEW nucleotide_variant_annotated
	WITH (FILLFACTOR = 100)
	AS
	SELECT DISTINCT
	nc.nucleotide_variant_id,
	-- ann.annotation_id, -- --> possibly have annotation_id, too. but for now, we an skip that.

	nc.sequence_id,
	nc.variant_type,
	nc.start_original,
	nc.sequence_original,
	nc.sequence_alternative,
	nc.variant_length,

	ann.feature_type AS n_feature_type,
	ann.gene_name AS n_gene_name,
	ann.product AS n_product

	FROM nucleotide_variant as nc
	LEFT JOIN annotation as ann -- --> LEFT?
	ON nc.start_original >= ann.start
	AND nc.start_original <= ann.stop
	AND nc.sequence_id = ann.sequence_id
	WHERE variant_length <= 20 -----------------> ?remove LIMIT 20
	-- and nc.sequence_id < 10 and ann.sequence_id < 10 -- FOR TESTING
	;



	-- JOIN KEY
	CREATE INDEX nucleotide_variant_annotated__sequence_id__idx ON nucleotide_variant_annotated(sequence_id) WITH (FILLFACTOR = 100);

	CLUSTER VERBOSE nucleotide_variant_annotated USING nucleotide_variant_annotated__sequence_id__idx;


	CREATE INDEX nucleotide_variant_annotated__nucleotide_variant_id__idx ON nucleotide_variant_annotated(nucleotide_variant_id) WITH (FILLFACTOR = 100);


	CREATE INDEX nucleotide_variant_annotated__variant_type_lower__idx ON nucleotide_variant_annotated(LOWER(variant_type)) WITH (FILLFACTOR = 100);


	CREATE INDEX nucleotide_variant_annotated__start_original__idx ON nucleotide_variant_annotated(start_original) WITH (FILLFACTOR = 100);



	CREATE INDEX nucleotide_variant_annotated__sequence_original_lower__idx ON nucleotide_variant_annotated(LOWER(sequence_original)) WITH (FILLFACTOR = 100);


	CREATE INDEX nucleotide_variant_annotated__sequence_alternative_lower__idx ON nucleotide_variant_annotated(LOWER(sequence_alternative)) WITH (FILLFACTOR = 100);


	CREATE INDEX nucleotide_variant_annotated__n_feature_type_lower__idx ON nucleotide_variant_annotated(LOWER(n_feature_type)) WITH (FILLFACTOR = 100);


	CREATE INDEX nucleotide_variant_annotated__n_gene_name_lower__idx ON nucleotide_variant_annotated(LOWER(n_gene_name)) WITH (FILLFACTOR = 100);


	CREATE INDEX nucleotide_variant_annotated__n_product__idx ON nucleotide_variant_annotated(LOWER(n_product)) WITH (FILLFACTOR = 100);



	ANALYZE VERBOSE nucleotide_variant_annotated;
	WITH nucleotide_variant_impact AS (
	SELECT
	nc.nucleotide_variant_id,
	nc.sequence_id,
	nc.variant_type,
	nc.start_original,
	nc.sequence_original,
	nc.sequence_alternative,
	nc.variant_length,
	-- should be distinct!!!!
	array_agg(--DISTINCT
	ARRAY[effect, putative_impact, impact_gene_name] ORDER BY effect, putative_impact, impact_gene_name) --FILTER (WHERE imp.nucleotide_variant_id is not null)
	as variant_impact_array
	FROM nucleotide_variant as nc
	LEFT JOIN variant_impact as imp ON nc.nucleotide_variant_id = imp.nucleotide_variant_id
	--WHERE nc.sequence_id < 10 -- FOR TESTING
	GROUP BY nc.nucleotide_variant_id
	)
	SELECT nc.*,
	-- ann.annotation_id, -- --> possibly have annotation_id, too. but for now, we an skip that.
	ann.feature_type AS n_feature_type,
	ann.gene_name AS n_gene_name,
	ann.product AS n_product
	FROM nucleotide_variant_impact as nc
	LEFT JOIN annotation as ann -- --> LEFT?
	ON nc.start_original >= ann.start
	AND nc.start_original <= ann.stop
	AND nc.sequence_id = ann.sequence_id
	WHERE variant_length <= 20 -----------------> ?remove LIMIT 20


	--second version

	SELECT
	nc.nucleotide_variant_id,
	nc.sequence_id,
	nc.variant_type,
	nc.start_original,
	nc.sequence_original,
	nc.sequence_alternative,
	nc.variant_length,
	-- should be distinct!!!!
	array_agg(--DISTINCT
	ARRAY[effect, putative_impact, impact_gene_name] ORDER BY effect, putative_impact, impact_gene_name) --FILTER (WHERE imp.nucleotide_variant_id is not null)
	as variant_impact_array,
	--ann.annotation_id, -- --> possibly have annotation_id, too. but for now, we an skip that.
	ann.feature_type AS n_feature_type,
	ann.gene_name AS n_gene_name,
	ann.product AS n_product


	FROM nucleotide_variant as nc
	LEFT JOIN variant_impact as imp ON nc.nucleotide_variant_id = imp.nucleotide_variant_id
	LEFT JOIN annotation as ann -- --> LEFT?
	ON nc.start_original >= ann.start
	AND nc.start_original <= ann.stop
	AND nc.sequence_id = ann.sequence_id
	WHERE variant_length <= 20 -----------------> ?remove LIMIT 20
	GROUP BY nc.nucleotide_variant_id, ann.annotation_id