Skip to content

Instantly share code, notes, and snippets.

@acanakoglu
Last active November 22, 2020 19:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save acanakoglu/ed9fa4585a8d3828c1fbcb405dd2cfb3 to your computer and use it in GitHub Desktop.
Save acanakoglu/ed9fa4585a8d3828c1fbcb405dd2cfb3 to your computer and use it in GitHub Desktop.
ViruSurf views
-- DROP MATERIALIZED VIEW all_flatten_view;
CREATE MATERIALIZED VIEW all_flatten_view
WITH (FILLFACTOR = 100)
AS
SELECT sequence_id, accession_id, strain_name, is_reference, is_complete, strand, length, gc_percentage, n_percentage, sequencing_technology, assembly_method, coverage, sequencing_lab, submission_date, bioproject_id, database_source, taxon_id, taxon_name, species, host_taxon_name, collection_date, isolation_source, geo_group, country, region, gender, age, nucleotide_sequence, lineage, clade, host_taxon_id, originating_lab, genus, sub_family, family, equivalent_list, molecule_type, is_single_stranded, is_positive_stranded
FROM virus
NATURAL JOIN sequence
NATURAL JOIN host_sample
NATURAL JOIN host_specie
NATURAL JOIN experiment_type
NATURAL JOIN sequencing_project
;
-- PRIMARY KEY --
CREATE UNIQUE INDEX all_flatten_view__sequence_id__uidx ON all_flatten_view(sequence_id) WITH (FILLFACTOR=100);
CLUSTER VERBOSE all_flatten_view USING all_flatten_view__sequence_id__uidx;
CREATE INDEX all_flatten_view__accession_id_lower__idx ON all_flatten_view(LOWER(accession_id)) WITH (FILLFACTOR=100);
CREATE INDEX all_flatten_view__strain_name_lower__idx ON all_flatten_view(LOWER(strain_name)) WITH (FILLFACTOR=100);
CREATE INDEX all_flatten_view__is_reference__idx ON all_flatten_view(is_reference) WITH (FILLFACTOR=100);
CREATE INDEX all_flatten_view__is_complete__idx ON all_flatten_view(is_complete) WITH (FILLFACTOR=100);
CREATE INDEX all_flatten_view__strand_lower__idx ON all_flatten_view(LOWER(strand)) WITH (FILLFACTOR=100);
CREATE INDEX all_flatten_view__length__idx ON all_flatten_view(length) WITH (FILLFACTOR=100);
CREATE INDEX all_flatten_view__gc_percentage__idx ON all_flatten_view(gc_percentage) WITH (FILLFACTOR=100);
CREATE INDEX all_flatten_view__n_percentage__idx ON all_flatten_view(n_percentage) WITH (FILLFACTOR=100);
CREATE INDEX all_flatten_view__sequencing_technology_lower__idx ON all_flatten_view(LOWER(sequencing_technology)) WITH (FILLFACTOR=100);
CREATE INDEX all_flatten_view__assembly_method_lower__idx ON all_flatten_view(LOWER(assembly_method)) WITH (FILLFACTOR=100);
CREATE INDEX all_flatten_view__coverage__idx ON all_flatten_view(coverage) WITH (FILLFACTOR=100);
CREATE INDEX all_flatten_view__sequencing_lab_lower__idx ON all_flatten_view(LOWER(sequencing_lab)) WITH (FILLFACTOR=100);
CREATE INDEX all_flatten_view__submission_date__idx ON all_flatten_view(submission_date) WITH (FILLFACTOR=100);
CREATE INDEX all_flatten_view__bioproject_id_lower__idx ON all_flatten_view(LOWER(bioproject_id)) WITH (FILLFACTOR=100);
CREATE INDEX all_flatten_view__database_source_lower__idx ON all_flatten_view(LOWER(database_source)) WITH (FILLFACTOR=100);
CREATE INDEX all_flatten_view__taxon_id__idx ON all_flatten_view(taxon_id) WITH (FILLFACTOR=100);
CREATE INDEX all_flatten_view__taxon_name_lower__idx ON all_flatten_view(LOWER(taxon_name)) WITH (FILLFACTOR=100);
CREATE INDEX all_flatten_view__species_lower__idx ON all_flatten_view(LOWER(species)) WITH (FILLFACTOR=100);
CREATE INDEX all_flatten_view__host_taxon_name_lower__idx ON all_flatten_view(LOWER(host_taxon_name)) WITH (FILLFACTOR=100);
CREATE INDEX all_flatten_view__collection_date__idx ON all_flatten_view(collection_date) WITH (FILLFACTOR=100);
CREATE INDEX all_flatten_view__isolation_source_lower__idx ON all_flatten_view(LOWER(isolation_source)) WITH (FILLFACTOR=100);
CREATE INDEX all_flatten_view__geo_group_lower__idx ON all_flatten_view(LOWER(geo_group)) WITH (FILLFACTOR=100);
CREATE INDEX all_flatten_view__country_lower__idx ON all_flatten_view(LOWER(country)) WITH (FILLFACTOR=100);
CREATE INDEX all_flatten_view__region_lower__idx ON all_flatten_view(LOWER(region)) WITH (FILLFACTOR=100);
CREATE INDEX all_flatten_view__gender_lower__idx ON all_flatten_view(LOWER(gender)) WITH (FILLFACTOR=100);
CREATE INDEX all_flatten_view__age__idx ON all_flatten_view(age) WITH (FILLFACTOR=100);
ANALYZE VERBOSE all_flatten_view;
-- DROP MATERIALIZED VIEW nucleotide_variant_annotated;
CREATE MATERIALIZED VIEW nucleotide_variant_annotated
WITH (FILLFACTOR = 100)
AS
SELECT DISTINCT
nc.nucleotide_variant_id,
-- ann.annotation_id, -- --> possibly have annotation_id, too. but for now, we an skip that.
nc.sequence_id,
nc.variant_type,
nc.start_original,
nc.sequence_original,
nc.sequence_alternative,
nc.variant_length,
ann.feature_type AS n_feature_type,
ann.gene_name AS n_gene_name,
ann.product AS n_product
FROM nucleotide_variant as nc
LEFT JOIN annotation as ann -- --> LEFT?
ON nc.start_original >= ann.start
AND nc.start_original <= ann.stop
AND nc.sequence_id = ann.sequence_id
WHERE variant_length <= 20 -----------------> ?remove LIMIT 20
-- and nc.sequence_id < 10 and ann.sequence_id < 10 -- FOR TESTING
;
-- JOIN KEY
CREATE INDEX nucleotide_variant_annotated__sequence_id__idx ON nucleotide_variant_annotated(sequence_id) WITH (FILLFACTOR = 100);
CLUSTER VERBOSE nucleotide_variant_annotated USING nucleotide_variant_annotated__sequence_id__idx;
CREATE INDEX nucleotide_variant_annotated__nucleotide_variant_id__idx ON nucleotide_variant_annotated(nucleotide_variant_id) WITH (FILLFACTOR = 100);
CREATE INDEX nucleotide_variant_annotated__variant_type_lower__idx ON nucleotide_variant_annotated(LOWER(variant_type)) WITH (FILLFACTOR = 100);
CREATE INDEX nucleotide_variant_annotated__start_original__idx ON nucleotide_variant_annotated(start_original) WITH (FILLFACTOR = 100);
CREATE INDEX nucleotide_variant_annotated__sequence_original_lower__idx ON nucleotide_variant_annotated(LOWER(sequence_original)) WITH (FILLFACTOR = 100);
CREATE INDEX nucleotide_variant_annotated__sequence_alternative_lower__idx ON nucleotide_variant_annotated(LOWER(sequence_alternative)) WITH (FILLFACTOR = 100);
CREATE INDEX nucleotide_variant_annotated__n_feature_type_lower__idx ON nucleotide_variant_annotated(LOWER(n_feature_type)) WITH (FILLFACTOR = 100);
CREATE INDEX nucleotide_variant_annotated__n_gene_name_lower__idx ON nucleotide_variant_annotated(LOWER(n_gene_name)) WITH (FILLFACTOR = 100);
CREATE INDEX nucleotide_variant_annotated__n_product__idx ON nucleotide_variant_annotated(LOWER(n_product)) WITH (FILLFACTOR = 100);
ANALYZE VERBOSE nucleotide_variant_annotated;
WITH nucleotide_variant_impact AS (
SELECT
nc.nucleotide_variant_id,
nc.sequence_id,
nc.variant_type,
nc.start_original,
nc.sequence_original,
nc.sequence_alternative,
nc.variant_length,
-- should be distinct!!!!
array_agg(--DISTINCT
ARRAY[effect, putative_impact, impact_gene_name] ORDER BY effect, putative_impact, impact_gene_name) --FILTER (WHERE imp.nucleotide_variant_id is not null)
as variant_impact_array
FROM nucleotide_variant as nc
LEFT JOIN variant_impact as imp ON nc.nucleotide_variant_id = imp.nucleotide_variant_id
--WHERE nc.sequence_id < 10 -- FOR TESTING
GROUP BY nc.nucleotide_variant_id
)
SELECT nc.*,
-- ann.annotation_id, -- --> possibly have annotation_id, too. but for now, we an skip that.
ann.feature_type AS n_feature_type,
ann.gene_name AS n_gene_name,
ann.product AS n_product
FROM nucleotide_variant_impact as nc
LEFT JOIN annotation as ann -- --> LEFT?
ON nc.start_original >= ann.start
AND nc.start_original <= ann.stop
AND nc.sequence_id = ann.sequence_id
WHERE variant_length <= 20 -----------------> ?remove LIMIT 20
--second version
SELECT
nc.nucleotide_variant_id,
nc.sequence_id,
nc.variant_type,
nc.start_original,
nc.sequence_original,
nc.sequence_alternative,
nc.variant_length,
-- should be distinct!!!!
array_agg(--DISTINCT
ARRAY[effect, putative_impact, impact_gene_name] ORDER BY effect, putative_impact, impact_gene_name) --FILTER (WHERE imp.nucleotide_variant_id is not null)
as variant_impact_array,
--ann.annotation_id, -- --> possibly have annotation_id, too. but for now, we an skip that.
ann.feature_type AS n_feature_type,
ann.gene_name AS n_gene_name,
ann.product AS n_product
FROM nucleotide_variant as nc
LEFT JOIN variant_impact as imp ON nc.nucleotide_variant_id = imp.nucleotide_variant_id
LEFT JOIN annotation as ann -- --> LEFT?
ON nc.start_original >= ann.start
AND nc.start_original <= ann.stop
AND nc.sequence_id = ann.sequence_id
WHERE variant_length <= 20 -----------------> ?remove LIMIT 20
GROUP BY nc.nucleotide_variant_id, ann.annotation_id
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment