Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save rsomani95/dfc1006fb63fbf861bb389cae894b97f to your computer and use it in GitHub Desktop.
Save rsomani95/dfc1006fb63fbf861bb389cae894b97f to your computer and use it in GitHub Desktop.
-- Ensure the extension is made
CREATE EXTENSION IF NOT EXISTS vectorscale CASCADE;
-- Create the collection table
CREATE TABLE collection (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
name TEXT NOT NULL
);
-- Create the asset table with a foreign key to the collection table
CREATE TABLE asset (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
collection_id UUID REFERENCES collection(id) ON DELETE CASCADE,
title TEXT NOT NULL
);
-- Vector Types (https://docs.pgvecto.rs/reference/vector-types.html)
-- - vecf16: FP16 vector
-- - bvector: Binary vector
-- Depending on the choice of vector type, we need to modify the following parts:
-- - `segment.embedding` dtype
-- - index creation operators (See https://docs.pgvecto.rs/reference/schema.html#list-of-operator-families)
-- - table insertion dtype casting
-- Create the segment table with a foreign key to the asset table
CREATE TABLE segment (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
asset_id UUID REFERENCES asset(id) ON DELETE CASCADE,
embedding vector(640) NOT NULL, -- 640 dimensions
category TEXT NOT NULL
);
-- INDEXES
-- Create index on asset's foreign key collection_id
CREATE INDEX idx_asset_collection_id ON asset(collection_id);
-- Create index on segment's foreign key asset_id
CREATE INDEX idx_segment_asset_id ON segment(asset_id);
-- Create index on segment's category column
CREATE INDEX idx_segment_category ON segment(category);
-- VECTOR INDEX
-- Ops options: https://docs.pgvecto.rs/reference/schema.html#list-of-operator-families
-- - vector_cos_ops
-- - vecf16_cos_ops
-- - veci8_cos_ops
-- - bvector_cos_ops
-- CREATE INDEX ON segment USING vectors (embedding vecf16_cos_ops);
-- CREATE INDEX ON segment USING vectors (embedding vector_cos_ops);
DO $$
DECLARE
-- num_collections INT := 10;
-- num_assets_per_collection INT := 100;
-- num_segments_per_asset INT := 1500;
num_collections INT := current_setting('app.num_collections')::INT;
num_assets_per_collection INT := current_setting('app.num_assets_per_collection')::INT;
num_segments_per_asset INT := current_setting('app.num_segments_per_asset')::INT;
category_counter INTEGER;
category TEXT;
collection_id UUID;
asset_id UUID;
embedding vector; -- May need to modify dtype
-- Dummy data
BEGIN
-- Insert dummy data for the collection table
FOR i IN 1..num_collections LOOP
INSERT INTO collection (name)
VALUES ('Library ' || i)
RETURNING id INTO collection_id;
-- Insert dummy data for the asset table
FOR j IN 1..num_assets_per_collection LOOP
INSERT INTO asset (collection_id, title)
VALUES (collection_id, 'Video Asset ' || j)
RETURNING id INTO asset_id;
category_counter := 0;
-- Insert dummy data for the segment table
FOR k IN 1..num_segments_per_asset LOOP
-- Generate a random vector embedding
-- embedding := ARRAY(SELECT round(random()::real) FROM generate_series(1, 640))::real[]::vector::bvector;
-- embedding := ARRAY(SELECT random()::real FROM generate_series(1, 640))::real[]::vector::vecf16;
embedding := ARRAY(SELECT random()::real FROM generate_series(1, 640))::real[]::vector;
category := CASE WHEN category_counter % 2 = 0 THEN 'visual' ELSE 'transcript' END;
category_counter := category_counter + 1;
-- Insert the segment data
INSERT INTO segment (asset_id, embedding, category)
VALUES (asset_id, embedding, category);
END LOOP;
END LOOP;
END LOOP;
END $$;
-- Build DiskANN index
CREATE INDEX segment_embedding_idx ON segment USING diskann (embedding);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment