Created
June 17, 2024 14:26
-
-
Save rsomani95/dfc1006fb63fbf861bb389cae894b97f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- Ensure the extension is made | |
CREATE EXTENSION IF NOT EXISTS vectorscale CASCADE; | |
-- Create the collection table | |
CREATE TABLE collection ( | |
id UUID PRIMARY KEY DEFAULT gen_random_uuid(), | |
name TEXT NOT NULL | |
); | |
-- Create the asset table with a foreign key to the collection table | |
CREATE TABLE asset ( | |
id UUID PRIMARY KEY DEFAULT gen_random_uuid(), | |
collection_id UUID REFERENCES collection(id) ON DELETE CASCADE, | |
title TEXT NOT NULL | |
); | |
-- Vector Types (https://docs.pgvecto.rs/reference/vector-types.html) | |
-- - vecf16: FP16 vector | |
-- - bvector: Binary vector | |
-- Depending on the choice of vector type, we need to modify the following parts: | |
-- - `segment.embedding` dtype | |
-- - index creation operators (See https://docs.pgvecto.rs/reference/schema.html#list-of-operator-families) | |
-- - table insertion dtype casting | |
-- Create the segment table with a foreign key to the asset table | |
CREATE TABLE segment ( | |
id UUID PRIMARY KEY DEFAULT gen_random_uuid(), | |
asset_id UUID REFERENCES asset(id) ON DELETE CASCADE, | |
embedding vector(640) NOT NULL, -- 640 dimensions | |
category TEXT NOT NULL | |
); | |
-- INDEXES | |
-- Create index on asset's foreign key collection_id | |
CREATE INDEX idx_asset_collection_id ON asset(collection_id); | |
-- Create index on segment's foreign key asset_id | |
CREATE INDEX idx_segment_asset_id ON segment(asset_id); | |
-- Create index on segment's category column | |
CREATE INDEX idx_segment_category ON segment(category); | |
-- VECTOR INDEX | |
-- Ops options: https://docs.pgvecto.rs/reference/schema.html#list-of-operator-families | |
-- - vector_cos_ops | |
-- - vecf16_cos_ops | |
-- - veci8_cos_ops | |
-- - bvector_cos_ops | |
-- CREATE INDEX ON segment USING vectors (embedding vecf16_cos_ops); | |
-- CREATE INDEX ON segment USING vectors (embedding vector_cos_ops); | |
DO $$ | |
DECLARE | |
-- num_collections INT := 10; | |
-- num_assets_per_collection INT := 100; | |
-- num_segments_per_asset INT := 1500; | |
num_collections INT := current_setting('app.num_collections')::INT; | |
num_assets_per_collection INT := current_setting('app.num_assets_per_collection')::INT; | |
num_segments_per_asset INT := current_setting('app.num_segments_per_asset')::INT; | |
category_counter INTEGER; | |
category TEXT; | |
collection_id UUID; | |
asset_id UUID; | |
embedding vector; -- May need to modify dtype | |
-- Dummy data | |
BEGIN | |
-- Insert dummy data for the collection table | |
FOR i IN 1..num_collections LOOP | |
INSERT INTO collection (name) | |
VALUES ('Library ' || i) | |
RETURNING id INTO collection_id; | |
-- Insert dummy data for the asset table | |
FOR j IN 1..num_assets_per_collection LOOP | |
INSERT INTO asset (collection_id, title) | |
VALUES (collection_id, 'Video Asset ' || j) | |
RETURNING id INTO asset_id; | |
category_counter := 0; | |
-- Insert dummy data for the segment table | |
FOR k IN 1..num_segments_per_asset LOOP | |
-- Generate a random vector embedding | |
-- embedding := ARRAY(SELECT round(random()::real) FROM generate_series(1, 640))::real[]::vector::bvector; | |
-- embedding := ARRAY(SELECT random()::real FROM generate_series(1, 640))::real[]::vector::vecf16; | |
embedding := ARRAY(SELECT random()::real FROM generate_series(1, 640))::real[]::vector; | |
category := CASE WHEN category_counter % 2 = 0 THEN 'visual' ELSE 'transcript' END; | |
category_counter := category_counter + 1; | |
-- Insert the segment data | |
INSERT INTO segment (asset_id, embedding, category) | |
VALUES (asset_id, embedding, category); | |
END LOOP; | |
END LOOP; | |
END LOOP; | |
END $$; | |
-- Build DiskANN index | |
CREATE INDEX segment_embedding_idx ON segment USING diskann (embedding); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment