Skip to content

Instantly share code, notes, and snippets.

@louismullie
Last active November 29, 2023 23:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save louismullie/9b0b96ec61d424ed10035efb7aec0270 to your computer and use it in GitHub Desktop.
Save louismullie/9b0b96ec61d424ed10035efb7aec0270 to your computer and use it in GitHub Desktop.
-- Function to split the input query and generate all contiguous combinations
CREATE FUNCTION split_to_table(@input_query VARCHAR(MAX))
RETURNS @result TABLE (combination VARCHAR(MAX))
AS
BEGIN
DECLARE @words TABLE (id INT IDENTITY(1,1), word VARCHAR(255))
DECLARE @total_words INT, @i INT, @j INT, @current_combination VARCHAR(MAX)
-- Step 1: Splitting the input string into words
INSERT INTO @words (word)
SELECT value
FROM STRING_SPLIT(@input_query, ' ') -- This function varies based on SQL dialect
SELECT @total_words = COUNT(*) FROM @words
-- Step 2: Generating all contiguous combinations
SET @i = 1
WHILE @i <= @total_words
BEGIN
SET @j = @i
SET @current_combination = ''
WHILE @j <= @total_words
BEGIN
SELECT @current_combination = @current_combination + ' ' + word
FROM @words
WHERE id = @j
-- Insert the current combination into the result table
INSERT INTO @result (combination)
VALUES (LTRIM(@current_combination))
SET @j = @j + 1
END
SET @i = @i + 1
END
RETURN
END
-- Create tables
CREATE TABLE synonyms (
term VARCHAR(255),
longest_form VARCHAR(255)
);
CREATE TABLE abbreviations (
abbreviation VARCHAR(255),
full_form VARCHAR(255)
);
CREATE VIRTUAL TABLE documents USING fts5(title, body);
-- Example data
INSERT INTO synonyms (term, longest_form) VALUES
('anca', 'antineutrophil cytoplasmic antibodies'),
('anca associated vasculitis', 'antineutrophil cytoplasmic antibodies associated vasculitis');
INSERT INTO abbreviations (abbreviation, full_form) VALUES
('anca', 'antineutrophil cytoplasmic antibodies'),
('aav', 'anca associated vasculitis');
INSERT INTO documents (title, body) VALUES ('Example Title', 'Example body text');
-- Declare input query
DECLARE @input_query VARCHAR(255);
SET @input_query = 'your input query here';
-- Step 1: Breaking down the input query
WITH split_terms AS (
SELECT term
FROM split_to_table(@input_query, ' ')
)
-- Step 2: Matching terms against synonyms
, expanded_terms AS (
SELECT s.longest_form
FROM split_terms st
LEFT JOIN synonyms s ON st.term = s.term
)
-- Step 3: Apply expansion
, expanded_query AS (
SELECT DISTINCT longest_form
FROM expanded_terms
ORDER BY LENGTH(longest_form) DESC
)
-- Step 4: Apply contraction
, contracted_query AS (
SELECT DISTINCT a.abbreviation
FROM abbreviations a
WHERE a.full_form IN (SELECT longest_form FROM expanded_terms)
ORDER BY LENGTH(a.full_form) DESC
)
-- Step 5: Nested contraction
, nested_contracted_query AS (
SELECT DISTINCT a.abbreviation
FROM abbreviations a
WHERE a.full_form IN (
SELECT abbreviation FROM abbreviations WHERE full_form IN (
SELECT longest_form FROM expanded_terms
)
)
ORDER BY LENGTH(a.full_form) DESC
)
-- Step 6: Final Combined Query
WITH (SELECT * FROM expanded_query
UNION
SELECT * FROM contracted_query
UNION
SELECT * FROM nested_contracted_query) as search_queries;
-- Step 7: Query Documents using FTS5
SELECT * FROM documents
WHERE documents MATCH (group_concat(search_queries, ' OR '));
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment