Skip to content

Instantly share code, notes, and snippets.

@yeliu84
Created January 29, 2015 07:30
Show Gist options
  • Save yeliu84/8f6fe86add8f7f732e46 to your computer and use it in GitHub Desktop.
Save yeliu84/8f6fe86add8f7f732e46 to your computer and use it in GitHub Desktop.
Answer for Seven Databases in Seven Weeks: Day 3 Do#2
/**
* Seven Databases in Seven Weeks: Day 3 Do#2
*
* Expand the movies database to track user comments and extract keywords
* (minus English stopwords). Cross-reference these keywords with actors’ last
* names, and try to find the most talked about actors.
*/
-- create user_comments table
DROP TABLE IF EXISTS user_comments;
CREATE TABLE user_comments (
comment_id SERIAL PRIMARY KEY,
comment TEXT
);
-- create keywords table
DROP TABLE IF EXISTS keywords;
CREATE TABLE keywords (
keyword_id SERIAL PRIMARY KEY,
keyword TEXT,
count INTEGER DEFAULT 1
);
CREATE INDEX keywords_keyword on keywords (keyword);
-- create a simple dictionary for filtering out English stop words
DROP TEXT SEARCH DICTIONARY IF EXISTS public.simple_dict;
CREATE TEXT SEARCH DICTIONARY public.simple_dict (
TEMPLATE = pg_catalog.simple,
STOPWORDS = english
);
-- create function to convert new comment to keywords
DROP FUNCTION IF EXISTS insert_keywords();
CREATE FUNCTION insert_keywords() RETURNS trigger AS $$
DECLARE
c TEXT;
lexemes TEXT[];
keyword_count INTEGER;
BEGIN
FOR c IN
SELECT *
FROM regexp_split_to_table(NEW.comment, '[^\w]+') AS word
WHERE word <> '' AND
word IN ( -- cross-referencing keyword with actors' last names
SELECT lower((regexp_split_to_array(name, '[^\w]+'))[2])
FROM actors
)
LOOP
SELECT ts_lexize('public.simple_dict', c) INTO lexemes;
IF array_length(lexemes, 1) > 0 THEN -- the word is not a stop word, insert as new keyword or update the counter
SELECT count INTO keyword_count FROM keywords WHERE keyword = lexemes[1];
IF FOUND THEN
RAISE NOTICE 'updating keyword count: % (%)', lexemes[1], keyword_count + 1;
UPDATE keywords SET count = keyword_count + 1 WHERE keyword = lexemes[1];
ELSE
RAISE NOTICE 'inserting keyword: %', lexemes[1];
INSERT INTO keywords (keyword) VALUES (lexemes[1]);
END IF;
END IF;
END LOOP;
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
-- create trigger to convert new comment to keywords
CREATE TRIGGER convert_comment_to_keywords
AFTER INSERT ON user_comments
FOR EACH ROW EXECUTE PROCEDURE insert_keywords();
-- create a procedure to find the most talked actors
CREATE OR REPLACE FUNCTION get_most_talked_actors() RETURNS SETOF actors AS $$
DECLARE
lname TEXT;
BEGIN
SELECT keyword INTO lname FROM keywords ORDER BY count LIMIT 1;
IF FOUND THEN
RETURN QUERY
SELECT *
FROM actors
WHERE lower((regexp_split_to_array(name, '[^\w]+'))[2]) = lname
ORDER BY name;
END IF;
END;
$$ LANGUAGE plpgsql;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment