yeliu84/user_comments.sql

## user_comments.sql
/**
 * Seven Databases in Seven Weeks: Day 3 Do#2
 *
 * Expand the movies database to track user comments and extract keywords
 * (minus English stopwords). Cross-reference these keywords with actors’ last
 * names, and try to find the most talked about actors.
 */

-- create user_comments table
DROP TABLE IF EXISTS user_comments;
CREATE TABLE user_comments (
    comment_id SERIAL PRIMARY KEY,
    comment TEXT
);

-- create keywords table
DROP TABLE IF EXISTS keywords;
CREATE TABLE keywords (
    keyword_id SERIAL PRIMARY KEY,
    keyword TEXT,
    count INTEGER DEFAULT 1
);
CREATE INDEX keywords_keyword on keywords (keyword);

-- create a simple dictionary for filtering out English stop words
DROP TEXT SEARCH DICTIONARY IF EXISTS public.simple_dict;
CREATE TEXT SEARCH DICTIONARY public.simple_dict (
    TEMPLATE = pg_catalog.simple,
    STOPWORDS = english
);

-- create function to convert new comment to keywords
DROP FUNCTION IF EXISTS insert_keywords();
CREATE FUNCTION insert_keywords() RETURNS trigger AS $$
DECLARE
    c TEXT;
    lexemes TEXT[];
    keyword_count INTEGER;
BEGIN
    FOR c IN
        SELECT *
        FROM regexp_split_to_table(NEW.comment, '[^\w]+') AS word
        WHERE word <> '' AND
              word IN ( -- cross-referencing keyword with actors' last names
                  SELECT lower((regexp_split_to_array(name, '[^\w]+'))[2])
                  FROM actors
              )
    LOOP
        SELECT ts_lexize('public.simple_dict', c) INTO lexemes;
        IF array_length(lexemes, 1) > 0 THEN -- the word is not a stop word, insert as new keyword or update the counter
            SELECT count INTO keyword_count FROM keywords WHERE keyword = lexemes[1];
            IF FOUND THEN
                RAISE NOTICE 'updating keyword count: % (%)', lexemes[1], keyword_count + 1;
                UPDATE keywords SET count = keyword_count + 1 WHERE keyword = lexemes[1];
            ELSE
                RAISE NOTICE 'inserting keyword: %', lexemes[1];
                INSERT INTO keywords (keyword) VALUES (lexemes[1]);
            END IF;
        END IF;
    END LOOP;

    RETURN NEW;
END;
$$ LANGUAGE plpgsql;

-- create trigger to convert new comment to keywords
CREATE TRIGGER convert_comment_to_keywords
    AFTER INSERT ON user_comments
    FOR EACH ROW EXECUTE PROCEDURE insert_keywords();

-- create a procedure to find the most talked actors
CREATE OR REPLACE FUNCTION get_most_talked_actors() RETURNS SETOF actors AS $$
DECLARE
    lname TEXT;
BEGIN
    SELECT keyword INTO lname FROM keywords ORDER BY count LIMIT 1;
    IF FOUND THEN
        RETURN QUERY
            SELECT *
            FROM actors
            WHERE lower((regexp_split_to_array(name, '[^\w]+'))[2]) = lname
            ORDER BY name;
    END IF;
END;
$$ LANGUAGE plpgsql;
	/**
	* Seven Databases in Seven Weeks: Day 3 Do#2
	*
	* Expand the movies database to track user comments and extract keywords
	* (minus English stopwords). Cross-reference these keywords with actors’ last
	* names, and try to find the most talked about actors.
	*/

	-- create user_comments table
	DROP TABLE IF EXISTS user_comments;
	CREATE TABLE user_comments (
	comment_id SERIAL PRIMARY KEY,
	comment TEXT
	);

	-- create keywords table
	DROP TABLE IF EXISTS keywords;
	CREATE TABLE keywords (
	keyword_id SERIAL PRIMARY KEY,
	keyword TEXT,
	count INTEGER DEFAULT 1
	);
	CREATE INDEX keywords_keyword on keywords (keyword);

	-- create a simple dictionary for filtering out English stop words
	DROP TEXT SEARCH DICTIONARY IF EXISTS public.simple_dict;
	CREATE TEXT SEARCH DICTIONARY public.simple_dict (
	TEMPLATE = pg_catalog.simple,
	STOPWORDS = english
	);

	-- create function to convert new comment to keywords
	DROP FUNCTION IF EXISTS insert_keywords();
	CREATE FUNCTION insert_keywords() RETURNS trigger AS $$
	DECLARE
	c TEXT;
	lexemes TEXT[];
	keyword_count INTEGER;
	BEGIN
	FOR c IN
	SELECT *
	FROM regexp_split_to_table(NEW.comment, '[^\w]+') AS word
	WHERE word <> '' AND
	word IN ( -- cross-referencing keyword with actors' last names
	SELECT lower((regexp_split_to_array(name, '[^\w]+'))[2])
	FROM actors
	)
	LOOP
	SELECT ts_lexize('public.simple_dict', c) INTO lexemes;
	IF array_length(lexemes, 1) > 0 THEN -- the word is not a stop word, insert as new keyword or update the counter
	SELECT count INTO keyword_count FROM keywords WHERE keyword = lexemes[1];
	IF FOUND THEN
	RAISE NOTICE 'updating keyword count: % (%)', lexemes[1], keyword_count + 1;
	UPDATE keywords SET count = keyword_count + 1 WHERE keyword = lexemes[1];
	ELSE
	RAISE NOTICE 'inserting keyword: %', lexemes[1];
	INSERT INTO keywords (keyword) VALUES (lexemes[1]);
	END IF;
	END IF;
	END LOOP;

	RETURN NEW;
	END;
	$$ LANGUAGE plpgsql;

	-- create trigger to convert new comment to keywords
	CREATE TRIGGER convert_comment_to_keywords
	AFTER INSERT ON user_comments
	FOR EACH ROW EXECUTE PROCEDURE insert_keywords();

	-- create a procedure to find the most talked actors
	CREATE OR REPLACE FUNCTION get_most_talked_actors() RETURNS SETOF actors AS $$
	DECLARE
	lname TEXT;
	BEGIN
	SELECT keyword INTO lname FROM keywords ORDER BY count LIMIT 1;
	IF FOUND THEN
	RETURN QUERY
	SELECT *
	FROM actors
	WHERE lower((regexp_split_to_array(name, '[^\w]+'))[2]) = lname
	ORDER BY name;
	END IF;
	END;
	$$ LANGUAGE plpgsql;