Created
January 29, 2015 07:30
-
-
Save yeliu84/8f6fe86add8f7f732e46 to your computer and use it in GitHub Desktop.
Answer for Seven Databases in Seven Weeks: Day 3 Do#2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Seven Databases in Seven Weeks: Day 3 Do#2 | |
* | |
* Expand the movies database to track user comments and extract keywords | |
* (minus English stopwords). Cross-reference these keywords with actors’ last | |
* names, and try to find the most talked about actors. | |
*/ | |
-- create user_comments table | |
DROP TABLE IF EXISTS user_comments; | |
CREATE TABLE user_comments ( | |
comment_id SERIAL PRIMARY KEY, | |
comment TEXT | |
); | |
-- create keywords table | |
DROP TABLE IF EXISTS keywords; | |
CREATE TABLE keywords ( | |
keyword_id SERIAL PRIMARY KEY, | |
keyword TEXT, | |
count INTEGER DEFAULT 1 | |
); | |
CREATE INDEX keywords_keyword on keywords (keyword); | |
-- create a simple dictionary for filtering out English stop words | |
DROP TEXT SEARCH DICTIONARY IF EXISTS public.simple_dict; | |
CREATE TEXT SEARCH DICTIONARY public.simple_dict ( | |
TEMPLATE = pg_catalog.simple, | |
STOPWORDS = english | |
); | |
-- create function to convert new comment to keywords | |
DROP FUNCTION IF EXISTS insert_keywords(); | |
CREATE FUNCTION insert_keywords() RETURNS trigger AS $$ | |
DECLARE | |
c TEXT; | |
lexemes TEXT[]; | |
keyword_count INTEGER; | |
BEGIN | |
FOR c IN | |
SELECT * | |
FROM regexp_split_to_table(NEW.comment, '[^\w]+') AS word | |
WHERE word <> '' AND | |
word IN ( -- cross-referencing keyword with actors' last names | |
SELECT lower((regexp_split_to_array(name, '[^\w]+'))[2]) | |
FROM actors | |
) | |
LOOP | |
SELECT ts_lexize('public.simple_dict', c) INTO lexemes; | |
IF array_length(lexemes, 1) > 0 THEN -- the word is not a stop word, insert as new keyword or update the counter | |
SELECT count INTO keyword_count FROM keywords WHERE keyword = lexemes[1]; | |
IF FOUND THEN | |
RAISE NOTICE 'updating keyword count: % (%)', lexemes[1], keyword_count + 1; | |
UPDATE keywords SET count = keyword_count + 1 WHERE keyword = lexemes[1]; | |
ELSE | |
RAISE NOTICE 'inserting keyword: %', lexemes[1]; | |
INSERT INTO keywords (keyword) VALUES (lexemes[1]); | |
END IF; | |
END IF; | |
END LOOP; | |
RETURN NEW; | |
END; | |
$$ LANGUAGE plpgsql; | |
-- create trigger to convert new comment to keywords | |
CREATE TRIGGER convert_comment_to_keywords | |
AFTER INSERT ON user_comments | |
FOR EACH ROW EXECUTE PROCEDURE insert_keywords(); | |
-- create a procedure to find the most talked actors | |
CREATE OR REPLACE FUNCTION get_most_talked_actors() RETURNS SETOF actors AS $$ | |
DECLARE | |
lname TEXT; | |
BEGIN | |
SELECT keyword INTO lname FROM keywords ORDER BY count LIMIT 1; | |
IF FOUND THEN | |
RETURN QUERY | |
SELECT * | |
FROM actors | |
WHERE lower((regexp_split_to_array(name, '[^\w]+'))[2]) = lname | |
ORDER BY name; | |
END IF; | |
END; | |
$$ LANGUAGE plpgsql; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment