Skip to content

Instantly share code, notes, and snippets.

@soaxelbrooke
Last active January 10, 2018 19:38
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save soaxelbrooke/11c045855b129dbdabb05d5a4c8cfbde to your computer and use it in GitHub Desktop.
Save soaxelbrooke/11c045855b129dbdabb05d5a4c8cfbde to your computer and use it in GitHub Desktop.
Phrase detection implemented in pure PostgreSQL
WITH tokens AS (
-- Just edit MY_TABLE, MY_TEXT_COL, and MY_PKEY_COL, and watch it go!
SELECT MY_PKEY_COL AS pkey, (unnest(to_tsvector(MY_TEXT_COL))).* FROM MY_TABLE
), token_stream AS (
SELECT pkey, unnest(positions) AS token_idx, lexeme
FROM tokens ORDER BY pkey, token_idx
), token_counts AS (
SELECT lexeme, sum(count) AS count
FROM (
SELECT lexeme, array_length(positions, 1) AS count FROM tokens
WHERE char_length(lexeme) < 12
) lexeme_counts
GROUP BY lexeme
), vocab_meta AS (
SELECT count(distinct(lexeme)) FROM token_counts
), phrase_scores AS (
SELECT
phrase,
phrase_count,
first_counts.count AS first_count,
second_counts.count AS last_count,
CAST(phrase_count - 5 AS DOUBLE PRECISION) / first_counts.count / second_counts.count * vocab_meta.count AS score
FROM vocab_meta, (
SELECT
first.lexeme AS first,
second.lexeme AS second,
first.lexeme || ' ' || second.lexeme AS phrase,
count(DISTINCT (first.pkey)) AS phrase_count
FROM token_stream first
JOIN token_stream second
ON first.pkey = second.pkey
AND second.token_idx = first.token_idx + 1
GROUP BY first.lexeme, second.lexeme
) phrase_counts
JOIN token_counts first_counts ON first = first_counts.lexeme
JOIN token_counts second_counts ON second = second_counts.lexeme
)
SELECT * FROM phrase_scores WHERE score > 5 ORDER BY score DESC;
-- BSD 0 Licensed - Copyright Stuart Axelbrooke 2017-2018
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment