soaxelbrooke/phrase_detection.sql

## phrase_detection.sql
WITH tokens AS (
  -- Just edit MY_TABLE, MY_TEXT_COL, and MY_PKEY_COL, and watch it go!
  SELECT MY_PKEY_COL AS pkey, (unnest(to_tsvector(MY_TEXT_COL))).* FROM MY_TABLE
), token_stream AS (
  SELECT pkey, unnest(positions) AS token_idx, lexeme
  FROM tokens ORDER BY pkey, token_idx
), token_counts AS (
  SELECT lexeme, sum(count) AS count
  FROM (
    SELECT lexeme, array_length(positions, 1) AS count FROM tokens
    WHERE char_length(lexeme) < 12
  ) lexeme_counts
  GROUP BY lexeme
), vocab_meta AS (
  SELECT count(distinct(lexeme)) FROM token_counts
), phrase_scores AS (
    SELECT
      phrase,
      phrase_count,
      first_counts.count                                                                                       AS first_count,
      second_counts.count                                                                                      AS last_count,
      CAST(phrase_count - 5 AS DOUBLE PRECISION) / first_counts.count / second_counts.count * vocab_meta.count AS score
    FROM vocab_meta, (
                       SELECT
                         first.lexeme                         AS first,
                         second.lexeme                        AS second,
                         first.lexeme || ' ' || second.lexeme AS phrase,
                         count(DISTINCT (first.pkey))    AS phrase_count
                       FROM token_stream first
                         JOIN token_stream second
                           ON first.pkey = second.pkey
                              AND second.token_idx = first.token_idx + 1
                       GROUP BY first.lexeme, second.lexeme
                     ) phrase_counts
      JOIN token_counts first_counts ON first = first_counts.lexeme
      JOIN token_counts second_counts ON second = second_counts.lexeme
)
SELECT * FROM phrase_scores WHERE score > 5 ORDER BY score DESC;

-- BSD 0 Licensed - Copyright Stuart Axelbrooke 2017-2018
	WITH tokens AS (
	-- Just edit MY_TABLE, MY_TEXT_COL, and MY_PKEY_COL, and watch it go!
	SELECT MY_PKEY_COL AS pkey, (unnest(to_tsvector(MY_TEXT_COL))).* FROM MY_TABLE
	), token_stream AS (
	SELECT pkey, unnest(positions) AS token_idx, lexeme
	FROM tokens ORDER BY pkey, token_idx
	), token_counts AS (
	SELECT lexeme, sum(count) AS count
	FROM (
	SELECT lexeme, array_length(positions, 1) AS count FROM tokens
	WHERE char_length(lexeme) < 12
	) lexeme_counts
	GROUP BY lexeme
	), vocab_meta AS (
	SELECT count(distinct(lexeme)) FROM token_counts
	), phrase_scores AS (
	SELECT
	phrase,
	phrase_count,
	first_counts.count AS first_count,
	second_counts.count AS last_count,
	CAST(phrase_count - 5 AS DOUBLE PRECISION) / first_counts.count / second_counts.count * vocab_meta.count AS score
	FROM vocab_meta, (
	SELECT
	first.lexeme AS first,
	second.lexeme AS second,
	first.lexeme \|\| ' ' \|\| second.lexeme AS phrase,
	count(DISTINCT (first.pkey)) AS phrase_count
	FROM token_stream first
	JOIN token_stream second
	ON first.pkey = second.pkey
	AND second.token_idx = first.token_idx + 1
	GROUP BY first.lexeme, second.lexeme
	) phrase_counts
	JOIN token_counts first_counts ON first = first_counts.lexeme
	JOIN token_counts second_counts ON second = second_counts.lexeme
	)
	SELECT * FROM phrase_scores WHERE score > 5 ORDER BY score DESC;

	-- BSD 0 Licensed - Copyright Stuart Axelbrooke 2017-2018