Skip to content

Instantly share code, notes, and snippets.

@danielfone
Created November 10, 2019 10:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save danielfone/2b2a084c5cf8baa22d460b86a7e59f5f to your computer and use it in GitHub Desktop.
Save danielfone/2b2a084c5cf8baa22d460b86a7e59f5f to your computer and use it in GitHub Desktop.
with recursive
-- extract all the messages as rows from the several `conversations.history`
-- API responses in the `slack_history` table
messages as (
select jsonb_array_elements(response->'messages') as message from markov.slack_history
),
-- remove @heres, @names, exclude bot messages, etc
cleaned_messages as (
select message->>'ts' as id, regexp_replace(message->>'text', '\<[!@#]\S+\> ?', '', 'g') as text
from messages
where message->'subtype' is null
),
-- split each message into a text array
message_word_lists as (
select id, string_to_array(text, ' ') as list
from cleaned_messages
),
-- split all words into separate rows with a column for the original message
-- and a number that represents the word's sequence in the message. includes a
-- NULL word at the start and end of each message. This is initiator and
-- terminator for the chain.
word_indexes as (
select id, list[sequence] as word, sequence
from (select id, list, generate_series(0, array_length(list, 1)+1) as sequence from message_word_lists) indicies
),
-- self-join our words on neighbouring sequences to produce rows containing
-- words that occured in pairs in the original message (bigrams)
-- e.g. | Hello | how |
-- | how | are |
-- | are | you? |
bigrams as (
select lefts.word as left, rights.word as right
from word_indexes lefts
join word_indexes rights on lefts.sequence = rights.sequence - 1 and lefts.id = rights.id
),
-- 1. start with a random null-initiated bigram, this is the start of a
-- message
-- 2. shuffle the bigrams and pick a random word which has followed the first
-- word before. Probability of choice is determined by number of occurances
-- in the text, since it's random sorting
-- 3. Finish when we select a word with a null RHS
--
markov_chain(word) as (
(select bigrams.right as word from bigrams where bigrams.left is null order by random() limit 1)
union all
(select bigrams.right as word from bigrams join markov_chain on markov_chain.word = bigrams.left order by random() limit 1)
)
--select * from messages
--select * from cleaned_messages
--select * from message_word_lists
--select * from word_indexes
select string_agg(word, ' ') from markov_chain
--select * from bigrams where bigrams.left = 'perfect.'
;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment