archiewood/word_frequency.sql

## word_frequency.sql
with cleaned_docs as (
    select
        tweet_id,
        regexp_replace(regexp_replace(text, '\\n', ' ', 'g'), 'https[^\\s]+', '', 'g') as cleaned_text
    from tweets
    where text is not null
),

tokenized_docs as (
    select
        tweet_id,
        unnest(string_split(cleaned_text, ' ')) as token
    from cleaned_docs
),

stop_words as (
    select
        unnest(en) as word
    from 'https://raw.githubusercontent.com/stopwords-iso/stopwords-iso/master/python/stopwordsiso/stopwords-iso.json'
),

filtered_tokens as (
    select
        tweet_id,
        regexp_replace(lower(token), '[^a-z]', '', 'g') as token
    from tokenized_docs
    where
        length(token) > 1
        and token not in (select word from stop_words)
        and token is not null
),

word_frequencies as (
    select
        token,
        count(*) as freq
    from filtered_tokens
    where token is not null
    group by token
    order by freq desc
)

select * from word_frequencies;
	with cleaned_docs as (
	select
	tweet_id,
	regexp_replace(regexp_replace(text, '\\n', ' ', 'g'), 'https[^\\s]+', '', 'g') as cleaned_text
	from tweets
	where text is not null
	),

	tokenized_docs as (
	select
	tweet_id,
	unnest(string_split(cleaned_text, ' ')) as token
	from cleaned_docs
	),

	stop_words as (
	select
	unnest(en) as word
	from 'https://raw.githubusercontent.com/stopwords-iso/stopwords-iso/master/python/stopwordsiso/stopwords-iso.json'
	),

	filtered_tokens as (
	select
	tweet_id,
	regexp_replace(lower(token), '[^a-z]', '', 'g') as token
	from tokenized_docs
	where
	length(token) > 1
	and token not in (select word from stop_words)
	and token is not null
	),

	word_frequencies as (
	select
	token,
	count(*) as freq
	from filtered_tokens
	where token is not null
	group by token
	order by freq desc
	)

	select * from word_frequencies;