Skip to content

Instantly share code, notes, and snippets.

@yssymmt
Created September 19, 2022 16:51
Show Gist options
  • Save yssymmt/9b001b60aac975288fd7887378edf7bc to your computer and use it in GitHub Desktop.
Save yssymmt/9b001b60aac975288fd7887378edf7bc to your computer and use it in GitHub Desktop.
drop table jumbo.aud10_model_tfidf;
create table jumbo.aud10_model_tfidf (
token varchar(100) character set unicode,
category varchar(100) character set unicode,
prob double precision
) primary index(token,category)
;
insert into jumbo.aud10_model_tfidf
with src as (
select
a4.cat,
a4.word,
zeroifnull(tfidf) as tfidf,
count(a4.word) over(partition by a4.cat) as distinctwordcnt
from (
select
cat,
word
from (
select
cat
from jumbo.aud08_train
group by 1
) a2 cross join (
select
word
from jumbo.aud08_train
group by 1
) a3
) a4 left outer join (
select
cat,
word,
sum(tfidf) as tfidf
from jumbo.aud08_train
group by 1,2
) a5 on a4.cat=a5.cat and a4.word=a5.word
)
, probb as (
select
word as token,
cat as category,
tfidf,
cast(tfidf+1 as double precision) / cast(sum(tfidf) over(partition by cat) + distinctwordcnt as double precision) as prob
from src
)
, prr as (
/*事前確率*/
select
cast('ASTER_NAIVE_BAYES_PRIOR_PROB' as varchar(100)) as token,
category,
prob
from (
select
cat as category,
cast(sum(tfidf) over(partition by category) as double precision) / cast(sum(tfidf) over() as double precision) as prob
from src
) a6
group by 1,2,3
)
select * from (
(
/*初期確率*/
select
cast('ASTER_NAIVE_BAYES_TEXT_MODEL_TYPE' as varchar(100)) as token,
cast('MULTINOMIAL' as varchar(100)) as category,
sum(prob) as prob
from prr
group by 1,2
) union (
/*事前確率*/
select
token,
category,
prob
from prr
) union (
/*算出確率*/
select
token,
category,
prob
from probb
where tfidf<>0
) union (
/*未出現文字の確率*/
select
cast('ASTER_NAIVE_BAYES_MISSING_TOKEN_PROB' as varchar(100)) as token,
category,
prob
from probb
where tfidf=0
group by 1,2,3
)
) a7
;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment