Created
September 19, 2022 16:51
-
-
Save yssymmt/9b001b60aac975288fd7887378edf7bc to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
drop table jumbo.aud10_model_tfidf; | |
create table jumbo.aud10_model_tfidf ( | |
token varchar(100) character set unicode, | |
category varchar(100) character set unicode, | |
prob double precision | |
) primary index(token,category) | |
; | |
insert into jumbo.aud10_model_tfidf | |
with src as ( | |
select | |
a4.cat, | |
a4.word, | |
zeroifnull(tfidf) as tfidf, | |
count(a4.word) over(partition by a4.cat) as distinctwordcnt | |
from ( | |
select | |
cat, | |
word | |
from ( | |
select | |
cat | |
from jumbo.aud08_train | |
group by 1 | |
) a2 cross join ( | |
select | |
word | |
from jumbo.aud08_train | |
group by 1 | |
) a3 | |
) a4 left outer join ( | |
select | |
cat, | |
word, | |
sum(tfidf) as tfidf | |
from jumbo.aud08_train | |
group by 1,2 | |
) a5 on a4.cat=a5.cat and a4.word=a5.word | |
) | |
, probb as ( | |
select | |
word as token, | |
cat as category, | |
tfidf, | |
cast(tfidf+1 as double precision) / cast(sum(tfidf) over(partition by cat) + distinctwordcnt as double precision) as prob | |
from src | |
) | |
, prr as ( | |
/*事前確率*/ | |
select | |
cast('ASTER_NAIVE_BAYES_PRIOR_PROB' as varchar(100)) as token, | |
category, | |
prob | |
from ( | |
select | |
cat as category, | |
cast(sum(tfidf) over(partition by category) as double precision) / cast(sum(tfidf) over() as double precision) as prob | |
from src | |
) a6 | |
group by 1,2,3 | |
) | |
select * from ( | |
( | |
/*初期確率*/ | |
select | |
cast('ASTER_NAIVE_BAYES_TEXT_MODEL_TYPE' as varchar(100)) as token, | |
cast('MULTINOMIAL' as varchar(100)) as category, | |
sum(prob) as prob | |
from prr | |
group by 1,2 | |
) union ( | |
/*事前確率*/ | |
select | |
token, | |
category, | |
prob | |
from prr | |
) union ( | |
/*算出確率*/ | |
select | |
token, | |
category, | |
prob | |
from probb | |
where tfidf<>0 | |
) union ( | |
/*未出現文字の確率*/ | |
select | |
cast('ASTER_NAIVE_BAYES_MISSING_TOKEN_PROB' as varchar(100)) as token, | |
category, | |
prob | |
from probb | |
where tfidf=0 | |
group by 1,2,3 | |
) | |
) a7 | |
; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment