Created
September 19, 2022 16:50
-
-
Save yssymmt/79345f6584e923610406db54f9d14ab4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
drop table jumbo.aud09_model_bow; | |
create table jumbo.aud09_model_bow ( | |
token varchar(100) character set unicode, | |
category varchar(100) character set unicode, | |
prob double precision | |
) primary index(token,category) | |
; | |
insert into jumbo.aud09_model_bow | |
with src as ( | |
select | |
a4.cat, | |
a4.word, | |
zeroifnull(bow) as bow, | |
count(a4.word) over(partition by a4.cat) as distinctwordcnt | |
from ( | |
select | |
cat, | |
word | |
from ( | |
select | |
cat | |
from jumbo.aud08_train | |
group by 1 | |
) a2 cross join ( | |
select | |
word | |
from jumbo.aud08_train | |
group by 1 | |
) a3 | |
) a4 left outer join ( | |
select | |
cat, | |
word, | |
sum(bow) as bow | |
from jumbo.aud08_train | |
group by 1,2 | |
) a5 on a4.cat=a5.cat and a4.word=a5.word | |
) | |
, probb as ( | |
select | |
word as token, | |
cat as category, | |
bow, | |
cast(bow+1 as double precision) / cast(sum(bow) over(partition by cat) + distinctwordcnt as double precision) as prob | |
from src | |
) | |
, prr as ( | |
/*事前確率*/ | |
select | |
cast('ASTER_NAIVE_BAYES_PRIOR_PROB' as varchar(100)) as token, | |
category, | |
prob | |
from ( | |
select | |
cat as category, | |
cast(sum(bow) over(partition by category) as double precision) / cast(sum(bow) over() as double precision) as prob | |
from src | |
) a6 | |
group by 1,2,3 | |
) | |
select * from ( | |
( | |
/*初期確率*/ | |
select | |
cast('ASTER_NAIVE_BAYES_TEXT_MODEL_TYPE' as varchar(100)) as token, | |
cast('MULTINOMIAL' as varchar(100)) as category, | |
sum(prob) as prob | |
from prr | |
group by 1,2 | |
) union ( | |
/*事前確率*/ | |
select | |
token, | |
category, | |
prob | |
from prr | |
) union ( | |
/*算出確率*/ | |
select | |
token, | |
category, | |
prob | |
from probb | |
where bow<>0 | |
) union ( | |
/*未出現文字の確率*/ | |
select | |
cast('ASTER_NAIVE_BAYES_MISSING_TOKEN_PROB' as varchar(100)) as token, | |
category, | |
prob | |
from probb | |
where bow=0 | |
group by 1,2,3 | |
) | |
) a7 | |
; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment