yssymmt/aud10_nbmodel_tfidf.sql

## aud10_nbmodel_tfidf.sql
drop table jumbo.aud10_model_tfidf;

create table jumbo.aud10_model_tfidf (
token varchar(100) character set unicode,
category varchar(100) character set unicode,
prob double precision
) primary index(token,category)
;

insert into jumbo.aud10_model_tfidf
with src as (
	select
	a4.cat,
	a4.word,
	zeroifnull(tfidf) as tfidf,
	count(a4.word) over(partition by a4.cat) as distinctwordcnt
	from (
		select
		cat,
		word
		from (
			select
			cat
			from jumbo.aud08_train
			group by 1
		) a2 cross join (
			select
			word
			from jumbo.aud08_train
			group by 1
		) a3
	) a4 left outer join (
		select
		cat,
		word,
		sum(tfidf) as tfidf
		from jumbo.aud08_train
		group by 1,2
	) a5 on a4.cat=a5.cat and a4.word=a5.word
)
, probb as (
	select
	word as token,
	cat as category,
	tfidf,
	cast(tfidf+1 as double precision) / cast(sum(tfidf) over(partition by cat) + distinctwordcnt as double precision) as prob
	from src
)
, prr as (
	 /*事前確率*/
	select
	cast('ASTER_NAIVE_BAYES_PRIOR_PROB' as varchar(100)) as token,
	category,
	prob
	from (
		select
		cat as category,
		cast(sum(tfidf) over(partition by category) as double precision) / cast(sum(tfidf) over() as double precision) as prob
		from src
	) a6
	group by 1,2,3
)
	select * from (
		(
			 /*初期確率*/
			select
			cast('ASTER_NAIVE_BAYES_TEXT_MODEL_TYPE' as varchar(100)) as token,
			cast('MULTINOMIAL' as varchar(100)) as category,
			sum(prob) as prob
			from prr
			group by 1,2
		) union (
			 /*事前確率*/
			select
			token,
			category,
			prob
			from prr
		) union (
			 /*算出確率*/
			select
			token,
			category,
			prob
			from probb
			where tfidf<>0
		) union (
			 /*未出現文字の確率*/
			select
			cast('ASTER_NAIVE_BAYES_MISSING_TOKEN_PROB' as varchar(100)) as token,
			category,
			prob
			from probb
			where tfidf=0
			group by 1,2,3
		)
	) a7
;
	drop table jumbo.aud10_model_tfidf;

	create table jumbo.aud10_model_tfidf (
	token varchar(100) character set unicode,
	category varchar(100) character set unicode,
	prob double precision
	) primary index(token,category)
	;

	insert into jumbo.aud10_model_tfidf
	with src as (
	select
	a4.cat,
	a4.word,
	zeroifnull(tfidf) as tfidf,
	count(a4.word) over(partition by a4.cat) as distinctwordcnt
	from (
	select
	cat,
	word
	from (
	select
	cat
	from jumbo.aud08_train
	group by 1
	) a2 cross join (
	select
	word
	from jumbo.aud08_train
	group by 1
	) a3
	) a4 left outer join (
	select
	cat,
	word,
	sum(tfidf) as tfidf
	from jumbo.aud08_train
	group by 1,2
	) a5 on a4.cat=a5.cat and a4.word=a5.word
	)
	, probb as (
	select
	word as token,
	cat as category,
	tfidf,
	cast(tfidf+1 as double precision) / cast(sum(tfidf) over(partition by cat) + distinctwordcnt as double precision) as prob
	from src
	)
	, prr as (
	/事前確率/
	select
	cast('ASTER_NAIVE_BAYES_PRIOR_PROB' as varchar(100)) as token,
	category,
	prob
	from (
	select
	cat as category,
	cast(sum(tfidf) over(partition by category) as double precision) / cast(sum(tfidf) over() as double precision) as prob
	from src
	) a6
	group by 1,2,3
	)
	select * from (
	(
	/初期確率/
	select
	cast('ASTER_NAIVE_BAYES_TEXT_MODEL_TYPE' as varchar(100)) as token,
	cast('MULTINOMIAL' as varchar(100)) as category,
	sum(prob) as prob
	from prr
	group by 1,2
	) union (
	/事前確率/
	select
	token,
	category,
	prob
	from prr
	) union (
	/算出確率/
	select
	token,
	category,
	prob
	from probb
	where tfidf<>0
	) union (
	/未出現文字の確率/
	select
	cast('ASTER_NAIVE_BAYES_MISSING_TOKEN_PROB' as varchar(100)) as token,
	category,
	prob
	from probb
	where tfidf=0
	group by 1,2,3
	)
	) a7
	;