yssymmt/aud20_cosine_tfidf.sql

## aud20_cosine_tfidf.sql
drop table jumbo.aud14_tfidf;

create table jumbo.aud14_tfidf (
docid integer,
cat varchar(10) character set unicode,
word varchar(10) character set unicode,
bow integer,
tf double precision,
idf double precision,
tfidf double precision,
idf_smoothfalse double precision,
idf_smoothtrue double precision,
tfidf_normfalse_smoothfalse double precision,
tfidf_normfalse_smoothtrue double precision,
l2_norm_smoothfalse double precision,
l2_norm_smoothtrue double precision,
tfidf_l2_smoothfalse double precision,
tfidf_l2_smoothtrue double precision
) primary index(docid)
;

insert into jumbo.aud14_tfidf
with bowtf as (
	 /*Bags-of-WordsおよびTFの計算*/
	select
	a3.docid,
	a3.word,
	zeroifnull(bow) as bow,
	zeroifnull(cast(bow as float) / cast(sum(bow) over(partition by a3.docid) as float)) as tf
	from (
		select
		docid,
		word
		from (
			select
			docid
			from jumbo.aud07_wordseq
			group by 1
		) a1 cross join (
			select
			word
			from jumbo.aud07_wordseq
			group by 1
		) a2
	) a3 left outer join (
		select
		docid,
		word,
		count(*) as bow
		from jumbo.aud07_wordseq
		group by 1,2
	) a4
	on a3.docid=a4.docid and a3.word=a4.word
)
, idf as (
	 /*IDF計算、合計文書数/ワードが所属している文書数の自然対数*/
	select
	word,
	count(case when bow>0 then 1 else null end) as numdoc,
	count(distinct docid) as numdocall,
	ln(cast(numdocall as double precision) / cast(numdoc as double precision)) as idf,
	idf + 1 as idf_smoothfalse,
	ln(cast(numdocall+1 as double precision) / cast(numdoc+1 as double precision)) +1 as idf_smoothtrue
	from bowtf
	group by 1
)
	 /*TFIDF計算: IDFのユニークはwordのみでgroup byすれば再現可能*/
	select
	a5.docid,
	cat,
	a5.word,
	bow, /*scikitのtf*/
	tf,
	idf,
	tf * idf as tfidf, /*一般的なtfidf*/
	idf_smoothfalse,
	idf_smoothtrue,
	bow * idf_smoothfalse as tfidf_normfalse_smoothfalse, /*scikit mimic, normがfalse(none)でsmooth_idfもfalseのとき*/
	bow * idf_smoothtrue as tfidf_normfalse_smoothtrue, /*scikit mimic, normがfalse(none)でsmooth_idfがtrueのとき*/
	power(sum(power(tfidf_normfalse_smoothfalse,2)) over(partition by a5.docid),0.5) as l2_norm_smoothfalse,
	power(sum(power(tfidf_normfalse_smoothtrue,2)) over(partition by a5.docid),0.5) as l2_norm_smoothtrue,
	tfidf_normfalse_smoothfalse / l2_norm_smoothfalse as tfidf_l2_smoothfalse, /*scikit mimic, normがtrueでsmooth_idfがfalseのとき*/
	tfidf_normfalse_smoothtrue / l2_norm_smoothtrue as tfidf_l2_smoothtrue /*scikit mimic, normがtrueでsmooth_idfがtrue (デフォルト)のとき*/
	from bowtf a5 inner join idf a6
	on a5.word=a6.word
	left outer join (
		select
		docid,
		cat
		from jumbo.aud01_org
		group by 1,2
	) a7
	on a5.docid=a7.docid
	;
	drop table jumbo.aud14_tfidf;

	create table jumbo.aud14_tfidf (
	docid integer,
	cat varchar(10) character set unicode,
	word varchar(10) character set unicode,
	bow integer,
	tf double precision,
	idf double precision,
	tfidf double precision,
	idf_smoothfalse double precision,
	idf_smoothtrue double precision,
	tfidf_normfalse_smoothfalse double precision,
	tfidf_normfalse_smoothtrue double precision,
	l2_norm_smoothfalse double precision,
	l2_norm_smoothtrue double precision,
	tfidf_l2_smoothfalse double precision,
	tfidf_l2_smoothtrue double precision
	) primary index(docid)
	;

	insert into jumbo.aud14_tfidf
	with bowtf as (
	/Bags-of-WordsおよびTFの計算/
	select
	a3.docid,
	a3.word,
	zeroifnull(bow) as bow,
	zeroifnull(cast(bow as float) / cast(sum(bow) over(partition by a3.docid) as float)) as tf
	from (
	select
	docid,
	word
	from (
	select
	docid
	from jumbo.aud07_wordseq
	group by 1
	) a1 cross join (
	select
	word
	from jumbo.aud07_wordseq
	group by 1
	) a2
	) a3 left outer join (
	select
	docid,
	word,
	count(*) as bow
	from jumbo.aud07_wordseq
	group by 1,2
	) a4
	on a3.docid=a4.docid and a3.word=a4.word
	)
	, idf as (
	/IDF計算、合計文書数/ワードが所属している文書数の自然対数/
	select
	word,
	count(case when bow>0 then 1 else null end) as numdoc,
	count(distinct docid) as numdocall,
	ln(cast(numdocall as double precision) / cast(numdoc as double precision)) as idf,
	idf + 1 as idf_smoothfalse,
	ln(cast(numdocall+1 as double precision) / cast(numdoc+1 as double precision)) +1 as idf_smoothtrue
	from bowtf
	group by 1
	)
	/TFIDF計算: IDFのユニークはwordのみでgroup byすれば再現可能/
	select
	a5.docid,
	cat,
	a5.word,
	bow, /scikitのtf/
	tf,
	idf,
	tf * idf as tfidf, /一般的なtfidf/
	idf_smoothfalse,
	idf_smoothtrue,
	bow * idf_smoothfalse as tfidf_normfalse_smoothfalse, /scikit mimic, normがfalse(none)でsmooth_idfもfalseのとき/
	bow * idf_smoothtrue as tfidf_normfalse_smoothtrue, /scikit mimic, normがfalse(none)でsmooth_idfがtrueのとき/
	power(sum(power(tfidf_normfalse_smoothfalse,2)) over(partition by a5.docid),0.5) as l2_norm_smoothfalse,
	power(sum(power(tfidf_normfalse_smoothtrue,2)) over(partition by a5.docid),0.5) as l2_norm_smoothtrue,
	tfidf_normfalse_smoothfalse / l2_norm_smoothfalse as tfidf_l2_smoothfalse, /scikit mimic, normがtrueでsmooth_idfがfalseのとき/
	tfidf_normfalse_smoothtrue / l2_norm_smoothtrue as tfidf_l2_smoothtrue /scikit mimic, normがtrueでsmooth_idfがtrue (デフォルト)のとき/
	from bowtf a5 inner join idf a6
	on a5.word=a6.word
	left outer join (
	select
	docid,
	cat
	from jumbo.aud01_org
	group by 1,2
	) a7
	on a5.docid=a7.docid
	;