Skip to content

Instantly share code, notes, and snippets.

@yssymmt
Created September 23, 2022 02:45
Show Gist options
  • Save yssymmt/a50256396e1df54a79f01e55a5a30673 to your computer and use it in GitHub Desktop.
Save yssymmt/a50256396e1df54a79f01e55a5a30673 to your computer and use it in GitHub Desktop.
drop table jumbo.aud14_tfidf;
create table jumbo.aud14_tfidf (
docid integer,
cat varchar(10) character set unicode,
word varchar(10) character set unicode,
bow integer,
tf double precision,
idf double precision,
tfidf double precision,
idf_smoothfalse double precision,
idf_smoothtrue double precision,
tfidf_normfalse_smoothfalse double precision,
tfidf_normfalse_smoothtrue double precision,
l2_norm_smoothfalse double precision,
l2_norm_smoothtrue double precision,
tfidf_l2_smoothfalse double precision,
tfidf_l2_smoothtrue double precision
) primary index(docid)
;
insert into jumbo.aud14_tfidf
with bowtf as (
/*Bags-of-WordsおよびTFの計算*/
select
a3.docid,
a3.word,
zeroifnull(bow) as bow,
zeroifnull(cast(bow as float) / cast(sum(bow) over(partition by a3.docid) as float)) as tf
from (
select
docid,
word
from (
select
docid
from jumbo.aud07_wordseq
group by 1
) a1 cross join (
select
word
from jumbo.aud07_wordseq
group by 1
) a2
) a3 left outer join (
select
docid,
word,
count(*) as bow
from jumbo.aud07_wordseq
group by 1,2
) a4
on a3.docid=a4.docid and a3.word=a4.word
)
, idf as (
/*IDF計算、合計文書数/ワードが所属している文書数の自然対数*/
select
word,
count(case when bow>0 then 1 else null end) as numdoc,
count(distinct docid) as numdocall,
ln(cast(numdocall as double precision) / cast(numdoc as double precision)) as idf,
idf + 1 as idf_smoothfalse,
ln(cast(numdocall+1 as double precision) / cast(numdoc+1 as double precision)) +1 as idf_smoothtrue
from bowtf
group by 1
)
/*TFIDF計算: IDFのユニークはwordのみでgroup byすれば再現可能*/
select
a5.docid,
cat,
a5.word,
bow, /*scikitのtf*/
tf,
idf,
tf * idf as tfidf, /*一般的なtfidf*/
idf_smoothfalse,
idf_smoothtrue,
bow * idf_smoothfalse as tfidf_normfalse_smoothfalse, /*scikit mimic, normがfalse(none)でsmooth_idfもfalseのとき*/
bow * idf_smoothtrue as tfidf_normfalse_smoothtrue, /*scikit mimic, normがfalse(none)でsmooth_idfがtrueのとき*/
power(sum(power(tfidf_normfalse_smoothfalse,2)) over(partition by a5.docid),0.5) as l2_norm_smoothfalse,
power(sum(power(tfidf_normfalse_smoothtrue,2)) over(partition by a5.docid),0.5) as l2_norm_smoothtrue,
tfidf_normfalse_smoothfalse / l2_norm_smoothfalse as tfidf_l2_smoothfalse, /*scikit mimic, normがtrueでsmooth_idfがfalseのとき*/
tfidf_normfalse_smoothtrue / l2_norm_smoothtrue as tfidf_l2_smoothtrue /*scikit mimic, normがtrueでsmooth_idfがtrue (デフォルト)のとき*/
from bowtf a5 inner join idf a6
on a5.word=a6.word
left outer join (
select
docid,
cat
from jumbo.aud01_org
group by 1,2
) a7
on a5.docid=a7.docid
;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment