Created
September 23, 2022 02:45
-
-
Save yssymmt/a50256396e1df54a79f01e55a5a30673 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
drop table jumbo.aud14_tfidf; | |
create table jumbo.aud14_tfidf ( | |
docid integer, | |
cat varchar(10) character set unicode, | |
word varchar(10) character set unicode, | |
bow integer, | |
tf double precision, | |
idf double precision, | |
tfidf double precision, | |
idf_smoothfalse double precision, | |
idf_smoothtrue double precision, | |
tfidf_normfalse_smoothfalse double precision, | |
tfidf_normfalse_smoothtrue double precision, | |
l2_norm_smoothfalse double precision, | |
l2_norm_smoothtrue double precision, | |
tfidf_l2_smoothfalse double precision, | |
tfidf_l2_smoothtrue double precision | |
) primary index(docid) | |
; | |
insert into jumbo.aud14_tfidf | |
with bowtf as ( | |
/*Bags-of-WordsおよびTFの計算*/ | |
select | |
a3.docid, | |
a3.word, | |
zeroifnull(bow) as bow, | |
zeroifnull(cast(bow as float) / cast(sum(bow) over(partition by a3.docid) as float)) as tf | |
from ( | |
select | |
docid, | |
word | |
from ( | |
select | |
docid | |
from jumbo.aud07_wordseq | |
group by 1 | |
) a1 cross join ( | |
select | |
word | |
from jumbo.aud07_wordseq | |
group by 1 | |
) a2 | |
) a3 left outer join ( | |
select | |
docid, | |
word, | |
count(*) as bow | |
from jumbo.aud07_wordseq | |
group by 1,2 | |
) a4 | |
on a3.docid=a4.docid and a3.word=a4.word | |
) | |
, idf as ( | |
/*IDF計算、合計文書数/ワードが所属している文書数の自然対数*/ | |
select | |
word, | |
count(case when bow>0 then 1 else null end) as numdoc, | |
count(distinct docid) as numdocall, | |
ln(cast(numdocall as double precision) / cast(numdoc as double precision)) as idf, | |
idf + 1 as idf_smoothfalse, | |
ln(cast(numdocall+1 as double precision) / cast(numdoc+1 as double precision)) +1 as idf_smoothtrue | |
from bowtf | |
group by 1 | |
) | |
/*TFIDF計算: IDFのユニークはwordのみでgroup byすれば再現可能*/ | |
select | |
a5.docid, | |
cat, | |
a5.word, | |
bow, /*scikitのtf*/ | |
tf, | |
idf, | |
tf * idf as tfidf, /*一般的なtfidf*/ | |
idf_smoothfalse, | |
idf_smoothtrue, | |
bow * idf_smoothfalse as tfidf_normfalse_smoothfalse, /*scikit mimic, normがfalse(none)でsmooth_idfもfalseのとき*/ | |
bow * idf_smoothtrue as tfidf_normfalse_smoothtrue, /*scikit mimic, normがfalse(none)でsmooth_idfがtrueのとき*/ | |
power(sum(power(tfidf_normfalse_smoothfalse,2)) over(partition by a5.docid),0.5) as l2_norm_smoothfalse, | |
power(sum(power(tfidf_normfalse_smoothtrue,2)) over(partition by a5.docid),0.5) as l2_norm_smoothtrue, | |
tfidf_normfalse_smoothfalse / l2_norm_smoothfalse as tfidf_l2_smoothfalse, /*scikit mimic, normがtrueでsmooth_idfがfalseのとき*/ | |
tfidf_normfalse_smoothtrue / l2_norm_smoothtrue as tfidf_l2_smoothtrue /*scikit mimic, normがtrueでsmooth_idfがtrue (デフォルト)のとき*/ | |
from bowtf a5 inner join idf a6 | |
on a5.word=a6.word | |
left outer join ( | |
select | |
docid, | |
cat | |
from jumbo.aud01_org | |
group by 1,2 | |
) a7 | |
on a5.docid=a7.docid | |
; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment