Created
September 23, 2022 02:46
-
-
Save yssymmt/89a2f9c5446da1c4673af0d15f201a8e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
with src1 as ( | |
select | |
word1, | |
word2, | |
sum(a1.tfidf*a2.tfidf) as tfidfsum, | |
sqrt(sum(tfidfsum*tfidfsum) over(partition by word1))* | |
sqrt(sum(tfidfsum*tfidfsum) over(partition by word2)) as norm | |
from ( | |
select | |
docid, | |
word as word1, | |
tfidf | |
from jumbo.aud14_tfidf | |
) a1 inner join ( | |
select | |
docid, | |
word as word2, | |
tfidf | |
from jumbo.aud14_tfidf | |
) a2 on a1.docid=a2.docid | |
group by 1,2 | |
) | |
, src2 as ( | |
select | |
a3.word1 as word1, | |
a4.word1 as word2, | |
sum(a3.tfidfsum*a4.tfidfsum) as dott | |
from ( | |
select | |
word1, | |
word2, | |
tfidfsum | |
from src1 | |
) a3 inner join ( | |
select | |
word1, | |
word2, | |
tfidfsum | |
from src1 | |
) a4 on a3.word2=a4.word2 | |
group by 1,2 | |
) | |
select | |
a5.word1, | |
a5.word2, | |
dott / norm as コサイン類似度 | |
from src1 a5 left outer join src2 a6 | |
on a5.word1=a6.word1 and a5.word2=a6.word2 | |
where a5.word1<>a5.word2 | |
and a5.word1<a5.word2 | |
and コサイン類似度>=0.5 | |
order by 3 desc | |
; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment