Created
September 23, 2022 02:43
-
-
Save yssymmt/eb2aa276559b8dc788343e122233d488 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
with cossim as ( | |
/*コサイン類似度の計算*/ | |
select | |
a1.docid as docid1, | |
a2.docid as docid2, | |
左側ベクトル長*右側ベクトル長 as norm, | |
sum(a1.tfidf*a2.tfidf) as 内積, | |
内積 / (左側ベクトル長*右側ベクトル長) as コサイン類似度 | |
from ( | |
select | |
docid, | |
word, | |
tfidf, | |
sqrt(sum(tfidf*tfidf) over(partition by docid)) as 左側ベクトル長 | |
from jumbo.aud14_tfidf | |
) a1 inner join ( | |
select | |
docid, | |
word, | |
tfidf, | |
sqrt(sum(tfidf*tfidf) over(partition by docid)) as 右側ベクトル長 | |
from jumbo.aud14_tfidf | |
) a2 on a1.word=a2.word | |
group by 1,2,3 | |
) | |
/*絞り込みと文書の横付け*/ | |
select | |
b1.docid1, | |
b2.docdesc as docdesc1, | |
b1.docid2, | |
b3.docdesc as docdesc2, | |
コサイン類似度 | |
from ( | |
select | |
docid1, | |
docid2, | |
コサイン類似度 | |
from cossim | |
where docid1<>docid2 | |
and docid1<docid2 | |
and コサイン類似度>=0.1 | |
) b1 | |
left outer join jumbo.aud03_neologdn b2 | |
on b1.docid1=b2.docid | |
left outer join jumbo.aud03_neologdn b3 | |
on b1.docid2=b3.docid | |
order by 5 desc | |
; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment