Skip to content

Instantly share code, notes, and snippets.

@yssymmt
Created September 23, 2022 02:43
Show Gist options
  • Save yssymmt/eb2aa276559b8dc788343e122233d488 to your computer and use it in GitHub Desktop.
Save yssymmt/eb2aa276559b8dc788343e122233d488 to your computer and use it in GitHub Desktop.
with cossim as (
/*コサイン類似度の計算*/
select
a1.docid as docid1,
a2.docid as docid2,
左側ベクトル長*右側ベクトル長 as norm,
sum(a1.tfidf*a2.tfidf) as 内積,
内積 / (左側ベクトル長*右側ベクトル長) as コサイン類似度
from (
select
docid,
word,
tfidf,
sqrt(sum(tfidf*tfidf) over(partition by docid)) as 左側ベクトル長
from jumbo.aud14_tfidf
) a1 inner join (
select
docid,
word,
tfidf,
sqrt(sum(tfidf*tfidf) over(partition by docid)) as 右側ベクトル長
from jumbo.aud14_tfidf
) a2 on a1.word=a2.word
group by 1,2,3
)
/*絞り込みと文書の横付け*/
select
b1.docid1,
b2.docdesc as docdesc1,
b1.docid2,
b3.docdesc as docdesc2,
コサイン類似度
from (
select
docid1,
docid2,
コサイン類似度
from cossim
where docid1<>docid2
and docid1<docid2
and コサイン類似度>=0.1
) b1
left outer join jumbo.aud03_neologdn b2
on b1.docid1=b2.docid
left outer join jumbo.aud03_neologdn b3
on b1.docid2=b3.docid
order by 5 desc
;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment