Skip to content

Instantly share code, notes, and snippets.

@yssymmt
Created September 23, 2022 02:46
Show Gist options
  • Save yssymmt/89a2f9c5446da1c4673af0d15f201a8e to your computer and use it in GitHub Desktop.
Save yssymmt/89a2f9c5446da1c4673af0d15f201a8e to your computer and use it in GitHub Desktop.
with src1 as (
select
word1,
word2,
sum(a1.tfidf*a2.tfidf) as tfidfsum,
sqrt(sum(tfidfsum*tfidfsum) over(partition by word1))*
sqrt(sum(tfidfsum*tfidfsum) over(partition by word2)) as norm
from (
select
docid,
word as word1,
tfidf
from jumbo.aud14_tfidf
) a1 inner join (
select
docid,
word as word2,
tfidf
from jumbo.aud14_tfidf
) a2 on a1.docid=a2.docid
group by 1,2
)
, src2 as (
select
a3.word1 as word1,
a4.word1 as word2,
sum(a3.tfidfsum*a4.tfidfsum) as dott
from (
select
word1,
word2,
tfidfsum
from src1
) a3 inner join (
select
word1,
word2,
tfidfsum
from src1
) a4 on a3.word2=a4.word2
group by 1,2
)
select
a5.word1,
a5.word2,
dott / norm as コサイン類似度
from src1 a5 left outer join src2 a6
on a5.word1=a6.word1 and a5.word2=a6.word2
where a5.word1<>a5.word2
and a5.word1<a5.word2
and コサイン類似度>=0.5
order by 3 desc
;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment