Skip to content

Instantly share code, notes, and snippets.

@wmichi wmichi/cosineSim.R
Last active Jul 27, 2019

Embed
What would you like to do?
Extract top N similar items from embeddings
# Embedding後のデータを読み込む
df_first <- read.csv('tmp/emb_first', header=FALSE, sep=' ', stringsAsFactors = FALSE)
df_second <- read.csv('tmp/emb_second', header=FALSE, sep=' ', stringsAsFactors = FALSE)
# アイテム名の抽出。first, secondともに同じ順番n
names <- df_first[2:nrow(df_first),1]
# Embedding部分のconcat
emb_first <- matrix(unlist(df_first[2:nrow(df_first),2:33]), nrow=nrow(df_first)-1, ncol=32)
emb_second <- matrix(unlist(df_second[2:nrow(df_second),2:33]), nrow=nrow(df_second)-1, ncol=32)
embs <- cbind(emb_first, emb_second)
# コサイン類似度の算出関数
cosineSim <- function(x, y){
xx <- sqrt(dot(x,x))
yy <- sqrt(dot(y,y))
xy <- dot(x,y)
cosine_sim <- xy / dot(xx,yy)
}
# コサイン類似度に基づくトップNアイテムの出力
# param: emb_matrix: Embedding後の行列 matrix
# name_list: ノード名のリスト characterのlist
# target_idx: 類似アイテムをみたいアイテムID integer
# k: 出力したい上位アイテム数 integer
coSimTopk<-function(emb_matrix, name_list, target_idx, k){
result <- matrix(0L, nrow=k,ncol=3)
score_mat <- matrix(0L, nrow=nrow(emb_matrix),ncol=1)
x <- emb_matrix[target_idx,]
for(i in 1:nrow(emb_matrix)){
y <- emb_matrix[i,]
score <- cosineSim(x,y)
score_mat[i,1] <- score
}
score_mat[target_idx,1] <- 0.0
top_idxs <- head(order(score_mat,decreasing=TRUE), k)
for(j in 1:k){
result[j,1] <- top_idxs[j]
result[j,2] <- name_list[top_idxs[j]]
result[j,3] <- score_mat[top_idxs[j],1]
}
print(name_list[target_idx])
result
}
# 実行例
coSimTopk(embs, names, target_idx=2, 10)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.