Last active
July 27, 2019 06:00
-
-
Save wmichi/6b60b12543bfeb3205cff32d6adc3995 to your computer and use it in GitHub Desktop.
Extract top N similar items from embeddings
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Embedding後のデータを読み込む | |
df_first <- read.csv('tmp/emb_first', header=FALSE, sep=' ', stringsAsFactors = FALSE) | |
df_second <- read.csv('tmp/emb_second', header=FALSE, sep=' ', stringsAsFactors = FALSE) | |
# アイテム名の抽出。first, secondともに同じ順番n | |
names <- df_first[2:nrow(df_first),1] | |
# Embedding部分のconcat | |
emb_first <- matrix(unlist(df_first[2:nrow(df_first),2:33]), nrow=nrow(df_first)-1, ncol=32) | |
emb_second <- matrix(unlist(df_second[2:nrow(df_second),2:33]), nrow=nrow(df_second)-1, ncol=32) | |
embs <- cbind(emb_first, emb_second) | |
# コサイン類似度の算出関数 | |
cosineSim <- function(x, y){ | |
xx <- sqrt(dot(x,x)) | |
yy <- sqrt(dot(y,y)) | |
xy <- dot(x,y) | |
cosine_sim <- xy / dot(xx,yy) | |
} | |
# コサイン類似度に基づくトップNアイテムの出力 | |
# param: emb_matrix: Embedding後の行列 matrix | |
# name_list: ノード名のリスト characterのlist | |
# target_idx: 類似アイテムをみたいアイテムID integer | |
# k: 出力したい上位アイテム数 integer | |
coSimTopk<-function(emb_matrix, name_list, target_idx, k){ | |
result <- matrix(0L, nrow=k,ncol=3) | |
score_mat <- matrix(0L, nrow=nrow(emb_matrix),ncol=1) | |
x <- emb_matrix[target_idx,] | |
for(i in 1:nrow(emb_matrix)){ | |
y <- emb_matrix[i,] | |
score <- cosineSim(x,y) | |
score_mat[i,1] <- score | |
} | |
score_mat[target_idx,1] <- 0.0 | |
top_idxs <- head(order(score_mat,decreasing=TRUE), k) | |
for(j in 1:k){ | |
result[j,1] <- top_idxs[j] | |
result[j,2] <- name_list[top_idxs[j]] | |
result[j,3] <- score_mat[top_idxs[j],1] | |
} | |
print(name_list[target_idx]) | |
result | |
} | |
# 実行例 | |
coSimTopk(embs, names, target_idx=2, 10) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment