wmichi/cosineSim.R

## cosineSim.R
# Embedding後のデータを読み込む
df_first <- read.csv('tmp/emb_first', header=FALSE, sep=' ', stringsAsFactors = FALSE)
df_second <- read.csv('tmp/emb_second', header=FALSE, sep=' ', stringsAsFactors = FALSE)
# アイテム名の抽出。first, secondともに同じ順番ｎ
names <- df_first[2:nrow(df_first),1]
# Embedding部分のconcat
emb_first <- matrix(unlist(df_first[2:nrow(df_first),2:33]), nrow=nrow(df_first)-1, ncol=32)
emb_second <- matrix(unlist(df_second[2:nrow(df_second),2:33]), nrow=nrow(df_second)-1, ncol=32)
embs <- cbind(emb_first, emb_second)

# コサイン類似度の算出関数
cosineSim <- function(x, y){
	xx <- sqrt(dot(x,x))
	yy <- sqrt(dot(y,y))
	xy <- dot(x,y)

	cosine_sim <- xy / dot(xx,yy)
}

# コサイン類似度に基づくトップNアイテムの出力
# param: emb_matrix: Embedding後の行列 matrix
# name_list: ノード名のリスト characterのlist
# target_idx: 類似アイテムをみたいアイテムID integer
# k: 出力したい上位アイテム数 integer
coSimTopk<-function(emb_matrix, name_list, target_idx, k){
	result <- matrix(0L, nrow=k,ncol=3)
	score_mat <- matrix(0L, nrow=nrow(emb_matrix),ncol=1)
	x <- emb_matrix[target_idx,]
	for(i in 1:nrow(emb_matrix)){
		y <- emb_matrix[i,]
		score <- cosineSim(x,y)
		score_mat[i,1] <- score
	}
	score_mat[target_idx,1] <- 0.0
	top_idxs <- head(order(score_mat,decreasing=TRUE), k)

	for(j in 1:k){
		result[j,1] <- top_idxs[j]
		result[j,2] <- name_list[top_idxs[j]]
		result[j,3] <- score_mat[top_idxs[j],1]
	}
	print(name_list[target_idx])
	result
}

# 実行例
coSimTopk(embs, names, target_idx=2, 10)
	# Embedding後のデータを読み込む
	df_first <- read.csv('tmp/emb_first', header=FALSE, sep=' ', stringsAsFactors = FALSE)
	df_second <- read.csv('tmp/emb_second', header=FALSE, sep=' ', stringsAsFactors = FALSE)
	# アイテム名の抽出。first, secondともに同じ順番ｎ
	names <- df_first[2:nrow(df_first),1]
	# Embedding部分のconcat
	emb_first <- matrix(unlist(df_first[2:nrow(df_first),2:33]), nrow=nrow(df_first)-1, ncol=32)
	emb_second <- matrix(unlist(df_second[2:nrow(df_second),2:33]), nrow=nrow(df_second)-1, ncol=32)
	embs <- cbind(emb_first, emb_second)

	# コサイン類似度の算出関数
	cosineSim <- function(x, y){
	xx <- sqrt(dot(x,x))
	yy <- sqrt(dot(y,y))
	xy <- dot(x,y)

	cosine_sim <- xy / dot(xx,yy)
	}

	# コサイン類似度に基づくトップNアイテムの出力
	# param: emb_matrix: Embedding後の行列 matrix
	# name_list: ノード名のリスト characterのlist
	# target_idx: 類似アイテムをみたいアイテムID integer
	# k: 出力したい上位アイテム数 integer
	coSimTopk<-function(emb_matrix, name_list, target_idx, k){
	result <- matrix(0L, nrow=k,ncol=3)
	score_mat <- matrix(0L, nrow=nrow(emb_matrix),ncol=1)
	x <- emb_matrix[target_idx,]
	for(i in 1:nrow(emb_matrix)){
	y <- emb_matrix[i,]
	score <- cosineSim(x,y)
	score_mat[i,1] <- score
	}
	score_mat[target_idx,1] <- 0.0
	top_idxs <- head(order(score_mat,decreasing=TRUE), k)

	for(j in 1:k){
	result[j,1] <- top_idxs[j]
	result[j,2] <- name_list[top_idxs[j]]
	result[j,3] <- score_mat[top_idxs[j],1]
	}
	print(name_list[target_idx])
	result
	}

	# 実行例
	coSimTopk(embs, names, target_idx=2, 10)