ByungSunBae/jaccard_sim_starwars.R

## jaccard_sim_starwars.R
# 스타워즈 시리즈들의 유사도를 등장인물들을 통해 계산하고 시각화 하는 문제
# From : https://github.com/R-Korea/weekly_R_quiz/blob/master/201808/1.movie_similarity/movie_similarity_quiz.Rmd
library(dplyr)
library(data.table)
library(ggplot2)
library(scales)

rm(list=ls())

raw.data <-
  starwars %>%
  select(name, films)

# Start -----
## 데이터 테이블로 변환
raw.data.dt <- as.data.table(raw.data)
## unique한 영화이름 벡터 생성
film_unique <- raw.data.dt[, unlist(films) %>% unique()]

## 각 영화별 등장인물 출연 여부 데이터 생성
tmp <- raw.data.dt[, lapply(films, function(x) film_unique %in% x)] %>% as.matrix()
## 컬럼명을 등장인물로 셋팅
colnames(tmp) <- raw.data.dt[, name]
## 출연 여부를 이용한 등장인물 셋팅
## 즉, 각 영화별 등장인물을 리스트로 생성
name_by_film <- apply(tmp, 1, function(x) colnames(tmp)[x])
## 생성된 리스트와 영화이름 벡터를 이용하여 데이터 테이블생성
rec_dt <- data.table(films = film_unique, name = name_by_film)

## cross join => 2번의 merge를 거쳐 유사도 측정을 위한 데이터 셋 구성
cross_names <- CJ(rec_dt[, films], rec_dt[, films])
fnl.data <- merge(cross_names, rec_dt, by.x = "V1", by.y = "films", all = TRUE)
fnl.data <- merge(fnl.data, rec_dt, by.x = "V2", by.y = "films", all = TRUE)
setnames(fnl.data, 1:4, c("films_1", "films_2", "name_1", "name_2"))

## jaccard 유사도를 구하는 함수 작성
jaccard_sim <- function(x, y){
  x_ <- unlist(x)
  y_ <- unlist(y)
  sum(x_ %in% y_) / ((length(x_) + length(y_)) - sum(x_ %in% y_))
}

## jaccard 유사도 값을 jac_sim이라는 컬럼명으로 지정하여 값 계산
fnl.data[, jac_sim := jaccard_sim(name_1, name_2), by = c("films_1", "films_2")]
## name_1, name_2는 더 이상 필요없으므로 삭제
fnl.data[, c("name_1", "name_2") := NULL]
## 상삼각 행렬값을 유지하기 위한 index 정보 대입하여 시각화를 위한 데이터 생성
fnl.data.v2 <- fnl.data[as.vector(lower.tri(fnl.data[, jac_sim] %>% matrix(nrow = 7, byrow = FALSE), diag = TRUE)),]
## 행렬의 대각원소의 경우 실제 사람 수를 시각화에 표현하기 위한 컬럼 생성
fnl.data.v2[films_1 == films_2, counts := c(unlist(name_1), unlist(name_2)) %>% unique() %>% length(), by = c("films_1", "films_2")]

## 시각화하기
ggplot(fnl.data.v2) +
  geom_tile(aes(x = films_1, y = films_2, fill = jac_sim)) +
  geom_text(aes(x = films_1, y = films_2, label = ifelse(films_1 == films_2,
                                                         paste0(counts, "명"),
                                                         paste0(sprintf("%0.1f", round(jac_sim, digits = 4) * 100), "%"))), fontface = "bold") +
  xlab(NULL) +
  ylab(NULL) +
  theme(panel.background = NULL, rect = NULL, legend.position = "none", title = element_text(face = "bold")) +
  scale_fill_gradient(low = "gray90", high = "red") +
  ggtitle(label = "영화별 유사도", subtitle = "Jaccard similarity")
	# 스타워즈 시리즈들의 유사도를 등장인물들을 통해 계산하고 시각화 하는 문제
	# From : https://github.com/R-Korea/weekly_R_quiz/blob/master/201808/1.movie_similarity/movie_similarity_quiz.Rmd
	library(dplyr)
	library(data.table)
	library(ggplot2)
	library(scales)

	rm(list=ls())

	raw.data <-
	starwars %>%
	select(name, films)

	# Start -----
	## 데이터 테이블로 변환
	raw.data.dt <- as.data.table(raw.data)
	## unique한 영화이름 벡터 생성
	film_unique <- raw.data.dt[, unlist(films) %>% unique()]

	## 각 영화별 등장인물 출연 여부 데이터 생성
	tmp <- raw.data.dt[, lapply(films, function(x) film_unique %in% x)] %>% as.matrix()
	## 컬럼명을 등장인물로 셋팅
	colnames(tmp) <- raw.data.dt[, name]
	## 출연 여부를 이용한 등장인물 셋팅
	## 즉, 각 영화별 등장인물을 리스트로 생성
	name_by_film <- apply(tmp, 1, function(x) colnames(tmp)[x])
	## 생성된 리스트와 영화이름 벡터를 이용하여 데이터 테이블생성
	rec_dt <- data.table(films = film_unique, name = name_by_film)

	## cross join => 2번의 merge를 거쳐 유사도 측정을 위한 데이터 셋 구성
	cross_names <- CJ(rec_dt[, films], rec_dt[, films])
	fnl.data <- merge(cross_names, rec_dt, by.x = "V1", by.y = "films", all = TRUE)
	fnl.data <- merge(fnl.data, rec_dt, by.x = "V2", by.y = "films", all = TRUE)
	setnames(fnl.data, 1:4, c("films_1", "films_2", "name_1", "name_2"))

	## jaccard 유사도를 구하는 함수 작성
	jaccard_sim <- function(x, y){
	x_ <- unlist(x)
	y_ <- unlist(y)
	sum(x_ %in% y_) / ((length(x_) + length(y_)) - sum(x_ %in% y_))
	}

	## jaccard 유사도 값을 jac_sim이라는 컬럼명으로 지정하여 값 계산
	fnl.data[, jac_sim := jaccard_sim(name_1, name_2), by = c("films_1", "films_2")]
	## name_1, name_2는 더 이상 필요없으므로 삭제
	fnl.data[, c("name_1", "name_2") := NULL]
	## 상삼각 행렬값을 유지하기 위한 index 정보 대입하여 시각화를 위한 데이터 생성
	fnl.data.v2 <- fnl.data[as.vector(lower.tri(fnl.data[, jac_sim] %>% matrix(nrow = 7, byrow = FALSE), diag = TRUE)),]
	## 행렬의 대각원소의 경우 실제 사람 수를 시각화에 표현하기 위한 컬럼 생성
	fnl.data.v2[films_1 == films_2, counts := c(unlist(name_1), unlist(name_2)) %>% unique() %>% length(), by = c("films_1", "films_2")]

	## 시각화하기
	ggplot(fnl.data.v2) +
	geom_tile(aes(x = films_1, y = films_2, fill = jac_sim)) +
	geom_text(aes(x = films_1, y = films_2, label = ifelse(films_1 == films_2,
	paste0(counts, "명"),
	paste0(sprintf("%0.1f", round(jac_sim, digits = 4) * 100), "%"))), fontface = "bold") +
	xlab(NULL) +
	ylab(NULL) +
	theme(panel.background = NULL, rect = NULL, legend.position = "none", title = element_text(face = "bold")) +
	scale_fill_gradient(low = "gray90", high = "red") +
	ggtitle(label = "영화별 유사도", subtitle = "Jaccard similarity")