ratsgo/analysis.R

## analysis.R
library(stringr)

# loading
DTM <- readRDS('dtm.rds')
vec <- read.csv('word2vec.txt', fileEncoding='utf-8', sep=" ", header=F, skip=1)

# distance matrix
distance <- dist(vec[,-1])
distance <- as.matrix(distance)
colnames(distance) <- vec[,1]
rownames(distance) <- vec[,1]

# weight matrix
func <- c('디자인','화면','음질','스펙','카메라','소프트웨어','배터리')
funclocation <- which(rownames(distance) %in% func)
weight <- distance[funclocation,]
weight <- exp(-weight^2/100)
location <- order(colnames(weight))
weight <- weight[,location]

# weight 행렬의 단어와 DTM 행렬의 단어가 일치해야 내적 의미가 있음, 0이 나와야 함
which(colnames(weight) != colnames(DTM))

# inner-product
result <- as.matrix(DTM) %*% t(weight)

# 문장별 길이 체크
doc.length <- rowSums(DTM)

# post-processing
for (i in 1:dim(result)[1]) {
  if (is.na(doc.length[i]) == T) {
    doc.length[i] <- 1
  }
  # 단어 개수가 2 이하인 리뷰는 아예 빠지도록 함
  if (doc.length[i] < 3) {
    result[i,] <- 0
  }
  # 문장별 스코어를 단어 개수로 나누어 Normalize
  else {
    result[i,] <- result[i,] / doc.length[i]
  }
}
	library(stringr)

	# loading
	DTM <- readRDS('dtm.rds')
	vec <- read.csv('word2vec.txt', fileEncoding='utf-8', sep=" ", header=F, skip=1)

	# distance matrix
	distance <- dist(vec[,-1])
	distance <- as.matrix(distance)
	colnames(distance) <- vec[,1]
	rownames(distance) <- vec[,1]

	# weight matrix
	func <- c('디자인','화면','음질','스펙','카메라','소프트웨어','배터리')
	funclocation <- which(rownames(distance) %in% func)
	weight <- distance[funclocation,]
	weight <- exp(-weight^2/100)
	location <- order(colnames(weight))
	weight <- weight[,location]

	# weight 행렬의 단어와 DTM 행렬의 단어가 일치해야 내적 의미가 있음, 0이 나와야 함
	which(colnames(weight) != colnames(DTM))

	# inner-product
	result <- as.matrix(DTM) %*% t(weight)

	# 문장별 길이 체크
	doc.length <- rowSums(DTM)

	# post-processing
	for (i in 1:dim(result)[1]) {
	if (is.na(doc.length[i]) == T) {
	doc.length[i] <- 1
	}
	# 단어 개수가 2 이하인 리뷰는 아예 빠지도록 함
	if (doc.length[i] < 3) {
	result[i,] <- 0
	}
	# 문장별 스코어를 단어 개수로 나누어 Normalize
	else {
	result[i,] <- result[i,] / doc.length[i]
	}
	}