Skip to content

Instantly share code, notes, and snippets.

@ratsgo
Last active November 26, 2017 17:14
Show Gist options
  • Save ratsgo/75f40be1cc3efb76c3ea206146eae959 to your computer and use it in GitHub Desktop.
Save ratsgo/75f40be1cc3efb76c3ea206146eae959 to your computer and use it in GitHub Desktop.
Word2Vec 분석
library(stringr)
# loading
DTM <- readRDS('dtm.rds')
vec <- read.csv('word2vec.txt', fileEncoding='utf-8', sep=" ", header=F, skip=1)
# distance matrix
distance <- dist(vec[,-1])
distance <- as.matrix(distance)
colnames(distance) <- vec[,1]
rownames(distance) <- vec[,1]
# weight matrix
func <- c('디자인','화면','음질','스펙','카메라','소프트웨어','배터리')
funclocation <- which(rownames(distance) %in% func)
weight <- distance[funclocation,]
weight <- exp(-weight^2/100)
location <- order(colnames(weight))
weight <- weight[,location]
# weight 행렬의 단어와 DTM 행렬의 단어가 일치해야 내적 의미가 있음, 0이 나와야 함
which(colnames(weight) != colnames(DTM))
# inner-product
result <- as.matrix(DTM) %*% t(weight)
# 문장별 길이 체크
doc.length <- rowSums(DTM)
# post-processing
for (i in 1:dim(result)[1]) {
if (is.na(doc.length[i]) == T) {
doc.length[i] <- 1
}
# 단어 개수가 2 이하인 리뷰는 아예 빠지도록 함
if (doc.length[i] < 3) {
result[i,] <- 0
}
# 문장별 스코어를 단어 개수로 나누어 Normalize
else {
result[i,] <- result[i,] / doc.length[i]
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment