Skip to content

Instantly share code, notes, and snippets.

@yamano357
Created October 26, 2015 03:23
Show Gist options
  • Save yamano357/8a31b2dc0c7a20a30d36 to your computer and use it in GitHub Desktop.
Save yamano357/8a31b2dc0c7a20a30d36 to your computer and use it in GitHub Desktop.
library(hadleyverse)
library(PythonInR)
# http://textminingonline.com/getting-started-with-word2vec-and-glove-in-python
# https://github.com/maciejkula/glove-python
# git clone https://github.com/maciejkula/glove-python.git
# $ sudo python setup.py develop
# $ sudo python setup.py install
# PythonInR::pyIsConnected()
# PythonInR::pyExit()
# PythonInR::pyConnect()
defPyConst <- function (
param_list
) {
sapply(
X = seq(from = 1, to = length(param_list)),
FUN = function (i) {
cast_fun <- ifelse(test = is.integer(x = param_list[i][[1]]), yes = as.integer, no = as.numeric)
PythonInR::pySet(
key = stringr::str_to_lower(string = names(param_list[i])),
value = cast_fun(param_list[i][[1]])
)
}
)
}
callPyConst <- function (
param_list_vec
){
sapply(X = param_list_vec, FUN = defPyConst)
}
PythonInR::pyImport(import = c("Glove"), from = c("glove"))
PythonInR::pyImport(import = c("Corpus"), from = c("glove"))
SET_CORPUS <- list(
FILE_NAME = "ptb.train.txt",
CORPUS_PARAM = list(
WINDOW_SIZE = 10L
)
)
SET_GLOVE_PARAM <- list(
MODEL = list(
NO_COMPONENTS = 100L, LEARNING_RATE = 0.05
),
TRAIN = list(
EPOCHS = 30L, NO_THREADS = 1L
)
)
callPyConst(
param_list_vec = list(
SET_CORPUS$CORPUS_PARAM, SET_GLOVE_PARAM$MODEL, SET_GLOVE_PARAM$TRAIN
)
)
sentences <- stringr::str_split(
string = readr::read_lines(file = SET_CORPUS$FILE_NAME, n_max = -1),
pattern = "[:space:]", n = Inf
)
PythonInR::pySet(key = "sentences", value = sentences)
create_corpus <- '
corpus = Corpus()
corpus.fit(corpus = sentences, window = window_size)
'
PythonInR::pyExec(code = create_corpus)
# PythonInR::pyPrint(objName = 'len(corpus.dictionary)')
# PythonInR::pyPrint(objName = 'corpus.dictionary')
train_glove <- '
glove_model = Glove(no_components = no_components, learning_rate = learning_rate)
glove_model.fit(matrix = corpus.matrix, epochs = epochs, no_threads = no_threads, verbose = False)
glove_model.add_dictionary(corpus.dictionary)
'
PythonInR::pyExec(code = train_glove)
word_vectors <- PythonInR::pyGet(key = 'glove_model.word_vectors')
words <- sort(x = PythonInR::pyGet(key = 'corpus.dictionary')) + 1
rownames(word_vectors) <- names(words)
word_similar <- do.call(what = "rbind", args = PythonInR::pyGet(key = 'glove_model.most_similar(word = "man", number = 10)'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment