| library(hadleyverse) | |
| library(PythonInR) | |
| # http://textminingonline.com/getting-started-with-word2vec-and-glove-in-python | |
| # https://github.com/maciejkula/glove-python | |
| # git clone https://github.com/maciejkula/glove-python.git | |
| # $ sudo python setup.py develop | |
| # $ sudo python setup.py install | |
| # PythonInR::pyIsConnected() | |
| # PythonInR::pyExit() | |
| # PythonInR::pyConnect() | |
| defPyConst <- function ( | |
| param_list | |
| ) { | |
| sapply( | |
| X = seq(from = 1, to = length(param_list)), | |
| FUN = function (i) { | |
| cast_fun <- ifelse(test = is.integer(x = param_list[i][[1]]), yes = as.integer, no = as.numeric) | |
| PythonInR::pySet( | |
| key = stringr::str_to_lower(string = names(param_list[i])), | |
| value = cast_fun(param_list[i][[1]]) | |
| ) | |
| } | |
| ) | |
| } | |
| callPyConst <- function ( | |
| param_list_vec | |
| ){ | |
| sapply(X = param_list_vec, FUN = defPyConst) | |
| } | |
| PythonInR::pyImport(import = c("Glove"), from = c("glove")) | |
| PythonInR::pyImport(import = c("Corpus"), from = c("glove")) | |
| SET_CORPUS <- list( | |
| FILE_NAME = "ptb.train.txt", | |
| CORPUS_PARAM = list( | |
| WINDOW_SIZE = 10L | |
| ) | |
| ) | |
| SET_GLOVE_PARAM <- list( | |
| MODEL = list( | |
| NO_COMPONENTS = 100L, LEARNING_RATE = 0.05 | |
| ), | |
| TRAIN = list( | |
| EPOCHS = 30L, NO_THREADS = 1L | |
| ) | |
| ) | |
| callPyConst( | |
| param_list_vec = list( | |
| SET_CORPUS$CORPUS_PARAM, SET_GLOVE_PARAM$MODEL, SET_GLOVE_PARAM$TRAIN | |
| ) | |
| ) | |
| sentences <- stringr::str_split( | |
| string = readr::read_lines(file = SET_CORPUS$FILE_NAME, n_max = -1), | |
| pattern = "[:space:]", n = Inf | |
| ) | |
| PythonInR::pySet(key = "sentences", value = sentences) | |
| create_corpus <- ' | |
| corpus = Corpus() | |
| corpus.fit(corpus = sentences, window = window_size) | |
| ' | |
| PythonInR::pyExec(code = create_corpus) | |
| # PythonInR::pyPrint(objName = 'len(corpus.dictionary)') | |
| # PythonInR::pyPrint(objName = 'corpus.dictionary') | |
| train_glove <- ' | |
| glove_model = Glove(no_components = no_components, learning_rate = learning_rate) | |
| glove_model.fit(matrix = corpus.matrix, epochs = epochs, no_threads = no_threads, verbose = False) | |
| glove_model.add_dictionary(corpus.dictionary) | |
| ' | |
| PythonInR::pyExec(code = train_glove) | |
| word_vectors <- PythonInR::pyGet(key = 'glove_model.word_vectors') | |
| words <- sort(x = PythonInR::pyGet(key = 'corpus.dictionary')) + 1 | |
| rownames(word_vectors) <- names(words) | |
| word_similar <- do.call(what = "rbind", args = PythonInR::pyGet(key = 'glove_model.most_similar(word = "man", number = 10)')) |