Last active
April 27, 2018 18:17
-
-
Save jnolis/e23f00d752b47671c9f0feccd333e1d4 to your computer and use it in GitHub Desktop.
A better generator for using Keras in R for word2vec
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# the example code provided by RStudio to use R for word2vec has an error if you don't have enough data. | |
# This fixes the error by having the generator reset when it runs out | |
skipgrams_generator <- function(text, tokenizer, window_size, negative_samples) { | |
gen <- texts_to_sequences_generator(tokenizer, sample(text)) | |
function() { | |
next_value <- generator_next(gen) | |
if(is.null(next_value)){ #if there isn't new text from the generator | |
gen <<- texts_to_sequences_generator(tokenizer, sample(text)) # remake the generator | |
next_value <- generator_next(gen) | |
} | |
skip <- next_value %>% | |
skipgrams( | |
vocabulary_size = tokenizer$num_words, | |
window_size = window_size, | |
negative_samples = 1 | |
) | |
x <- transpose(skip$couples) %>% map(. %>% unlist %>% as.matrix(ncol = 1)) | |
y <- skip$labels %>% as.matrix(ncol = 1) | |
list(x, y) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment