Skip to content

Instantly share code, notes, and snippets.

@jmclawson jmclawson/w2v_utilities.R
Last active Aug 13, 2019

Embed
What would you like to do?
library(tidyverse)
library(tidytext)
library(dplyr)
library(reshape2)
library(wordVectors)
library(ggplot2)
#######################
## Modeling a Corpus
# This process for preparing and modeling the corpus is adapted from Women Writers Project's template_word2vec.Rmd
# These adaptations should allow for for preservation of modeling settings to aid in reproducibility
# After training the model, recall its setting parameters by exploring the object's attributes.
# Example 1: attributes(w2vModel)$window
# Example 2: attributes(w2vModel)$negative_samples
# Example 3: attributes(w2vModel)$vectors
readTextFiles <- function(file, path2file) {
message(file)
rawText = paste(scan(file, sep="\n", what="raw", strip.white = TRUE))
output = tibble(filename=gsub(path2file, "", file), text=rawText) %>%
group_by(filename) %>%
summarise(text = paste(rawText, collapse = " "))
return(output)
}
# If the name of the folder with your corpus differs from your model name, be sure to set source.dir
prep_model <- function(model="w2vModel",
lowercase=TRUE,
bundle_ngrams=1,
source.dir=NULL){
modelInput <- paste0("data/",model,".txt")
modelCleaned <- paste0("data/",model,"_cleaned.txt")
if (is.null(source.dir)) {source.dir=paste0("data/",model)}
if (!file.exists(modelInput)){
fileList <- list.files(source.dir,full.names = TRUE)
combinedTexts <- tibble(filename=fileList) %>%
group_by(filename) %>%
do(readTextFiles(.$filename, source.dir))
combinedTexts$text %>% write_lines(modelInput)
} else {message("'", getwd(), "/",
modelInput,
"' already exists.")}
if (!file.exists(modelCleaned)){
prep_word2vec(origin=modelInput,
destination=modelCleaned,
lowercase=lowercase,
bundle_ngrams=bundle_ngrams)
} else {message("'", getwd(), "/",
modelCleaned,
"' already exists.")}
}
# For later recall, this function saves a metadata_model.Rdata file beside the model.bin in your data folder
train_model <- function(model="w2vModel",
vectors=100,
window=6,
iter=10,
negative_samples=15,
threads=3){
if(!exists(".Random.seed")) set.seed(NULL)
modelSeed <- .Random.seed
modelBin <- paste0("data/", model, ".bin")
modelInput <- paste0("data/", model, ".txt")
modelCleaned <- paste0("data/", model, "_cleaned.txt")
if (!file.exists(modelBin)) {
the_model <- train_word2vec(
modelCleaned,
output_file=modelBin,
vectors=vectors,
threads=threads,
window=window, iter=iter, negative_samples=negative_samples
)
# This metadata gets lost after first run, so store it
attributes(the_model)$vectors <- vectors
attributes(the_model)$window <- window
attributes(the_model)$iter <- iter
attributes(the_model)$negative_samples <- negative_samples
attributes(the_model)$seed <- modelSeed
model_metadata <- c("vectors"=vectors,
"window"=window,
"iter"=iter,
"negative_samples"=negative_samples,
"seed"=modelSeed)
save(model_metadata,file=paste0("data/metadata_",
model,
".Rdata"))
# Save this model in a global object
assign(model, the_model, envir = .GlobalEnv)
} else {
the_model <- read.vectors(modelBin)
meta_filename <- paste0("data/metadata_", model, ".Rdata")
if (file.exists(meta_filename)){
load(file=meta_filename)
attributes(the_model)$vectors <- model_metadata["vectors"]
attributes(the_model)$window <- model_metadata["window"]
attributes(the_model)$iter <- model_metadata["iter"]
attributes(the_model)$negative_samples <- model_metadata["negative_samples"]
attributes(the_model)$seed <- model_metadata["seed"]
}
assign(model, the_model, envir = .GlobalEnv)
}
}
#######################
## Managing Results
# The get_siml() function returns distances between one word 'x' and a vector of words 'y' for model 'wem'
# Example: get_siml(w2vModel, "salty", c("food", "ocean", "attitude", "air"))
get_siml <- function(wem, x, y){
sapply(y, function(z) {
cosineSimilarity(wem[[x]],
wem[[z]]) %>%
round(9)
})
}
# The make_siml_matrix() function returns a matrix of distances between two vectors of words 'x' and 'y' in model 'wem'
# Example: make_siml_matrix(w2vModel, c("salty", "sweet", "fresh"), c("food", "ocean", "attitude", "air"))
make_siml_matrix <- function(wem, x, y){
dis_col <- read.table(text="",
colClasses="double",
col.names = y)
for (each in x){
dis_col[each,] <- wem %>%
get_siml(each, y) %>%
data.frame() %>%
t()
}
as.matrix(dis_col) %>% t()
}
# The scale_matrix() function scales values in a matrix, amplifying the signal in each row and column
# Example 1: scale_matrix(my_matrix)
# Example 2: make_siml_matrix(w2vModel, my_adj, my_nouns) %>% scale_matrix()
scale_matrix <- function(x, diagonal=TRUE){
if (!diagonal){x[x==1] <- NA}
# scale each column 0 to 1
scale_cols <- x %>%
apply(1, function(x) {
sapply(x, function(y) {
suppressWarnings((y-min(x, na.rm=TRUE))/(max(x, na.rm=TRUE)-min(x, na.rm=TRUE)))
})
}) %>%
as.matrix()
# scale each row 0 to 1
scale_rows <- x %>%
apply(2, function(x) {
sapply(x, function(y) {
suppressWarnings(y-min(x, na.rm=TRUE))/(max(x, na.rm=TRUE)-min(x, na.rm=TRUE))
})
}) %>%
as.matrix() %>%
t()
# add these matrices together
scale_join <- scale_cols + scale_rows
return(t(scale_join))
}
#######################
## Mapping Proximities
# For cosine_heatmap(), set x and y to words you'd like to compare, distance-wise, along x and y axes.
# For instance, x=c("salty","sweet","fresh"), y=c("food","ocean","air") or x=my_flavors, etc.
# Try setting 'labeled' to "title" or "simple"; toggle 'values' to TRUE or FALSE
# Example 1: cosine_heatmap(w2vModel, my_adj, my_nouns)
# Example 2: cosine_heatmap(w2vModel, my_words, my_words)
cosine_heatmap <- function(wem, x, y,
labeled="simple",
round=2,
legend=TRUE,
values=TRUE,
redundant=TRUE){
if (!identical(x, y)){
if (!redundant){
redundant <- TRUE
cat("Values for 'x' and 'y' don't match. Setting 'redundant' to TRUE.")
}
}
the_matrix <- make_siml_matrix(wem, x, y)
if (!redundant){
the_matrix[upper.tri(the_matrix)] <- NA
the_matrix <- the_matrix %>% melt(na.rm = TRUE)
} else {
the_matrix <- the_matrix %>% melt()
}
the_plot <- the_matrix %>%
ggplot(aes(x=Var2, y=reorder(Var1, desc(Var1)), fill=value)) +
geom_tile(color="white") +
scale_fill_gradient2(low = "blue",
high = "red",
mid = "white",
midpoint = 0,
limit = c(-1,1),
name="Similarity") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45,
vjust = 1,
size = 12,
hjust = 1),
panel.grid.major = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.ticks = element_blank())
if(values){
the_plot <- the_plot +
geom_text(aes(label=round(value,round)), color="black")
legend <- FALSE
}
if(labeled=="title"){
the_plot <- the_plot +
labs(title=paste0("Comparing '",
deparse(substitute(y)),
"' by '",
deparse(substitute(x)),
"' in ",
deparse(substitute(wem))),
x=element_blank(),
y=element_blank())
} else if (labeled=="simple") {
the_plot <- the_plot +
labs(title=deparse(substitute(wem)),
x=element_blank(),
y=element_blank())
} else {
the_plot <- the_plot +
labs(title=deparse(substitute(wem)),
x=deparse(substitute(x)),
y=deparse(substitute(y)))
}
if (!legend) {
the_plot <- the_plot +
guides(fill=FALSE)
}
the_plot
}
# The amplified_heatmap() function merely amplifies signals it finds; it doesn't validate these signals.
# Try setting 'labeled' to "title" or "simple".
# Example 1: amplified_heatmap(w2vModel, my_adj, my_nouns)
# Example 2: amplified_heatmap(w2vModel, my_words, my_words)
amplified_heatmap <- function(wem, x, y,
labeled="simple",
legend=TRUE,
diagonal=TRUE,
redundant=TRUE){
if (!identical(x, y)){
if (!redundant){
redundant <- TRUE
cat("Values for 'x' and 'y' don't match. Setting 'redundant' to TRUE.")
}
}
if (!redundant){diagonal <- TRUE}
the_matrix <- make_siml_matrix(wem, x, y) %>%
scale_matrix(diagonal)
if (!redundant){
the_matrix[upper.tri(the_matrix)] <- NA
the_matrix <- the_matrix %>% melt(na.rm = TRUE)
} else {
the_matrix <- the_matrix %>% melt()
}
the_plot <- the_matrix %>%
ggplot(aes(x=Var2, y=reorder(Var1, desc(Var1)), fill=value)) +
geom_tile(color="white") +
scale_fill_gradient2(low = "blue",
high = "red",
mid = "white",
midpoint = 1,
limit = c(0, 2),
name=element_blank(),
breaks=c(0, 2),
labels=c("far", "near")) +
theme_bw() +
theme(axis.text.x = element_text(angle = 45,
vjust = 1,
size = 12,
hjust = 1),
panel.grid.major = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.ticks = element_blank()) +
if(labeled=="title"){
labs(title=paste0("Comparing '",
deparse(substitute(y)),
"' by '",
deparse(substitute(x)),
"' in ",
deparse(substitute(wem))),
x=element_blank(),
y=element_blank())
} else if (labeled=="simple") {
labs(title=deparse(substitute(wem)),
x=element_blank(),
y=element_blank())
} else {
labs(title=deparse(substitute(wem)),
x=deparse(substitute(x)),
y=deparse(substitute(y)))
}
if (!legend){
the_plot <- the_plot +
guides(fill=FALSE)
}
the_plot
}
@jmclawson

This comment has been minimized.

Copy link
Owner Author

jmclawson commented Jul 30, 2019

This code builds on a workflow shared by the Women Writers Project, which relies on Benjamin Schmidt's wordVectors package. To use these functions in R, paste the following line into the console or a code chunk: devtools::source_gist("21c6a40c78fd66d708bec45d5c0b52e2"). Alternatively, copy the functions from the gist and save them locally to run them from your machine.

  1. Put corpus files in "data\YourModelName" and then prep the texts with the function prep_model(model="YourModelName").
  • Alternatively, put them in some other directory and set the path to that directory with the source.dir parameter.
  • Optionally, set the bundle_ngrams parameter to some other value to combine common phrases using underscores, and set lowercase=FALSE to retain uppercase characters. These settings are passed along to the prep_word2vec() function from the wordVectors package.
  1. Train the model with the function train_model(model="YourModelName").
  • Setting defaults are vectors=100, window=6, iter=10, negative_samples=15, threads=3, but these can each be changed within the train_model() call. These settings are passed along to the train_word2vec() function from the wordVectors package.
  1. Recall these setting parameters with, e.g., attributes(YourModelName)$window
  2. Visualize a heatmap of cosine similarities between two groups of words with the command cosine_heatmap(YourModelName, x=c("red","green","blue"), y=c("happy","sad","excited))
  3. Amplify these comparisons within each row and column using the command amplified_heatmap(YourModelName, x=c("red","green","blue"), y=c("happy","sad","excited))

Everything is explained further in a blog post with examples here: https://jmclawson.net/blog/posts/word-vector-utilities/

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.