Skip to content

Instantly share code, notes, and snippets.

@tslumley
Last active September 23, 2018 06:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tslumley/c36db10b4b316277482619451675f580 to your computer and use it in GitHub Desktop.
Save tslumley/c36db10b4b316277482619451675f580 to your computer and use it in GitHub Desktop.
Read GloVe word embeddings
# Based on https://gist.github.com/tjvananne/8b0e7df7dcad414e8e6d5bf3947439a9
# Rewritten to work chunk by chunk, so I can read the 42B file with only 8GB memory
# input .txt file, exports list of list of values and character vector of names (words)
proc_pretrained_vec <- function(filename, chunksize=1000, guess_size=100000) {
size<-guess_size
here<-0
# initialize space for values and the names of each word in vocab
vals <- vector(mode = "list", length(size))
names <- character(length(size))
done<-FALSE
filecon<-file(filename, open="rt")
while(!done){
p_vec<-scan(filecon,nlines=chunksize,
what="", sep="\n")
n_read<-length(p_vec)
if (n_read<chunksize) done<-TRUE
if (n_read==0) break
if (here+n_read >size) { #too small; double it
vals<-c(vals, vector(mode = "list", length(size)))
names<-c(names, character(length(size)))
}
# loop through to gather values and names of each word
for(i in 1:n_read) {
this_vec <- p_vec[i]
this_vec_unlisted <- unlist(strsplit(this_vec, " "))
this_vec_values <- as.numeric(this_vec_unlisted[-1]) # this needs testing, does it become numeric?
this_vec_name <- this_vec_unlisted[1]
vals[[i+here]] <- this_vec_values
names[[i+here]] <- this_vec_name
}
here<-here+n_read
print(here)
}
# convert lists to data.frame and attach the names
glove <- data.frame(vals)
names(glove) <- names
return(glove)
}
# using the function -------------------------------------------------------------------------
# here we are reading in the unzipped, raw, GloVe pre-trained word vector object (.txt)
# all you have to change is the file path to where you GloVe object has been unzipped
g6b_300 <- proc_pretrained_vec(file = "WORDS/glove.42B.300d.txt")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment