Skip to content

Instantly share code, notes, and snippets.

@alabrashJr
Created March 21, 2019 13:53
Show Gist options
  • Save alabrashJr/d71cf74bc9713bb0a5bb12ccd331a405 to your computer and use it in GitHub Desktop.
Save alabrashJr/d71cf74bc9713bb0a5bb12ccd331a405 to your computer and use it in GitHub Desktop.
load vector of pre-trained embedded word from pre-trained binary file like google_w2v.bin
#fname: the file name of binary file <google_w2v.bin>
#vocab: vocabulary dictionary
function load_bin_vec(fname, vocab)
pc(s)=return convert(Char,s[1])
word_vecs = Dict()
open(fname, "r") do f
@show header = readline(f)
vocab_size, layer1_size = map(pf, split(header))
@show binary_len = sizeof(Float32) * layer1_size
for line in collect(1:vocab_size)
word=[]
while true
ch=read(f,1)
ch=convert(Char,ch[1])
if ch == ' '
word = join(word,"")
break
end
if ch != '\n';
push!(word,ch);
end
end
if word in keys(vocab)
word_vecs[word]=reinterpret(Float32,read(f,binary_len))
else
read(f,binary_len)
end
end
end;
return word_vecs
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment