Skip to content

Instantly share code, notes, and snippets.

@vyraun
Forked from ili3p/process_word2vec.lua
Created December 5, 2016 08:52
Show Gist options
  • Save vyraun/2fe5d27222f39168321229b9ba778a82 to your computer and use it in GitHub Desktop.
Save vyraun/2fe5d27222f39168321229b9ba778a82 to your computer and use it in GitHub Desktop.
Reading 5.3GB text file with LuaJIT
local words = torch.load(opt.words) -- it's a tds.Hash
local word2vec = torch.FloatTensor(opt.vocabsz, opt.dim)
local buffsz = 2^13 -- == 8k
local f = io.input(opt.input)
local done = 0
local unk
-- read huge word2vec file with 2,196,017 lines
while true do
local lines, leftover = f:read(buffsz, '*line')
if not lines then break end -- no more lines
if leftover then lines = lines .. leftover .. '\n' end -- join the leftover
lines = lines:split('\n')
for i=1, #lines do
if done % 1000 == 0 then
xlua.progress(done, opt.nvec)
end
local line = lines[i]:split(' ')
if line[1] == 'UNK' then
table.remove(line, 1) -- remove the word
unk = torch.FloatTensor(line)
else
local index = words['word2id'][line[1]]
if index then
table.remove(line, 1) -- remove the word
word2vec[index] = torch.FloatTensor(line)
end
end
done = done + 1
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment