Created
March 30, 2015 06:19
-
-
Save wktk/91b2187f3e5edbcac5ed to your computer and use it in GitHub Desktop.
Read word2vec binary file in Ruby
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class Vocabulary | |
# @param [String] file_name Path to a word2vec vocabulary binary file | |
def initialize(file_name) | |
@file_name = file_name | |
@size = 0 | |
@vocabulary = {} | |
@word_count = 0 | |
read_file | |
end | |
# Supressed inspection | |
# | |
# Since @vocabulary can be very large, this class does not provide output of @vocabulary. | |
# If you need inspection of @vocabuary, directly inspect it. | |
# | |
# @return [String] Supressed inspection | |
def inspect | |
"#<#{self.class} file_name=#{@file_name}, size=#{@size}, word_count=#{@word_count}>" | |
end | |
private | |
def read_file | |
file = File.open(@file_name) | |
string_buffer = '' | |
vector_buffer = [] | |
float_buffer = [] | |
mode = :word_count | |
file.each_byte do |byte| | |
case mode | |
when :word_count, :size | |
unless (?0.ord .. ?9.ord).include?(byte) | |
case mode | |
when :word_count | |
@word_count = string_buffer.to_i | |
mode = :size | |
when :size | |
@size = string_buffer.to_i | |
mode = :vocabulary | |
end | |
string_buffer = '' | |
next | |
end | |
string_buffer += byte.chr | |
when :vocabulary | |
if byte == 0x20 | |
mode = :vector | |
next | |
end | |
string_buffer += byte.chr | |
when :vector | |
if vector_buffer.size == @size | |
@vocabulary[string_buffer] = vector_buffer | |
mode = :vocabulary | |
string_buffer = '' | |
vector_buffer = [] | |
next | |
end | |
float_buffer.push(byte.chr) | |
if float_buffer.size == 4 | |
vector_buffer.push(float_buffer.join.unpack('f')) | |
float_buffer = [] | |
end | |
end | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment