Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kanekomasahiro/841331529a083d68eca843f1217f12f5 to your computer and use it in GitHub Desktop.
Save kanekomasahiro/841331529a083d68eca843f1217f12f5 to your computer and use it in GitHub Desktop.
dict形式のembeddingをgensimのbinで保存する.
import gensim
import argparse
import numpy as np
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--input', type=str, required=True)
parser.add_argument('--output', type=str, required=True)
args = parser.parse_args()
return args
def save_embeddings(dict_embedding, output):
emb_num = len(dict_embedding)
emb_size = len(list(dict_embedding.values())[0])
with gensim.utils.smart_open(f'{output}.bin', 'wb') as fw:
fw.write(gensim.utils.to_utf8(f'{emb_num} {emb_size}\n'))
for word, vector in (dict_embedding.items()):
vector = vector.astype(np.float32).tostring()
fw.write(gensim.utils.to_utf8(word) + b' ' + vector)
def main(args):
dict_embedding = {}
with open(args.input) as f:
for l in f:
word = l.split()[0]
vector = np.array(l.split()[1:])
dict_embedding[word] = vector
save_embeddings(dict_embedding, args.output)
if __name__ == "__main__":
args = parse_args()
main(args)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment