Skip to content

Instantly share code, notes, and snippets.

@thisismattmiller
Created December 19, 2017 15:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thisismattmiller/3451da479c51dc84c01864b0144f3288 to your computer and use it in GitHub Desktop.
Save thisismattmiller/3451da479c51dc84c01864b0144f3288 to your computer and use it in GitHub Desktop.
import gensim
import os
import collections
import smart_open
import random
import json
import logging
import numpy as np
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
model = gensim.models.doc2vec.Doc2Vec.load('all_docs_simple_model')
file_counter = 0
out = open('vectors_'+str(file_counter)+'.ndjson','w')
for x in range(len(model.docvecs)):
data = json.dumps({"i":x,"d":model.docvecs.index_to_doctag(x),"v":np.array(model.docvecs[x]).tolist()})
out.write(data + "\n")
if x % 5000000 == 0 and x != 0:
file_counter = file_counter + 1
out.close()
out = open('vectors_'+str(file_counter)+'.ndjson','w')
if x % 10000 == 0:
print(x)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment