Skip to content

Instantly share code, notes, and snippets.

@thisismattmiller
Created December 19, 2017 15:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thisismattmiller/3d32d5751fd732321bbf5ffabfa08adb to your computer and use it in GitHub Desktop.
Save thisismattmiller/3d32d5751fd732321bbf5ffabfa08adb to your computer and use it in GitHub Desktop.
import gensim
import os
import collections
import smart_open
import random
import json
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
def read_corpus(fname, tokens_only=False):
with smart_open.smart_open(fname, encoding="utf-8") as f:
for i, line in enumerate(f):
# print(line)
try:
line = json.loads(line)
id = line['id']
doc = line['doc']
if i % 1000 == 0:
print(i)
if tokens_only:
yield gensim.utils.simple_preprocess(doc)
else:
# For training data, add tags
yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(doc), [id])
except:
print("Error on line #", i)
continue
train_corpus = list(read_corpus('all_docs_simple'))
model = gensim.models.doc2vec.Doc2Vec(train_corpus, size=100, window=8, min_count=5, iter=20, workers=32)
model.save('all_docs_simple_model')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment