Skip to content

Instantly share code, notes, and snippets.

@piskvorky
Last active October 18, 2020 01:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save piskvorky/85015d52254df77cd23ca43bc2e8322d to your computer and use it in GitHub Desktop.
Save piskvorky/85015d52254df77cd23ca43bc2e8322d to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2020 Radim Rehurek <me@radimrehurek.com>
"""
Help script (template) for benchmarking. Run with:
/usr/bin/time --format "%E elapsed\n%Mk peak RAM" python gensim_benchmark.py ~/gensim-data/text9/text9.txt
"""
import logging
import sys
from gensim.models.word2vec import Text8Corpus, LineSentence
from gensim.models import FastText, Word2Vec, Doc2Vec, Phrases
from gensim import __version__
logger = logging.getLogger(__name__)
if __name__ == "__main__":
logging.basicConfig(
format='%(asctime)s [%(processName)s/%(process)d] [%(levelname)s] %(name)s:%(lineno)d: %(message)s',
level=logging.INFO,
)
if len(sys.argv) < 2:
print(globals()['__doc__'] % locals())
sys.exit(1)
corpus = Text8Corpus(sys.argv[1]) # text8/text9 format from http://mattmahoney.net/dc/textdata.html
cls = FastText
cls(corpus, workers=12, epochs=1).save(f'/tmp/{cls.__name__}.gensim{__version__}')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment