Created
June 5, 2020 03:03
-
-
Save applenob/0f89640179b29d15f1f036a02cbb8285 to your computer and use it in GitHub Desktop.
w2v training script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
训练向量 | |
python w2v.py corpus.txt w2v_char.txt | |
""" | |
import logging | |
import os.path | |
import sys | |
import multiprocessing | |
from gensim.corpora import WikiCorpus | |
from gensim.models import Word2Vec | |
from gensim.models.word2vec import LineSentence | |
if __name__ == '__main__': | |
program = os.path.basename(sys.argv[0]) | |
logger = logging.getLogger(program) | |
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') | |
logging.root.setLevel(level=logging.INFO) | |
logger.info("running %s" % ' '.join(sys.argv)) | |
# check and process input arguments | |
if len(sys.argv) != 3: | |
print(globals()['__doc__'] % locals()) | |
sys.exit(1) | |
inp, outp = sys.argv[1:3] | |
model = Word2Vec(LineSentence(inp), sg=1, size=200, window=5, min_count=5, | |
workers=multiprocessing.cpu_count()) | |
print("model: ", model) | |
# trim unneeded model memory = use(much) less RAM | |
# model.init_sims(replace=True) | |
# model.save(outp1) | |
model.wv.save_word2vec_format(outp, binary=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment