iamaziz/word2vec-accuracy.py

## word2vec-accuracy.py
from gensim.models import Word2Vec

# read the evaluation file, get it at:
# https://word2vec.googlecode.com/svn/trunk/questions-words.txt
>>> questions = 'questions-words.txt'
>>> evals = open(questions, 'r').readlines()
>>> num_sections = len([l for l in evals if l.startswith(':')])
>>> print('total evaluation sentences: {} '.format(len(evals) - num_sections))
total evaluation sentences: 19544

# load the pre-trained model of GoogleNews dataset (100 billion words), get it at:
# https://code.google.com/p/word2vec/#Pre-trained_word_and_phrase_vectors
>>> google = Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
# test the model accuracy*
>>> w2v_model_accuracy(google)
Total sentences: 7614, Correct: 74.26%, Incorrect: 25.74%


def w2v_model_accuracy(model):

    accuracy = model.accuracy(questions)

    sum_corr = len(accuracy[-1]['correct'])
    sum_incorr = len(accuracy[-1]['incorrect'])
    total = sum_corr + sum_incorr
    percent = lambda a: a / total * 100

    print('Total sentences: {}, Correct: {:.2f}%, Incorrect: {:.2f}%'.format(total, percent(sum_corr), percent(sum_incorr)))


# *took around 1hr45mins on Mac Book Pro (3.1 GHz Intel Core i7)
	from gensim.models import Word2Vec

	# read the evaluation file, get it at:
	# https://word2vec.googlecode.com/svn/trunk/questions-words.txt
	>>> questions = 'questions-words.txt'
	>>> evals = open(questions, 'r').readlines()
	>>> num_sections = len([l for l in evals if l.startswith(':')])
	>>> print('total evaluation sentences: {} '.format(len(evals) - num_sections))
	total evaluation sentences: 19544

	# load the pre-trained model of GoogleNews dataset (100 billion words), get it at:
	# https://code.google.com/p/word2vec/#Pre-trained_word_and_phrase_vectors
	>>> google = Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
	# test the model accuracy*
	>>> w2v_model_accuracy(google)
	Total sentences: 7614, Correct: 74.26%, Incorrect: 25.74%


	def w2v_model_accuracy(model):

	accuracy = model.accuracy(questions)

	sum_corr = len(accuracy[-1]['correct'])
	sum_incorr = len(accuracy[-1]['incorrect'])
	total = sum_corr + sum_incorr
	percent = lambda a: a / total * 100

	print('Total sentences: {}, Correct: {:.2f}%, Incorrect: {:.2f}%'.format(total, percent(sum_corr), percent(sum_incorr)))


	# *took around 1hr45mins on Mac Book Pro (3.1 GHz Intel Core i7)