Last active
January 29, 2021 21:21
-
-
Save iamaziz/8d8d8c08c7eeda707b9e to your computer and use it in GitHub Desktop.
Computing the accuracy of a word2vec model (used GoogleNews-vectors-negative300.bin as an example).
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from gensim.models import Word2Vec | |
# read the evaluation file, get it at: | |
# https://word2vec.googlecode.com/svn/trunk/questions-words.txt | |
>>> questions = 'questions-words.txt' | |
>>> evals = open(questions, 'r').readlines() | |
>>> num_sections = len([l for l in evals if l.startswith(':')]) | |
>>> print('total evaluation sentences: {} '.format(len(evals) - num_sections)) | |
total evaluation sentences: 19544 | |
# load the pre-trained model of GoogleNews dataset (100 billion words), get it at: | |
# https://code.google.com/p/word2vec/#Pre-trained_word_and_phrase_vectors | |
>>> google = Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True) | |
# test the model accuracy* | |
>>> w2v_model_accuracy(google) | |
Total sentences: 7614, Correct: 74.26%, Incorrect: 25.74% | |
def w2v_model_accuracy(model): | |
accuracy = model.accuracy(questions) | |
sum_corr = len(accuracy[-1]['correct']) | |
sum_incorr = len(accuracy[-1]['incorrect']) | |
total = sum_corr + sum_incorr | |
percent = lambda a: a / total * 100 | |
print('Total sentences: {}, Correct: {:.2f}%, Incorrect: {:.2f}%'.format(total, percent(sum_corr), percent(sum_incorr))) | |
# *took around 1hr45mins on Mac Book Pro (3.1 GHz Intel Core i7) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment