Last active
December 10, 2017 17:22
-
-
Save thomasniebler/26f3e972f8faad61b33ae7fb19150921 to your computer and use it in GitHub Desktop.
Analogy Evaluation for Word Embeddings
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas | |
import numpy as np | |
from sklearn.preprocessing import normalize | |
# load any set of word embeddings like that | |
vecs = {"word": np.array([0, 1, 2, 3, 4, 5]} | |
# load the word2vec analogies | |
analogies = pandas.read_csv("questions-words.txt", names=["a", "b", "c", "d"], sep=" ") | |
def analogy(vecs, analogies): | |
keys = list(vecs.keys()) | |
X = normalize(np.vstack((vecs[key] for key in keys)), axis=1) | |
for col in analogies.columns: | |
analogies[col] = analogies[col].str.lower() | |
analogies["vec" + col] = [vecs.get(word, None) for word in analogies[col]] | |
analogies = analogies.dropna() | |
analogies["3ca"] = (analogies["vecb"] - analogies["veca"] + analogies["vecc"]).apply(normalize) | |
Y = np.vstack(analogies["3ca"].values) | |
bestids = np.hstack((np.argmax(X.dot(Y[i - 1000:min(i, len(analogies))].T), axis=0).reshape(1, -1) for i in list(range(1000, 20001, 1000)))) | |
revind = dict(zip(keys, range(len(keys)))) | |
trueids = np.array([revind[word] for word in analogies["d"]]) | |
print("Accuracy:\t" + str(len(trueids[(trueids == bestids)[0]]) * 1.0 / len(trueids))) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Not clean, not really beautiful, but should work.