Skip to content

Instantly share code, notes, and snippets.

@vene
Last active August 4, 2018 18:18
Show Gist options
  • Save vene/198aba5550b9d2bf7b1f to your computer and use it in GitHub Desktop.
Save vene/198aba5550b9d2bf7b1f to your computer and use it in GitHub Desktop.
from collections import Counter
import numpy as np
from sklearn.metrics import euclidean_distances
from pyemd import emd as pyemd
def word_movers_distance(a, b, embeddings):
"""Word Mover's Distance.
A measure of text similarity: earth mover's distance in embedding metric space.
Computes the distance between texts `a` and `b`.
Parameters
----------
a: iterable
One of the two documents to compute similarity between.
Represented as a sequence of indexes into the `embeddings`.
b: iterable
The other document to compute similarity between.
embeddings: array-like, shape: (vocab_size, n_features)
Word representations. The embedding of the first word in `a`
should be `embeddings[a[0]]`.
Returns
-------
distance: double,
The distance between `a` and `b`.
References
----------
Matt J. Kusner, Yu Sun, Nicholas I. Kolkin, Kilian Q. Weinberger,
From Word Embeddings To Document Distances. ICML 2015
http://matthewkusner.com/publications/WMD.pdf
Notes
-----
This implementation is OK for one-off cases. If looping over multiple documents,
precomputing can speed things up. (Code coming soon.)
"""
counter_a = Counter(a)
counter_b = Counter(b)
uniq_words = list(set(counter_a.keys()).union(counter_b.keys()))
bow_a = np.array([counter_a[w] for w in uniq_words], dtype=np.double)
bow_b = np.array([counter_b[w] for w in uniq_words], dtype=np.double)
bow_a /= bow_a.sum()
bow_b /= bow_b.sum()
D = euclidean_distances(embeddings[uniq_words])
return pyemd(bow_a, bow_b, D)
if __name__ == '__main__':
from time import time
vocab_size = 10000
n_words = 500
# make up random word embeddings
embeddings = np.random.randn(vocab_size, 100)
embeddings /= np.linalg.norm(embeddings, axis=1)[:, np.newaxis]
# make up two random texts
a = np.random.random_integers(vocab_size, size=n_words) - 1
b = np.random.random_integers(vocab_size, size=n_words) - 1
t0 = time()
dist = word_movers_distance(a, b, embeddings)
t = time() - t0
print("Distance = {:.4f}, computed in {:.2}s.".format(dist, t))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment