Skip to content

Instantly share code, notes, and snippets.

@minhlab
Created June 15, 2017 08:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save minhlab/1f9b5a8d5a303e9a91606db0767e20ca to your computer and use it in GitHub Desktop.
Save minhlab/1f9b5a8d5a303e9a91606db0767e20ca to your computer and use it in GitHub Desktop.
import numpy as np
# from deep-coref project: https://github.com/clarkkev/deep-coref
# download w2v_50d.txt from here: https://drive.google.com/file/d/0B5Y5rz_RUKRmdEFPcGIwZ2xLRW8/view
with open('w2v_50d.txt') as f:
word2id = {}
vectors = []
words = []
for line in f:
parts = line.strip().split()
word = parts[0]
nums = [float(n) for n in parts[1:]]
assert len(nums) == 50
word2id[word] = len(words)
words.append(word)
vectors.append(nums)
vectors_np = np.array(vectors)
vectors_normed = vectors_np/np.linalg.norm(vectors_np, axis=1)[:,np.newaxis]
def print_nn_and_cos(word):
print('*** %s ***' %word)
v = vectors_normed[word2id[word]]
cos = np.dot(vectors_normed, v[np.newaxis,:].T)[:,0]
nn = np.argsort(-cos)[1:11]
print([words[i] for i in nn])
print(cos[nn])
print('mean = %.3f, std = %.3f' %(cos[nn].mean(), cos[nn].std()))
print_nn_and_cos('the')
print_nn_and_cos('dog')
print_nn_and_cos('river')
print_nn_and_cos('man')
print_nn_and_cos('space')
print_nn_and_cos('truth')
# download Stanford's nndep model from: https://nlp.stanford.edu/software/nndep.shtml
with gzip.open('PTB_Stanford_params.txt.gz', 'rt') as f:
ndict = int(re.match('dict=(\d+)', f.readline()).group(1))
for _ in range(6): f.readline()
word2id = {}
vectors = []
words = []
for k in range(ndict):
parts = f.readline().strip().split()
word = parts[0]
nums = [float(n) for n in parts[1:]]
word2id[word] = len(words)
words.append(word)
vectors.append(nums)
vectors_np = np.array(vectors)
vectors_normed = vectors_np/np.linalg.norm(vectors_np, axis=1)[:,np.newaxis]
def print_nn_and_distance(word):
print('*** %s ***' %word)
v = vectors_np[word2id[word]]
d = np.linalg.norm(vectors_np-v[np.newaxis,:], axis=1)
nn = np.argsort(d)[1:11]
print([words[i] for i in nn])
print(d[nn])
print('mean = %.3f, std = %.3f' %(d[nn].mean(), d[nn].std()))
print_nn_and_distance('the')
print_nn_and_distance('dog')
print_nn_and_distance('river')
print_nn_and_distance('man')
print_nn_and_distance('space')
print_nn_and_distance('truth')
@minhlab
Copy link
Author

minhlab commented Jun 15, 2017

Results of w2v_50d.txt:

*** the ***
['of', '.', 'in', 'as', 'which', 'a', ',', 'from', 'and', 'part']
[ 0.95630862  0.95557567  0.94625047  0.93489123  0.9338221   0.93277277
  0.9283411   0.92372776  0.9208091   0.90351672]
mean = 0.934, std = 0.015
*** dog ***
['cat', 'puppy', 'dachshund', 'dogs', 'rabbit', 'puppies', 'raccoon', 'chihuahuas', 'rottweiler', 'doberman']
[ 0.92129055  0.91466229  0.88206141  0.87930401  0.87678949  0.87291358
  0.8718119   0.87106787  0.86945022  0.86920737]
mean = 0.883, std = 0.018
*** river ***
['tributary', 'empties', 'tributaries', 'confluence', 'headwaters', 'basin', 'north-flowing', 'sub-basin', 'distributary', 'nnw']
[ 0.93032231  0.89797763  0.89537829  0.87337773  0.87038715  0.86998711
  0.86233695  0.86015036  0.85655437  0.85566725]
mean = 0.877, std = 0.023
*** man ***
['boy', 'stranger', 'killer', 'himself', 'deranged', 'schoolmate', 'woman', 'confesses', 'murderer', 'crazed']
[ 0.8301589   0.82253605  0.82014207  0.80086525  0.8004424   0.80004879
  0.79883606  0.79246782  0.79067237  0.79028027]
mean = 0.805, std = 0.014
*** space ***
['sirtf', 'module', 'spacecraft', 'skylab', 'orbiting', 'orbit', 'transhab', 'earth-orbiting', 'heliospheric', 'low-earth']
[ 0.85021054  0.84761857  0.83769804  0.83094069  0.82613307  0.8224817
  0.82096958  0.8147794   0.80670971  0.80444771]
mean = 0.826, std = 0.015
*** truth ***
['falseness', 'falsehood', 'half-truth', 'truths', 'untruth', 'confess', 'liars', 'evilness', 'betrayal', 'guilt']
[ 0.81823948  0.80875277  0.80711669  0.80709574  0.80589443  0.79790151
  0.79355611  0.79346315  0.78589431  0.78581251]
mean = 0.800, std = 0.010

Results of PTB_Stanford_params.txt.gz:

*** the ***
['THE', 'The', 'an', 'AN', 'S.p', 'An', '180', 'another', 'G.m.b', 'Another']
[ 2.52918536  3.66520565  6.24821837  6.3463501   6.46181631  6.48635947
  6.73231996  6.73712092  6.73833322  6.74346129]
mean = 5.869, std = 1.419
*** dog ***
['Dog', 'girl', 'Girl', 'cat', 'boy', 'crab', 'Boy', 'goose', 'sheep', 'baby']
[ 0.19307155  4.94744398  4.96510756  5.05930523  5.15690922  5.16321572
  5.18603107  5.31667825  5.34221699  5.44591579]
mean = 4.678, std = 1.503
*** river ***
['RIVER', 'River', 'valley', 'VALLEY', 'Valley', 'Canyon', 'Peninsula', 'Ridge', 'peninsula', 'Forest']
[ 0.20460781  0.72840849  4.02730687  4.03969697  4.05952527  4.34050865
  4.72512918  4.73398893  4.75385519  4.90019596]
mean = 3.651, std = 1.627
*** man ***
['Man', 'girl', 'hero', 'Girl', 'boy', 'Woman', 'woman', 'Boy', 'figure', 'boss']
[ 1.40921062  4.41261459  4.43513685  4.4669663   4.49299735  4.52174227
  4.53455685  4.64653384  5.23157296  5.29558111]
mean = 4.345, std = 1.025
*** space ***
['Space', 'System', 'system', 'machine', 'Machine', 'power', 'Power', 'Technology', 'line', 'earth']
[ 1.18936447  5.33776326  5.38445055  5.43205926  5.46524485  5.48349733
  5.56612404  5.59520173  5.64357568  5.67654112]
mean = 5.077, std = 1.300
*** truth ***
['facts', 'identity', 'punishment', 'Punishment', 'Darkness', 'secrets', 'Gods', 'reasoning', 'emotion', 'Reasoning']
[ 5.43819436  5.44826786  5.48064205  5.51750067  5.66504278  5.77240799
  5.94669463  6.01730846  6.01860553  6.02418689]
mean = 5.733, std = 0.240

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment