Skip to content

Instantly share code, notes, and snippets.

@zyocum
Last active July 19, 2020 01:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zyocum/2ba0457246a4d0075149aa7d607432c1 to your computer and use it in GitHub Desktop.
Save zyocum/2ba0457246a4d0075149aa7d607432c1 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
from itertools import chain
from itertools import combinations
import numpy as np
def distances(sentence, window=3, scale=True):
"""Compute a dictionary mapping combinations of pairs of words in the sentence to their normalized distances.
s1 = 'A B X B'.split()
distances(s1, 2) -> {
('A', 'B'): 1.0,
('B', 'X'): 2.0
}
distances(s1, 3) -> {
('A', 'B'): 1.0,
('A', 'X'): 0.5,
('B', 'X'): 3.0,
('B', 'B'): 0.5
}
"""
distances = {}
for pair in combinations(enumerate(sentence), 2):
(i, word1), (j, word2) = pair
word1, word2 = sorted((word1, word2))
if (word1, word2) not in distances:
distances[(word1, word2)] = 0.0
distance = abs(i - j)
if distance <= window:
distances[(word1, word2)] += (1.0 / distance) if scale else 1.0
return distances
def vocabulary(sentences):
"""Flatten and index the vocabulary of words in the sentences.
s1 = 'A B X B'.split()
s2 = 'X B B'.split()
sentences = s1, s2
vocabulary(sentences) -> {0: 'A', 1: 'B', 2: 'X'}
"""
return dict(enumerate(set(chain(*sentences))))
def co_occurrence_matrix(sentences, window=3, scale=True):
"""Compute a co-occurrence matrix with the sum of the distances between all word tokens in the sentences.
window: window size for considering word tokens to compute the distance from (default is to look up to 3 tokens away)
scale: whether to scale the co-occurence by the distance (default is to scale to 1/distance such that a pair of tokens that are 3 away from one another contribute only 1/3, but tokens next to each other contribute 1/1)
s1 = 'A B X B'.split()
s2 = 'X B B'.split()
sentences = s1, s2
index, matrix = co_occurence_matrix(sentences, window=3, scale=False)
index -> {'A': 0, 'B': 1, 'X': 2}
matrix -> array([
[0., 1., 2.],
[1., 0., 4.],
[2., 4., 2.]
])
index, matrix = co_occurence_matrix(sentences, window=3, scale=True)
index -> {'A': 0, 'B': 1, 'X': 2}
matrix -> array([
[0. , 0.5 , 1.33333333],
[0.5 , 0. , 3.5 ],
[1.33333333, 3.5 , 1.5 ]
])
cell = index['B'], index['X']
matrix[cell] -> 3.5
"""
vocab = vocabulary(sentences)
shape = len(vocab), len(vocab)
matrix = np.zeros(shape)
index = {word: i for (i, word) in vocab.items()}
for sentence in sentences:
for (word1, word2), distance in distances(
sentence,
window=window,
scale=scale
).items():
cell = index[word1], index[word2]
matrix[cell] += distance
# maintain symmetry
if word1 != word2:
inverse_cell = index[word2], index[word1]
matrix[inverse_cell] += distance
return index, matrix
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment