Last active
July 19, 2020 01:25
-
-
Save zyocum/2ba0457246a4d0075149aa7d607432c1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from itertools import chain | |
from itertools import combinations | |
import numpy as np | |
def distances(sentence, window=3, scale=True): | |
"""Compute a dictionary mapping combinations of pairs of words in the sentence to their normalized distances. | |
s1 = 'A B X B'.split() | |
distances(s1, 2) -> { | |
('A', 'B'): 1.0, | |
('B', 'X'): 2.0 | |
} | |
distances(s1, 3) -> { | |
('A', 'B'): 1.0, | |
('A', 'X'): 0.5, | |
('B', 'X'): 3.0, | |
('B', 'B'): 0.5 | |
} | |
""" | |
distances = {} | |
for pair in combinations(enumerate(sentence), 2): | |
(i, word1), (j, word2) = pair | |
word1, word2 = sorted((word1, word2)) | |
if (word1, word2) not in distances: | |
distances[(word1, word2)] = 0.0 | |
distance = abs(i - j) | |
if distance <= window: | |
distances[(word1, word2)] += (1.0 / distance) if scale else 1.0 | |
return distances | |
def vocabulary(sentences): | |
"""Flatten and index the vocabulary of words in the sentences. | |
s1 = 'A B X B'.split() | |
s2 = 'X B B'.split() | |
sentences = s1, s2 | |
vocabulary(sentences) -> {0: 'A', 1: 'B', 2: 'X'} | |
""" | |
return dict(enumerate(set(chain(*sentences)))) | |
def co_occurrence_matrix(sentences, window=3, scale=True): | |
"""Compute a co-occurrence matrix with the sum of the distances between all word tokens in the sentences. | |
window: window size for considering word tokens to compute the distance from (default is to look up to 3 tokens away) | |
scale: whether to scale the co-occurence by the distance (default is to scale to 1/distance such that a pair of tokens that are 3 away from one another contribute only 1/3, but tokens next to each other contribute 1/1) | |
s1 = 'A B X B'.split() | |
s2 = 'X B B'.split() | |
sentences = s1, s2 | |
index, matrix = co_occurence_matrix(sentences, window=3, scale=False) | |
index -> {'A': 0, 'B': 1, 'X': 2} | |
matrix -> array([ | |
[0., 1., 2.], | |
[1., 0., 4.], | |
[2., 4., 2.] | |
]) | |
index, matrix = co_occurence_matrix(sentences, window=3, scale=True) | |
index -> {'A': 0, 'B': 1, 'X': 2} | |
matrix -> array([ | |
[0. , 0.5 , 1.33333333], | |
[0.5 , 0. , 3.5 ], | |
[1.33333333, 3.5 , 1.5 ] | |
]) | |
cell = index['B'], index['X'] | |
matrix[cell] -> 3.5 | |
""" | |
vocab = vocabulary(sentences) | |
shape = len(vocab), len(vocab) | |
matrix = np.zeros(shape) | |
index = {word: i for (i, word) in vocab.items()} | |
for sentence in sentences: | |
for (word1, word2), distance in distances( | |
sentence, | |
window=window, | |
scale=scale | |
).items(): | |
cell = index[word1], index[word2] | |
matrix[cell] += distance | |
# maintain symmetry | |
if word1 != word2: | |
inverse_cell = index[word2], index[word1] | |
matrix[inverse_cell] += distance | |
return index, matrix |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment