zyocum/co_occurrence.py

## co_occurrence.py
#!/usr/bin/env python3

from itertools import chain
from itertools import combinations

import numpy as np

def distances(sentence, window=3, scale=True):
    """Compute a dictionary mapping combinations of pairs of words in the sentence to their normalized distances.
    s1 = 'A B X B'.split()
    distances(s1, 2) -> {
        ('A', 'B'): 1.0,
        ('B', 'X'): 2.0
    }
    distances(s1, 3) -> {
        ('A', 'B'): 1.0,
        ('A', 'X'): 0.5,
        ('B', 'X'): 3.0,
        ('B', 'B'): 0.5
    }
    """
    distances = {}
    for pair in combinations(enumerate(sentence), 2):
        (i, word1), (j, word2) = pair
        word1, word2 = sorted((word1, word2))
        if (word1, word2) not in distances:
            distances[(word1, word2)] = 0.0
        distance = abs(i - j)
        if distance <= window:
            distances[(word1, word2)] += (1.0 / distance) if scale else 1.0
    return distances

def vocabulary(sentences):
    """Flatten and index the vocabulary of words in the sentences.

    s1 = 'A B X B'.split()
    s2 = 'X B B'.split()
    sentences = s1, s2
    vocabulary(sentences) -> {0: 'A', 1: 'B', 2: 'X'}
    """
    return dict(enumerate(set(chain(*sentences))))

def co_occurrence_matrix(sentences, window=3, scale=True):
    """Compute a co-occurrence matrix with the sum of the distances between all word tokens in the sentences.

    window: window size for considering word tokens to compute the distance from (default is to look up to 3 tokens away)
    scale: whether to scale the co-occurence by the distance (default is to scale to 1/distance such that a pair of tokens that are 3 away from one another contribute only 1/3, but tokens next to each other contribute 1/1)

    s1 = 'A B X B'.split()
    s2 = 'X B B'.split()
    sentences = s1, s2

    index, matrix = co_occurence_matrix(sentences, window=3, scale=False)
    index -> {'A': 0, 'B': 1, 'X': 2}
    matrix -> array([
        [0., 1., 2.],
        [1., 0., 4.],
        [2., 4., 2.]
    ])

    index, matrix = co_occurence_matrix(sentences, window=3, scale=True)
    index -> {'A': 0, 'B': 1, 'X': 2}
    matrix -> array([
        [0.        , 0.5       , 1.33333333],
        [0.5       , 0.        , 3.5       ],
        [1.33333333, 3.5       , 1.5       ]
    ])

    cell = index['B'], index['X']
    matrix[cell] -> 3.5
    """
    vocab = vocabulary(sentences)
    shape = len(vocab), len(vocab)
    matrix = np.zeros(shape)
    index = {word: i for (i, word) in vocab.items()}
    for sentence in sentences:
        for (word1, word2), distance in distances(
            sentence,
            window=window,
            scale=scale
        ).items():
            cell = index[word1], index[word2]
            matrix[cell] += distance
            # maintain symmetry
            if word1 != word2:
                inverse_cell = index[word2], index[word1]
                matrix[inverse_cell] += distance
    return index, matrix
	#!/usr/bin/env python3

	from itertools import chain
	from itertools import combinations

	import numpy as np

	def distances(sentence, window=3, scale=True):
	"""Compute a dictionary mapping combinations of pairs of words in the sentence to their normalized distances.
	s1 = 'A B X B'.split()
	distances(s1, 2) -> {
	('A', 'B'): 1.0,
	('B', 'X'): 2.0
	}
	distances(s1, 3) -> {
	('A', 'B'): 1.0,
	('A', 'X'): 0.5,
	('B', 'X'): 3.0,
	('B', 'B'): 0.5
	}
	"""
	distances = {}
	for pair in combinations(enumerate(sentence), 2):
	(i, word1), (j, word2) = pair
	word1, word2 = sorted((word1, word2))
	if (word1, word2) not in distances:
	distances[(word1, word2)] = 0.0
	distance = abs(i - j)
	if distance <= window:
	distances[(word1, word2)] += (1.0 / distance) if scale else 1.0
	return distances

	def vocabulary(sentences):
	"""Flatten and index the vocabulary of words in the sentences.

	s1 = 'A B X B'.split()
	s2 = 'X B B'.split()
	sentences = s1, s2
	vocabulary(sentences) -> {0: 'A', 1: 'B', 2: 'X'}
	"""
	return dict(enumerate(set(chain(*sentences))))

	def co_occurrence_matrix(sentences, window=3, scale=True):
	"""Compute a co-occurrence matrix with the sum of the distances between all word tokens in the sentences.

	window: window size for considering word tokens to compute the distance from (default is to look up to 3 tokens away)
	scale: whether to scale the co-occurence by the distance (default is to scale to 1/distance such that a pair of tokens that are 3 away from one another contribute only 1/3, but tokens next to each other contribute 1/1)

	s1 = 'A B X B'.split()
	s2 = 'X B B'.split()
	sentences = s1, s2

	index, matrix = co_occurence_matrix(sentences, window=3, scale=False)
	index -> {'A': 0, 'B': 1, 'X': 2}
	matrix -> array([
	[0., 1., 2.],
	[1., 0., 4.],
	[2., 4., 2.]
	])

	index, matrix = co_occurence_matrix(sentences, window=3, scale=True)
	index -> {'A': 0, 'B': 1, 'X': 2}
	matrix -> array([
	[0. , 0.5 , 1.33333333],
	[0.5 , 0. , 3.5 ],
	[1.33333333, 3.5 , 1.5 ]
	])

	cell = index['B'], index['X']
	matrix[cell] -> 3.5
	"""
	vocab = vocabulary(sentences)
	shape = len(vocab), len(vocab)
	matrix = np.zeros(shape)
	index = {word: i for (i, word) in vocab.items()}
	for sentence in sentences:
	for (word1, word2), distance in distances(
	sentence,
	window=window,
	scale=scale
	).items():
	cell = index[word1], index[word2]
	matrix[cell] += distance
	# maintain symmetry
	if word1 != word2:
	inverse_cell = index[word2], index[word1]
	matrix[inverse_cell] += distance
	return index, matrix