internaut/cooc.py

## cooc.py
import numpy as np


def word_cooccurrence(dtm):
    """
    Calculate the co-document frequency (aka word co-occurrence) matrix for a document-term matrix `dtm`, i.e. how often
    each pair of tokens occurs together at least once in the same document.

    :param dtm: (sparse) document-term-matrix of size NxM (N docs, M is vocab size) with raw term counts.
    :return: co-document frequency (aka word co-occurrence) matrix with shape MxM
    """
    if dtm.ndim != 2:
        raise ValueError('`dtm` must be a 2D array/matrix')

    bin_dtm = (dtm >= 1).astype(np.int)
    return bin_dtm.T @ bin_dtm

## test_cooc.py
import numpy as np
from hypothesis import given, strategies as st
from hypothesis.extra.numpy import arrays, array_shapes

from cooc import word_cooccurrence


@given(dtm=arrays(np.int, array_shapes(2, 2), elements=st.integers(min_value=0, max_value=1000)))
def test_word_cooccurrence(dtm):
    res = word_cooccurrence(dtm)

    n_docs, vocab_size = dtm.shape

    assert isinstance(res, np.ndarray)
    assert res.dtype == np.int
    assert res.ndim == 2
    assert res.shape == (vocab_size, vocab_size)
    assert np.all((res >= 0) & (res <= n_docs))
    assert np.array_equal(res, res.T)

    if np.array_equal(dtm, np.zeros(dtm.shape, dtype=np.int)):
        assert np.array_equal(res, np.zeros(res.shape, dtype=np.int))

    ident = np.eye(n_docs)
    if n_docs == vocab_size and np.array_equal(dtm, ident):
        assert np.array_equal(res, ident)
	import numpy as np


	def word_cooccurrence(dtm):
	"""
	Calculate the co-document frequency (aka word co-occurrence) matrix for a document-term matrix `dtm`, i.e. how often
	each pair of tokens occurs together at least once in the same document.

	:param dtm: (sparse) document-term-matrix of size NxM (N docs, M is vocab size) with raw term counts.
	:return: co-document frequency (aka word co-occurrence) matrix with shape MxM
	"""
	if dtm.ndim != 2:
	raise ValueError('`dtm` must be a 2D array/matrix')

	bin_dtm = (dtm >= 1).astype(np.int)
	return bin_dtm.T @ bin_dtm
	import numpy as np
	from hypothesis import given, strategies as st
	from hypothesis.extra.numpy import arrays, array_shapes

	from cooc import word_cooccurrence


	@given(dtm=arrays(np.int, array_shapes(2, 2), elements=st.integers(min_value=0, max_value=1000)))
	def test_word_cooccurrence(dtm):
	res = word_cooccurrence(dtm)

	n_docs, vocab_size = dtm.shape

	assert isinstance(res, np.ndarray)
	assert res.dtype == np.int
	assert res.ndim == 2
	assert res.shape == (vocab_size, vocab_size)
	assert np.all((res >= 0) & (res <= n_docs))
	assert np.array_equal(res, res.T)

	if np.array_equal(dtm, np.zeros(dtm.shape, dtype=np.int)):
	assert np.array_equal(res, np.zeros(res.shape, dtype=np.int))

	ident = np.eye(n_docs)
	if n_docs == vocab_size and np.array_equal(dtm, ident):
	assert np.array_equal(res, ident)