Skip to content

Instantly share code, notes, and snippets.

@toobaz
Last active February 28, 2024 14:36
Show Gist options
  • Save toobaz/e40ecfcf0e77ba158d8ac60a39faeaf9 to your computer and use it in GitHub Desktop.
Save toobaz/e40ecfcf0e77ba158d8ac60a39faeaf9 to your computer and use it in GitHub Desktop.
import numpy as np
import scipy.sparse
def _alt_lexsort(arr, kind='stable'):
"""
Optimized and more flexible (allows to choose sorting algorithm) version of
np.lexsort.
Assumes positive elements such that the product of the maximum values in
each layer of arr, multiplied by the length of each layer
(i.e., arr[0].shape[0]), plus some margin for roundings, is lower than
maxint.
Only tested with integers.
"""
n_layers = len(arr)
bits_for_layers = [int(np.ceil(np.log2(max(arr[i].max(), 1))))
for i in range(n_layers-1)] + [0]
# Go backwards for consistency with np.lexsort
prod = np.bitwise_or.reduce([arr[i] << bits_for_layers[n_layers - i-1]
for i in range(n_layers)])
return np.argsort(prod, kind=kind)
def coo_get_block(mat, rows, columns):
"""
Fancy-indexed submatrix getter.
"""
rows = np.asarray(rows)
columns = np.asarray(columns)
if ((len(rows) > len(np.unique(rows))) |
(len(columns) > len(np.unique(columns)))):
raise NotImplementedError("Repeated rows or columns are unsupported")
new_shape = len(rows), len(columns)
# Translation of negative indices:
rows[rows < 0] += mat.shape[0]
columns[columns < 0] += mat.shape[1]
rows_pos = -np.ones((mat.shape[0],), dtype=int)
rows_pos[rows] = np.arange(new_shape[0], dtype=int)
cols_pos = -np.ones((mat.shape[1],), dtype=int)
cols_pos[columns] = np.arange(new_shape[1], dtype=int)
row_is_sel = np.isin(mat.row, rows)
col_is_sel = np.isin(mat.col, columns)
is_sel = row_is_sel & col_is_sel
data, row, col = (arr[is_sel] for arr in (mat.data, mat.row, mat.col))
new_mat = scipy.sparse.coo_array((data, (rows_pos[row], cols_pos[col])),
shape=new_shape, dtype=mat.dtype)
return new_mat
def coo_set_block(mat, other, rows, columns):
"""
Fancy-indexed submatrix setter.
"""
rows = np.asarray(rows)
columns = np.asarray(columns)
if not isinstance(other, scipy.sparse._coo._coo_base):
other = scipy.sparse.coo_array(other)
new_shape = len(rows), len(columns)
# Translation of negative indices:
rows[rows < 0] += mat.shape[0]
columns[columns < 0] += mat.shape[1]
keep_rows = ~np.isin(mat.row, rows)
keep_cols = ~np.isin(mat.col, columns)
keep_mask = keep_rows | keep_cols
to_keep = [arr[keep_mask] for arr in (mat.row, mat.col, mat.data)]
shifted_rows = rows[other.row]
shifted_cols = columns[other.col]
new_row = np.concatenate([to_keep[0], shifted_rows])
new_col = np.concatenate([to_keep[1], shifted_cols])
new_data = np.concatenate([to_keep[2], other.data])
order = np.lexsort((new_row, new_col))
mat.data = new_data[order]
mat.col = new_col[order]
mat.row = new_row[order]
OP_ATTRS = {'sum' : 'add', 'sub' : 'sub', 'div' : 'truediv', 'mul' : 'mul'}
def coo_binary_op(mat, other, op='sum'):
"""
Binary operation between COO sparse matrices.
Only supports operations such that op(0, 0) = 0, hence preserving
sparsity (e.g. not np.power).
Note that sum could be fastpathed (relying on .sum_duplicates()).
"""
# We need canonical format (we don't necessarily care about order - except
# for performance - but we do care about unicity):
mat.sum_duplicates()
other.sum_duplicates()
all_row = np.concatenate([mat.row, other.row])
all_col = np.concatenate([mat.col, other.col])
all_data = np.concatenate([mat.data, other.data])
all_ind = np.concatenate([np.zeros(len(mat.data), dtype=int),
np.ones(len(other.data), dtype=int)])
# Use _alt_lexsort as it better exploits the partial sortedness of data:
try:
order = _alt_lexsort((all_ind, all_col, all_row))
except Exception as exc:
print((all_ind, all_col, all_row))
raise exc
ord_row, ord_col, ord_data, ord_ind = [arr[order]
for arr in [all_row, all_col,
all_data, all_ind]]
common_cells = np.zeros(len(ord_data) + 1, dtype=bool)
common_cells[1:-1] = ((ord_row[1:] == ord_row[:-1]) &
(ord_col[1:] == ord_col[:-1]))
# Last element cannot possibly be a duplicate with indicator 0
only_mat = (ord_ind == 0) & ~common_cells[1:]
# First element cannot possibly be a duplicate with indicator 1
only_other = (ord_ind == 1) & ~common_cells[:-1]
op = OP_ATTRS.get(op, op)
if np.any(common_cells):
# Change values in place, we will remove redundant positions later
operands = (ord_data[common_cells[:-1]], ord_data[common_cells[1:]])
method = getattr(operands[0], f'__{op}__')
ord_data[common_cells[1:]] = method(operands[1])
ord_data[only_mat] = getattr(ord_data[only_mat], f'__{op}__')(0)
ord_data[only_other] = getattr(ord_data[only_other], f'__r{op}__')(0)
# Now "squeeze" by removing duplicate indices:
new_mat = scipy.sparse.coo_array((ord_data[~common_cells[:-1]],
(ord_row[~common_cells[:-1]],
ord_col[~common_cells[:-1]])),
shape=mat.shape, dtype=ord_data.dtype)
return new_mat
def test():
# 7.28 TiB in dense representation:
m1 = scipy.sparse.lil_array((10**6, 10**6), dtype=int)
# 1.82 TiB in dense representation:
m2 = scipy.sparse.lil_array((5*10**5, 5*10**5), dtype=int)
coords_1 = np.random.randint(0, 10**6, (10**3, 2))
coords_2 = np.random.randint(0, 10**5, (10**3, 2))
for idx, (x, y) in enumerate(coords_1):
m1[x, y] = idx
for idx, (x, y) in enumerate(coords_2):
m2[x, y] = idx
# Two cells that will actually add up below:
m1[1*10**5 + 10, 2*10**5 + 10] = m2[10, 10] = -1
m1 = scipy.sparse.coo_array(m1)
tot_1 = m1.sum()
m2 = scipy.sparse.coo_array(m2)
tot_2 = m2.sum()
rows = np.concatenate([np.arange(1*10**5, 3*10**5),
np.arange(4*10**5, 7*10**5)])
cols = np.concatenate([np.arange(2*10**5, 5*10**5),
np.arange(7*10**5, 9*10**5)])
extracted = coo_get_block(m1, rows, cols)
added = coo_binary_op(extracted, m2)
coo_set_block(m1, added, rows, cols)
# Sum from 0 to 999, times 2, minus 2 manually set:
total = m1.sum()
assert(total == 998998), total
the_element = scipy.sparse.csr_array(m1)[1*10**5 + 10, 2*10**5 + 10]
assert(the_element == -2), the_element
if __name__ == '__main__':
test()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment