Skip to content

Instantly share code, notes, and snippets.

@Linux-cpp-lisp
Last active October 12, 2017 16:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Linux-cpp-lisp/688487fa81c64c9799d34b56b09d6be5 to your computer and use it in GitHub Desktop.
Save Linux-cpp-lisp/688487fa81c64c9799d34b56b09d6be5 to your computer and use it in GitHub Desktop.
A memory-optimized algorithm for computing N(epsilon) when finding the capacity dimension of a dataset
#boxes.py: Compute N(epsilon) for the "box-counting" or capacity dimension of a data set.
# Alby Musaelian, 12/2016.
import numpy as np
import warnings
def memory_optimized_n_eps(traj,
eps,
memory_slot_limit=100*1024*1024,
memory_limit_mode='error',
save_extra_memory = False,
logger=None):
"""Compute N(eps) for a n-dimensional time series.
Compute the number of squares/cubes/hypercubes of side length `eps`
needed to cover all points in the time-series `traj`.
Returns an N(eps) (an integer).
IMPORTANT: !! -- Modifies the order and values of the trajectory in place. -- !!
Params:
- traj: The n-dimensional trajectory to compute on. Modified IN-PLACE!
- eps: The side length of the squares/cubes/hypercubes to count.
- memory_slot_limit: Approximate hard limit on the size (in bytes) of the box grid matrix used by the algorithm.
This parameter can be used to control the memory use of the algorithm, however, it has a lower bound
that depends on epsilon and the data itself. The algorithm's behaviour if the limit is below the lower
bound is controlled by `memory_limit_mode`. Defaults to 100Mb.
- memory_limit_mode: Either 'error' or 'increase'. When 'error' is set, if the least memory needed to run the computation
is above `memory_slot_limit`, an exception is raised. In 'increase' mode, the amount of memory used will be
increased to the lowest amount above the limit needed to run the computation. Defaults to 'error'.
- save_extra_memory: When true, the algorithm will sort completely in place without creating an extra array of indexes.
Defaults to False. When this mode is enabled, only C contiguous arrays (meaning, for example, no views or slices)
can be provided for `traj`.
- logger: A `Logger` to log information to. Defaults to `None`, or no logging.
"""
assert memory_limit_mode in ['error', 'increase']
if not traj.flags.owndata:
warnings.warn("`memory_optimized_n_eps` modifies `traj` in place; "
"the provided `traj` does not own its data and the computation may therefore unintentionally modify data elsewhere")
if save_extra_memory and not traj.flags.c_contiguous:
raise ValueError("`traj` must be C contiguous when in `save_extra_memory` mode.\n"
"(Generally, this error means a view or slice was passed to `memory_optimized_n_eps`; "
"try using `np.ascontiguousarray()` or `.copy(order='C')` or setting `save_extra_memory` to False)")
mins = np.amin(traj, axis=0)
maxes = np.amax(traj, axis=0)
orig_shape = list(np.ceil((maxes - mins) / float(eps)))
slicing_dimension = np.argmax(orig_shape)
if logger:
logger.info("Slicing Dimension: %d", slicing_dimension)
#Compute the slice size
other_dimension_product = np.prod(orig_shape[0:slicing_dimension] + orig_shape[slicing_dimension+1:])
slice_size = np.floor(float(memory_slot_limit) / other_dimension_product)
if slice_size < 1:
if memory_limit_mode == 'error':
error = ("memory_slot_limit must be large enough for a step size of at least 1. "
"For these parameters, that implies a memory_slot_limit of at least %s"
)
raise ValueError(error % other_dimension_product)
else:
slice_size = 1
#If the slice size is bigger than the data, clamp it to the size of the data.
if slice_size > orig_shape[slicing_dimension]:
slice_size = orig_shape[slicing_dimension]
#Sort trajectory in-place (ascending) along the axis with the most boxes (largest data range)
if save_extra_memory: #Sorting hack:
#Hack from http://stackoverflow.com/questions/2828059/sorting-arrays-in-numpy-by-column
traj.view(','.join([traj.dtype.str] * traj.shape[1])).sort(order=['f%d' % slicing_dimension], axis=0)
else:
traj = traj[traj[:,slicing_dimension].argsort()]
#Move data origin to lower left corner of box grid
traj -= mins
#Create the slice grid
shape = orig_shape[:]
shape[slicing_dimension] = slice_size
slice_grid = np.zeros(shape=shape, dtype=bool)
box_count = 0
slice_index = 0
if logger:
logger.info("Slice Size (boxes): %s", slice_size)
logger.info("Slice Size (input units): %s", slice_size * eps)
logger.info("Slice Grid Size (bytes): %s", slice_grid.nbytes)
lastpt = -np.inf
for pt in traj:
# - Check for out-of-order sorts -
assert not lastpt > pt[slicing_dimension], "Fatal error: %d out-of-order points were found. If using `save_extra_memory` mode, try turning it off."
lastpt = pt[slicing_dimension]
#Get the box index of the point in the overall grid:
box_idex = np.floor(pt / eps).astype(int)
#Translate that index to be for the current slice in the slicing dimension:
box_idex[slicing_dimension] -= slice_index * slice_size
#If the index is bigger that the slice size, move to another slice:
if box_idex[slicing_dimension] >= slice_size:
#Finish the current slice:
box_count += slice_grid.sum()
slice_grid.fill(False)
#Skip enough slices to bring the index within range:
slices_moved, new_idex = divmod(box_idex[slicing_dimension], slice_size)
slice_index += int(slices_moved)
box_idex[slicing_dimension] = new_idex
#Mark the point's box as used:
slice_grid[tuple(box_idex)] |= True
#Add any remaining boxes to the sum:
box_count += slice_grid.sum()
return box_count
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment