Linux-cpp-lisp/boxes.py

## boxes.py
#boxes.py: Compute N(epsilon) for the "box-counting" or capacity dimension of a data set.
# Alby Musaelian, 12/2016.

import numpy as np

import warnings

def memory_optimized_n_eps(traj,
                           eps,
                           memory_slot_limit=100*1024*1024,
                           memory_limit_mode='error',
                           save_extra_memory = False,
                           logger=None):
    """Compute N(eps) for a n-dimensional time series.

    Compute the number of squares/cubes/hypercubes  of side length `eps`
    needed to cover all points in the time-series `traj`.

    Returns an N(eps) (an integer).

    IMPORTANT: !! -- Modifies the order and values of the trajectory in place. -- !!

    Params:
     - traj: The n-dimensional trajectory to compute on. Modified IN-PLACE!
     - eps: The side length of the squares/cubes/hypercubes to count.
     - memory_slot_limit: Approximate hard limit on the size (in bytes) of the box grid matrix used by the algorithm.
             This parameter can be used to control the memory use of the algorithm, however, it has a lower bound
             that depends on epsilon and the data itself. The algorithm's behaviour if the limit is below the lower
             bound is controlled by `memory_limit_mode`. Defaults to 100Mb.
     - memory_limit_mode: Either 'error' or 'increase'. When 'error' is set, if the least memory needed to run the computation
             is above `memory_slot_limit`, an exception is raised. In 'increase' mode, the amount of memory used will be
             increased to the lowest amount above the limit needed to run the computation. Defaults to 'error'.
     - save_extra_memory: When true, the algorithm will sort completely in place without creating an extra array of indexes.
             Defaults to False. When this mode is enabled, only C contiguous arrays (meaning, for example, no views or slices)
             can be provided for `traj`.
     - logger: A `Logger` to log information to. Defaults to `None`, or no logging.
    """
    assert memory_limit_mode in ['error', 'increase']

    if not traj.flags.owndata:
        warnings.warn("`memory_optimized_n_eps` modifies `traj` in place; "
                     "the provided `traj` does not own its data and the computation may therefore unintentionally modify data elsewhere")

    if save_extra_memory and not traj.flags.c_contiguous:
        raise ValueError("`traj` must be C contiguous when in `save_extra_memory` mode.\n"
                         "(Generally, this error means a view or slice was passed to `memory_optimized_n_eps`; "
                         "try using `np.ascontiguousarray()` or `.copy(order='C')` or setting `save_extra_memory` to False)")

    mins = np.amin(traj, axis=0)
    maxes = np.amax(traj, axis=0)
    orig_shape = list(np.ceil((maxes - mins) / float(eps)))
    slicing_dimension = np.argmax(orig_shape)

    if logger:
        logger.info("Slicing Dimension: %d", slicing_dimension)

    #Compute the slice size
    other_dimension_product = np.prod(orig_shape[0:slicing_dimension] + orig_shape[slicing_dimension+1:])
    slice_size = np.floor(float(memory_slot_limit) / other_dimension_product)

    if slice_size < 1:
        if memory_limit_mode == 'error':
            error = ("memory_slot_limit must be large enough for a step size of at least 1. "
                     "For these parameters, that implies a memory_slot_limit of at least %s"
                     )
            raise ValueError(error % other_dimension_product)
        else:
            slice_size = 1

    #If the slice size is bigger than the data, clamp it to the size of the data.
    if slice_size > orig_shape[slicing_dimension]:
        slice_size = orig_shape[slicing_dimension]

    #Sort trajectory in-place (ascending) along the axis with the most boxes (largest data range)
    if save_extra_memory: #Sorting hack:
        #Hack from http://stackoverflow.com/questions/2828059/sorting-arrays-in-numpy-by-column
        traj.view(','.join([traj.dtype.str] * traj.shape[1])).sort(order=['f%d' % slicing_dimension], axis=0)
    else:
        traj = traj[traj[:,slicing_dimension].argsort()]

    #Move data origin to lower left corner of box grid
    traj -= mins

    #Create the slice grid
    shape = orig_shape[:]
    shape[slicing_dimension] = slice_size
    slice_grid = np.zeros(shape=shape, dtype=bool)

    box_count = 0
    slice_index = 0

    if logger:
        logger.info("Slice Size (boxes): %s", slice_size)
        logger.info("Slice Size (input units): %s", slice_size * eps)
        logger.info("Slice Grid Size (bytes): %s", slice_grid.nbytes)

    lastpt = -np.inf

    for pt in traj:
        # - Check for out-of-order sorts -
        assert not lastpt > pt[slicing_dimension], "Fatal error: %d out-of-order points were found. If using `save_extra_memory` mode, try turning it off."
        lastpt = pt[slicing_dimension]

        #Get the box index of the point in the overall grid:
        box_idex = np.floor(pt / eps).astype(int)
        #Translate that index to be for the current slice in the slicing dimension:
        box_idex[slicing_dimension] -= slice_index * slice_size

        #If the index is bigger that the slice size, move to another slice:
        if box_idex[slicing_dimension] >= slice_size:
            #Finish the current slice:
            box_count += slice_grid.sum()
            slice_grid.fill(False)
            #Skip enough slices to bring the index within range:
            slices_moved, new_idex = divmod(box_idex[slicing_dimension], slice_size)
            slice_index += int(slices_moved)
            box_idex[slicing_dimension] = new_idex

        #Mark the point's box as used:
        slice_grid[tuple(box_idex)] |= True

    #Add any remaining boxes to the sum:
    box_count += slice_grid.sum()

    return box_count
	#boxes.py: Compute N(epsilon) for the "box-counting" or capacity dimension of a data set.
	# Alby Musaelian, 12/2016.

	import numpy as np

	import warnings

	def memory_optimized_n_eps(traj,
	eps,
	memory_slot_limit=10010241024,
	memory_limit_mode='error',
	save_extra_memory = False,
	logger=None):
	"""Compute N(eps) for a n-dimensional time series.

	Compute the number of squares/cubes/hypercubes of side length `eps`
	needed to cover all points in the time-series `traj`.

	Returns an N(eps) (an integer).

	IMPORTANT: !! -- Modifies the order and values of the trajectory in place. -- !!

	Params:
	- traj: The n-dimensional trajectory to compute on. Modified IN-PLACE!
	- eps: The side length of the squares/cubes/hypercubes to count.
	- memory_slot_limit: Approximate hard limit on the size (in bytes) of the box grid matrix used by the algorithm.
	This parameter can be used to control the memory use of the algorithm, however, it has a lower bound
	that depends on epsilon and the data itself. The algorithm's behaviour if the limit is below the lower
	bound is controlled by `memory_limit_mode`. Defaults to 100Mb.
	- memory_limit_mode: Either 'error' or 'increase'. When 'error' is set, if the least memory needed to run the computation
	is above `memory_slot_limit`, an exception is raised. In 'increase' mode, the amount of memory used will be
	increased to the lowest amount above the limit needed to run the computation. Defaults to 'error'.
	- save_extra_memory: When true, the algorithm will sort completely in place without creating an extra array of indexes.
	Defaults to False. When this mode is enabled, only C contiguous arrays (meaning, for example, no views or slices)
	can be provided for `traj`.
	- logger: A `Logger` to log information to. Defaults to `None`, or no logging.
	"""
	assert memory_limit_mode in ['error', 'increase']

	if not traj.flags.owndata:
	warnings.warn("`memory_optimized_n_eps` modifies `traj` in place; "
	"the provided `traj` does not own its data and the computation may therefore unintentionally modify data elsewhere")

	if save_extra_memory and not traj.flags.c_contiguous:
	raise ValueError("`traj` must be C contiguous when in `save_extra_memory` mode.\n"
	"(Generally, this error means a view or slice was passed to `memory_optimized_n_eps`; "
	"try using `np.ascontiguousarray()` or `.copy(order='C')` or setting `save_extra_memory` to False)")

	mins = np.amin(traj, axis=0)
	maxes = np.amax(traj, axis=0)
	orig_shape = list(np.ceil((maxes - mins) / float(eps)))
	slicing_dimension = np.argmax(orig_shape)

	if logger:
	logger.info("Slicing Dimension: %d", slicing_dimension)

	#Compute the slice size
	other_dimension_product = np.prod(orig_shape[0:slicing_dimension] + orig_shape[slicing_dimension+1:])
	slice_size = np.floor(float(memory_slot_limit) / other_dimension_product)

	if slice_size < 1:
	if memory_limit_mode == 'error':
	error = ("memory_slot_limit must be large enough for a step size of at least 1. "
	"For these parameters, that implies a memory_slot_limit of at least %s"
	)
	raise ValueError(error % other_dimension_product)
	else:
	slice_size = 1

	#If the slice size is bigger than the data, clamp it to the size of the data.
	if slice_size > orig_shape[slicing_dimension]:
	slice_size = orig_shape[slicing_dimension]

	#Sort trajectory in-place (ascending) along the axis with the most boxes (largest data range)
	if save_extra_memory: #Sorting hack:
	#Hack from http://stackoverflow.com/questions/2828059/sorting-arrays-in-numpy-by-column
	traj.view(','.join([traj.dtype.str] * traj.shape[1])).sort(order=['f%d' % slicing_dimension], axis=0)
	else:
	traj = traj[traj[:,slicing_dimension].argsort()]

	#Move data origin to lower left corner of box grid
	traj -= mins

	#Create the slice grid
	shape = orig_shape[:]
	shape[slicing_dimension] = slice_size
	slice_grid = np.zeros(shape=shape, dtype=bool)

	box_count = 0
	slice_index = 0

	if logger:
	logger.info("Slice Size (boxes): %s", slice_size)
	logger.info("Slice Size (input units): %s", slice_size * eps)
	logger.info("Slice Grid Size (bytes): %s", slice_grid.nbytes)

	lastpt = -np.inf

	for pt in traj:
	# - Check for out-of-order sorts -
	assert not lastpt > pt[slicing_dimension], "Fatal error: %d out-of-order points were found. If using `save_extra_memory` mode, try turning it off."
	lastpt = pt[slicing_dimension]

	#Get the box index of the point in the overall grid:
	box_idex = np.floor(pt / eps).astype(int)
	#Translate that index to be for the current slice in the slicing dimension:
	box_idex[slicing_dimension] -= slice_index * slice_size

	#If the index is bigger that the slice size, move to another slice:
	if box_idex[slicing_dimension] >= slice_size:
	#Finish the current slice:
	box_count += slice_grid.sum()
	slice_grid.fill(False)
	#Skip enough slices to bring the index within range:
	slices_moved, new_idex = divmod(box_idex[slicing_dimension], slice_size)
	slice_index += int(slices_moved)
	box_idex[slicing_dimension] = new_idex

	#Mark the point's box as used:
	slice_grid[tuple(box_idex)] \|= True

	#Add any remaining boxes to the sum:
	box_count += slice_grid.sum()

	return box_count