Skip to content

Instantly share code, notes, and snippets.

@Susensio
Last active October 26, 2023 16:10
Show Gist options
  • Star 23 You must be signed in to star a gist
  • Fork 3 You must be signed in to fork a gist
  • Save Susensio/61f4fee01150caaac1e10fc5f005eb75 to your computer and use it in GitHub Desktop.
Save Susensio/61f4fee01150caaac1e10fc5f005eb75 to your computer and use it in GitHub Desktop.
Make function of numpy array cacheable

How to cache slow functions with numpy.array as function parameter on Python

TL;DR

from numpy_lru_cache_decorator import np_cache

@np_cache()
def function(array):
    ...

Explanation

Sometimes processing numpy arrays can be slow, even more if we are doing image analysis. Simply using functools.lru_cache won't work because numpy.array is mutable and not hashable. This workaround allows caching functions that take an arbitrary numpy.array as first parameter, other parameters are passed as is. Decorator accepts lru_cache standard parameters (maxsize=128, typed=False).

Example:

>>> array = np.array([[1, 2, 3], [4, 5, 6]])

>>> @np_cache(maxsize=256)
... def multiply(array, factor):
...     print("Calculating...")
...     return factor*array

>>> product = multiply(array, 2)
Calculating...
>>> product
array([[ 2,  4,  6],
       [ 8, 10, 12]])

>>> multiply(array, 2)
array([[ 2,  4,  6],
       [ 8, 10, 12]])

Warning: about lru_cache decorator caveats

User must be very careful when mutable objects (list, dict, numpy.array...) are returned. A reference to the same object in memory is returned each time from cache and not a copy. Then, if this object is modified, the cache itself looses its validity.

Example of this caveat:

>>> array = np.array([1, 2, 3])

>>> @np_cache()
... def to_list(array):
...     print("Calculating...")
...     return array.tolist()

>>> result = to_list(array)
Calculating...
>>> result
[1, 2, 3]

>>> result.append("this shouldn't be here")  # WARNING, DO NOT do this
>>> result
[1, 2, 3, "this shouldn't be here"]

>>> new_result = to_list(array)
>>> result
[1, 2, 3, "this shouldn't be here"]  # CACHE BROKEN!!

To avoid this mutability problem, the usual approaches must be followed. In this case, either list(result) or result[:] will create a (shallow) copy. If result were a nested list, deepcopy must be used. For numpy.array, array.copy() must be used, as neither array[:] nor numpy.array(array) will make a copy.

from functools import lru_cache, wraps
import numpy as np
def np_cache(*args, **kwargs):
"""LRU cache implementation for functions whose FIRST parameter is a numpy array
>>> array = np.array([[1, 2, 3], [4, 5, 6]])
>>> @np_cache(maxsize=256)
... def multiply(array, factor):
... print("Calculating...")
... return factor*array
>>> multiply(array, 2)
Calculating...
array([[ 2, 4, 6],
[ 8, 10, 12]])
>>> multiply(array, 2)
array([[ 2, 4, 6],
[ 8, 10, 12]])
>>> multiply.cache_info()
CacheInfo(hits=1, misses=1, maxsize=256, currsize=1)
"""
def decorator(function):
@wraps(function)
def wrapper(np_array, *args, **kwargs):
hashable_array = array_to_tuple(np_array)
return cached_wrapper(hashable_array, *args, **kwargs)
@lru_cache(*args, **kwargs)
def cached_wrapper(hashable_array, *args, **kwargs):
array = np.array(hashable_array)
return function(array, *args, **kwargs)
def array_to_tuple(np_array):
"""Iterates recursivelly."""
try:
return tuple(array_to_tuple(_) for _ in np_array)
except TypeError:
return np_array
# copy lru_cache attributes over too
wrapper.cache_info = cached_wrapper.cache_info
wrapper.cache_clear = cached_wrapper.cache_clear
return wrapper
return decorator
@Argysh
Copy link

Argysh commented Jul 22, 2019

Hey,

thanks a lot for the snippet. That's exactly what I was hoping to find!
However, this implementations seems to be rather slow when dealing with large arrays. This is due to the way the arrays are being converted to tuples. If you replace your custom function array_to_tuple(np_array) with tuple(map(tuple, np_array)) you'll get much better performance.

in:

import numpy as np
from functools import lru_cache, wraps
from timeit import default_timer

def vocalTimeit(*args, **kwargs):
    ''' provides the decorator @vocalTime which will print the name of the function as well as the
        execution time in seconds '''

    def decorator(function):
        @wraps(function)
        def wrapper(*args, **kwargs):
            start = default_timer()
            results = function(*args, **kwargs)
            end = default_timer()
            print('{} execution time: {} s'.format(function.__name__, end-start))
            return results
        return wrapper
    return decorator


def npCacheMap(*args, **kwargs):
    ''' LRU cache implementation for functions whose FIRST parameter is a numpy array
        forked from: https://gist.github.com/Susensio/61f4fee01150caaac1e10fc5f005eb75 '''

    def decorator(function):
        @wraps(function)
        def wrapper(np_array, *args, **kwargs):
            hashable_array = tuple(map(tuple, np_array))
            return cached_wrapper(hashable_array, *args, **kwargs)

        @lru_cache(*args, **kwargs)
        def cached_wrapper(hashable_array, *args, **kwargs):
            array = np.array(hashable_array)
            return function(array, *args, **kwargs)

        # copy lru_cache attributes over too
        wrapper.cache_info = cached_wrapper.cache_info
        wrapper.cache_clear = cached_wrapper.cache_clear
        return wrapper
    return decorator


def npCacheFun(*args, **kwargs):
    ''' LRU cache implementation for functions whose FIRST parameter is a numpy array
        Source: https://gist.github.com/Susensio/61f4fee01150caaac1e10fc5f005eb75 '''

    def decorator(function):
        @wraps(function)
        def wrapper(np_array, *args, **kwargs):
            hashable_array = array_to_tuple(np_array)
            return cached_wrapper(hashable_array, *args, **kwargs)

        @lru_cache(*args, **kwargs)
        def cached_wrapper(hashable_array, *args, **kwargs):
            array = np.array(hashable_array)
            return function(array, *args, **kwargs)

        def array_to_tuple(np_array):
            '''Iterates recursively.'''
            try:
                return tuple(array_to_tuple(_) for _ in np_array)
            except TypeError:
                return np_array

        # copy lru_cache attributes over too
        wrapper.cache_info = cached_wrapper.cache_info
        wrapper.cache_clear = cached_wrapper.cache_clear
        return wrapper
    return decorator


def npCacheMethod(*args, **kwargs):
    ''' LRU cache implementation for methods whose FIRST parameter is a numpy array
        modified from: https://gist.github.com/Susensio/61f4fee01150caaac1e10fc5f005eb75 '''

    def decorator(function):
        @wraps(function)
        def wrapper(s, np_array, *args, **kwargs):
            hashable_array = tuple(map(tuple, np_array))
            return cached_wrapper(s, hashable_array, *args, **kwargs)

        @lru_cache(*args, **kwargs)
        def cached_wrapper(s, hashable_array, *args, **kwargs):
            array = np.array(hashable_array)
            return function(s, array, *args, **kwargs)

        # copy lru_cache attributes over too
        wrapper.cache_info = cached_wrapper.cache_info
        wrapper.cache_clear = cached_wrapper.cache_clear
        return wrapper
    return decorator


if __name__ == "__main__":
    from copy import copy
    from time import sleep

    array = np.random.rand(900,600)
    param = int(np.random.rand()*1000)

    print('\nwith @npCacheFun:')
    @vocalTimeit()
    @npCacheFun()
    def doSomethingFun(array, param):
        print("Calculating...")
        sleep(1)
        return True

    for i in range(5):
        res = copy(doSomethingFun(array, param))

    print('\nwith @npCacheMap:')
    @vocalTimeit()
    @npCacheMap()
    def doSomethingMap(array, param):
        print("Calculating...")
        sleep(1)
        return True

    for i in range(5):
        res = copy(doSomethingMap(array, param))

    print('\nwith @npCacheMethod')
    class DummyClass():
        def __init__(self):
            for i in range(5):
                res = copy(self.doSomethingMethod(array, param))

        @vocalTimeit()
        @npCacheMethod()
        def doSomethingMethod(self, array, param):
            print("Calculating...")
            sleep(1)
            return True
    dummyClass = DummyClass()

out:

with @npCacheFun:
Calculating...
doSomethingFun execution time: 1.4808940216898918 s
doSomethingFun execution time: 0.4669961463660002 s
doSomethingFun execution time: 0.46817597933113575 s
doSomethingFun execution time: 0.46563387755304575 s
doSomethingFun execution time: 0.4685899028554559 s

with @npCacheMap:
Calculating...
doSomethingMap execution time: 1.073955507017672 s
doSomethingMap execution time: 0.0814172513782978 s
doSomethingMap execution time: 0.08347152546048164 s
doSomethingMap execution time: 0.08153043873608112 s
doSomethingMap execution time: 0.08101261034607887 s

with @npCacheMethod
Calculating...
doSomethingMethod execution time: 1.075349354185164 s
doSomethingMethod execution time: 0.08380787633359432 s
doSomethingMethod execution time: 0.08389369770884514 s
doSomethingMethod execution time: 0.0843976391479373 s
doSomethingMethod execution time: 0.0834763403981924 s

@joerivstrien
Copy link

Do note that the implementation with tuple(map(tuple, np_array)) does not work on arrays with >2 Dimensions, while the recursive implementation does.

@CarloNicolini
Copy link

I've been trying to adapt this implementation to pandas DataFrame, but I am still struggling with arguments other than the first one which in this case is a Pandas dataframe.

My idea is to decompose the dataframe into a tuple with three elements, the first one is a tuple containing the index, the second one a tuple with the columns and the third one, using the array_to_tuple function contains the dataframe values.

def df_cache(*args, **kwargs):
    """LRU cache implementation for functions whose FIRST parameter is a pandas dataframe"""

    def decorator(function):
        @wraps(function)
        def wrapper(df: pd.DataFrame, *args, **kwargs):
            df_tuple = dataframe_to_tuple(df)
            return cached_wrapper(df_tuple, *args, **kwargs)

        @lru_cache(*args, **kwargs)
        def cached_wrapper(df_tuple, *args, **kwargs):
            df = (
                pd.DataFrame(index=df_tuple[0], columns=df_tuple[1], data=df_tuple[2]),
            )
            return function(df, *args, **kwargs)

        def array_to_tuple(np_array):
            """Iterates recursivelly."""
            try:
                return tuple(array_to_tuple(_) for _ in np_array)
            except TypeError:
                return np_array

        def dataframe_to_tuple(df: pd.DataFrame):
            try:
                return tuple(
                    tuple(df.index), tuple(df.columns), array_to_tuple(df.values)
                )
            except TypeError:
                return df

        # copy lru_cache attributes over too
        wrapper.cache_info = cached_wrapper.cache_info
        wrapper.cache_clear = cached_wrapper.cache_clear

        return wrapper

    return decorator

However I am having problems with a code like this:

@df_cache
def test(df, a, b=2):
    return df +a/b

df = pd.DataFrame(list(range(10)))
test(df, 1)

getting the following error:

TypeError: decorator() takes 1 positional argument but 2 were given

I cannot understand where is the difference with the numpy cached version.

@Susensio
Copy link
Author

Susensio commented May 3, 2021

Hi, @CarloNicolini
I spotted 3 lines that were giving you errors:

First, the decorator must be called, not only mentioned, as in @df_cache() instead of @df_cache. Notice the parenthesis. This could be worked around with something like this https://stackoverflow.com/questions/3931627/how-to-build-a-decorator-with-optional-parameters

Second, my exception handling in array_to_tuple was intended to handle the final case when recursion tries to unpack a single element. In your code dataframe_to_tuple, that exception handling was hiding an error thrown when constructing the returned tupled. That error is that tuple expects a single iterable parameter. You can instead create a tuple implicitly as

return (
    tuple(df.index), tuple(df.columns), array_to_tuple(df.values)
)

instead of

return tuple(
    tuple(df.index), tuple(df.columns), array_to_tuple(df.values)
)

Finally, test function was failing to me because cached_wrapper was passing to function a tuple, and not a dataframe as is expected. So this

@lru_cache(*args, **kwargs)
def cached_wrapper(df_tuple, *args, **kwargs):
    df = (
        pd.DataFrame(index=df_tuple[0], columns=df_tuple[1], data=df_tuple[2]),
    )
    return function(df, *args, **kwargs)

should be just

@lru_cache(*args, **kwargs)
def cached_wrapper(df_tuple, *args, **kwargs):
    df = pd.DataFrame(index=df_tuple[0], columns=df_tuple[1], data=df_tuple[2])
    return function(df, *args, **kwargs)

This is the complete code that works for me

def df_cache(*args, **kwargs):
    """LRU cache implementation for functions whose FIRST parameter is a pandas dataframe"""

    def decorator(function):
        @wraps(function)
        def wrapper(df: pd.DataFrame, *args, **kwargs):
            df_tuple = dataframe_to_tuple(df)
            return cached_wrapper(df_tuple, *args, **kwargs)

        @lru_cache(*args, **kwargs)
        def cached_wrapper(df_tuple, *args, **kwargs):
            df = pd.DataFrame(index=df_tuple[0], columns=df_tuple[1], data=df_tuple[2])
            return function(df, *args, **kwargs)

        def array_to_tuple(np_array):
            """Iterates recursivelly."""
            try:
                return tuple(array_to_tuple(_) for _ in np_array)
            except TypeError:
                return np_array

        def dataframe_to_tuple(df: pd.DataFrame):
            try:
                return (
                    tuple(df.index), tuple(df.columns), array_to_tuple(df.values)
                )
            except TypeError:
                return df

        # copy lru_cache attributes over too
        wrapper.cache_info = cached_wrapper.cache_info
        wrapper.cache_clear = cached_wrapper.cache_clear

        return wrapper

    return decorator

Bear in mind that caching full dataframes this way could result in a lot of RAM usage. Maybe this post could be interesting if you run into memory overhead issues https://stackoverflow.com/questions/23477284/memory-aware-lru-caching-in-python

@kuchi
Copy link

kuchi commented Jul 16, 2021

Thanks for the helpful writeup. One additional tip, if you are returning a numpy array from the cached function, you can make the numpy array immutable by setting the WRITEABLE flag. If your return variable is x, then:

@np_cache()
def my_cached_func(in):
....
x.flags['WRITEABLE'] = False # prevent it from being changed
return x

@mcleantom
Copy link

Is there any way to modify this for a class function, so that the second item is the numpy array?

@MischaPanch
Copy link

MischaPanch commented Nov 30, 2021

@Susensio Thanks for the snippet! @mcleantom I also needed this functionality so here a slightly extended version of the snippet, where it is possible to configure the position of the numpy array. I also used @Argysh 's suggestion to optimize runtime for low-dimensional arrays. You should change to the original version if you need it to work with higher dimensional arrays

def np_cache(*lru_args, array_argument_index=0, **lru_kwargs):
    """
    LRU cache implementation for functions whose parameter at ``array_argument_index`` is a numpy array of dimensions <= 2

    Example:
    >>> from sem_env.utils.cache import np_cache
    >>> array = np.array([[1, 2, 3], [4, 5, 6]])
    >>> @np_cache(maxsize=256)
    ... def multiply(array, factor):
    ...     return factor * array
    >>> multiply(array, 2)
    >>> multiply(array, 2)
    >>> multiply.cache_info()
    CacheInfo(hits=1, misses=1, maxsize=256, currsize=1)
    """

    def decorator(function):
        @wraps(function)
        def wrapper(*args, **kwargs):
            np_array = args[array_argument_index]
            if len(np_array.shape) > 2:
                raise RuntimeError(
                    f"np_cache is currently only supported for arrays of dim. less than 3 but got shape: {np_array.shape}"
                )
            hashable_array = tuple(map(tuple, np_array))
            args = list(args)
            args[array_argument_index] = hashable_array
            return cached_wrapper(*args, **kwargs)

        @lru_cache(*lru_args, **lru_kwargs)
        def cached_wrapper(*args, **kwargs):
            hashable_array = args[array_argument_index]
            array = np.array(hashable_array)
            args = list(args)
            args[array_argument_index] = array
            return function(*args, **kwargs)

        # copy lru_cache attributes over too
        wrapper.cache_info = cached_wrapper.cache_info
        wrapper.cache_clear = cached_wrapper.cache_clear
        return wrapper

    return decorator

@domvwt
Copy link

domvwt commented Dec 7, 2021

FYI, the joblib library implements caching arrays and dataframes to disk. It's one of the key libraries that scikit-learn depends on so it's well tried and tested.

@mcleantom
Copy link

@domvwt Thanks, havent heard of joblib before, looks very useful

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment