healiseu/sorted_float_index_benchmarks.py

## sorted_float_index_benchmarks.py
# Sorted float secondary index: An in-memory data structures comparison of Python, NumPy and Redis data structures
# (C) By Athanassios I. Hatzis
# 18th of September 2018
# Written to explain issue: https://github.com/RedisLabsModules/RediSearch/issues/493
#
# OS Environment: Linux Ubuntu x64
# Processor: Intel(R) Core(TM) i3 CPU 540  @ 3.07GHz
# Memory 16 GB

from bisect import bisect_left, bisect_right
from collections import OrderedDict
import numpy as np
import time
import sys

from walrus import Database

from redisearch import Client, NumericField, Query


class SlicableOrderedDict(OrderedDict):
    def __getitem__(self, k):
        if not isinstance(k, slice):
            return OrderedDict.__getitem__(self, k)
        x = SlicableOrderedDict()
        for idx, key in enumerate(self.keys()):
            if k.start <= idx < k.stop:
                x[key] = self[key]
        return x

# **********************************************************************************************
# --------------------- Utility Functions -------------------------------------------------------------
# **********************************************************************************************
def ndx_range(min, max, ndx):
    """
    :param ndx: Sorted OrderedDict of values, i.e. index
    :param min: lower bound of index values
    :param max: upper bound of index values
    :return: a range of indexes from the sorted OrderedDict with values between min and max
    """
    lower_bound = find_gt(ndx, min)
    upper_bound = find_lt(ndx, max)
    return ndx[lower_bound:upper_bound]

def find_gt(ndx, k):
    """
    :param ndx: Sorted OrderedDict of values
    :param k: a key value of an OrderedDict
    :return: the position of k
    """
    arr = list(ndx.keys())

    #Find leftmost key value greater than k'
    arr_pos = bisect_right(arr, k)
    if arr_pos != len(arr):
        return arr_pos
    raise ValueError

def find_lt(ndx, k):
    """
    :param ndx: Sorted OrderedDict of values
    :param k: a key value of an OrderedDict
    :return: the position of k
    """
    arr = list(ndx.keys())

    # Find rightmost key value less than k'
    arr_pos = bisect_left(arr, k)
    if arr_pos:
        return arr_pos
    raise ValueError

def bytes2mb(bytes):
    return round(bytes / 1024 ** 2, 1)

def array2dict(arr):
    d = dict(enumerate(arr))
    return {v:k for k,v in d.items()}

def array2ordered_dict(arr):
    # create a Python Dictionary from NumPy array
    d = array2dict(arr)
    # return dictionary ordered by value
    return SlicableOrderedDict(sorted(d.items(), key=lambda t: t[0]))

def floats_sample(size, range):
    return (range[1]-range[0])*np.random.sample(size)+range[0]

def redis_dict(arr):
    """
    :param arr: numpy array of floats
    :return: dictionary with key in the form of numerical string integers and float values
    """
    d = dict(enumerate(arr))
    return {str(k): v for k, v in d.items()}


# **********************************************************************************************
########## 1. Create float secondary index with Python/Numpy data structure ####################
# **********************************************************************************************

# %%%%%%%%%%%%%%%%%%%%%%%%%%% Start Timing Benchmark %%%%%%%%%%%%%%%%%%%%%%%%%%
t1_start = time.perf_counter()
t2_start = time.process_time()

# ----------------- Create NumPy Array of Float Values ------------------------
sample_size = 250000             # sample size of 250,000 float values
sample_interval = (-5000, 5000)  # sampling range of values min=-5000, max=5000
flt_arr = floats_sample(sample_size, sample_interval)

# ---------------- Create Python Ordered Dictionary to represent sorted float index
flt_sorted_ndx = array2ordered_dict(flt_arr)

t1_stop = time.perf_counter()
t2_stop = time.process_time()

print(f"Python/NumPy - elapsed time for {sample_size} floats    : {int(round((t1_stop-t1_start)))} [sec]")
print(f"Python/NumPy - cpu process time for {sample_size} floats: {int(round((t2_stop-t2_start)))} [sec]")
# %%%%%%%%%%%%%%%%% End Timing Benchmark %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

# Python/NumPy - elapsed time for 250000 floats    : 1 [sec]
# Python/NumPy - cpu process time for 250000 floats: 1 [sec]

# Sanity check
29 == flt_sorted_ndx[flt_arr[29]]

# In order to create the redis sorted set keys must be of type string
# these keys are numerical string integers and the values are the float numbers
flt_scores = redis_dict(flt_arr)


# ------------------ Print Memory Size Info ------------------------------
print(f'Size: {flt_arr.size}')
print(f'Data Type: {flt_arr.dtype}')
print(f'Memory Size: {flt_arr.nbytes}')

in_memory_total = bytes2mb(sys.getsizeof(flt_arr) + sys.getsizeof(flt_sorted_ndx))
print(f'Numpy Array and Ordered Dictionary - Total memory size: {in_memory_total} MB ')

# Size: 250000
# Data Type: float64
# Memory Size: 2000000
# Numpy Array and Ordered Dictionary - Total memory size: 23.5 MB


# ************************************************************************************************
# ############### 2. Create float secondary index with Walrus/Redis Data Structure ###############
# *************************************************************************************************

# -------------------- Create Redis-Walrus Structures --------------------------------------------
db = Database(host='localhost', port=6379, db=9)
db.flushdb()
db.dbsize()

# Float Values are stored in a Redis LIST (Walrus Array)
FloatField = db.Array('FloatField')

# Float Index is created and stored with a Redis ZSET
FloatNDX   = db.ZSet('FloatNDX')

# %%%%%%%%%%%%%%%%%%%%%%%%%%% Start Timing Benchmark %%%%%%%%%%%%%%%%%%%%%%%%%%
t1_start = time.perf_counter()
t2_start = time.process_time()

FloatField.extend(flt_arr)

FloatNDX.add(**flt_scores)

t1_stop = time.perf_counter()
t2_stop = time.process_time()

print(f"Walrus-Redis - elapsed time for {sample_size} floats    : {int(round((t1_stop-t1_start)))} [sec]")
print(f"Walrus-Redis - cpu process time for {sample_size} floats: {int(round((t2_stop-t2_start)))} [sec]")
# %%%%%%%%%%%%%%%%% End Timing Benchmark %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

# Walrus-Redis - elapsed time for 250000 floats    : 6 [sec]
# Walrus-Redis - cpu process time for 250000 floats: 5 [sec]

# Redis Server Before Insertion
# used_memory_human  :  0.99 MB
# used_memory_dataset:  0.05 MB

# Redis Server After Insertion
# used_memory_human :  36.92 MB
# used_memory_dataset: 35.96 MB

# Save RDB policy: after 600 sec (10 min) if at least 10 keys changed
# Redis 4.0.8 (00000000/0) 64 bit


# ************************************************************************************************
# ############### 3. Create float secondary index with RediSearch Module ###############
# *************************************************************************************************

# Creating a client with a given index name - DVS (Data Value Store)
redclient = Client('DVS')

# Creating the index definition and schema
redclient.create_index([NumericField('flt', sortable=True)])

batchndx = redclient.batch_indexer(chunk_size=500)

# %%%%%%%%%%%%%%%%%%%%%%%%%%% Start Timing Benchmark %%%%%%%%%%%%%%%%%%%%%%%%%%
t1_start = time.perf_counter()
t2_start = time.process_time()

[batchndx.add_document(doc_id=n, flt=flt_arr[n]) for n in range(sample_size)]
batchndx.commit()

t1_stop = time.perf_counter()
t2_stop = time.process_time()

print(f"RediSearch Module - elapsed time for {sample_size} floats    : {int(round((t1_stop-t1_start)))} [sec]")
print(f"RediSearch Module - cpu process time for {sample_size} floats: {int(round((t2_stop-t2_start)))} [sec]")
# %%%%%%%%%%%%%%%%% End Timing Benchmark %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

# RediSearch Module - elapsed time for 250000 floats (500 chunks)     : 25 [sec]
# RediSearch Module - cpu process time for 250000 floats (500 chunks) : 17 [sec]

# Redis Server Before Insertion
# used_memory_human  :  0.99 MB
# used_memory_dataset:  0.05 MB

# Redis Server After Insertion
# used_memory_human :  66.62 MB
# used_memory_dataset: 54.12 MB

# Redis 4.0.8 (00000000/0) 64 bit
# Save RDB policy: after 600 sec (10 min) if at least 10 keys changed


print(f"Fields                  : {redclient.info()['fields']}")
print(f"Total values            : {redclient.info()['num_docs']}")
print(f"Total size           mb : {redclient.info()['doc_table_size_mb']}")
print(f"Sortable values size mb : {redclient.info()['sortable_values_size_mb']}")

#Fields                  : [[b'flt', b'type', b'NUMERIC', b'SORTABLE']]
#Total values            : 250000
#Total size           mb : 20.874872207641602
#Sortable values size mb : 5.7220458984375

# ************************************************************************************************
# +++++++++++++++++++++++++++ SEARCHING BENCHMARKS WITH INDEX FOR 1, 2, 3 Cases +++++++++++++++++
# ***********************************************************************************************
val_range = (2300, 2303)

# 1. Python/NumPy
len(ndx_range(*val_range, flt_sorted_ndx))
ndx_range(*val_range, flt_sorted_ndx)

# 2. Redis/Walrus
FloatNDX.count(*val_range)
FloatNDX.range_by_score(*val_range, with_scores=True)

# 3. RediSearch Module
res = redclient.search('@flt:[2300 2303]')
res.total
res.docs
redclient.search(Query('@flt:[2300 2303]').return_fields('flt').sort_by('flt')).docs


# 2. Python Range Query Time Benchmark
%timeit -r 7 -n 10 ndx_range(*val_range, flt_sorted_ndx)
# 245 ms ± 1.31 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

# 2. Redis Range Query Time Benchmark
%timeit -r 7 -n 1000 FloatNDX.range_by_score(*val_range, with_scores=True)
# 490 µs ± 82.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

# 3. RediSearch Query Time Benchmark
%timeit -r 7 -n 1000 redclient.search('@flt:[2300 2303]')
# 434 µs ± 6.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
	# Sorted float secondary index: An in-memory data structures comparison of Python, NumPy and Redis data structures
	# (C) By Athanassios I. Hatzis
	# 18th of September 2018
	# Written to explain issue: https://github.com/RedisLabsModules/RediSearch/issues/493
	#
	# OS Environment: Linux Ubuntu x64
	# Processor: Intel(R) Core(TM) i3 CPU 540 @ 3.07GHz
	# Memory 16 GB

	from bisect import bisect_left, bisect_right
	from collections import OrderedDict
	import numpy as np
	import time
	import sys

	from walrus import Database

	from redisearch import Client, NumericField, Query


	class SlicableOrderedDict(OrderedDict):
	def __getitem__(self, k):
	if not isinstance(k, slice):
	return OrderedDict.__getitem__(self, k)
	x = SlicableOrderedDict()
	for idx, key in enumerate(self.keys()):
	if k.start <= idx < k.stop:
	x[key] = self[key]
	return x

	# **********************************************************************************************
	# --------------------- Utility Functions -------------------------------------------------------------
	# **********************************************************************************************
	def ndx_range(min, max, ndx):
	"""
	:param ndx: Sorted OrderedDict of values, i.e. index
	:param min: lower bound of index values
	:param max: upper bound of index values
	:return: a range of indexes from the sorted OrderedDict with values between min and max
	"""
	lower_bound = find_gt(ndx, min)
	upper_bound = find_lt(ndx, max)
	return ndx[lower_bound:upper_bound]

	def find_gt(ndx, k):
	"""
	:param ndx: Sorted OrderedDict of values
	:param k: a key value of an OrderedDict
	:return: the position of k
	"""
	arr = list(ndx.keys())

	#Find leftmost key value greater than k'
	arr_pos = bisect_right(arr, k)
	if arr_pos != len(arr):
	return arr_pos
	raise ValueError

	def find_lt(ndx, k):
	"""
	:param ndx: Sorted OrderedDict of values
	:param k: a key value of an OrderedDict
	:return: the position of k
	"""
	arr = list(ndx.keys())

	# Find rightmost key value less than k'
	arr_pos = bisect_left(arr, k)
	if arr_pos:
	return arr_pos
	raise ValueError

	def bytes2mb(bytes):
	return round(bytes / 1024 ** 2, 1)

	def array2dict(arr):
	d = dict(enumerate(arr))
	return {v:k for k,v in d.items()}

	def array2ordered_dict(arr):
	# create a Python Dictionary from NumPy array
	d = array2dict(arr)
	# return dictionary ordered by value
	return SlicableOrderedDict(sorted(d.items(), key=lambda t: t[0]))

	def floats_sample(size, range):
	return (range[1]-range[0])*np.random.sample(size)+range[0]

	def redis_dict(arr):
	"""
	:param arr: numpy array of floats
	:return: dictionary with key in the form of numerical string integers and float values
	"""
	d = dict(enumerate(arr))
	return {str(k): v for k, v in d.items()}


	# **********************************************************************************************
	########## 1. Create float secondary index with Python/Numpy data structure ####################
	# **********************************************************************************************

	# %%%%%%%%%%%%%%%%%%%%%%%%%%% Start Timing Benchmark %%%%%%%%%%%%%%%%%%%%%%%%%%
	t1_start = time.perf_counter()
	t2_start = time.process_time()

	# ----------------- Create NumPy Array of Float Values ------------------------
	sample_size = 250000 # sample size of 250,000 float values
	sample_interval = (-5000, 5000) # sampling range of values min=-5000, max=5000
	flt_arr = floats_sample(sample_size, sample_interval)

	# ---------------- Create Python Ordered Dictionary to represent sorted float index
	flt_sorted_ndx = array2ordered_dict(flt_arr)

	t1_stop = time.perf_counter()
	t2_stop = time.process_time()

	print(f"Python/NumPy - elapsed time for {sample_size} floats : {int(round((t1_stop-t1_start)))} [sec]")
	print(f"Python/NumPy - cpu process time for {sample_size} floats: {int(round((t2_stop-t2_start)))} [sec]")
	# %%%%%%%%%%%%%%%%% End Timing Benchmark %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

	# Python/NumPy - elapsed time for 250000 floats : 1 [sec]
	# Python/NumPy - cpu process time for 250000 floats: 1 [sec]

	# Sanity check
	29 == flt_sorted_ndx[flt_arr[29]]

	# In order to create the redis sorted set keys must be of type string
	# these keys are numerical string integers and the values are the float numbers
	flt_scores = redis_dict(flt_arr)


	# ------------------ Print Memory Size Info ------------------------------
	print(f'Size: {flt_arr.size}')
	print(f'Data Type: {flt_arr.dtype}')
	print(f'Memory Size: {flt_arr.nbytes}')

	in_memory_total = bytes2mb(sys.getsizeof(flt_arr) + sys.getsizeof(flt_sorted_ndx))
	print(f'Numpy Array and Ordered Dictionary - Total memory size: {in_memory_total} MB ')

	# Size: 250000
	# Data Type: float64
	# Memory Size: 2000000
	# Numpy Array and Ordered Dictionary - Total memory size: 23.5 MB


	# ************************************************************************************************
	# ############### 2. Create float secondary index with Walrus/Redis Data Structure ###############
	# *************************************************************************************************

	# -------------------- Create Redis-Walrus Structures --------------------------------------------
	db = Database(host='localhost', port=6379, db=9)
	db.flushdb()
	db.dbsize()

	# Float Values are stored in a Redis LIST (Walrus Array)
	FloatField = db.Array('FloatField')

	# Float Index is created and stored with a Redis ZSET
	FloatNDX = db.ZSet('FloatNDX')

	# %%%%%%%%%%%%%%%%%%%%%%%%%%% Start Timing Benchmark %%%%%%%%%%%%%%%%%%%%%%%%%%
	t1_start = time.perf_counter()
	t2_start = time.process_time()

	FloatField.extend(flt_arr)

	FloatNDX.add(**flt_scores)

	t1_stop = time.perf_counter()
	t2_stop = time.process_time()

	print(f"Walrus-Redis - elapsed time for {sample_size} floats : {int(round((t1_stop-t1_start)))} [sec]")
	print(f"Walrus-Redis - cpu process time for {sample_size} floats: {int(round((t2_stop-t2_start)))} [sec]")
	# %%%%%%%%%%%%%%%%% End Timing Benchmark %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

	# Walrus-Redis - elapsed time for 250000 floats : 6 [sec]
	# Walrus-Redis - cpu process time for 250000 floats: 5 [sec]

	# Redis Server Before Insertion
	# used_memory_human : 0.99 MB
	# used_memory_dataset: 0.05 MB

	# Redis Server After Insertion
	# used_memory_human : 36.92 MB
	# used_memory_dataset: 35.96 MB

	# Save RDB policy: after 600 sec (10 min) if at least 10 keys changed
	# Redis 4.0.8 (00000000/0) 64 bit



	# ************************************************************************************************
	# ############### 3. Create float secondary index with RediSearch Module ###############
	# *************************************************************************************************

	# Creating a client with a given index name - DVS (Data Value Store)
	redclient = Client('DVS')

	# Creating the index definition and schema
	redclient.create_index([NumericField('flt', sortable=True)])

	batchndx = redclient.batch_indexer(chunk_size=500)

	# %%%%%%%%%%%%%%%%%%%%%%%%%%% Start Timing Benchmark %%%%%%%%%%%%%%%%%%%%%%%%%%
	t1_start = time.perf_counter()
	t2_start = time.process_time()

	[batchndx.add_document(doc_id=n, flt=flt_arr[n]) for n in range(sample_size)]
	batchndx.commit()

	t1_stop = time.perf_counter()
	t2_stop = time.process_time()

	print(f"RediSearch Module - elapsed time for {sample_size} floats : {int(round((t1_stop-t1_start)))} [sec]")
	print(f"RediSearch Module - cpu process time for {sample_size} floats: {int(round((t2_stop-t2_start)))} [sec]")
	# %%%%%%%%%%%%%%%%% End Timing Benchmark %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

	# RediSearch Module - elapsed time for 250000 floats (500 chunks) : 25 [sec]
	# RediSearch Module - cpu process time for 250000 floats (500 chunks) : 17 [sec]

	# Redis Server Before Insertion
	# used_memory_human : 0.99 MB
	# used_memory_dataset: 0.05 MB

	# Redis Server After Insertion
	# used_memory_human : 66.62 MB
	# used_memory_dataset: 54.12 MB

	# Redis 4.0.8 (00000000/0) 64 bit
	# Save RDB policy: after 600 sec (10 min) if at least 10 keys changed


	print(f"Fields : {redclient.info()['fields']}")
	print(f"Total values : {redclient.info()['num_docs']}")
	print(f"Total size mb : {redclient.info()['doc_table_size_mb']}")
	print(f"Sortable values size mb : {redclient.info()['sortable_values_size_mb']}")

	#Fields : [[b'flt', b'type', b'NUMERIC', b'SORTABLE']]
	#Total values : 250000
	#Total size mb : 20.874872207641602
	#Sortable values size mb : 5.7220458984375

	# ************************************************************************************************
	# +++++++++++++++++++++++++++ SEARCHING BENCHMARKS WITH INDEX FOR 1, 2, 3 Cases +++++++++++++++++
	# ***********************************************************************************************
	val_range = (2300, 2303)

	# 1. Python/NumPy
	len(ndx_range(*val_range, flt_sorted_ndx))
	ndx_range(*val_range, flt_sorted_ndx)

	# 2. Redis/Walrus
	FloatNDX.count(*val_range)
	FloatNDX.range_by_score(*val_range, with_scores=True)

	# 3. RediSearch Module
	res = redclient.search('@flt:[2300 2303]')
	res.total
	res.docs
	redclient.search(Query('@flt:[2300 2303]').return_fields('flt').sort_by('flt')).docs


	# 2. Python Range Query Time Benchmark
	%timeit -r 7 -n 10 ndx_range(*val_range, flt_sorted_ndx)
	# 245 ms ± 1.31 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

	# 2. Redis Range Query Time Benchmark
	%timeit -r 7 -n 1000 FloatNDX.range_by_score(*val_range, with_scores=True)
	# 490 µs ± 82.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

	# 3. RediSearch Query Time Benchmark
	%timeit -r 7 -n 1000 redclient.search('@flt:[2300 2303]')
	# 434 µs ± 6.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)