Skip to content

Instantly share code, notes, and snippets.

@healiseu
Last active September 19, 2018 06:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save healiseu/0856470814b87f78d188dd1ec82d712b to your computer and use it in GitHub Desktop.
Save healiseu/0856470814b87f78d188dd1ec82d712b to your computer and use it in GitHub Desktop.
# Sorted float secondary index: An in-memory data structures comparison of Python, NumPy and Redis data structures
# (C) By Athanassios I. Hatzis
# 18th of September 2018
# Written to explain issue: https://github.com/RedisLabsModules/RediSearch/issues/493
#
# OS Environment: Linux Ubuntu x64
# Processor: Intel(R) Core(TM) i3 CPU 540 @ 3.07GHz
# Memory 16 GB
from bisect import bisect_left, bisect_right
from collections import OrderedDict
import numpy as np
import time
import sys
from walrus import Database
from redisearch import Client, NumericField, Query
class SlicableOrderedDict(OrderedDict):
def __getitem__(self, k):
if not isinstance(k, slice):
return OrderedDict.__getitem__(self, k)
x = SlicableOrderedDict()
for idx, key in enumerate(self.keys()):
if k.start <= idx < k.stop:
x[key] = self[key]
return x
# **********************************************************************************************
# --------------------- Utility Functions -------------------------------------------------------------
# **********************************************************************************************
def ndx_range(min, max, ndx):
"""
:param ndx: Sorted OrderedDict of values, i.e. index
:param min: lower bound of index values
:param max: upper bound of index values
:return: a range of indexes from the sorted OrderedDict with values between min and max
"""
lower_bound = find_gt(ndx, min)
upper_bound = find_lt(ndx, max)
return ndx[lower_bound:upper_bound]
def find_gt(ndx, k):
"""
:param ndx: Sorted OrderedDict of values
:param k: a key value of an OrderedDict
:return: the position of k
"""
arr = list(ndx.keys())
#Find leftmost key value greater than k'
arr_pos = bisect_right(arr, k)
if arr_pos != len(arr):
return arr_pos
raise ValueError
def find_lt(ndx, k):
"""
:param ndx: Sorted OrderedDict of values
:param k: a key value of an OrderedDict
:return: the position of k
"""
arr = list(ndx.keys())
# Find rightmost key value less than k'
arr_pos = bisect_left(arr, k)
if arr_pos:
return arr_pos
raise ValueError
def bytes2mb(bytes):
return round(bytes / 1024 ** 2, 1)
def array2dict(arr):
d = dict(enumerate(arr))
return {v:k for k,v in d.items()}
def array2ordered_dict(arr):
# create a Python Dictionary from NumPy array
d = array2dict(arr)
# return dictionary ordered by value
return SlicableOrderedDict(sorted(d.items(), key=lambda t: t[0]))
def floats_sample(size, range):
return (range[1]-range[0])*np.random.sample(size)+range[0]
def redis_dict(arr):
"""
:param arr: numpy array of floats
:return: dictionary with key in the form of numerical string integers and float values
"""
d = dict(enumerate(arr))
return {str(k): v for k, v in d.items()}
# **********************************************************************************************
########## 1. Create float secondary index with Python/Numpy data structure ####################
# **********************************************************************************************
# %%%%%%%%%%%%%%%%%%%%%%%%%%% Start Timing Benchmark %%%%%%%%%%%%%%%%%%%%%%%%%%
t1_start = time.perf_counter()
t2_start = time.process_time()
# ----------------- Create NumPy Array of Float Values ------------------------
sample_size = 250000 # sample size of 250,000 float values
sample_interval = (-5000, 5000) # sampling range of values min=-5000, max=5000
flt_arr = floats_sample(sample_size, sample_interval)
# ---------------- Create Python Ordered Dictionary to represent sorted float index
flt_sorted_ndx = array2ordered_dict(flt_arr)
t1_stop = time.perf_counter()
t2_stop = time.process_time()
print(f"Python/NumPy - elapsed time for {sample_size} floats : {int(round((t1_stop-t1_start)))} [sec]")
print(f"Python/NumPy - cpu process time for {sample_size} floats: {int(round((t2_stop-t2_start)))} [sec]")
# %%%%%%%%%%%%%%%%% End Timing Benchmark %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Python/NumPy - elapsed time for 250000 floats : 1 [sec]
# Python/NumPy - cpu process time for 250000 floats: 1 [sec]
# Sanity check
29 == flt_sorted_ndx[flt_arr[29]]
# In order to create the redis sorted set keys must be of type string
# these keys are numerical string integers and the values are the float numbers
flt_scores = redis_dict(flt_arr)
# ------------------ Print Memory Size Info ------------------------------
print(f'Size: {flt_arr.size}')
print(f'Data Type: {flt_arr.dtype}')
print(f'Memory Size: {flt_arr.nbytes}')
in_memory_total = bytes2mb(sys.getsizeof(flt_arr) + sys.getsizeof(flt_sorted_ndx))
print(f'Numpy Array and Ordered Dictionary - Total memory size: {in_memory_total} MB ')
# Size: 250000
# Data Type: float64
# Memory Size: 2000000
# Numpy Array and Ordered Dictionary - Total memory size: 23.5 MB
# ************************************************************************************************
# ############### 2. Create float secondary index with Walrus/Redis Data Structure ###############
# *************************************************************************************************
# -------------------- Create Redis-Walrus Structures --------------------------------------------
db = Database(host='localhost', port=6379, db=9)
db.flushdb()
db.dbsize()
# Float Values are stored in a Redis LIST (Walrus Array)
FloatField = db.Array('FloatField')
# Float Index is created and stored with a Redis ZSET
FloatNDX = db.ZSet('FloatNDX')
# %%%%%%%%%%%%%%%%%%%%%%%%%%% Start Timing Benchmark %%%%%%%%%%%%%%%%%%%%%%%%%%
t1_start = time.perf_counter()
t2_start = time.process_time()
FloatField.extend(flt_arr)
FloatNDX.add(**flt_scores)
t1_stop = time.perf_counter()
t2_stop = time.process_time()
print(f"Walrus-Redis - elapsed time for {sample_size} floats : {int(round((t1_stop-t1_start)))} [sec]")
print(f"Walrus-Redis - cpu process time for {sample_size} floats: {int(round((t2_stop-t2_start)))} [sec]")
# %%%%%%%%%%%%%%%%% End Timing Benchmark %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Walrus-Redis - elapsed time for 250000 floats : 6 [sec]
# Walrus-Redis - cpu process time for 250000 floats: 5 [sec]
# Redis Server Before Insertion
# used_memory_human : 0.99 MB
# used_memory_dataset: 0.05 MB
# Redis Server After Insertion
# used_memory_human : 36.92 MB
# used_memory_dataset: 35.96 MB
# Save RDB policy: after 600 sec (10 min) if at least 10 keys changed
# Redis 4.0.8 (00000000/0) 64 bit
# ************************************************************************************************
# ############### 3. Create float secondary index with RediSearch Module ###############
# *************************************************************************************************
# Creating a client with a given index name - DVS (Data Value Store)
redclient = Client('DVS')
# Creating the index definition and schema
redclient.create_index([NumericField('flt', sortable=True)])
batchndx = redclient.batch_indexer(chunk_size=500)
# %%%%%%%%%%%%%%%%%%%%%%%%%%% Start Timing Benchmark %%%%%%%%%%%%%%%%%%%%%%%%%%
t1_start = time.perf_counter()
t2_start = time.process_time()
[batchndx.add_document(doc_id=n, flt=flt_arr[n]) for n in range(sample_size)]
batchndx.commit()
t1_stop = time.perf_counter()
t2_stop = time.process_time()
print(f"RediSearch Module - elapsed time for {sample_size} floats : {int(round((t1_stop-t1_start)))} [sec]")
print(f"RediSearch Module - cpu process time for {sample_size} floats: {int(round((t2_stop-t2_start)))} [sec]")
# %%%%%%%%%%%%%%%%% End Timing Benchmark %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# RediSearch Module - elapsed time for 250000 floats (500 chunks) : 25 [sec]
# RediSearch Module - cpu process time for 250000 floats (500 chunks) : 17 [sec]
# Redis Server Before Insertion
# used_memory_human : 0.99 MB
# used_memory_dataset: 0.05 MB
# Redis Server After Insertion
# used_memory_human : 66.62 MB
# used_memory_dataset: 54.12 MB
# Redis 4.0.8 (00000000/0) 64 bit
# Save RDB policy: after 600 sec (10 min) if at least 10 keys changed
print(f"Fields : {redclient.info()['fields']}")
print(f"Total values : {redclient.info()['num_docs']}")
print(f"Total size mb : {redclient.info()['doc_table_size_mb']}")
print(f"Sortable values size mb : {redclient.info()['sortable_values_size_mb']}")
#Fields : [[b'flt', b'type', b'NUMERIC', b'SORTABLE']]
#Total values : 250000
#Total size mb : 20.874872207641602
#Sortable values size mb : 5.7220458984375
# ************************************************************************************************
# +++++++++++++++++++++++++++ SEARCHING BENCHMARKS WITH INDEX FOR 1, 2, 3 Cases +++++++++++++++++
# ***********************************************************************************************
val_range = (2300, 2303)
# 1. Python/NumPy
len(ndx_range(*val_range, flt_sorted_ndx))
ndx_range(*val_range, flt_sorted_ndx)
# 2. Redis/Walrus
FloatNDX.count(*val_range)
FloatNDX.range_by_score(*val_range, with_scores=True)
# 3. RediSearch Module
res = redclient.search('@flt:[2300 2303]')
res.total
res.docs
redclient.search(Query('@flt:[2300 2303]').return_fields('flt').sort_by('flt')).docs
# 2. Python Range Query Time Benchmark
%timeit -r 7 -n 10 ndx_range(*val_range, flt_sorted_ndx)
# 245 ms ± 1.31 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
# 2. Redis Range Query Time Benchmark
%timeit -r 7 -n 1000 FloatNDX.range_by_score(*val_range, with_scores=True)
# 490 µs ± 82.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# 3. RediSearch Query Time Benchmark
%timeit -r 7 -n 1000 redclient.search('@flt:[2300 2303]')
# 434 µs ± 6.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment