Skip to content

Instantly share code, notes, and snippets.

@wphicks
Created April 10, 2023 16:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wphicks/02800752bc16311b863681c25369d45d to your computer and use it in GitHub Desktop.
Save wphicks/02800752bc16311b863681c25369d45d to your computer and use it in GitHub Desktop.
Comparison of perf for various forest inference implementations
import cupy as cp
import os
import numpy as np
import treelite
import treelite_runtime
import xgboost as xgb
from time import perf_counter
from cuml.common.device_selection import using_device_type
from cuml.experimental import ForestInference
def find_optimal_fil_params(model_path, iterations, batch_size):
fil_model = ForestInference.load(model_path)
optimal_chunk_size = 1
optimal_timing = float('inf')
optimization_data = np.random.uniform(0, 1, (iterations, batch_size, 792))
for log_chunk_size in range(6):
chunk_size = 2 ** log_chunk_size
print(chunk_size)
start = perf_counter()
for iter_index in range(iterations):
X = optimization_data[iter_index]
y = fil_model.predict(X, chunk_size=chunk_size)
end = perf_counter()
elapsed = end - start
if elapsed < optimal_timing:
optimal_timing = elapsed
optimal_chunk_size = chunk_size
return fil_model, optimal_chunk_size
def test_predicts(batch_size, iterations):
results = {}
model_path = '0.model'
data = np.random.uniform(0, 1, (iterations, batch_size, 792))
data_gpu = cp.asarray(data)
tree_model = xgb.Booster()
tree_model.load_model(model_path)
tree_model.set_param({'predictor': 'gpu_predictor'})
start = perf_counter()
for iter_index in range(iterations):
X = xgb.DMatrix(data[iter_index])
y = tree_model.predict(X)
end = perf_counter()
average_time = (end - start) / iterations
print('xgboost GPU time: ', average_time)
results['XGB_GPU'] = average_time
start = perf_counter()
for iter_index in range(iterations):
X = xgb.DMatrix(data_gpu[iter_index])
y = tree_model.predict(X)
end = perf_counter()
average_time = (end - start) / iterations
print('xgboost GPU (Native I/O) time: ', average_time)
results['XGB_GPU_native'] = average_time
tree_model.set_param({'predictor': 'cpu_predictor'})
start = perf_counter()
for iter_index in range(iterations):
X = xgb.DMatrix(data[iter_index])
y = tree_model.predict(X)
end = perf_counter()
average_time = (end - start) / iterations
print('xgboost CPU time: ', average_time)
results['XGB_CPU'] = average_time
tl_model = treelite.Model.from_xgboost(tree_model)
compiled_lib_name = '0.so'
if not os.path.exists(compiled_lib_name):
tl_model.export_lib(
toolchain="gcc",
libpath=compiled_lib_name,
params={"parallel_comp": 40},
verbose=False,
)
# WARNING: Compiled Treelite models currently have an odd interaction
# with CPU FIL. Loading a compiled model in the same process as CPU FIL
# is used causes CPU FIL to use only one thread. To get correct results for
# CPU FIL, run the Treelite compiled benchmark separately from the
# other benchmarks.
#
# compiled_model = treelite_runtime.Predictor(
# compiled_lib_name
# )
# start = perf_counter()
# for iter_index in range(iterations):
# X = treelite_runtime.DMatrix(data[iter_index])
# y = compiled_model.predict(X)
# end = perf_counter()
# average_time = (end - start) / iterations
# results['TL_compiled'] = average_time
# print('TL compiled time: ', average_time)
start = perf_counter()
for iter_index in range(iterations):
X = data[iter_index]
y = treelite.gtil.predict(tl_model, X)
end = perf_counter()
average_time = (end - start) / iterations
results['GTIL'] = average_time
print('GTIL time: ', average_time)
with using_device_type('cpu'):
# Do not have to find optimal chunk size to use FIL, but doing so can
# squeeze some extra performance out of your model
fil_model, chunk_size = find_optimal_fil_params(
model_path, iterations, batch_size
)
start = perf_counter()
for iter_index in range(iterations):
X = data[iter_index]
y = fil_model.predict(X, chunk_size=chunk_size)
end = perf_counter()
average_time = (end - start) / iterations
results['FIL_CPU'] = average_time
print('FIL CPU time: ', average_time)
with using_device_type('gpu'):
fil_model, chunk_size = find_optimal_fil_params(
model_path, iterations, batch_size
)
start = perf_counter()
for iter_index in range(iterations):
X = data[iter_index]
y = fil_model.predict(X, chunk_size=chunk_size)
end = perf_counter()
average_time = (end - start) / iterations
results['FIL_GPU'] = average_time
print('FIL GPU time: ', average_time)
with using_device_type('gpu'):
start = perf_counter()
for iter_index in range(iterations):
X = data_gpu[iter_index]
y = fil_model.predict(X, chunk_size=chunk_size)
end = perf_counter()
average_time = (end - start) / iterations
results['FIL_GPU_native'] = average_time
print('FIL GPU (native I/O) time: ', average_time)
return results
if __name__ == "__main__":
all_results = {}
iterations = 10
batch_sizes = [1, 10, 1_000, 10_000]
for batch_size_ in batch_sizes:
print(f'Benchmarking batch size {batch_size_}')
all_results[batch_size_] = test_predicts(
batch_size_, iterations
)
algo_names = sorted(all_results[1].keys())
print(",".join(('batch_size', *algo_names)))
for batch_size_ in batch_sizes:
results = [
str(all_results[batch_size_][algo_name_])
for algo_name_ in algo_names
]
print(",".join((str(batch_size_), *results)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment