wphicks/treelite-issue470.py

## treelite-issue470.py
import cupy as cp
import os
import numpy as np
import treelite
import treelite_runtime
import xgboost as xgb
from time import perf_counter
from cuml.common.device_selection import using_device_type
from cuml.experimental import ForestInference

def find_optimal_fil_params(model_path, iterations, batch_size):
    fil_model = ForestInference.load(model_path)
    optimal_chunk_size = 1
    optimal_timing = float('inf')
    optimization_data = np.random.uniform(0, 1, (iterations, batch_size, 792))
    for log_chunk_size in range(6):
        chunk_size = 2 ** log_chunk_size
        print(chunk_size)
        start = perf_counter()
        for iter_index in range(iterations):
            X = optimization_data[iter_index]
            y = fil_model.predict(X, chunk_size=chunk_size)
        end = perf_counter()
        elapsed = end - start
        if elapsed < optimal_timing:
            optimal_timing = elapsed
            optimal_chunk_size = chunk_size
    return fil_model, optimal_chunk_size

def test_predicts(batch_size, iterations):
    results = {}
    model_path = '0.model'
    data = np.random.uniform(0, 1, (iterations, batch_size, 792))
    data_gpu = cp.asarray(data)

    tree_model = xgb.Booster()
    tree_model.load_model(model_path)
    tree_model.set_param({'predictor': 'gpu_predictor'})
    start = perf_counter()
    for iter_index in range(iterations):
        X = xgb.DMatrix(data[iter_index])
        y = tree_model.predict(X)
    end = perf_counter()
    average_time = (end - start) / iterations
    print('xgboost GPU time: ', average_time)
    results['XGB_GPU'] = average_time

    start = perf_counter()
    for iter_index in range(iterations):
        X = xgb.DMatrix(data_gpu[iter_index])
        y = tree_model.predict(X)
    end = perf_counter()
    average_time = (end - start) / iterations
    print('xgboost GPU (Native I/O) time: ', average_time)
    results['XGB_GPU_native'] = average_time

    tree_model.set_param({'predictor': 'cpu_predictor'})
    start = perf_counter()
    for iter_index in range(iterations):
        X = xgb.DMatrix(data[iter_index])
        y = tree_model.predict(X)
    end = perf_counter()
    average_time = (end - start) / iterations
    print('xgboost CPU time: ', average_time)
    results['XGB_CPU'] = average_time

    tl_model = treelite.Model.from_xgboost(tree_model)
    compiled_lib_name = '0.so'
    if not os.path.exists(compiled_lib_name):
        tl_model.export_lib(
            toolchain="gcc",
            libpath=compiled_lib_name,
            params={"parallel_comp": 40},
            verbose=False,
        )
    # WARNING: Compiled Treelite models currently have an odd interaction
    # with CPU FIL. Loading a compiled model in the same process as CPU FIL
    # is used causes CPU FIL to use only one thread. To get correct results for
    # CPU FIL, run the Treelite compiled benchmark separately from the
    # other benchmarks.
    #
    # compiled_model = treelite_runtime.Predictor(
    #     compiled_lib_name
    # )
    # start = perf_counter()
    # for iter_index in range(iterations):
    #     X = treelite_runtime.DMatrix(data[iter_index])
    #     y = compiled_model.predict(X)
    # end = perf_counter()
    # average_time = (end - start) / iterations
    # results['TL_compiled'] = average_time
    # print('TL compiled time: ', average_time)

    start = perf_counter()
    for iter_index in range(iterations):
        X = data[iter_index]
        y = treelite.gtil.predict(tl_model, X)
    end = perf_counter()
    average_time = (end - start) / iterations
    results['GTIL'] = average_time
    print('GTIL time: ', average_time)

    with using_device_type('cpu'):
        # Do not have to find optimal chunk size to use FIL, but doing so can
        # squeeze some extra performance out of your model
        fil_model, chunk_size = find_optimal_fil_params(
            model_path, iterations, batch_size
        )
        start = perf_counter()
        for iter_index in range(iterations):
            X = data[iter_index]
            y = fil_model.predict(X, chunk_size=chunk_size)
        end = perf_counter()
    average_time = (end - start) / iterations
    results['FIL_CPU'] = average_time
    print('FIL CPU time: ', average_time)

    with using_device_type('gpu'):
        fil_model, chunk_size = find_optimal_fil_params(
            model_path, iterations, batch_size
        )
        start = perf_counter()
        for iter_index in range(iterations):
            X = data[iter_index]
            y = fil_model.predict(X, chunk_size=chunk_size)
        end = perf_counter()
    average_time = (end - start) / iterations
    results['FIL_GPU'] = average_time
    print('FIL GPU time: ', average_time)

    with using_device_type('gpu'):
        start = perf_counter()
        for iter_index in range(iterations):
            X = data_gpu[iter_index]
            y = fil_model.predict(X, chunk_size=chunk_size)
        end = perf_counter()
    average_time = (end - start) / iterations
    results['FIL_GPU_native'] = average_time
    print('FIL GPU (native I/O) time: ', average_time)

    return results


if __name__ == "__main__":
    all_results = {}
    iterations = 10
    batch_sizes = [1, 10, 1_000, 10_000]
    for batch_size_ in batch_sizes:
        print(f'Benchmarking batch size {batch_size_}')
        all_results[batch_size_] = test_predicts(
            batch_size_, iterations
        )
    algo_names = sorted(all_results[1].keys())
    print(",".join(('batch_size', *algo_names)))

    for batch_size_ in batch_sizes:
        results = [
            str(all_results[batch_size_][algo_name_])
            for algo_name_ in algo_names
        ]
        print(",".join((str(batch_size_), *results)))
	import cupy as cp
	import os
	import numpy as np
	import treelite
	import treelite_runtime
	import xgboost as xgb
	from time import perf_counter
	from cuml.common.device_selection import using_device_type
	from cuml.experimental import ForestInference

	def find_optimal_fil_params(model_path, iterations, batch_size):
	fil_model = ForestInference.load(model_path)
	optimal_chunk_size = 1
	optimal_timing = float('inf')
	optimization_data = np.random.uniform(0, 1, (iterations, batch_size, 792))
	for log_chunk_size in range(6):
	chunk_size = 2 ** log_chunk_size
	print(chunk_size)
	start = perf_counter()
	for iter_index in range(iterations):
	X = optimization_data[iter_index]
	y = fil_model.predict(X, chunk_size=chunk_size)
	end = perf_counter()
	elapsed = end - start
	if elapsed < optimal_timing:
	optimal_timing = elapsed
	optimal_chunk_size = chunk_size
	return fil_model, optimal_chunk_size

	def test_predicts(batch_size, iterations):
	results = {}
	model_path = '0.model'
	data = np.random.uniform(0, 1, (iterations, batch_size, 792))
	data_gpu = cp.asarray(data)

	tree_model = xgb.Booster()
	tree_model.load_model(model_path)
	tree_model.set_param({'predictor': 'gpu_predictor'})
	start = perf_counter()
	for iter_index in range(iterations):
	X = xgb.DMatrix(data[iter_index])
	y = tree_model.predict(X)
	end = perf_counter()
	average_time = (end - start) / iterations
	print('xgboost GPU time: ', average_time)
	results['XGB_GPU'] = average_time

	start = perf_counter()
	for iter_index in range(iterations):
	X = xgb.DMatrix(data_gpu[iter_index])
	y = tree_model.predict(X)
	end = perf_counter()
	average_time = (end - start) / iterations
	print('xgboost GPU (Native I/O) time: ', average_time)
	results['XGB_GPU_native'] = average_time

	tree_model.set_param({'predictor': 'cpu_predictor'})
	start = perf_counter()
	for iter_index in range(iterations):
	X = xgb.DMatrix(data[iter_index])
	y = tree_model.predict(X)
	end = perf_counter()
	average_time = (end - start) / iterations
	print('xgboost CPU time: ', average_time)
	results['XGB_CPU'] = average_time

	tl_model = treelite.Model.from_xgboost(tree_model)
	compiled_lib_name = '0.so'
	if not os.path.exists(compiled_lib_name):
	tl_model.export_lib(
	toolchain="gcc",
	libpath=compiled_lib_name,
	params={"parallel_comp": 40},
	verbose=False,
	)
	# WARNING: Compiled Treelite models currently have an odd interaction
	# with CPU FIL. Loading a compiled model in the same process as CPU FIL
	# is used causes CPU FIL to use only one thread. To get correct results for
	# CPU FIL, run the Treelite compiled benchmark separately from the
	# other benchmarks.
	#
	# compiled_model = treelite_runtime.Predictor(
	# compiled_lib_name
	# )
	# start = perf_counter()
	# for iter_index in range(iterations):
	# X = treelite_runtime.DMatrix(data[iter_index])
	# y = compiled_model.predict(X)
	# end = perf_counter()
	# average_time = (end - start) / iterations
	# results['TL_compiled'] = average_time
	# print('TL compiled time: ', average_time)

	start = perf_counter()
	for iter_index in range(iterations):
	X = data[iter_index]
	y = treelite.gtil.predict(tl_model, X)
	end = perf_counter()
	average_time = (end - start) / iterations
	results['GTIL'] = average_time
	print('GTIL time: ', average_time)

	with using_device_type('cpu'):
	# Do not have to find optimal chunk size to use FIL, but doing so can
	# squeeze some extra performance out of your model
	fil_model, chunk_size = find_optimal_fil_params(
	model_path, iterations, batch_size
	)
	start = perf_counter()
	for iter_index in range(iterations):
	X = data[iter_index]
	y = fil_model.predict(X, chunk_size=chunk_size)
	end = perf_counter()
	average_time = (end - start) / iterations
	results['FIL_CPU'] = average_time
	print('FIL CPU time: ', average_time)

	with using_device_type('gpu'):
	fil_model, chunk_size = find_optimal_fil_params(
	model_path, iterations, batch_size
	)
	start = perf_counter()
	for iter_index in range(iterations):
	X = data[iter_index]
	y = fil_model.predict(X, chunk_size=chunk_size)
	end = perf_counter()
	average_time = (end - start) / iterations
	results['FIL_GPU'] = average_time
	print('FIL GPU time: ', average_time)

	with using_device_type('gpu'):
	start = perf_counter()
	for iter_index in range(iterations):
	X = data_gpu[iter_index]
	y = fil_model.predict(X, chunk_size=chunk_size)
	end = perf_counter()
	average_time = (end - start) / iterations
	results['FIL_GPU_native'] = average_time
	print('FIL GPU (native I/O) time: ', average_time)

	return results


	if __name__ == "__main__":
	all_results = {}
	iterations = 10
	batch_sizes = [1, 10, 1_000, 10_000]
	for batch_size_ in batch_sizes:
	print(f'Benchmarking batch size {batch_size_}')
	all_results[batch_size_] = test_predicts(
	batch_size_, iterations
	)
	algo_names = sorted(all_results[1].keys())
	print(",".join(('batch_size', *algo_names)))

	for batch_size_ in batch_sizes:
	results = [
	str(all_results[batch_size_][algo_name_])
	for algo_name_ in algo_names
	]
	print(",".join((str(batch_size_), *results)))