mitmul/output_of_gdb_python.txt

## output_of_gdb_python.txt
(gdb) bt
#0  0x0000000000000007 in ?? ()
#1  0x00007fffccf8c116 in google::protobuf::MessageLite::AppendToString(std::string*) const ()
   from /home/ubuntu/miniconda/lib/libprotobuf.so.14
#2  0x00007fffccf8c472 in google::protobuf::MessageLite::SerializeAsString() const ()
   from /home/ubuntu/miniconda/lib/libprotobuf.so.14
#3  0x00007fffcd5d613c in tc::ExecutionEngine::getHandle(std::string const&, std::vector<DLTensor const*, std::allocator<DLTensor const*> > const&, tc::MappingOptions const&)::{lambda(std::unique_ptr<tc::ExecutionEngine::ExecutorInfo, std::default_delete<tc::ExecutionEngine::ExecutorInfo> > const&)#1}::operator()(std::unique_ptr<tc::ExecutionEngine::ExecutorInfo, std::default_delete<tc::ExecutionEngine::ExecutorInfo> > const&) const () from /home/ubuntu/miniconda/lib/libtc_core.so
#4  0x00007fffcd5d63d7 in tc::ExecutionEngine::getHandle(std::string const&, std::vector<DLTensor const*, std::allocator<DLTensor const*> > const&, tc::MappingOptions const&) ()
   from /home/ubuntu/miniconda/lib/libtc_core.so
#5  0x00007fffcd5d73bb in tc::ExecutionEngine::compile(std::string const&, std::vector<DLTensor const*, std::allocator<DLTensor const*> > const&, tc::MappingOptions const&) ()
   from /home/ubuntu/miniconda/lib/libtc_core.so
#6  0x00007fffcdc3705d in __pyx_pf_4cupy_4core_2tc_8TCKernel_8autotune (
    __pyx_v_tuner_min_launch_total_threads=<optimized out>, __pyx_v_log_generations=<optimized out>,
    __pyx_v_restore_number=<optimized out>, __pyx_v_restore_from_proto=<optimized out>,
    __pyx_v_proto=<optimized out>, __pyx_v_gpus=<optimized out>, __pyx_v_threads=<optimized out>,
    __pyx_v_number_elites=<optimized out>, __pyx_v_generations=<optimized out>,
    __pyx_v_mutation_rate=<optimized out>, __pyx_v_crossover_rate=<optimized out>,
    __pyx_v_pop_size=<optimized out>, __pyx_v_starting_points=<optimized out>,
    __pyx_v_base_mapping=<optimized out>, __pyx_v_cache_file=<optimized out>,
    __pyx_v_inputs=<optimized out>, __pyx_v_self=0x7ffff6c6dd50) at cupy/core/tc.cpp:5796
#7  __pyx_pw_4cupy_4core_2tc_8TCKernel_9autotune (__pyx_v_self=0x7ffff6c6dd50,
    __pyx_args=<optimized out>, __pyx_kwds=<optimized out>) at cupy/core/tc.cpp:4854
#8  0x0000555555662fd4 in _PyCFunction_FastCallDict ()
#9  0x0000555555690f24 in _PyCFunction_FastCallKeywords ()
#10 0x00005555556f0bec in call_function ()
#11 0x0000555555715eb1 in _PyEval_EvalFrameDefault ()
#12 0x00005555556eb529 in PyEval_EvalCodeEx ()
#13 0x00005555556ec2cc in PyEval_EvalCode ()
#14 0x0000555555768af4 in run_mod ()
#15 0x0000555555768ef1 in PyRun_FileExFlags ()
#16 0x00005555557690f4 in PyRun_SimpleFileExFlags ()
#17 0x000055555576cc28 in Py_Main ()
#18 0x000055555563471e in main ()

## tc_test.py
import cupy as cp
import tensor_comprehensions as tc

lang = """
def matmul1(float(M,N) A, float(N,K) B) -> (output) {
  output(i, j) +=! A(i, kk) * B(kk, j)
}
"""


mat1 = cp.random.randn(100, 400).astype(cp.float32)
mat2 = cp.random.randn(400, 500).astype(cp.float32)
inputs = [mat1, mat2]

matmul_tc = cp.TCKernel(lang, 'matmul1', inputs)

matmul_tc.autotune(inputs, cache_file='matmul1_cache', base_mapping='mlp')

outputs = matmul_tc(mat1, mat2)

print(outputs)

print(mat1.dot(mat2))
	(gdb) bt
	#0 0x0000000000000007 in ?? ()
	#1 0x00007fffccf8c116 in google::protobuf::MessageLite::AppendToString(std::string*) const ()
	from /home/ubuntu/miniconda/lib/libprotobuf.so.14
	#2 0x00007fffccf8c472 in google::protobuf::MessageLite::SerializeAsString() const ()
	from /home/ubuntu/miniconda/lib/libprotobuf.so.14
	#3 0x00007fffcd5d613c in tc::ExecutionEngine::getHandle(std::string const&, std::vector<DLTensor const, std::allocator<DLTensor const> > const&, tc::MappingOptions const&)::{lambda(std::unique_ptr<tc::ExecutionEngine::ExecutorInfo, std::default_delete<tc::ExecutionEngine::ExecutorInfo> > const&)#1}::operator()(std::unique_ptr<tc::ExecutionEngine::ExecutorInfo, std::default_delete<tc::ExecutionEngine::ExecutorInfo> > const&) const () from /home/ubuntu/miniconda/lib/libtc_core.so
	#4 0x00007fffcd5d63d7 in tc::ExecutionEngine::getHandle(std::string const&, std::vector<DLTensor const, std::allocator<DLTensor const> > const&, tc::MappingOptions const&) ()
	from /home/ubuntu/miniconda/lib/libtc_core.so
	#5 0x00007fffcd5d73bb in tc::ExecutionEngine::compile(std::string const&, std::vector<DLTensor const, std::allocator<DLTensor const> > const&, tc::MappingOptions const&) ()
	from /home/ubuntu/miniconda/lib/libtc_core.so
	#6 0x00007fffcdc3705d in __pyx_pf_4cupy_4core_2tc_8TCKernel_8autotune (
	__pyx_v_tuner_min_launch_total_threads=<optimized out>, __pyx_v_log_generations=<optimized out>,
	__pyx_v_restore_number=<optimized out>, __pyx_v_restore_from_proto=<optimized out>,
	__pyx_v_proto=<optimized out>, __pyx_v_gpus=<optimized out>, __pyx_v_threads=<optimized out>,
	__pyx_v_number_elites=<optimized out>, __pyx_v_generations=<optimized out>,
	__pyx_v_mutation_rate=<optimized out>, __pyx_v_crossover_rate=<optimized out>,
	__pyx_v_pop_size=<optimized out>, __pyx_v_starting_points=<optimized out>,
	__pyx_v_base_mapping=<optimized out>, __pyx_v_cache_file=<optimized out>,
	__pyx_v_inputs=<optimized out>, __pyx_v_self=0x7ffff6c6dd50) at cupy/core/tc.cpp:5796
	#7 __pyx_pw_4cupy_4core_2tc_8TCKernel_9autotune (__pyx_v_self=0x7ffff6c6dd50,
	__pyx_args=<optimized out>, __pyx_kwds=<optimized out>) at cupy/core/tc.cpp:4854
	#8 0x0000555555662fd4 in _PyCFunction_FastCallDict ()
	#9 0x0000555555690f24 in _PyCFunction_FastCallKeywords ()
	#10 0x00005555556f0bec in call_function ()
	#11 0x0000555555715eb1 in _PyEval_EvalFrameDefault ()
	#12 0x00005555556eb529 in PyEval_EvalCodeEx ()
	#13 0x00005555556ec2cc in PyEval_EvalCode ()
	#14 0x0000555555768af4 in run_mod ()
	#15 0x0000555555768ef1 in PyRun_FileExFlags ()
	#16 0x00005555557690f4 in PyRun_SimpleFileExFlags ()
	#17 0x000055555576cc28 in Py_Main ()
	#18 0x000055555563471e in main ()
	import cupy as cp
	import tensor_comprehensions as tc

	lang = """
	def matmul1(float(M,N) A, float(N,K) B) -> (output) {
	output(i, j) +=! A(i, kk) * B(kk, j)
	}
	"""


	mat1 = cp.random.randn(100, 400).astype(cp.float32)
	mat2 = cp.random.randn(400, 500).astype(cp.float32)
	inputs = [mat1, mat2]

	matmul_tc = cp.TCKernel(lang, 'matmul1', inputs)

	matmul_tc.autotune(inputs, cache_file='matmul1_cache', base_mapping='mlp')

	outputs = matmul_tc(mat1, mat2)

	print(outputs)

	print(mat1.dot(mat2))