inferrna/run-tests.cc

## run-tests.cc
$ make run-tests
[  1%] Built target clew
[  8%] Built target patch-hostside
[ 16%] Built target easycl
[ 56%] Built target clblast
[ 78%] Built target cocl
Scanning dependencies of target run-singlebuffer
Scanning dependencies of target run-teststream
Scanning dependencies of target run-testnullpointer
Scanning dependencies of target run-testpartialcopy
[ 78%] /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/bin/cocl test/cocl/singlebuffer.cu -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/singlebuffer --add_ir_to_cl
[ 80%] /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/bin/cocl test/cocl/teststream.cu -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/teststream --add_ir_to_cl
[ 81%] /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/bin/cocl test/cocl/testnullpointer.cu -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testnullpointer --add_ir_to_cl
[ 81%] /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/bin/cocl test/cocl/testpartialcopy.cu -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testpartialcopy --add_ir_to_cl
+ /usr/lib/llvm-3.8/bin/clang++ -DUSE_CLEW -std=c++11 -x cuda --cuda-gpu-arch=sm_30 --cuda-device-only -emit-llvm -O0 -S -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/EasyCL -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL/thirdparty/clew/include -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/fake_funcs.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl_deviceside.h -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include test/cocl/testnullpointer.cu -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testnullpointer-device-noopt.ll
+ /usr/lib/llvm-3.8/bin/clang++ -DUSE_CLEW -std=c++11 -x cuda --cuda-gpu-arch=sm_30 --cuda-device-only -emit-llvm -O0 -S -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/EasyCL -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL/thirdparty/clew/include -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/fake_funcs.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl_deviceside.h -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include test/cocl/teststream.cu -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/teststream-device-noopt.ll
+ /usr/lib/llvm-3.8/bin/clang++ -DUSE_CLEW -std=c++11 -x cuda --cuda-gpu-arch=sm_30 --cuda-device-only -emit-llvm -O0 -S -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/EasyCL -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL/thirdparty/clew/include -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/fake_funcs.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl_deviceside.h -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include test/cocl/testpartialcopy.cu -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testpartialcopy-device-noopt.ll
+ /usr/lib/llvm-3.8/bin/clang++ -DUSE_CLEW -std=c++11 -x cuda --cuda-gpu-arch=sm_30 --cuda-device-only -emit-llvm -O0 -S -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/EasyCL -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL/thirdparty/clew/include -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/fake_funcs.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl_deviceside.h -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include test/cocl/singlebuffer.cu -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/singlebuffer-device-noopt.ll
warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
warning: unknown warning option warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
'-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
1 warning generated.
+ /usr/lib/llvm-3.8/bin/opt -S -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testpartialcopy-device.ll /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testpartialcopy-device-noopt.ll
1 warning generated.
+ + /usr/lib/llvm-3.8/bin/opt/usr/lib/llvm-3.8/bin/clang++ -DUSE_CLEW -std=c++11 -x cuda --cuda-host-only -emit-llvm -O3 -S -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -S -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testnullpointer-device.ll /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testnullpointer-device-noopt.ll
 -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/EasyCL -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL/thirdparty/clew/include -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/fake_funcs.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl_hostside.h test/cocl/testpartialcopy.cu -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testpartialcopy-hostraw.ll
1 warning generated.
+ /usr/lib/llvm-3.8/bin/opt -S -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/singlebuffer-device.ll /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/singlebuffer-device-noopt.ll
+ /usr/lib/llvm-3.8/bin/clang++ -DUSE_CLEW -std=c++11 -x cuda --cuda-host-only -emit-llvm -O3 -S -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/EasyCL -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL/thirdparty/clew/include -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/fake_funcs.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl_hostside.h test/cocl/testnullpointer.cu -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testnullpointer-hostraw.ll
+ /usr/lib/llvm-3.8/bin/clang++ -DUSE_CLEW -std=c++11 -x cuda --cuda-host-only -emit-llvm -O3 -S -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/EasyCL -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL/thirdparty/clew/include -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/fake_funcs.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl_hostside.h test/cocl/singlebuffer.cu -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/singlebuffer-hostraw.ll
test/cocl/teststream.cu:71:21: warning: variable length arrays are a C99 feature [-Wvla-extension]
    float hostFloats[N];
                    ^
2 warnings generated.
+ /usr/lib/llvm-3.8/bin/opt -S -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/teststream-device.ll /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/teststream-device-noopt.ll
warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
+ /usr/lib/llvm-3.8/bin/clang++ -DUSE_CLEW -std=c++11 -x cuda --cuda-host-only -emit-llvm -O3 -S -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/EasyCL -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL/thirdparty/clew/include -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/fake_funcs.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl_hostside.h test/cocl/teststream.cu -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/teststream-hostraw.ll
warning: unknown warning option warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
'-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
2 warnings generated.
2 warnings generated.
+ /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/build/patch-hostside --hostrawfile /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testnullpointer-hostraw.ll --devicellfile /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testnullpointer-device.ll --hostpatchedfile /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testnullpointer-hostpatched.ll
+ /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/build/patch-hostside --hostrawfile /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testpartialcopy-hostraw.ll --devicellfile /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testpartialcopy-device.ll --hostpatchedfile /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testpartialcopy-hostpatched.ll
+ /usr/lib/llvm-3.8/bin/clang++ -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2+ /usr/lib/llvm-3.8/bin/clang++ -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -DUSE_CLEW -c -fexceptions /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testpartialcopy-hostpatched.ll -O3 -o -D_GNU_SOURCE /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testpartialcopy.o
 -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -DUSE_CLEW -c /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testnullpointer-hostpatched.ll -O3 -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testnullpointer.o
2 warnings generated.
+ /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/build/patch-hostside --hostrawfile /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/singlebuffer-hostraw.ll --devicellfile /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/singlebuffer-device.ll --hostpatchedfile /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/singlebuffer-hostpatched.ll
+ /usr/lib/llvm-3.8/bin/clang++ -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -DUSE_CLEW -c /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/singlebuffer-hostpatched.ll -O3 -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/singlebuffer.o
clang: warning: argument unused during compilation: '-I /usr/lib/llvm-3.8/include'
clang: warning: argument unused during compilation: '-I /usr/lib/llvm-3.8/include'
clang: warning: argument unused during compilation: '-I /usr/lib/llvm-3.8/include'
clang: warning: argument unused during compilation: '-I /usr/lib/llvm-3.8/include'
warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
1 warning generated.
+ [ !  ]
+ g++ -Wl,-rpath,/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/build -Wl,-rpath,24890ORIGIN -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testnullpointer /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testnullpointer.o -L/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/build -lcocl -lclblast -leasycl -lclew -lpthread -L/usr/lib/llvm-3.8/lib -lLLVMLTO -lLLVMObjCARCOpts -lLLVMSymbolize -lLLVMDebugInfoPDB -lLLVMDebugInfoDWARF -lLLVMXCoreDisassembler -lLLVMXCoreCodeGen -lLLVMXCoreDesc -lLLVMXCoreInfo -lLLVMXCoreAsmPrinter -lLLVMSystemZDisassembler -lLLVMSystemZCodeGen -lLLVMSystemZAsmParser -lLLVMSystemZDesc -lLLVMSystemZInfo -lLLVMSystemZAsmPrinter -lLLVMSparcDisassembler -lLLVMSparcCodeGen -lLLVMSparcAsmParser -lLLVMSparcDesc -lLLVMSparcInfo -lLLVMSparcAsmPrinter -lLLVMPowerPCDisassembler -lLLVMPowerPCCodeGen -lLLVMPowerPCAsmParser -lLLVMPowerPCDesc -lLLVMPowerPCInfo -lLLVMPowerPCAsmPrinter -lLLVMNVPTXCodeGen -lLLVMNVPTXDesc -lLLVMNVPTXInfo -lLLVMNVPTXAsmPrinter -lLLVMMSP430CodeGen -lLLVMMSP430Desc -lLLVMMSP430Info -lLLVMMSP430AsmPrinter -lLLVMMipsDisassembler -lLLVMMipsCodeGen -lLLVMMipsAsmParser -lLLVMMipsDesc -lLLVMMipsInfo -lLLVMMipsAsmPrinter -lLLVMHexagonDisassembler -lLLVMHexagonCodeGen -lLLVMHexagonAsmParser -lLLVMHexagonDesc -lLLVMHexagonInfo -lLLVMCppBackendCodeGen -lLLVMCppBackendInfo -lLLVMBPFCodeGen -lLLVMBPFDesc -lLLVMBPFInfo -lLLVMBPFAsmPrinter -lLLVMARMDisassembler -lLLVMARMCodeGen -lLLVMARMAsmParser -lLLVMARMDesc -lLLVMARMInfo -lLLVMARMAsmPrinter -lLLVMAMDGPUCodeGen -lLLVMAMDGPUAsmParser -lLLVMAMDGPUDesc -lLLVMAMDGPUUtils -lLLVMAMDGPUInfo -lLLVMAMDGPUAsmPrinter -lLLVMAArch64Disassembler -lLLVMAArch64CodeGen -lLLVMAArch64AsmParser -lLLVMAArch64Desc -lLLVMAArch64Info -lLLVMAArch64AsmPrinter -lLLVMAArch64Utils -lLLVMMIRParser -lLLVMLibDriver -lLLVMOption -lLLVMTableGen -lLLVMLineEditor -lLLVMX86Disassembler -lLLVMX86AsmParser -lLLVMX86CodeGen -lLLVMSelectionDAG -lLLVMAsmPrinter -lLLVMX86Desc -lLLVMMCDisassembler -lLLVMX86Info -lLLVMX86AsmPrinter -lLLVMX86Utils -lLLVMMCJIT -lLLVMPasses -lLLVMipo -lLLVMVectorize -lLLVMLinker -lLLVMIRReader -lLLVMAsmParser -lLLVMDebugInfoCodeView -lLLVMInterpreter -lLLVMCodeGen -lLLVMScalarOpts -lLLVMInstCombine -lLLVMInstrumentation -lLLVMProfileData -lLLVMBitWriter -lLLVMOrcJIT -lLLVMTransformUtils -lLLVMExecutionEngine -lLLVMTarget -lLLVMAnalysis -lLLVMRuntimeDyld -lLLVMObject -lLLVMMCParser -lLLVMBitReader -lLLVMMC -lLLVMCore -lLLVMSupport -lrt -ldl -ltinfo -lpthread -lz -lm
clang: warning: argument unused during compilation: '-I /usr/lib/llvm-3.8/include'
clang: warning: argument unused during compilation: '-I /usr/lib/llvm-3.8/include'
1 warning generated.
+ [ !  ]
+ g++ -Wl,-rpath,/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/build -Wl,-rpath,24911ORIGIN -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testpartialcopy /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testpartialcopy.o -L/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/build -lcocl -lclblast -leasycl -lclew -lpthread -L/usr/lib/llvm-3.8/lib -lLLVMLTO -lLLVMObjCARCOpts -lLLVMSymbolize -lLLVMDebugInfoPDB -lLLVMDebugInfoDWARF -lLLVMXCoreDisassembler -lLLVMXCoreCodeGen -lLLVMXCoreDesc -lLLVMXCoreInfo -lLLVMXCoreAsmPrinter -lLLVMSystemZDisassembler -lLLVMSystemZCodeGen -lLLVMSystemZAsmParser -lLLVMSystemZDesc -lLLVMSystemZInfo -lLLVMSystemZAsmPrinter -lLLVMSparcDisassembler -lLLVMSparcCodeGen -lLLVMSparcAsmParser -lLLVMSparcDesc -lLLVMSparcInfo -lLLVMSparcAsmPrinter -lLLVMPowerPCDisassembler -lLLVMPowerPCCodeGen -lLLVMPowerPCAsmParser -lLLVMPowerPCDesc -lLLVMPowerPCInfo -lLLVMPowerPCAsmPrinter -lLLVMNVPTXCodeGen -lLLVMNVPTXDesc -lLLVMNVPTXInfo -lLLVMNVPTXAsmPrinter -lLLVMMSP430CodeGen -lLLVMMSP430Desc -lLLVMMSP430Info -lLLVMMSP430AsmPrinter -lLLVMMipsDisassembler -lLLVMMipsCodeGen -lLLVMMipsAsmParser -lLLVMMipsDesc -lLLVMMipsInfo -lLLVMMipsAsmPrinter -lLLVMHexagonDisassembler -lLLVMHexagonCodeGen -lLLVMHexagonAsmParser -lLLVMHexagonDesc -lLLVMHexagonInfo -lLLVMCppBackendCodeGenwarning -lLLVMCppBackendInfo:  -lLLVMBPFCodeGen -lLLVMBPFDesc -lLLVMBPFInfo -lLLVMBPFAsmPrinter -lLLVMARMDisassembler -lLLVMARMCodeGen -lLLVMARMAsmParser -lLLVMARMDescunknown  -lLLVMARMInfo -lLLVMARMAsmPrinter -lLLVMAMDGPUCodeGen -lLLVMAMDGPUAsmParser -lLLVMAMDGPUDesc -lLLVMAMDGPUUtils -lLLVMAMDGPUInfo -lLLVMAMDGPUAsmPrinter -lLLVMAArch64Disassembler -lLLVMAArch64CodeGen -lLLVMAArch64AsmParser -lLLVMAArch64Desc -lLLVMAArch64Info -lLLVMAArch64AsmPrinter -lLLVMAArch64Utils -lLLVMMIRParser -lLLVMLibDriver -lLLVMOption -lLLVMTableGen -lLLVMLineEditor -lLLVMX86Disassembler -lLLVMX86AsmParser -lLLVMX86CodeGen -lLLVMSelectionDAG -lLLVMAsmPrinter -lLLVMX86Desc -lLLVMMCDisassembler -lLLVMX86Info -lLLVMX86AsmPrinter -lLLVMX86Utils -lLLVMMCJIT -lLLVMPasses -lLLVMipo -lLLVMVectorize -lLLVMLinker -lLLVMIRReader -lLLVMAsmParser -lLLVMDebugInfoCodeView -lLLVMInterpreter -lLLVMCodeGen -lLLVMScalarOpts -lLLVMInstCombine -lLLVMInstrumentation -lLLVMProfileData -lLLVMBitWriter -lLLVMOrcJIT -lLLVMTransformUtils -lLLVMExecutionEngine -lLLVMTarget -lLLVMAnalysis -lLLVMRuntimeDyld -lLLVMObject -lLLVMMCParser -lLLVMBitReader -lLLVMMC -lLLVMCore -lLLVMSupport -lrt -ldl -ltinfo -lpthread -lz -lm
warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
test/cocl/teststream.cu:71:21: warning: variable length arrays are a C99 feature [-Wvla-extension]
    float hostFloats[N];
                    ^
1 warning generated.
+ [ !  ]
+ g++ -Wl,-rpath,/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/build -Wl,-rpath,24872ORIGIN -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/singlebuffer /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/singlebuffer.o -L/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/build -lcocl -lclblast -leasycl -lclew -lpthread -L/usr/lib/llvm-3.8/lib -lLLVMLTO -lLLVMObjCARCOpts -lLLVMSymbolize -lLLVMDebugInfoPDB -lLLVMDebugInfoDWARF -lLLVMXCoreDisassembler -lLLVMXCoreCodeGen -lLLVMXCoreDesc -lLLVMXCoreInfo -lLLVMXCoreAsmPrinter -lLLVMSystemZDisassembler -lLLVMSystemZCodeGen -lLLVMSystemZAsmParser -lLLVMSystemZDesc -lLLVMSystemZInfo -lLLVMSystemZAsmPrinter -lLLVMSparcDisassembler -lLLVMSparcCodeGen -lLLVMSparcAsmParser -lLLVMSparcDesc -lLLVMSparcInfo -lLLVMSparcAsmPrinter -lLLVMPowerPCDisassembler -lLLVMPowerPCCodeGen -lLLVMPowerPCAsmParser -lLLVMPowerPCDesc -lLLVMPowerPCInfo -lLLVMPowerPCAsmPrinter -lLLVMNVPTXCodeGen -lLLVMNVPTXDesc -lLLVMNVPTXInfo -lLLVMNVPTXAsmPrinter -lLLVMMSP430CodeGen -lLLVMMSP430Desc -lLLVMMSP430Info -lLLVMMSP430AsmPrinter -lLLVMMipsDisassembler -lLLVMMipsCodeGen -lLLVMMipsAsmParser -lLLVMMipsDesc -lLLVMMipsInfo -lLLVMMipsAsmPrinter -lLLVMHexagonDisassembler -lLLVMHexagonCodeGen -lLLVMHexagonAsmParser -lLLVMHexagonDesc -lLLVMHexagonInfo -lLLVMCppBackendCodeGen -lLLVMCppBackendInfo -lLLVMBPFCodeGen -lLLVMBPFDesc -lLLVMBPFInfo -lLLVMBPFAsmPrinter -lLLVMARMDisassembler -lLLVMARMCodeGen -lLLVMARMAsmParser -lLLVMARMDesc -lLLVMARMInfo -lLLVMARMAsmPrinter -lLLVMAMDGPUCodeGen -lLLVMAMDGPUAsmParser -lLLVMAMDGPUDesc -lLLVMAMDGPUUtils -lLLVMAMDGPUInfo -lLLVMAMDGPUAsmPrinter -lLLVMAArch64Disassembler -lLLVMAArch64CodeGen -lLLVMAArch64AsmParser -lLLVMAArch64Desc -lLLVMAArch64Info -lLLVMAArch64AsmPrinter -lLLVMAArch64Utils -lLLVMMIRParser -lLLVMLibDriver -lLLVMOption -lLLVMTableGen -lLLVMLineEditor -lLLVMX86Disassembler -lLLVMX86AsmParser -lLLVMX86CodeGen -lLLVMSelectionDAG -lLLVMAsmPrinter -lLLVMX86Desc -lLLVMMCDisassembler -lLLVMX86Info -lLLVMX86AsmPrinter -lLLVMX86Utils -lLLVMMCJIT -lLLVMPasses -lLLVMipo -lLLVMVectorize -lLLVMLinker -lLLVMIRReader -lLLVMAsmParser -lLLVMDebugInfoCodeView -lLLVMInterpreter -lLLVMCodeGen -lLLVMScalarOpts -lLLVMInstCombine -lLLVMInstrumentation -lLLVMProfileData -lLLVMBitWriter -lLLVMOrcJIT -lLLVMTransformUtils -lLLVMExecutionEngine -lLLVMTarget -lLLVMAnalysis -lLLVMRuntimeDyld -lLLVMObject -lLLVMMCParser -lLLVMBitReader -lLLVMMC -lLLVMCore -lLLVMSupport -lrt -ldl -ltinfo -lpthread -lz -lm
3 warnings generated.
+ /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/build/patch-hostside --hostrawfile /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/teststream-hostraw.ll --devicellfile /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/teststream-device.ll --hostpatchedfile /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/teststream-hostpatched.ll
+ /usr/lib/llvm-3.8/bin/clang++ -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -DUSE_CLEW -c /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/teststream-hostpatched.ll -O3 -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/teststream.o
cuStreamCreate redirected
cuStreamCreate current context=0
creating default context
Context() 0x10e2030
cuStreamCreate redirected
cuStreamCreate current context=0
creating default context
Context() 0x7d5030
clang: warning: argument unused during compilation: '-I /usr/lib/llvm-3.8/include'
clang: warning: argument unused during compilation: '-I /usr/lib/llvm-3.8/include'
warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
1 warning generated.
+ [ !  ]
+ g++ -Wl,-rpath,/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/build -Wl,-rpath,24874ORIGIN -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/teststream /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/teststream.o -L/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/build -lcocl -lclblast -leasycl -lclew -lpthread -L/usr/lib/llvm-3.8/lib -lLLVMLTO -lLLVMObjCARCOpts -lLLVMSymbolize -lLLVMDebugInfoPDB -lLLVMDebugInfoDWARF -lLLVMXCoreDisassembler -lLLVMXCoreCodeGen -lLLVMXCoreDesc -lLLVMXCoreInfo -lLLVMXCoreAsmPrinter -lLLVMSystemZDisassembler -lLLVMSystemZCodeGen -lLLVMSystemZAsmParser -lLLVMSystemZDesc -lLLVMSystemZInfo -lLLVMSystemZAsmPrinter -lLLVMSparcDisassembler -lLLVMSparcCodeGen -lLLVMSparcAsmParser -lLLVMSparcDesc -lLLVMSparcInfo -lLLVMSparcAsmPrinter -lLLVMPowerPCDisassembler -lLLVMPowerPCCodeGen -lLLVMPowerPCAsmParser -lLLVMPowerPCDesc -lLLVMPowerPCInfo -lLLVMPowerPCAsmPrinter -lLLVMNVPTXCodeGen -lLLVMNVPTXDesc -lLLVMNVPTXInfo -lLLVMNVPTXAsmPrinter -lLLVMMSP430CodeGen -lLLVMMSP430Desc -lLLVMMSP430Info -lLLVMMSP430AsmPrinter -lLLVMMipsDisassembler -lLLVMMipsCodeGen -lLLVMMipsAsmParser -lLLVMMipsDesc -lLLVMMipsInfo -lLLVMMipsAsmPrinter -lLLVMHexagonDisassembler -lLLVMHexagonCodeGen -lLLVMHexagonAsmParser -lLLVMHexagonDesc -lLLVMHexagonInfo -lLLVMCppBackendCodeGen -lLLVMCppBackendInfo -lLLVMBPFCodeGen -lLLVMBPFDesc -lLLVMBPFInfo -lLLVMBPFAsmPrinter -lLLVMARMDisassembler -lLLVMARMCodeGen -lLLVMARMAsmParser -lLLVMARMDesc -lLLVMARMInfo -lLLVMARMAsmPrinter -lLLVMAMDGPUCodeGen -lLLVMAMDGPUAsmParser -lLLVMAMDGPUDesc -lLLVMAMDGPUUtils -lLLVMAMDGPUInfo -lLLVMAMDGPUAsmPrinter -lLLVMAArch64Disassembler -lLLVMAArch64CodeGen -lLLVMAArch64AsmParser -lLLVMAArch64Desc -lLLVMAArch64Info -lLLVMAArch64AsmPrinter -lLLVMAArch64Utils -lLLVMMIRParser -lLLVMLibDriver -lLLVMOption -lLLVMTableGen -lLLVMLineEditor -lLLVMX86Disassembler -lLLVMX86AsmParser -lLLVMX86CodeGen -lLLVMSelectionDAG -lLLVMAsmPrinter -lLLVMX86Desc -lLLVMMCDisassembler -lLLVMX86Info -lLLVMX86AsmPrinter -lLLVMX86Utils -lLLVMMCJIT -lLLVMPasses -lLLVMipo -lLLVMVectorize -lLLVMLinker -lLLVMIRReader -lLLVMAsmParser -lLLVMDebugInfoCodeView -lLLVMInterpreter -lLLVMCodeGen -lLLVMScalarOpts -lLLVMInstCombine -lLLVMInstrumentation -lLLVMProfileData -lLLVMBitWriter -lLLVMOrcJIT -lLLVMTransformUtils -lLLVMExecutionEngine -lLLVMTarget -lLLVMAnalysis -lLLVMRuntimeDyld -lLLVMObject -lLLVMMCParser -lLLVMBitReader -lLLVMMC -lLLVMCore -lLLVMSupport -lrt -ldl -ltinfo -lpthread -lz -lm
cuStreamCreate redirected
cuStreamCreate current context=0
creating default context
Context() 0x166e030
test1
cuStreamCreate redirected
cuStreamCreate current context=0
creating default context
Context() 0x1a18030
Using Advanced Micro Devices, Inc. , OpenCL platform: AMD Accelerated Parallel Processing
Using OpenCL device: Pitcairn
cuStreamCreate redirected new stream 0x18f2810
Memory::newDeviceAlloc context=0x10e2030 bytes=4096 memory=0x18f2de0 clmem=0x18f2c70
cuMemcpyHtoDAsync dst=384 src=0x18ff070 bytes=16
found memory: 0x18f2de0 fakepos=128 bytes=4096
cudaStreamSynchronize queue=0x18f2830
cuMemcpyDtoHAsync queue=0x18f2830 dst=0x18fee70 src=128 bytes=4096
found memory: 0x18f2de0 fakepos=128 bytes=4096
cuMemcpyDtoHAsync dst[0] 6.06749e-08
cudaStreamSynchronize queue=0x18f2830
123.456
444
321
111
found memory: 0x18f2de0 fakepos=128 bytes=4096
cuStreamDestroy_v2 redirected stream=0x18f2810
Using Advanced Micro Devices, Inc. , OpenCL platform: AMD Accelerated Parallel Processing
Using OpenCL device: Pitcairn
cuStreamCreate redirected new stream 0x1043280
Memory::newDeviceAlloc context=0x7d5030 bytes=4096 memory=0xbdd780 clmem=0xbdd610
cuMemcpyHtoDAsync dst=128 src=0x1047880 bytes=4096
found memory: 0xbdd780 fakepos=128 bytes=4096
cudaConfigureCall queue=0x10432a0
grid(1, 1, 1)
block(32, 1, 1)
configureKernel name=_Z8getValuePfS_
building kernel _Z8getValuePfS_
cocl dump cl set
cl: [
kernel void _Z8getValuePfS_(global float* outdata, uint outdata_offset, global float* indata, uint indata_offset, local int *scratch);

kernel void _Z8getValuePfS_(global float* outdata, uint outdata_offset, global float* indata, uint indata_offset, local int *scratch) {
    indata += indata_offset;
    outdata += outdata_offset;

    float v4;

v1:;
    /* bool v2 = icmp indata <unk> */;
    /* float v4 = select v2 <unk> <unk> */;
    v4 = indata == 0 ? 3.0f : 2.0f;
    /* void v7 = store v4 outdata */;
    outdata[0] = v4;
    return;
}
]
[ 81%] Built target run-testpartialcopy
Using Advanced Micro Devices, Inc. , OpenCL platform: AMD Accelerated Parallel Processing
Using OpenCL device: Pitcairn
Using Advanced Micro Devices, Inc. , OpenCL platform: AMD Accelerated Parallel Processing
Using OpenCL device: Pitcairn
Scanning dependencies of target run-offsetkernelargs
cuStreamCreate redirected new stream 0x22343e0
got stream
Memory::newDeviceAlloc context=0x1a18030 bytes=409600 memory=0x2234a00 clmem=0x2234890
cudaConfigureCall queue=0x2234400
grid(3200, 1, 1)
block(32, 1, 1)
configureKernel name=_Z10longKernelPfif
building kernel _Z10longKernelPfif
[ 82%] /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/bin/cocl test/cocl/offsetkernelargs.cu -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/offsetkernelargs --add_ir_to_cl
__internal__ build log:
"/tmp/OCL25092T1.cl", line 10: warning: label "v1" was declared but never
          referenced
  v1:;
  ^


 ... built
setKernelArgCharStar 0x80
found memory: 0xbdd780 fakepos=128 bytes=4096
setKernelArgCharStar 0
kernelGo queue=0x10432a0
<<< global=dim3(32,1,1,), workgroupsize=dim3(32,1,1,)>>>
workgroupSize=32
.. kernel queued
cuMemcpyDtoHAsync queue=0x10432a0 dst=0x1047880 src=128 bytes=4096
found memory: 0xbdd780 fakepos=128 bytes=4096
cuMemcpyDtoHAsync dst[0] 3
cudaStreamSynchronize queue=0x10432a0
3
found memory: 0xbdd780 fakepos=128 bytes=4096
cuStreamDestroy_v2 redirected stream=0x1043280
cuStreamCreate redirected
cuStreamCreate current context=0x7d5030
cocl dump cl set
cl: [
kernel void _Z10longKernelPfif(global float* data, uint data_offset, int N, float value, local int *scratch);

kernel void _Z10longKernelPfif(global float* data, uint data_offset, int N, float value, local int *scratch) {
    data += data_offset;

    float v24;
    float v25;
    float v36;
    float v37;
    float v42;
    float v43;
    float v49;
    float v50;
    float v55;
    float v56;
    global float* v23;
    global float* v35;
    global float* v41;
    global float* v48;
    global float* v54;
    int v16;
    int v19;
    int v20;
    int v21;
    int v27;
    int v29;
    int v31;
    int v33;
    int v58;

v1:;
    /* bool v12 = icmp N <unk> */;
    /* if(v12) */
    if (N > 0) {
        goto v2;
    } else {
        goto v10;
    }
v2:;
    /* int v14 = add N <unk> */;
    /* int v16 = and N <unk> */;
    v16 = N & 3;
    /* bool v18 = icmp v16 v13 */;
    /* if(v18) */
    if (v16 == 0) {
        /* int v19 = phi v13 */
        v19 = 0;
        goto v6;
    } else {
        goto v3;
    }
v3:;
    /* int v20 = phi v13 */
    v20 = 0;
    /* int v21 = phi v16 */
    v21 = v16;
    goto v4;
v4:;
    /* long v22 = sext v20 */;
    /* float* v23 = getelementptr data v22 */;
    v23 = (&(data[v20]));
    /* float v24 = load v23 */;
    v24 = v23[0];
    /* float v25 = fadd v24 value */;
    v25 = v24 + value;
    /* void v26 = store v25 v23 */;
    v23[0] = v25;
    /* int v27 = add v20 <unk> */;
    v27 = v20 + 1;
    /* int v29 = add v21 v15 */;
    v29 = v21 + -1;
    /* bool v30 = icmp v29 v13 */;
    /* if(v30) */
    if (v29 == 0) {
        /* int v31 = phi v27 */
        v31 = v27;
        goto v5;
    } else {
        /* int v20 = phi v27 */
        v20 = v27;
        /* int v21 = phi v29 */
        v21 = v29;
        goto v4;
    }
v5:;
    /* int v19 = phi v31 */
    v19 = v31;
    goto v6;
v6:;
    /* bool v32 = icmp v14 v17 */;
    /* if(v32) */
    if (N + -1 < 3) {
        goto v9;
    } else {
        goto v7;
    }
v7:;
    /* int v33 = phi v19 */
    v33 = v19;
    goto v11;
v8:;
    goto v9;
v9:;
    goto v10;
v10:;
    return;
v11:;
    /* long v34 = sext v33 */;
    /* float* v35 = getelementptr data v34 */;
    v35 = (&(data[v33]));
    /* float v36 = load v35 */;
    v36 = v35[0];
    /* float v37 = fadd v36 value */;
    v37 = v36 + value;
    /* void v38 = store v37 v35 */;
    v35[0] = v37;
    /* int v39 = add v33 v28 */;
    /* long v40 = sext v39 */;
    /* float* v41 = getelementptr data v40 */;
    v41 = (&(data[v33 + 1]));
    /* float v42 = load v41 */;
    v42 = v41[0];
    /* float v43 = fadd v42 value */;
    v43 = v42 + value;
    /* void v44 = store v43 v41 */;
    v41[0] = v43;
    /* int v45 = add v33 <unk> */;
    /* long v47 = sext v45 */;
    /* float* v48 = getelementptr data v47 */;
    v48 = (&(data[v33 + 2]));
    /* float v49 = load v48 */;
    v49 = v48[0];
    /* float v50 = fadd v49 value */;
    v50 = v49 + value;
    /* void v51 = store v50 v48 */;
    v48[0] = v50;
    /* int v52 = add v33 v17 */;
    /* long v53 = sext v52 */;
    /* float* v54 = getelementptr data v53 */;
    v54 = (&(data[v33 + 3]));
    /* float v55 = load v54 */;
    v55 = v54[0];
    /* float v56 = fadd v55 value */;
    v56 = v55 + value;
    /* void v57 = store v56 v54 */;
    v54[0] = v56;
    /* int v58 = add v33 <unk> */;
    v58 = v33 + 4;
    /* bool v60 = icmp v58 N */;
    /* if(v60) */
    if (v58 == N) {
        goto v8;
    } else {
        /* int v33 = phi v58 */
        v33 = v58;
        goto v11;
    }
}
]
cuStreamCreate redirected new stream 0x1c39b20
Memory::newDeviceAlloc context=0x166e030 bytes=65536 memory=0x1c39200 clmem=0x1c39090
cuMemcpyHtoDAsync dst=640 src=0x1a5a690 bytes=512
found memory: 0x1c39200 fakepos=128 bytes=65536
cudaConfigureCall queue=0x1c38c50
grid(1, 1, 1)
block(32, 1, 1)
configureKernel name=_Z8getValuePfS_
building kernel _Z8getValuePfS_
cocl dump cl set
cl: [
kernel void _Z8getValuePfS_(global float* outdata, uint outdata_offset, global float* indata, uint indata_offset, local int *scratch);

kernel void _Z8getValuePfS_(global float* outdata, uint outdata_offset, global float* indata, uint indata_offset, local int *scratch) {
    indata += indata_offset;
    outdata += outdata_offset;

    float v2;
    float v3;

v1:;
    /* float v2 = load indata */;
    v2 = indata[0];
    /* float v3 = fadd v2 <unk> */;
    v3 = v2 + 3.0f;
    /* void v5 = store v3 outdata */;
    outdata[0] = v3;
    return;
}
]
cuStreamCreate redirected new stream 0x1043280
Memory::newDeviceAlloc context=0x7d5030 bytes=4096 memory=0xad5c60 clmem=0x10456d0
cuMemcpyHtoDAsync dst=4224 src=0x1047880 bytes=4096
found memory: 0xad5c60 fakepos=4224 bytes=4096
cudaConfigureCall queue=0x10432a0
grid(1, 1, 1)
block(32, 1, 1)
configureKernel name=_Z16checkNullStructs8MyStruct
building kernel _Z16checkNullStructs8MyStruct
cocl dump cl set
cl: [struct MyStruct {
    global float* f0;
    global float* f1;
};
struct MyStruct_nopointers {
    int f0;
};

kernel void _Z16checkNullStructs8MyStruct(global struct MyStruct_nopointers* mystruct_nopointers, global float* mystruct_ptr0, uint mystruct_ptr0_offset, global float* mystruct_ptr1, uint mystruct_ptr1_offset, local int *scratch);

kernel void _Z16checkNullStructs8MyStruct(global struct MyStruct_nopointers* mystruct_nopointers, global float* mystruct_ptr0, uint mystruct_ptr0_offset, global float* mystruct_ptr1, uint mystruct_ptr1_offset, local int *scratch) {
    mystruct_ptr1 += mystruct_ptr1_offset;
    mystruct_ptr0 += mystruct_ptr0_offset;
struct MyStruct mystruct[1];
mystruct[0].f0 = 0;
mystruct[0].f1 = 0;
mystruct[0].f0 = mystruct_ptr0;
mystruct[0].f1 = mystruct_ptr1;

    float v7;
    global float* v11;
    global float* v4;

v1:;
    /* float** v2 = getelementptr mystruct <unk> <unk> */;
    /* float* v4 = load v2 */;
    v4 = (&(mystruct[0].f1))[0];
    /* bool v5 = icmp v4 <unk> */;
    /* float v7 = select v5 <unk> <unk> */;
    v7 = v4 == 0 ? 9.0f : 8.0f;
    /* float** v10 = getelementptr mystruct v3 <unk> */;
    /* float* v11 = load v10 */;
    v11 = (&(mystruct[0].f0))[0];
    /* void v12 = store v7 v11 */;
    v11[0] = v7;
    return;
}
]
+ /usr/lib/llvm-3.8/bin/clang++ -DUSE_CLEW -std=c++11 -x cuda --cuda-gpu-arch=sm_30 --cuda-device-only -emit-llvm -O0 -S -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/EasyCL -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL/thirdparty/clew/include -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/fake_funcs.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl_deviceside.h -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include test/cocl/offsetkernelargs.cu -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/offsetkernelargs-device-noopt.ll
__internal__ build log:
"/tmp/OCL25099T1.cl", line 11: warning: label "v1" was declared but never
          referenced
  v1:;
  ^


 ... built
setKernelArgCharStar 0x480
found memory: 0x1c39200 fakepos=128 bytes=65536
setKernelArgCharStar 0x280
found memory: 0x1c39200 fakepos=128 bytes=65536
kernelGo queue=0x1c38c50
<<< global=dim3(32,1,1,), workgroupsize=dim3(32,1,1,)>>>
workgroupSize=32
.. kernel queued
cuMemcpyDtoHAsync queue=0x1c38c50 dst=0x1a5a890 src=1152 bytes=512
found memory: 0x1c39200 fakepos=128 bytes=65536
cuMemcpyDtoHAsync dst[0] 126.456
cudaStreamSynchronize queue=0x1c38c50
126.456
found memory: 0x1c39200 fakepos=128 bytes=65536
cuStreamDestroy_v2 redirected stream=0x1c39b20
__internal__ build log:
"/tmp/OCL25092T2.cl", line 24: warning: label "v1" was declared but never
          referenced
  v1:;
  ^


 ... built
setKernelArgStruct structsize=4
setKernelArgCharStar 0x1080
found memory: 0xad5c60 fakepos=4224 bytes=4096
setKernelArgCharStar 0
kernelGo queue=0x10432a0
<<< global=dim3(32,1,1,), workgroupsize=dim3(32,1,1,)>>>
workgroupSize=32
.. kernel queued
cuMemcpyDtoHAsync queue=0x10432a0 dst=0x1047880 src=4224 bytes=4096
found memory: 0xad5c60 fakepos=4224 bytes=4096
cuMemcpyDtoHAsync dst[0] 9
cudaStreamSynchronize queue=0x10432a0
9
found memory: 0xad5c60 fakepos=4224 bytes=4096
cuStreamDestroy_v2 redirected stream=0x1043280
cuStreamCreate redirected
cuStreamCreate current context=0x166e030
cuStreamCreate redirected new stream 0x1c39b20
Memory::newDeviceAlloc context=0x166e030 bytes=65536 memory=0x1e862b0 clmem=0x19798d0
cuMemcpyHtoDAsync dst=66176 src=0x1a5a690 bytes=128
found memory: 0x1e862b0 fakepos=65664 bytes=65536
cudaConfigureCall queue=0x1c38c50
grid(1, 1, 1)
block(32, 1, 1)
configureKernel name=_Z12getValueCharPcS_
building kernel _Z12getValueCharPcS_
cocl dump cl set
cl: [
kernel void _Z12getValueCharPcS_(global char* outdata, uint outdata_offset, global char* indata, uint indata_offset, local int *scratch);

kernel void _Z12getValueCharPcS_(global char* outdata, uint outdata_offset, global char* indata, uint indata_offset, local int *scratch) {
    indata += indata_offset;
    outdata += outdata_offset;

    char v2;
    char v6;

v1:;
    /* char v2 = load indata */;
    v2 = indata[0];
    /* int v3 = zext v2 */;
    /* int v4 = add v3 <unk> */;
    /* char v6 = trunc v4 */;
    v6 = (char)(v2 + 3);
    /* void v7 = store v6 outdata */;
    outdata[0] = v6;
    return;
}
]
__internal__ build log:
"/tmp/OCL25106T1.cl", line 36: warning: goto statement may cause irreducible
          control flow
          goto v2;
          ^

"/tmp/OCL25106T1.cl", line 38: warning: goto statement may cause irreducible
          control flow
          goto v10;
          ^

"/tmp/OCL25106T1.cl", line 49: warning: goto statement may cause irreducible
          control flow
          goto v6;
          ^

"/tmp/OCL25106T1.cl", line 51: warning: goto statement may cause irreducible
          control flow
          goto v3;
          ^

"/tmp/OCL25106T1.cl", line 58: warning: goto statement may cause irreducible
          control flow
      goto v4;
      ^

"/tmp/OCL25106T1.cl", line 78: warning: goto statement may cause irreducible
          control flow
          goto v5;
          ^

"/tmp/OCL25106T1.cl", line 84: warning: goto statement may cause irreducible
          control flow
          goto v4;
          ^

"/tmp/OCL25106T1.cl", line 89: warning: goto statement may cause irreducible
          control flow
      goto v6;
      ^

"/tmp/OCL25106T1.cl", line 94: warning: goto statement may cause irreducible
          control flow
          goto v9;
          ^

"/tmp/OCL25106T1.cl", line 96: warning: goto statement may cause irreducible
          control flow
          goto v7;
          ^

"/tmp/OCL25106T1.cl", line 101: warning: goto statement may cause irreducible
          control flow
      goto v11;
      ^

"/tmp/OCL25106T1.cl", line 103: warning: goto statement may cause irreducible
          control flow
      goto v9;
      ^

"/tmp/OCL25106T1.cl", line 105: warning: goto statement may cause irreducible
          control flow
      goto v10;
      ^

"/tmp/OCL25106T1.cl", line 153: warning: goto statement may cause irreducible
          control flow
          goto v8;
          ^

"/tmp/OCL25106T1.cl", line 157: warning: goto statement may cause irreducible
          control flow
          goto v11;
          ^

"/tmp/OCL25106T1.cl", line 32: warning: label "v1" was declared but never
[ 82%] Built target run-testnullpointer
          referenced
  v1:;
  ^


 ... built
setKernelArgCharStar 0x80
found memory: 0x2234a00 fakepos=128 bytes=409600
setKernelArgInt32 102400
setKernelArgFloat 3
kernelGo queue=0x2234400
<<< global=dim3(102400,1,1,), workgroupsize=dim3(32,1,1,)>>>
workgroupSize=32
.. kernel queued
Scanning dependencies of target run-testmath
__internal__ build log:
"/tmp/OCL25099T2.cl", line 11: warning: label "v1" was declared but never
          referenced
  v1:;
  ^


 ... built
setKernelArgCharStar 0x10480
found memory: 0x1e862b0 fakepos=65664 bytes=65536
setKernelArgCharStar 0x10280
found memory: 0x1e862b0 fakepos=65664 bytes=65536
kernelGo queue=0x1c38c50
<<< global=dim3(32,1,1,), workgroupsize=dim3(32,1,1,)>>>
workgroupSize=32
.. kernel queued
[ 84%] /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/bin/cocl test/cocl/testmath.cu -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testmath --add_ir_to_cl
warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
+ /usr/lib/llvm-3.8/bin/clang++ -DUSE_CLEW -std=c++11 -x cuda --cuda-gpu-arch=sm_30 --cuda-device-only -emit-llvm -O0 -S -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/EasyCL -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL/thirdparty/clew/include -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/fake_funcs.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl_deviceside.h -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include test/cocl/testmath.cu -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testmath-device-noopt.ll
warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
queued kernel 1
Event()
cuEventCreate redirected flags=1 new event=0x1dac360
cuEventRecord redirected event=0x1dac360 queue=0x2234400
cuStreamWaitEvent redirected queue=0x2234400 event=0x1dac360 flags=0
cudaConfigureCall queue=0x2234400
grid(3200, 1, 1)
block(32, 1, 1)
configureKernel name=_Z10longKernelPfif
setKernelArgCharStar 0x80
found memory: 0x2234a00 fakepos=128 bytes=409600
setKernelArgInt32 102400
setKernelArgFloat 3
kernelGo queue=0x2234400
<<< global=dim3(102400,1,1,), workgroupsize=dim3(32,1,1,)>>>
cuMemcpyDtoHAsync queue=0x1c38c50 dst=0x1a5a890 src=66688 bytes=128
workgroupSize=32
found memory: 0x1e862b0 fakepos=65664 bytes=65536
.. kernel queued
cuMemcpyDtoHAsync dst[0] -nan
cudaStreamSynchronize queue=0x1c38c50
F
found memory: 0x1e862b0 fakepos=65664 bytes=65536
cuStreamDestroy_v2 redirected stream=0x1c39b20
queued kernel 2
cudaStreamSynchronize queue=0x2234400
finished
cuEventDestroy redirected event=0x1dac360
~Event()
found memory: 0x2234a00 fakepos=128 bytes=409600
cuStreamDestroy_v2 redirected stream=0x22343e0
test2
cuStreamCreate redirected
cuStreamCreate current context=0x1a18030
cuStreamCreate redirected new stream 0x22343e0
call cumemalloc
Memory::newDeviceAlloc context=0x1a18030 bytes=409600 memory=0x2234a00 clmem=0x2234870
cumemalloc done
 123 123 123 123 123 123 123 123 123 123
calling  cuMemcpyHtoDAsync
cuMemcpyHtoDAsync dst=409728 src=0x7ffd1f0a3980 bytes=409600
found memory: 0x2234a00 fakepos=409728 bytes=409600
cuMemcpyHtoDAsync done
cudaConfigureCall queue=0x2234400
grid(3200, 1, 1)
block(32, 1, 1)
configureKernel name=_Z10longKernelPfif
setKernelArgCharStar 0x64080
found memory: 0x2234a00 fakepos=409728 bytes=409600
setKernelArgInt32 102400
setKernelArgFloat 3
kernelGo queue=0x2234400
<<< global=dim3(102400,1,1,), workgroupsize=dim3(32,1,1,)>>>
workgroupSize=32
.. kernel queued
queued kernel
cuMemcpyDtoHAsync queue=0x2234400 dst=0x7ffd1f0a3980 src=409728 bytes=409600
found memory: 0x2234a00 fakepos=409728 bytes=409600
cuMemcpyDtoHAsync dst[0] 822
queued async copy
cudaStreamSynchronize queue=0x2234400
 822 1305 1305 1467 1482 1755 1755 1839 2070 2667
found memory: 0x2234a00 fakepos=409728 bytes=409600
cuStreamDestroy_v2 redirected stream=0x22343e0
[ 84%] Built target run-singlebuffer
1 warning generated.
[ 84%] Built target run-teststream
+ /usr/lib/llvm-3.8/bin/opt -S -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/offsetkernelargs-device.ll /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/offsetkernelargs-device-noopt.ll
Scanning dependencies of target run-testevents
+ /usr/lib/llvm-3.8/bin/clang++ -DUSE_CLEW -std=c++11 -x cuda --cuda-host-only -emit-llvm -O3 -S -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/EasyCL -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL/thirdparty/clew/include -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/fake_funcs.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl_hostside.h test/cocl/offsetkernelargs.cu -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/offsetkernelargs-hostraw.ll
Scanning dependencies of target run-testshfl
[ 85%] /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/bin/cocl test/cocl/testevents.cu -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testevents --add_ir_to_cl
[ 86%] /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/bin/cocl test/cocl/testshfl.cu -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testshfl --add_ir_to_cl
+ /usr/lib/llvm-3.8/bin/clang++ -DUSE_CLEW -std=c++11 -x cuda --cuda-gpu-arch=sm_30 --cuda-device-only -emit-llvm -O0 -S -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/EasyCL -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL/thirdparty/clew/include -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/fake_funcs.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl_deviceside.h -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include test/cocl/testevents.cu -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testevents-device-noopt.ll
warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
+ /usr/lib/llvm-3.8/bin/clang++ -DUSE_CLEW -std=c++11 -x cuda --cuda-gpu-arch=sm_30 --cuda-device-only -emit-llvm -O0 -S -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/EasyCL -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL/thirdparty/clew/include -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/fake_funcs.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl_deviceside.h -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include test/cocl/testshfl.cu -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testshfl-device-noopt.ll
test/cocl/testmath.cu:55:11: warning: unused variable 'diff' [-Wunused-variable]
    float diff = std::abs(hostFloats1[0] - 140.296);
          ^
warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
2 warnings generated.
+ /usr/lib/llvm-3.8/bin/opt -S -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testmath-device.ll /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testmath-device-noopt.ll
+ /usr/lib/llvm-3.8/bin/clang++ -DUSE_CLEW -std=c++11 -x cuda --cuda-host-only -emit-llvm -O3 -S -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/EasyCL -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL/thirdparty/clew/include -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/fake_funcs.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl_hostside.h test/cocl/testmath.cu -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testmath-hostraw.ll
warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
1 warning generated.
+ /usr/lib/llvm-3.8/bin/opt -S -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testevents-device.ll /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testevents-device-noopt.ll
+ /usr/lib/llvm-3.8/bin/clang++ -DUSE_CLEW -std=c++11 -x cuda --cuda-host-only -emit-llvm -O3 -S -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/EasyCL -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL/thirdparty/clew/include -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/fake_funcs.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl_hostside.h test/cocl/testevents.cu -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testevents-hostraw.ll
2 warnings generated.
+ /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/build/patch-hostside --hostrawfile /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/offsetkernelargs-hostraw.ll --devicellfile /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/offsetkernelargs-device.ll --hostpatchedfile /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/offsetkernelargs-hostpatched.ll
+ /usr/lib/llvm-3.8/bin/clang++ -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -DUSE_CLEW -c /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/offsetkernelargs-hostpatched.ll -O3 -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/offsetkernelargs.o
warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
clang: warning: argument unused during compilation: '-I /usr/lib/llvm-3.8/include'
clang: warning: argument unused during compilation: '-I /usr/lib/llvm-3.8/include'
warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
1 warning generated.
+ [ !  ]
+ g++ -Wl,-rpath,/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/build -Wl,-rpath,25132ORIGIN -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/offsetkernelargs /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/offsetkernelargs.o -L/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/build -lcocl -lclblast -leasycl -lclew -lpthread -L/usr/lib/llvm-3.8/lib -lLLVMLTO -lLLVMObjCARCOpts -lLLVMSymbolize -lLLVMDebugInfoPDB -lLLVMDebugInfoDWARF -lLLVMXCoreDisassembler -lLLVMXCoreCodeGen -lLLVMXCoreDesc -lLLVMXCoreInfo -lLLVMXCoreAsmPrinter -lLLVMSystemZDisassembler -lLLVMSystemZCodeGen -lLLVMSystemZAsmParser -lLLVMSystemZDesc -lLLVMSystemZInfo -lLLVMSystemZAsmPrinter -lLLVMSparcDisassembler -lLLVMSparcCodeGen -lLLVMSparcAsmParser -lLLVMSparcDesc -lLLVMSparcInfo -lLLVMSparcAsmPrinter -lLLVMPowerPCDisassembler -lLLVMPowerPCCodeGen -lLLVMPowerPCAsmParser -lLLVMPowerPCDesc -lLLVMPowerPCInfo -lLLVMPowerPCAsmPrinter -lLLVMNVPTXCodeGen -lLLVMNVPTXDesc -lLLVMNVPTXInfo -lLLVMNVPTXAsmPrinter -lLLVMMSP430CodeGen -lLLVMMSP430Desc -lLLVMMSP430Info -lLLVMMSP430AsmPrinter -lLLVMMipsDisassembler -lLLVMMipsCodeGen -lLLVMMipsAsmParser -lLLVMMipsDesc -lLLVMMipsInfo -lLLVMMipsAsmPrinter -lLLVMHexagonDisassembler -lLLVMHexagonCodeGen -lLLVMHexagonAsmParser -lLLVMHexagonDesc -lLLVMHexagonInfo -lLLVMCppBackendCodeGen -lLLVMCppBackendInfo -lLLVMBPFCodeGen -lLLVMBPFDesc -lLLVMBPFInfo -lLLVMBPFAsmPrinter -lLLVMARMDisassembler -lLLVMARMCodeGen -lLLVMARMAsmParser -lLLVMARMDesc -lLLVMARMInfo -lLLVMARMAsmPrinter -lLLVMAMDGPUCodeGen -lLLVMAMDGPUAsmParser -lLLVMAMDGPUDesc -lLLVMAMDGPUUtils -lLLVMAMDGPUInfo -lLLVMAMDGPUAsmPrinter -lLLVMAArch64Disassembler -lLLVMAArch64CodeGen -lLLVMAArch64AsmParser -lLLVMAArch64Desc -lLLVMAArch64Info -lLLVMAArch64AsmPrinter -lLLVMAArch64Utils -lLLVMMIRParser -lLLVMLibDriver -lLLVMOption -lLLVMTableGen -lLLVMLineEditor -lLLVMX86Disassembler -lLLVMX86AsmParser -lLLVMX86CodeGen -lLLVMSelectionDAG -lLLVMAsmPrinter -lLLVMX86Desc -lLLVMMCDisassembler -lLLVMX86Info -lLLVMX86AsmPrinter -lLLVMX86Utils -lLLVMMCJIT -lLLVMPasses -lLLVMipo -lLLVMVectorize -lLLVMLinker -lLLVMIRReader -lLLVMAsmParser -lLLVMDebugInfoCodeView -lLLVMInterpreter -lLLVMCodeGen -lLLVMScalarOpts -lLLVMInstCombine -lLLVMInstrumentation -lLLVMProfileData -lLLVMBitWriter -lLLVMOrcJIT -lLLVMTransformUtils -lLLVMExecutionEngine -lLLVMTarget -lLLVMAnalysis -lLLVMRuntimeDyld -lLLVMObject -lLLVMMCParser -lLLVMBitReader -lLLVMMC -lLLVMCore -lLLVMSupport -lrt -ldl -ltinfo -lpthread -lz -lm
2 warnings generated.
+ /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/build/patch-hostside --hostrawfile /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testmath-hostraw.ll --devicellfile /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testmath-device.ll --hostpatchedfile /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testmath-hostpatched.ll
+ /usr/lib/llvm-3.8/bin/clang++ -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -DUSE_CLEW -c /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testmath-hostpatched.ll -O3 -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testmath.o
clang: warning: argument unused during compilation: '-I /usr/lib/llvm-3.8/include'
clang: warning: argument unused during compilation: '-I /usr/lib/llvm-3.8/include'
warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
1 warning generated.
+ [ !  ]
+ g++ -Wl,-rpath,/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/build -Wl,-rpath,25189ORIGIN -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testmath /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testmath.o -L/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/build -lcocl -lclblast -leasycl -lclew -lpthread -L/usr/lib/llvm-3.8/lib -lLLVMLTO -lLLVMObjCARCOpts -lLLVMSymbolize -lLLVMDebugInfoPDB -lLLVMDebugInfoDWARF -lLLVMXCoreDisassembler -lLLVMXCoreCodeGen -lLLVMXCoreDesc -lLLVMXCoreInfo -lLLVMXCoreAsmPrinter -lLLVMSystemZDisassembler -lLLVMSystemZCodeGen -lLLVMSystemZAsmParser -lLLVMSystemZDesc -lLLVMSystemZInfo -lLLVMSystemZAsmPrinter -lLLVMSparcDisassembler -lLLVMSparcCodeGen -lLLVMSparcAsmParser -lLLVMSparcDesc -lLLVMSparcInfo -lLLVMSparcAsmPrinter -lLLVMPowerPCDisassembler -lLLVMPowerPCCodeGen -lLLVMPowerPCAsmParser -lLLVMPowerPCDesc -lLLVMPowerPCInfo -lLLVMPowerPCAsmPrinter -lLLVMNVPTXCodeGen -lLLVMNVPTXDesc -lLLVMNVPTXInfo -lLLVMNVPTXAsmPrinter -lLLVMMSP430CodeGen -lLLVMMSP430Desc -lLLVMMSP430Info -lLLVMMSP430AsmPrinter -lLLVMMipsDisassembler -lLLVMMipsCodeGen -lLLVMMipsAsmParser -lLLVMMipsDesc -lLLVMMipsInfo -lLLVMMipsAsmPrinter -lLLVMHexagonDisassembler -lLLVMHexagonCodeGen -lLLVMHexagonAsmParser -lLLVMHexagonDesc -lLLVMHexagonInfo -lLLVMCppBackendCodeGen -lLLVMCppBackendInfo -lLLVMBPFCodeGen -lLLVMBPFDesc -lLLVMBPFInfo -lLLVMBPFAsmPrinter -lLLVMARMDisassembler -lLLVMARMCodeGen -lLLVMARMAsmParser -lLLVMARMDesc -lLLVMARMInfo -lLLVMARMAsmPrinter -lLLVMAMDGPUCodeGen -lLLVMAMDGPUAsmParser -lLLVMAMDGPUDesc -lLLVMAMDGPUUtils -lLLVMAMDGPUInfo -lLLVMAMDGPUAsmPrinter -lLLVMAArch64Disassembler -lLLVMAArch64CodeGen -lLLVMAArch64AsmParser -lLLVMAArch64Desc -lLLVMAArch64Info -lLLVMAArch64AsmPrinter -lLLVMAArch64Utils -lLLVMMIRParser -lLLVMLibDriver -lLLVMOption -lLLVMTableGen -lLLVMLineEditor -lLLVMX86Disassembler -lLLVMX86AsmParser -lLLVMX86CodeGen -lLLVMSelectionDAG -lLLVMAsmPrinter -lLLVMX86Desc -lLLVMMCDisassembler -lLLVMX86Info -lLLVMX86AsmPrinter -lLLVMX86Utils -lLLVMMCJIT -lLLVMPasses -lLLVMipo -lLLVMVectorize -lLLVMLinker -lLLVMIRReader -lLLVMAsmParser -lLLVMDebugInfoCodeView -lLLVMInterpreter -lLLVMCodeGen -lLLVMScalarOpts -lLLVMInstCombine -lLLVMInstrumentation -lLLVMProfileData -lLLVMBitWriter -lLLVMOrcJIT -lLLVMTransformUtils -lLLVMExecutionEngine -lLLVMTarget -lLLVMAnalysis -lLLVMRuntimeDyld -lLLVMObject -lLLVMMCParser -lLLVMBitReader -lLLVMMC -lLLVMCore -lLLVMSupport -lrt -ldl -ltinfo -lpthread -lz -lm
test/cocl/testshfl.cu:13:9: warning: unused variable 'warpid' [-Wunused-variable]
    int warpid = tid % 32;  // assume warpsize 32.  Anyway, CUDA code uses warpsize 32.
        ^
2 warnings generated.
+ /usr/lib/llvm-3.8/bin/opt -S -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testshfl-device.ll /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testshfl-device-noopt.ll
+ /usr/lib/llvm-3.8/bin/clang++ -DUSE_CLEW -std=c++11 -x cuda --cuda-host-only -emit-llvm -O3 -S -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/EasyCL -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL/thirdparty/clew/include -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/fake_funcs.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl_hostside.h test/cocl/testshfl.cu -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testshfl-hostraw.ll
cuStreamCreate redirected
cuStreamCreate current context=0
creating default context
Context() 0x24ec030
warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
cuStreamCreate redirected
cuStreamCreate current context=0
creating default context
Context() 0x8dc030
2 warnings generated.
+ /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/build/patch-hostside --hostrawfile /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testevents-hostraw.ll --devicellfile /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testevents-device.ll --hostpatchedfile /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testevents-hostpatched.ll
+ /usr/lib/llvm-3.8/bin/clang++ -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -DUSE_CLEW -c /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testevents-hostpatched.ll -O3 -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testevents.o
clang: warning: argument unused during compilation: '-I /usr/lib/llvm-3.8/include'
clang: warning: argument unused during compilation: '-I /usr/lib/llvm-3.8/include'
warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
1 warning generated.
+ [ !  ]
+ g++ -Wl,-rpath,/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/build -Wl,-rpath,25254ORIGIN -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testevents /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testevents.o -L/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/build -lcocl -lclblast -leasycl -lclew -lpthread -L/usr/lib/llvm-3.8/lib -lLLVMLTO -lLLVMObjCARCOpts -lLLVMSymbolize -lLLVMDebugInfoPDB -lLLVMDebugInfoDWARF -lLLVMXCoreDisassembler -lLLVMXCoreCodeGen -lLLVMXCoreDesc -lLLVMXCoreInfo -lLLVMXCoreAsmPrinter -lLLVMSystemZDisassembler -lLLVMSystemZCodeGen -lLLVMSystemZAsmParser -lLLVMSystemZDesc -lLLVMSystemZInfo -lLLVMSystemZAsmPrinter -lLLVMSparcDisassembler -lLLVMSparcCodeGen -lLLVMSparcAsmParser -lLLVMSparcDesc -lLLVMSparcInfo -lLLVMSparcAsmPrinter -lLLVMPowerPCDisassembler -lLLVMPowerPCCodeGen -lLLVMPowerPCAsmParser -lLLVMPowerPCDesc -lLLVMPowerPCInfo -lLLVMPowerPCAsmPrinter -lLLVMNVPTXCodeGen -lLLVMNVPTXDesc -lLLVMNVPTXInfo -lLLVMNVPTXAsmPrinter -lLLVMMSP430CodeGen -lLLVMMSP430Desc -lLLVMMSP430Info -lLLVMMSP430AsmPrinter -lLLVMMipsDisassembler -lLLVMMipsCodeGen -lLLVMMipsAsmParser -lLLVMMipsDesc -lLLVMMipsInfo -lLLVMMipsAsmPrinter -lLLVMHexagonDisassembler -lLLVMHexagonCodeGen -lLLVMHexagonAsmParser -lLLVMHexagonDesc -lLLVMHexagonInfo -lLLVMCppBackendCodeGen -lLLVMCppBackendInfo -lLLVMBPFCodeGen -lLLVMBPFDesc -lLLVMBPFInfo -lLLVMBPFAsmPrinter -lLLVMARMDisassembler -lLLVMARMCodeGen -lLLVMARMAsmParser -lLLVMARMDesc -lLLVMARMInfo -lLLVMARMAsmPrinter -lLLVMAMDGPUCodeGen -lLLVMAMDGPUAsmParser -lLLVMAMDGPUDesc -lLLVMAMDGPUUtils -lLLVMAMDGPUInfo -lLLVMAMDGPUAsmPrinter -lLLVMAArch64Disassembler -lLLVMAArch64CodeGen -lLLVMAArch64AsmParser -lLLVMAArch64Desc -lLLVMAArch64Info -lLLVMAArch64AsmPrinter -lLLVMAArch64Utils -lLLVMMIRParser -lLLVMLibDriver -lLLVMOption -lLLVMTableGen -lLLVMLineEditor -lLLVMX86Disassembler -lLLVMX86AsmParser -lLLVMX86CodeGen -lLLVMSelectionDAG -lLLVMAsmPrinter -lLLVMX86Desc -lLLVMMCDisassembler -lLLVMX86Info -lLLVMX86AsmPrinter -lLLVMX86Utils -lLLVMMCJIT -lLLVMPasses -lLLVMipo -lLLVMVectorize -lLLVMLinker -lLLVMIRReader -lLLVMAsmParser -lLLVMDebugInfoCodeView -lLLVMInterpreter -lLLVMCodeGen -lLLVMScalarOpts -lLLVMInstCombine -lLLVMInstrumentation -lLLVMProfileData -lLLVMBitWriter -lLLVMOrcJIT -lLLVMTransformUtils -lLLVMExecutionEngine -lLLVMTarget -lLLVMAnalysis -lLLVMRuntimeDyld -lLLVMObject -lLLVMMCParser -lLLVMBitReader -lLLVMMC -lLLVMCore -lLLVMSupport -lrt -ldl -ltinfo -lpthread -lz -lm
Using Advanced Micro Devices, Inc. , OpenCL platform: AMD Accelerated Parallel Processing
Using OpenCL device: Pitcairn
cuStreamCreate redirected new stream 0x28a7230
Memory::newDeviceAlloc context=0x24ec030 bytes=4096 memory=0x28a7800 clmem=0x28a7690
Memory::newDeviceAlloc context=0x24ec030 bytes=4096 memory=0x28a79e0 clmem=0x28a7830
cuMemcpyHtoDAsync dst=128 src=0x27f75a0 bytes=4096
found memory: 0x28a7800 fakepos=128 bytes=4096
cudaConfigureCall queue=0x28a7250
grid(1, 1, 1)
block(32, 1, 1)
configureKernel name=_Z8getValuePfS_
building kernel _Z8getValuePfS_
cocl dump cl set
cl: [
kernel void _Z8getValuePfS_(global float* outdata, uint outdata_offset, global float* indata, uint indata_offset, local int *scratch);

kernel void _Z8getValuePfS_(global float* outdata, uint outdata_offset, global float* indata, uint indata_offset, local int *scratch) {
    indata += indata_offset;
    outdata += outdata_offset;

    float v2;
    float v3;

v1:;
    /* float v2 = load indata */;
    v2 = indata[0];
    /* float v3 = fadd v2 <unk> */;
    v3 = v2 + 3.0f;
    /* void v5 = store v3 outdata */;
    outdata[0] = v3;
    return;
}
]
Using Advanced Micro Devices, Inc. , OpenCL platform: AMD Accelerated Parallel Processing
Using OpenCL device: Pitcairn
cuStreamCreate redirected new stream 0xc99090
Memory::newDeviceAlloc context=0x8dc030 bytes=4096 memory=0xe2b780 clmem=0xe2b610
cuMemcpyHtoDAsync dst=128 src=0xe2a5f0 bytes=4096
found memory: 0xe2b780 fakepos=128 bytes=4096
cudaConfigureCall queue=0xc990b0
grid(1, 1, 1)
block(32, 1, 1)
configureKernel name=_Z8getValuePf
building kernel _Z8getValuePf
__internal__ build log:
"/tmp/OCL25377T1.cl", line 11: warning: label "v1" was declared but never
          referenced
  v1:;
  ^


 ... built
setKernelArgCharStar 0x1180
found memory: 0x28a79e0 fakepos=4224 bytes=4096
setKernelArgCharStar 0x280
found memory: 0x28a7800 fakepos=128 bytes=4096
kernelGo queue=0x28a7250
<<< global=dim3(32,1,1,), workgroupsize=dim3(32,1,1,)>>>
workgroupSize=32
cocl dump cl set
cl: [
kernel void _Z8getValuePf(global float* data, uint data_offset, local int *scratch);

kernel void _Z8getValuePf(global float* data, uint data_offset, local int *scratch) {
    data += data_offset;

    float v10;
    float v11;
    float v12;
    float v16;
    float v17;
    float v18;
    float v22;
    float v23;
    float v24;
    float v4;
    float v7;
    float v8;
    global float* v13;
    global float* v19;
    global float* v25;
    global float* v2;
    global float* v5;

v1:;
    /* float* v2 = getelementptr data <unk> */;
    v2 = (&(data[1]));
    /* float v4 = load v2 */;
    v4 = v2[0];
    /* float* v5 = getelementptr data <unk> */;
    v5 = (&(data[2]));
    /* float v7 = load v5 */;
    v7 = v5[0];
    /* float v8 = call v4 v7 <unk> */;
    v8 = pow(v4, v7);
    /* void v9 = store v8 data */;
    data[0] = v8;
    /* float v10 = load v2 */;
    v10 = v2[0];
    /* float v11 = load v5 */;
    v11 = v5[0];
    /* float v12 = call v10 v11 <unk> */;
    v12 = fmin(v10, v11);
    /* float* v13 = getelementptr data <unk> */;
    v13 = (&(data[4]));
    /* void v15 = store v12 v13 */;
    v13[0] = v12;
    /* float v16 = load v2 */;
    v16 = v2[0];
    /* float v17 = load v5 */;
    v17 = v5[0];
    /* float v18 = call v16 v17 <unk> */;
    v18 = fmax(v16, v17);
    /* float* v19 = getelementptr data <unk> */;
    v19 = (&(data[5]));
    /* void v21 = store v18 v19 */;
    v19[0] = v18;
    /* float v22 = load v2 */;
    v22 = v2[0];
    /* float v23 = load v5 */;
    v23 = v5[0];
    /* float v24 = call v22 v23 <unk> */;
    v24 = fmax(v22, v23);
    /* float* v25 = getelementptr data <unk> */;
    v25 = (&(data[6]));
    /* void v27 = store v24 v25 */;
    v25[0] = v24;
    return;
}
]
.. kernel queued
cuMemcpyDtoHAsync queue=0x28a7250 dst=0x2874750 src=4224 bytes=4096
found memory: 0x28a79e0 fakepos=4224 bytes=4096
cuMemcpyDtoHAsync dst[0] 0.0484016
cudaStreamSynchronize queue=0x28a7250
126.456
found memory: 0x28a7800 fakepos=128 bytes=4096
found memory: 0x28a79e0 fakepos=4224 bytes=4096
cuStreamDestroy_v2 redirected stream=0x28a7230
[ 86%] Built target run-offsetkernelargs
Scanning dependencies of target run-multigpu
[ 88%] /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/bin/cocl test/cocl/multigpu.cu -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/multigpu --add_ir_to_cl
__internal__ build log:
"/tmp/OCL25381T1.cl", line 25: warning: label "v1" was declared but never
          referenced
  v1:;
  ^


 ... built
setKernelArgCharStar 0x80
found memory: 0xe2b780 fakepos=128 bytes=4096
kernelGo queue=0xc990b0
<<< global=dim3(32,1,1,), workgroupsize=dim3(32,1,1,)>>>
workgroupSize=32
.. kernel queued
cuMemcpyDtoHAsync queue=0xc990b0 dst=0xe2a5f0 src=128 bytes=4096
found memory: 0xe2b780 fakepos=128 bytes=4096
cuMemcpyDtoHAsync dst[0] 140.296
cudaStreamSynchronize queue=0xc990b0
140.296
3
4.5
3
4.5
found memory: 0xe2b780 fakepos=128 bytes=4096
cuStreamDestroy_v2 redirected stream=0xc99090
[ 88%] Built target run-testmath
cuStreamCreate redirected
cuStreamCreate current context=0
creating default context
Context() 0xa9d030
Scanning dependencies of target run-properties
+ /usr/lib/llvm-3.8/bin/clang++ -DUSE_CLEW -std=c++11 -x cuda --cuda-gpu-arch=sm_30 --cuda-device-only -emit-llvm -O0 -S -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/EasyCL -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL/thirdparty/clew/include -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/fake_funcs.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl_deviceside.h -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include test/cocl/multigpu.cu -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/multigpu-device-noopt.ll
[ 89%] /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/bin/cocl test/cocl/properties.cu -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/properties --add_ir_to_cl
+ /usr/lib/llvm-3.8/bin/clang++ -DUSE_CLEW -std=c++11 -x cuda --cuda-gpu-arch=sm_30 --cuda-device-only -emit-llvm -O0 -S -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/EasyCL -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL/thirdparty/clew/include -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/fake_funcs.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl_deviceside.h -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include test/cocl/properties.cu -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/properties-device-noopt.ll
warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
test/cocl/testshfl.cu:13:9: warning: unused variable 'warpid' [-Wunused-variable]
    int warpid = tid % 32;  // assume warpsize 32.  Anyway, CUDA code uses warpsize 32.
        ^
3 warnings generated.
+ /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/build/patch-hostside --hostrawfile /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testshfl-hostraw.ll --devicellfile /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testshfl-device.ll --hostpatchedfile /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testshfl-hostpatched.ll
+ /usr/lib/llvm-3.8/bin/clang++ -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -DUSE_CLEW -c /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testshfl-hostpatched.ll -O3 -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testshfl.o
warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
clang: warning: argument unused during compilation: '-I /usr/lib/llvm-3.8/include'
clang: warning: argument unused during compilation: '-I /usr/lib/llvm-3.8/include'
warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
1 warning generated.
+ [ !  ]
+ g++ -Wl,-rpath,/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/build -Wl,-rpath,25301ORIGIN -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testshfl /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testshfl.o -L/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/build -lcocl -lclblast -leasycl -lclew -lpthread -L/usr/lib/llvm-3.8/lib -lLLVMLTO -lLLVMObjCARCOpts -lLLVMSymbolize -lLLVMDebugInfoPDB -lLLVMDebugInfoDWARF -lLLVMXCoreDisassembler -lLLVMXCoreCodeGen -lLLVMXCoreDesc -lLLVMXCoreInfo -lLLVMXCoreAsmPrinter -lLLVMSystemZDisassembler -lLLVMSystemZCodeGen -lLLVMSystemZAsmParser -lLLVMSystemZDesc -lLLVMSystemZInfo -lLLVMSystemZAsmPrinter -lLLVMSparcDisassembler -lLLVMSparcCodeGen -lLLVMSparcAsmParser -lLLVMSparcDesc -lLLVMSparcInfo -lLLVMSparcAsmPrinter -lLLVMPowerPCDisassembler -lLLVMPowerPCCodeGen -lLLVMPowerPCAsmParser -lLLVMPowerPCDesc -lLLVMPowerPCInfo -lLLVMPowerPCAsmPrinter -lLLVMNVPTXCodeGen -lLLVMNVPTXDesc -lLLVMNVPTXInfo -lLLVMNVPTXAsmPrinter -lLLVMMSP430CodeGen -lLLVMMSP430Desc -lLLVMMSP430Info -lLLVMMSP430AsmPrinter -lLLVMMipsDisassembler -lLLVMMipsCodeGen -lLLVMMipsAsmParser -lLLVMMipsDesc -lLLVMMipsInfo -lLLVMMipsAsmPrinter -lLLVMHexagonDisassembler -lLLVMHexagonCodeGen -lLLVMHexagonAsmParser -lLLVMHexagonDesc -lLLVMHexagonInfo -lLLVMCppBackendCodeGen -lLLVMCppBackendInfo -lLLVMBPFCodeGen -lLLVMBPFDesc -lLLVMBPFInfo -lLLVMBPFAsmPrinter -lLLVMARMDisassembler -lLLVMARMCodeGen -lLLVMARMAsmParser -lLLVMARMDesc -lLLVMARMInfo -lLLVMARMAsmPrinter -lLLVMAMDGPUCodeGen -lLLVMAMDGPUAsmParser -lLLVMAMDGPUDesc -lLLVMAMDGPUUtils -lLLVMAMDGPUInfo -lLLVMAMDGPUAsmPrinter -lLLVMAArch64Disassembler -lLLVMAArch64CodeGen -lLLVMAArch64AsmParser -lLLVMAArch64Desc -lLLVMAArch64Info -lLLVMAArch64AsmPrinter -lLLVMAArch64Utils -lLLVMMIRParser -lLLVMLibDriver -lLLVMOption -lLLVMTableGen -lLLVMLineEditor -lLLVMX86Disassembler -lLLVMX86AsmParser -lLLVMX86CodeGen -lLLVMSelectionDAG -lLLVMAsmPrinter -lLLVMX86Desc -lLLVMMCDisassembler -lLLVMX86Info -lLLVMX86AsmPrinter -lLLVMX86Utils -lLLVMMCJIT -lLLVMPasses -lLLVMipo -lLLVMVectorize -lLLVMLinker -lLLVMIRReader -lLLVMAsmParser -lLLVMDebugInfoCodeView -lLLVMInterpreter -lLLVMCodeGen -lLLVMScalarOpts -lLLVMInstCombine -lLLVMInstrumentation -lLLVMProfileData -lLLVMBitWriter -lLLVMOrcJIT -lLLVMTransformUtils -lLLVMExecutionEngine -lLLVMTarget -lLLVMAnalysis -lLLVMRuntimeDyld -lLLVMObject -lLLVMMCParser -lLLVMBitReader -lLLVMMC -lLLVMCore -lLLVMSupport -lrt -ldl -ltinfo -lpthread -lz -lm
Using Advanced Micro Devices, Inc. , OpenCL platform: AMD Accelerated Parallel Processing
Using OpenCL device: Pitcairn
cuStreamCreate redirected new stream 0xe86280
Memory::newDeviceAlloc context=0xa9d030 bytes=409600 memory=0xe868a0 clmem=0xe86730
cuMemcpyHtoDAsync dst=128 src=0xdad910 bytes=409600
found memory: 0xe868a0 fakepos=128 bytes=409600
cudaConfigureCall using default_queue
cudaConfigureCall queue=0x12b16c0
grid(3200, 1, 1)
block(32, 1, 1)
configureKernel name=_Z10longKernelPfif
building kernel _Z10longKernelPfif
cocl dump cl set
cl: [
kernel void _Z10longKernelPfif(global float* data, uint data_offset, int N, float value, local int *scratch);

kernel void _Z10longKernelPfif(global float* data, uint data_offset, int N, float value, local int *scratch) {
    data += data_offset;

    float v24;
    float v25;
    float v36;
    float v37;
    float v42;
    float v43;
    float v49;
    float v50;
    float v55;
    float v56;
    global float* v23;
    global float* v35;
    global float* v41;
    global float* v48;
    global float* v54;
    int v16;
    int v19;
    int v20;
    int v21;
    int v27;
    int v29;
    int v31;
    int v33;
    int v58;

v1:;
    /* bool v12 = icmp N <unk> */;
    /* if(v12) */
    if (N > 0) {
        goto v2;
    } else {
        goto v10;
    }
v2:;
    /* int v14 = add N <unk> */;
    /* int v16 = and N <unk> */;
    v16 = N & 3;
    /* bool v18 = icmp v16 v13 */;
    /* if(v18) */
    if (v16 == 0) {
        /* int v19 = phi v13 */
        v19 = 0;
        goto v6;
    } else {
        goto v3;
    }
v3:;
    /* int v20 = phi v13 */
    v20 = 0;
    /* int v21 = phi v16 */
    v21 = v16;
    goto v4;
v4:;
    /* long v22 = sext v20 */;
    /* float* v23 = getelementptr data v22 */;
    v23 = (&(data[v20]));
    /* float v24 = load v23 */;
    v24 = v23[0];
    /* float v25 = fadd v24 value */;
    v25 = v24 + value;
    /* void v26 = store v25 v23 */;
    v23[0] = v25;
    /* int v27 = add v20 <unk> */;
    v27 = v20 + 1;
    /* int v29 = add v21 v15 */;
    v29 = v21 + -1;
    /* bool v30 = icmp v29 v13 */;
    /* if(v30) */
    if (v29 == 0) {
        /* int v31 = phi v27 */
        v31 = v27;
        goto v5;
    } else {
        /* int v20 = phi v27 */
        v20 = v27;
        /* int v21 = phi v29 */
        v21 = v29;
        goto v4;
    }
v5:;
    /* int v19 = phi v31 */
    v19 = v31;
    goto v6;
v6:;
    /* bool v32 = icmp v14 v17 */;
    /* if(v32) */
    if (N + -1 < 3) {
        goto v9;
    } else {
        goto v7;
    }
v7:;
    /* int v33 = phi v19 */
    v33 = v19;
    goto v11;
v8:;
    goto v9;
v9:;
    goto v10;
v10:;
    return;
v11:;
    /* long v34 = sext v33 */;
    /* float* v35 = getelementptr data v34 */;
    v35 = (&(data[v33]));
    /* float v36 = load v35 */;
    v36 = v35[0];
    /* float v37 = fadd v36 value */;
    v37 = v36 + value;
    /* void v38 = store v37 v35 */;
    v35[0] = v37;
    /* int v39 = add v33 v28 */;
    /* long v40 = sext v39 */;
    /* float* v41 = getelementptr data v40 */;
    v41 = (&(data[v33 + 1]));
    /* float v42 = load v41 */;
    v42 = v41[0];
    /* float v43 = fadd v42 value */;
    v43 = v42 + value;
    /* void v44 = store v43 v41 */;
    v41[0] = v43;
    /* int v45 = add v33 <unk> */;
    /* long v47 = sext v45 */;
    /* float* v48 = getelementptr data v47 */;
    v48 = (&(data[v33 + 2]));
    /* float v49 = load v48 */;
    v49 = v48[0];
    /* float v50 = fadd v49 value */;
    v50 = v49 + value;
    /* void v51 = store v50 v48 */;
    v48[0] = v50;
    /* int v52 = add v33 v17 */;
    /* long v53 = sext v52 */;
    /* float* v54 = getelementptr data v53 */;
    v54 = (&(data[v33 + 3]));
    /* float v55 = load v54 */;
    v55 = v54[0];
    /* float v56 = fadd v55 value */;
    v56 = v55 + value;
    /* void v57 = store v56 v54 */;
    v54[0] = v56;
    /* int v58 = add v33 <unk> */;
    v58 = v33 + 4;
    /* bool v60 = icmp v58 N */;
    /* if(v60) */
    if (v58 == N) {
        goto v8;
    } else {
        /* int v33 = phi v58 */
        v33 = v58;
        goto v11;
    }
}
]
__internal__ build log:
"/tmp/OCL25444T1.cl", line 36: warning: goto statement may cause irreducible
          control flow
          goto v2;
          ^

"/tmp/OCL25444T1.cl", line 38: warning: goto statement may cause irreducible
          control flow
          goto v10;
          ^

"/tmp/OCL25444T1.cl", line 49: warning: goto statement may cause irreducible
          control flow
          goto v6;
          ^

"/tmp/OCL25444T1.cl", line 51: warning: goto statement may cause irreducible
          control flow
          goto v3;
          ^

"/tmp/OCL25444T1.cl", line 58: warning: goto statement may cause irreducible
          control flow
      goto v4;
      ^

"/tmp/OCL25444T1.cl", line 78: warning: goto statement may cause irreducible
          control flow
          goto v5;
          ^

"/tmp/OCL25444T1.cl", line 84: warning: goto statement may cause irreducible
          control flow
          goto v4;
          ^

"/tmp/OCL25444T1.cl", line 89: warning: goto statement may cause irreducible
          control flow
      goto v6;
      ^

"/tmp/OCL25444T1.cl", line 94: warning: goto statement may cause irreducible
          control flow
          goto v9;
          ^

"/tmp/OCL25444T1.cl", line 96: warning: goto statement may cause irreducible
          control flow
          goto v7;
          ^

"/tmp/OCL25444T1.cl", line 101: warning: goto statement may cause irreducible
          control flow
      goto v11;
      ^

"/tmp/OCL25444T1.cl", line 103: warning: goto statement may cause irreducible
          control flow
      goto v9;
      ^

"/tmp/OCL25444T1.cl", line 105: warning: goto statement may cause irreducible
          control flow
      goto v10;
      ^

"/tmp/OCL25444T1.cl", line 153: warning: goto statement may cause irreducible
          control flow
          goto v8;
          ^

"/tmp/OCL25444T1.cl", line 157: warning: goto statement may cause irreducible
          control flow
          goto v11;
          ^

"/tmp/OCL25444T1.cl", line 32: warning: label "v1" was declared but never
          referenced
  v1:;
  ^


 ... built
setKernelArgCharStar 0x80
found memory: 0xe868a0 fakepos=128 bytes=409600
setKernelArgInt32 102400
setKernelArgFloat 3
kernelGo queue=0x12b16c0
<<< global=dim3(102400,1,1,), workgroupsize=dim3(32,1,1,)>>>
workgroupSize=32
.. kernel queued
cuStreamCreate redirected
cuStreamCreate current context=0
creating default context
Context() 0xb24030
queued kernel x
cuCtxSynchronize redirected
finished
found memory: 0xe868a0 fakepos=128 bytes=409600
cuStreamDestroy_v2 redirected stream=0xe86280
[ 89%] Built target run-testevents
Scanning dependencies of target run-test_bitcast
[ 90%] /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/bin/cocl test/cocl/test_bitcast.cu -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/test_bitcast --add_ir_to_cl
+ /usr/lib/llvm-3.8/bin/clang++ -DUSE_CLEW -std=c++11 -x cuda --cuda-gpu-arch=sm_30 --cuda-device-only -emit-llvm -O0 -S -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/EasyCL -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL/thirdparty/clew/include -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/fake_funcs.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl_deviceside.h -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include test/cocl/test_bitcast.cu -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/test_bitcast-device-noopt.ll
warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
test/cocl/multigpu.cu:86:22: warning: variable length arrays are a C99 feature [-Wvla-extension]
    pthread_t threads[ deviceCount ];
                     ^
2 warnings generated.
+ /usr/lib/llvm-3.8/bin/opt -S -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/multigpu-device.ll /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/multigpu-device-noopt.ll
+ /usr/lib/llvm-3.8/bin/clang++ -DUSE_CLEW -std=c++11 -x cuda --cuda-host-only -emit-llvm -O3 -S -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/EasyCL -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL/thirdparty/clew/include -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/fake_funcs.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl_hostside.h test/cocl/multigpu.cu -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/multigpu-hostraw.ll
test/cocl/properties.cu:12:11: warning: unused variable 'N' [-Wunused-const-variable]
const int N = 1024;
          ^
2 warnings generated.
warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
+ /usr/lib/llvm-3.8/bin/opt -S -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/properties-device.ll /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/properties-device-noopt.ll
+ /usr/lib/llvm-3.8/bin/clang++ -DUSE_CLEW -std=c++11 -x cuda --cuda-host-only -emit-llvm -O3 -S -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/EasyCL -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL/thirdparty/clew/include -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/fake_funcs.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl_hostside.h test/cocl/properties.cu -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/properties-hostraw.ll
warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
Using Advanced Micro Devices, Inc. , OpenCL platform: AMD Accelerated Parallel Processing
Using OpenCL device: Pitcairn
cuStreamCreate redirected new stream 0x10e7e60
Memory::newDeviceAlloc context=0xb24030 bytes=4096 memory=0x10e8480 clmem=0x10e8310
cuMemcpyHtoDAsync dst=128 src=0x10e97a0 bytes=4096
found memory: 0x10e8480 fakepos=128 bytes=4096
cudaConfigureCall queue=0x10e7e80
grid(4, 1, 1)
block(128, 1, 1)
configureKernel name=_Z8getValuePf
building kernel _Z8getValuePf
cocl dump cl set
cl: [

inline float __shfl_down_3(local int *scratch, float v0, int v1, int v2) {
    // local float mem[1024];
    local float *mem = (local float *)scratch;
    int tid = get_local_id(0);
    int warpid = tid % 32;
    int warpstart = tid - warpid;
    mem[tid] = v0;
    //barrier(CLK_LOCAL_MEM_FENCE);
    int warpsrc = warpid + v1;
    warpsrc = warpsrc >= 32 ? warpid : warpsrc;
    return mem[warpstart + warpsrc];
}

kernel void _Z8getValuePf(global float* data, uint data_offset, local int *scratch);

kernel void _Z8getValuePf(global float* data, uint data_offset, local int *scratch) {
    data += data_offset;

    float v5;
    float v6;
    global float* v4;
    int v2;

v1:;
    /* int v2 = call <unk> */;
    v2 = get_local_id(0);
    /* long v3 = sext v2 */;
    /* float* v4 = getelementptr data v3 */;
    v4 = (&(data[v2]));
    /* float v5 = load v4 */;
    v5 = v4[0];
    /* float v6 = call v5 <unk> <unk> <unk> */;
    v6 = __shfl_down_3(scratch, v5, 1, 32);
    /* void v9 = store v6 v4 */;
    v4[0] = v6;
    return;
}
]
__internal__ build log:
"/tmp/OCL25521T1.cl", line 26: warning: label "v1" was declared but never
          referenced
  v1:;
  ^


 ... built
setKernelArgCharStar 0x80
found memory: 0x10e8480 fakepos=128 bytes=4096
kernelGo queue=0x10e7e80
<<< global=dim3(512,1,1,), workgroupsize=dim3(128,1,1,)>>>
workgroupSize=128
.. kernel queued
cuMemcpyDtoHAsync queue=0x10e7e80 dst=0x10e97a0 src=128 bytes=4096
found memory: 0x10e8480 fakepos=128 bytes=4096
cuMemcpyDtoHAsync dst[0] 1001
cudaStreamSynchronize queue=0x10e7e80
1001
1002
1003
1005
1006
found memory: 0x10e8480 fakepos=128 bytes=4096
cuStreamDestroy_v2 redirected stream=0x10e7e60
[ 90%] Built target run-testshfl
Scanning dependencies of target run-testblas
[ 92%] /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/bin/cocl test/cocl/testblas.cu -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testblas --add_ir_to_cl
+ /usr/lib/llvm-3.8/bin/clang++ -DUSE_CLEW -std=c++11 -x cuda --cuda-gpu-arch=sm_30 --cuda-device-only -emit-llvm -O0 -S -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/EasyCL -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL/thirdparty/clew/include -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/fake_funcs.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl_deviceside.h -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include test/cocl/testblas.cu -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testblas-device-noopt.ll
warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
1 warning generated.
+ /usr/lib/llvm-3.8/bin/opt -S -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/test_bitcast-device.ll /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/test_bitcast-device-noopt.ll
+ /usr/lib/llvm-3.8/bin/clang++ -DUSE_CLEW -std=c++11 -x cuda --cuda-host-only -emit-llvm -O3 -S -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/EasyCL -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL/thirdparty/clew/include -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/fake_funcs.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl_hostside.h test/cocl/test_bitcast.cu -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/test_bitcast-hostraw.ll
warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
test/cocl/multigpu.cu:86:22: warning: variable length arrays are a C99 feature [-Wvla-extension]
    pthread_t threads[ deviceCount ];
                     ^
3 warnings generated.
+ /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/build/patch-hostside --hostrawfile /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/multigpu-hostraw.ll --devicellfile /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/multigpu-device.ll --hostpatchedfile /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/multigpu-hostpatched.ll
+ /usr/lib/llvm-3.8/bin/clang++ -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -DUSE_CLEW -c /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/multigpu-hostpatched.ll -O3 -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/multigpu.o
test/cocl/properties.cu:12:11: warning: unused variable 'N' [-Wunused-const-variable]
const int N = 1024;
          ^
clang: warning: argument unused during compilation: '-I /usr/lib/llvm-3.8/include'
clang: warning: argument unused during compilation: '-I /usr/lib/llvm-3.8/include'
warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
3 warnings generated.
+ /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/build/patch-hostside --hostrawfile /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/properties-hostraw.ll --devicellfile /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/properties-device.ll --hostpatchedfile /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/properties-hostpatched.ll
+ /usr/lib/llvm-3.8/bin/clang++ -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -DUSE_CLEW -c /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/properties-hostpatched.ll -O3 -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/properties.o
1 warning generated.
+ [ !  ]
+ g++ -Wl,-rpath,/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/build -Wl,-rpath,25406ORIGIN -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/multigpu /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/multigpu.o -L/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/build -lcocl -lclblast -leasycl -lclew -lpthread -L/usr/lib/llvm-3.8/lib -lLLVMLTO -lLLVMObjCARCOpts -lLLVMSymbolize -lLLVMDebugInfoPDB -lLLVMDebugInfoDWARF -lLLVMXCoreDisassembler -lLLVMXCoreCodeGen -lLLVMXCoreDesc -lLLVMXCoreInfo -lLLVMXCoreAsmPrinter -lLLVMSystemZDisassembler -lLLVMSystemZCodeGen -lLLVMSystemZAsmParser -lLLVMSystemZDesc -lLLVMSystemZInfo -lLLVMSystemZAsmPrinter -lLLVMSparcDisassembler -lLLVMSparcCodeGen -lLLVMSparcAsmParser -lLLVMSparcDesc -lLLVMSparcInfo -lLLVMSparcAsmPrinter -lLLVMPowerPCDisassembler -lLLVMPowerPCCodeGen -lLLVMPowerPCAsmParser -lLLVMPowerPCDesc -lLLVMPowerPCInfo -lLLVMPowerPCAsmPrinter -lLLVMNVPTXCodeGen -lLLVMNVPTXDesc -lLLVMNVPTXInfo -lLLVMNVPTXAsmPrinter -lLLVMMSP430CodeGen -lLLVMMSP430Desc -lLLVMMSP430Info -lLLVMMSP430AsmPrinter -lLLVMMipsDisassembler -lLLVMMipsCodeGen -lLLVMMipsAsmParser -lLLVMMipsDesc -lLLVMMipsInfo -lLLVMMipsAsmPrinter -lLLVMHexagonDisassembler -lLLVMHexagonCodeGen -lLLVMHexagonAsmParser -lLLVMHexagonDesc -lLLVMHexagonInfo -lLLVMCppBackendCodeGen -lLLVMCppBackendInfo -lLLVMBPFCodeGen -lLLVMBPFDesc -lLLVMBPFInfo -lLLVMBPFAsmPrinter -lLLVMARMDisassembler -lLLVMARMCodeGen -lLLVMARMAsmParser -lLLVMARMDesc -lLLVMARMInfo -lLLVMARMAsmPrinter -lLLVMAMDGPUCodeGen -lLLVMAMDGPUAsmParser -lLLVMAMDGPUDesc -lLLVMAMDGPUUtils -lLLVMAMDGPUInfo -lLLVMAMDGPUAsmPrinter -lLLVMAArch64Disassembler -lLLVMAArch64CodeGen -lLLVMAArch64AsmParser -lLLVMAArch64Desc -lLLVMAArch64Info -lLLVMAArch64AsmPrinter -lLLVMAArch64Utils -lLLVMMIRParser -lLLVMLibDriver -lLLVMOption -lLLVMTableGen -lLLVMLineEditor -lLLVMX86Disassembler -lLLVMX86AsmParser -lLLVMX86CodeGen -lLLVMSelectionDAG -lLLVMAsmPrinter -lLLVMX86Desc -lLLVMMCDisassembler -lLLVMX86Info -lLLVMX86AsmPrinter -lLLVMX86Utils -lLLVMMCJIT -lLLVMPasses -lLLVMipo -lLLVMVectorize -lLLVMLinker -lLLVMIRReader -lLLVMAsmParser -lLLVMDebugInfoCodeView -lLLVMInterpreter -lLLVMCodeGen -lLLVMScalarOpts -lLLVMInstCombine -lLLVMInstrumentation -lLLVMProfileData -lLLVMBitWriter -lLLVMOrcJIT -lLLVMTransformUtils -lLLVMExecutionEngine -lLLVMTarget -lLLVMAnalysis -lLLVMRuntimeDyld -lLLVMObject -lLLVMMCParser -lLLVMBitReader -lLLVMMC -lLLVMCore -lLLVMSupport -lrt -ldl -ltinfo -lpthread -lz -lm
clang: warning: argument unused during compilation: '-I /usr/lib/llvm-3.8/include'
clang: warning: argument unused during compilation: '-I /usr/lib/llvm-3.8/include'
warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
1 warning generated.
+ [ !  ]
+ g++ -Wl,-rpath,/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/build -Wl,-rpath,25462ORIGIN -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/properties /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/properties.o -L/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/build -lcocl -lclblast -leasycl -lclew -lpthread -L/usr/lib/llvm-3.8/lib -lLLVMLTO -lLLVMObjCARCOpts -lLLVMSymbolize -lLLVMDebugInfoPDB -lLLVMDebugInfoDWARF -lLLVMXCoreDisassembler -lLLVMXCoreCodeGen -lLLVMXCoreDesc -lLLVMXCoreInfo -lLLVMXCoreAsmPrinter -lLLVMSystemZDisassembler -lLLVMSystemZCodeGen -lLLVMSystemZAsmParser -lLLVMSystemZDesc -lLLVMSystemZInfo -lLLVMSystemZAsmPrinter -lLLVMSparcDisassembler -lLLVMSparcCodeGen -lLLVMSparcAsmParser -lLLVMSparcDesc -lLLVMSparcInfo -lLLVMSparcAsmPrinter -lLLVMPowerPCDisassembler -lLLVMPowerPCCodeGen -lLLVMPowerPCAsmParser -lLLVMPowerPCDesc -lLLVMPowerPCInfo -lLLVMPowerPCAsmPrinter -lLLVMNVPTXCodeGen -lLLVMNVPTXDesc -lLLVMNVPTXInfo -lLLVMNVPTXAsmPrinter -lLLVMMSP430CodeGen -lLLVMMSP430Desc -lLLVMMSP430Info -lLLVMMSP430AsmPrinter -lLLVMMipsDisassembler -lLLVMMipsCodeGen -lLLVMMipsAsmParser -lLLVMMipsDesc -lLLVMMipsInfo -lLLVMMipsAsmPrinter -lLLVMHexagonDisassembler -lLLVMHexagonCodeGen -lLLVMHexagonAsmParser -lLLVMHexagonDesc -lLLVMHexagonInfo -lLLVMCppBackendCodeGen -lLLVMCppBackendInfo -lLLVMBPFCodeGen -lLLVMBPFDesc -lLLVMBPFInfo -lLLVMBPFAsmPrinter -lLLVMARMDisassembler -lLLVMARMCodeGen -lLLVMARMAsmParser -lLLVMARMDesc -lLLVMARMInfo -lLLVMARMAsmPrinter -lLLVMAMDGPUCodeGen -lLLVMAMDGPUAsmParser -lLLVMAMDGPUDesc -lLLVMAMDGPUUtils -lLLVMAMDGPUInfo -lLLVMAMDGPUAsmPrinter -lLLVMAArch64Disassembler -lLLVMAArch64CodeGen -lLLVMAArch64AsmParser -lLLVMAArch64Desc -lLLVMAArch64Info -lLLVMAArch64AsmPrinter -lLLVMAArch64Utils -lLLVMMIRParser -lLLVMLibDriver -lLLVMOption -lLLVMTableGen -lLLVMLineEditor -lLLVMX86Disassembler -lLLVMX86AsmParser -lLLVMX86CodeGen -lLLVMSelectionDAG -lLLVMAsmPrinter -lLLVMX86Desc -lLLVMMCDisassembler -lLLVMX86Info -lLLVMX86AsmPrinter -lLLVMX86Utils -lLLVMMCJIT -lLLVMPasses -lLLVMipo -lLLVMVectorize -lLLVMLinker -lLLVMIRReader -lLLVMAsmParser -lLLVMDebugInfoCodeView -lLLVMInterpreter -lLLVMCodeGen -lLLVMScalarOpts -lLLVMInstCombine -lLLVMInstrumentation -lLLVMProfileData -lLLVMBitWriter -lLLVMOrcJIT -lLLVMTransformUtils -lLLVMExecutionEngine -lLLVMTarget -lLLVMAnalysis -lLLVMRuntimeDyld -lLLVMObject -lLLVMMCParser -lLLVMBitReader -lLLVMMC -lLLVMCore -lLLVMSupport -lrt -ldl -ltinfo -lpthread -lz -lm
cudaGetDeviceCount
cudaGetDeviceProperties stub device=0
test/cocl/testblas.cu:12:9: warning: unused variable 'newrows' [-Wunused-variable]
    int newrows = cols;
        ^
2 warnings generated.
+ /usr/lib/llvm-3.8/bin/opt -S -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testblas-device.ll /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testblas-device-noopt.ll
+ /usr/lib/llvm-3.8/bin/clang++ -DUSE_CLEW -std=c++11 -x cuda --cuda-host-only -emit-llvm -O3 -S -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/EasyCL -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL/thirdparty/clew/include -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/fake_funcs.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl_hostside.h test/cocl/testblas.cu -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testblas-hostraw.ll
warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
2 warnings generated.
+ /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/build/patch-hostside --hostrawfile /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/test_bitcast-hostraw.ll --devicellfile /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/test_bitcast-device.ll --hostpatchedfile /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/test_bitcast-hostpatched.ll
+ /usr/lib/llvm-3.8/bin/clang++ -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -DUSE_CLEW -c /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/test_bitcast-hostpatched.ll -O3 -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/test_bitcast.o
clang: warning: argument unused during compilation: '-I /usr/lib/llvm-3.8/include'
clang: warning: argument unused during compilation: '-I /usr/lib/llvm-3.8/include'
warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
num platforms 1
checking platform id 0x7fcc76176a18
num devices 2
devices: 2
cuCtxCreate_v2 redirected device=0 flags=0
Context() 0x29f0020
Using Advanced Micro Devices, Inc. , OpenCL platform: AMD Accelerated Parallel Processing
Using OpenCL device: Pitcairn
cuCtxCreate_v2 new context=0x29f0020
created context 0x29f0020
cuCtxCreate_v2 redirected device=1 flags=0
Context() 0x29eef00
terminate called after throwing an instance of 'std::runtime_error'
  what():  Not enough OpenCL-enabled GPUs found to satisfy gpu index: 1
1 warning generated.
+ [ !  ]
+ g++ -Wl,-rpath,/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/build -Wl,-rpath,25532ORIGIN -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/test_bitcast /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/test_bitcast.o -L/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/build -lcocl -lclblast -leasycl -lclew -lpthread -L/usr/lib/llvm-3.8/lib -lLLVMLTO -lLLVMObjCARCOpts -lLLVMSymbolize -lLLVMDebugInfoPDB -lLLVMDebugInfoDWARF -lLLVMXCoreDisassembler -lLLVMXCoreCodeGen -lLLVMXCoreDesc -lLLVMXCoreInfo -lLLVMXCoreAsmPrinter -lLLVMSystemZDisassembler -lLLVMSystemZCodeGen -lLLVMSystemZAsmParser -lLLVMSystemZDesc -lLLVMSystemZInfo -lLLVMSystemZAsmPrinter -lLLVMSparcDisassembler -lLLVMSparcCodeGen -lLLVMSparcAsmParser -lLLVMSparcDesc -lLLVMSparcInfo -lLLVMSparcAsmPrinter -lLLVMPowerPCDisassembler -lLLVMPowerPCCodeGen -lLLVMPowerPCAsmParser -lLLVMPowerPCDesc -lLLVMPowerPCInfo -lLLVMPowerPCAsmPrinter -lLLVMNVPTXCodeGen -lLLVMNVPTXDesc -lLLVMNVPTXInfo -lLLVMNVPTXAsmPrinter -lLLVMMSP430CodeGen -lLLVMMSP430Desc -lLLVMMSP430Info -lLLVMMSP430AsmPrinter -lLLVMMipsDisassembler -lLLVMMipsCodeGen -lLLVMMipsAsmParser -lLLVMMipsDesc -lLLVMMipsInfo -lLLVMMipsAsmPrinter -lLLVMHexagonDisassembler -lLLVMHexagonCodeGen -lLLVMHexagonAsmParser -lLLVMHexagonDesc -lLLVMHexagonInfo -lLLVMCppBackendCodeGen -lLLVMCppBackendInfo -lLLVMBPFCodeGen -lLLVMBPFDesc -lLLVMBPFInfo -lLLVMBPFAsmPrinter -lLLVMARMDisassembler -lLLVMARMCodeGen -lLLVMARMAsmParser -lLLVMARMDesc -lLLVMARMInfo -lLLVMARMAsmPrinter -lLLVMAMDGPUCodeGen -lLLVMAMDGPUAsmParser -lLLVMAMDGPUDesc -lLLVMAMDGPUUtils -lLLVMAMDGPUInfo -lLLVMAMDGPUAsmPrinter -lLLVMAArch64Disassembler -lLLVMAArch64CodeGen -lLLVMAArch64AsmParser -lLLVMAArch64Desc -lLLVMAArch64Info -lLLVMAArch64AsmPrinter -lLLVMAArch64Utils -lLLVMMIRParser -lLLVMLibDriver -lLLVMOption -lLLVMTableGen -lLLVMLineEditor -lLLVMX86Disassembler -lLLVMX86AsmParser -lLLVMX86CodeGen -lLLVMSelectionDAG -lLLVMAsmPrinter -lLLVMX86Desc -lLLVMMCDisassembler -lLLVMX86Info -lLLVMX86AsmPrinter -lLLVMX86Utils -lLLVMMCJIT -lLLVMPasses -lLLVMipo -lLLVMVectorize -lLLVMLinker -lLLVMIRReader -lLLVMAsmParser -lLLVMDebugInfoCodeView -lLLVMInterpreter -lLLVMCodeGen -lLLVMScalarOpts -lLLVMInstCombine -lLLVMInstrumentation -lLLVMProfileData -lLLVMBitWriter -lLLVMOrcJIT -lLLVMTransformUtils -lLLVMExecutionEngine -lLLVMTarget -lLLVMAnalysis -lLLVMRuntimeDyld -lLLVMObject -lLLVMMCParser -lLLVMBitReader -lLLVMMC -lLLVMCore -lLLVMSupport -lrt -ldl -ltinfo -lpthread -lz -lm
maxworkgroupsize 256
cuMemGetInfo redirected
free 1395373056 total 2103008640
[ 92%] Built target run-properties
Scanning dependencies of target run-multithreading
[ 92%] /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/bin/cocl test/cocl/multithreading.cu -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/multithreading --add_ir_to_cl
+ /usr/lib/llvm-3.8/bin/clang++ -DUSE_CLEW -std=c++11 -x cuda --cuda-gpu-arch=sm_30 --cuda-device-only -emit-llvm -O0 -S -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/EasyCL -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL/thirdparty/clew/include -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/fake_funcs.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl_deviceside.h -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include test/cocl/multithreading.cu -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/multithreading-device-noopt.ll
Aborted (core dumped)
CMakeFiles/run-multigpu.dir/build.make:57: recipe for target 'CMakeFiles/run-multigpu' failed
make[3]: *** [CMakeFiles/run-multigpu] Error 134
CMakeFiles/Makefile2:998: recipe for target 'CMakeFiles/run-multigpu.dir/all' failed
make[2]: *** [CMakeFiles/run-multigpu.dir/all] Error 2
make[2]: *** Waiting for unfinished jobs....
creating default context
Context() 0x244b030
warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
Using Advanced Micro Devices, Inc. , OpenCL platform: AMD Accelerated Parallel Processing
Using OpenCL device: Pitcairn
Memory::newDeviceAlloc context=0x244b030 bytes=128 memory=0x2857a10 clmem=0x2cbe960
Memory::newDeviceAlloc context=0x244b030 bytes=128 memory=0x2cbec90 clmem=0x2cbeae0
Memory::newDeviceAlloc context=0x244b030 bytes=128 memory=0x2cbb490 clmem=0x2805a70
cudamempcy using opencl cudaMemcpyKind 222 count=128
found memory: 0x2857a10 fakepos=128 bytes=128
cudaConfigureCall using default_queue
cudaConfigureCall queue=0x28579f0
grid(1, 1, 1)
block(32, 1, 1)
configureKernel name=_Z8mykernelPiPfS_
building kernel _Z8mykernelPiPfS_
cocl dump cl set
cl: [
kernel void _Z8mykernelPiPfS_(global int* int1, uint int1_offset, global float* f1, uint f1_offset, global int* int2, uint int2_offset, local int *scratch);

kernel void _Z8mykernelPiPfS_(global int* int1, uint int1_offset, global float* f1, uint f1_offset, global int* int2, uint int2_offset, local int *scratch) {
    int2 += int2_offset;
    f1 += f1_offset;
    int1 += int1_offset;

    global int* v3;
    int v2;

v1:;
    /* int v2 = load int1 */;
    v2 = int1[0];
    /* int* v3 = bitcast f1 */;
    v3 = (global int*)f1;
    /* void v4 = store v2 v3 */;
    v3[0] = v2;
    /* void v5 = store v2 int2 */;
    int2[0] = v2;
    return;
}
]
__internal__ build log:
"/tmp/OCL25733T1.cl", line 12: warning: label "v1" was declared but never
          referenced
  v1:;
  ^


 ... built
setKernelArgCharStar 0x80
found memory: 0x2857a10 fakepos=128 bytes=128
setKernelArgCharStar 0x100
found memory: 0x2cbec90 fakepos=256 bytes=128
setKernelArgCharStar 0x180
found memory: 0x2cbb490 fakepos=384 bytes=128
kernelGo queue=0x28579f0
<<< global=dim3(32,1,1,), workgroupsize=dim3(32,1,1,)>>>
workgroupSize=32
.. kernel queued
cudamempcy using opencl cudaMemcpyKind 111 count=128
cudamemcpy device to host
found memory: 0x2cbec90 fakepos=256 bytes=128
cudamempcy using opencl cudaMemcpyKind 111 count=128
cudamemcpy device to host
found memory: 0x2cbb490 fakepos=384 bytes=128
this should NOT be 123, should be some weird float value, not even slightly close to 123 :
f1[0] 1.7236e-43
this SHOULD be 123 :
int2[0] 123
cudamempcy using opencl cudaMemcpyKind 222 count=128
found memory: 0x2857a10 fakepos=128 bytes=128
after copy to device
cudaConfigureCall using default_queue
cudaConfigureCall queue=0x28579f0
grid(1, 1, 1)
block(32, 1, 1)
configureKernel name=_Z10inttofloatPfPi
building kernel _Z10inttofloatPfPi
cocl dump cl set
cl: [
kernel void _Z10inttofloatPfPi(global float* out, uint out_offset, global int* in, uint in_offset, local int *scratch);

kernel void _Z10inttofloatPfPi(global float* out, uint out_offset, global int* in, uint in_offset, local int *scratch) {
    in += in_offset;
    out += out_offset;

    global int* v3;
    int v2;

v1:;
    /* int v2 = load in */;
    v2 = in[0];
    /* int* v3 = bitcast out */;
    v3 = (global int*)out;
    /* void v4 = store v2 v3 */;
    v3[0] = v2;
    return;
}
]
__internal__ build log:
"/tmp/OCL25733T2.cl", line 11: warning: label "v1" was declared but never
          referenced
  v1:;
  ^


 ... built
setKernelArgCharStar 0x100
found memory: 0x2cbec90 fakepos=256 bytes=128
setKernelArgCharStar 0x80
found memory: 0x2857a10 fakepos=128 bytes=128
kernelGo queue=0x28579f0
<<< global=dim3(32,1,1,), workgroupsize=dim3(32,1,1,)>>>
workgroupSize=32
.. kernel queued
after kernel call 2
cudamempcy using opencl cudaMemcpyKind 111 count=128
cudamemcpy device to host
found memory: 0x2cbec90 fakepos=256 bytes=128
f1[0]1.08881e-42
cudaConfigureCall using default_queue
cudaConfigureCall queue=0x28579f0
grid(1, 1, 1)
block(32, 1, 1)
configureKernel name=_Z10floattointPiPf
building kernel _Z10floattointPiPf
cocl dump cl set
cl: [
kernel void _Z10floattointPiPf(global int* out, uint out_offset, global float* in, uint in_offset, local int *scratch);

kernel void _Z10floattointPiPf(global int* out, uint out_offset, global float* in, uint in_offset, local int *scratch) {
    in += in_offset;
    out += out_offset;

    int v3;

v1:;
    /* int* v2 = bitcast in */;
    /* int v3 = load v2 */;
    v3 = ((global int*)in)[0];
    /* void v4 = store v3 out */;
    out[0] = v3;
    return;
}
]
test/cocl/testblas.cu:12:9: warning: unused variable 'newrows' [-Wunused-variable]
    int newrows = cols;
        ^
__internal__ build log:
"/tmp/OCL25733T3.cl", line 10: warning: label "v1" was declared but never
          referenced
  v1:;
  ^


 ... built
setKernelArgCharStar 0x180
found memory: 0x2cbb490 fakepos=384 bytes=128
setKernelArgCharStar 0x100
found memory: 0x2cbec90 fakepos=256 bytes=128
kernelGo queue=0x28579f0
<<< global=dim3(32,1,1,), workgroupsize=dim3(32,1,1,)>>>
workgroupSize=32
.. kernel queued
after kernel call 3
cudamempcy using opencl cudaMemcpyKind 111 count=128
cudamemcpy device to host
found memory: 0x2cbb490 fakepos=384 bytes=128
int2[0]777
3 warnings generated.
+ /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/build/patch-hostside --hostrawfile /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testblas-hostraw.ll --devicellfile /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testblas-device.ll --hostpatchedfile /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testblas-hostpatched.ll
+ /usr/lib/llvm-3.8/bin/clang++ -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -DUSE_CLEW -c /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testblas-hostpatched.ll -O3 -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testblas.o
clang: warning: argument unused during compilation: '-I /usr/lib/llvm-3.8/include'
clang: warning: argument unused during compilation: '-I /usr/lib/llvm-3.8/include'
warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
[ 92%] Built target run-test_bitcast
1 warning generated.
+ [ !  ]
+ g++ -Wl,-rpath,/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/build -Wl,-rpath,25597ORIGIN -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testblas /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/testblas.o -L/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/build -lcocl -lclblast -leasycl -lclew -lpthread -L/usr/lib/llvm-3.8/lib -lLLVMLTO -lLLVMObjCARCOpts -lLLVMSymbolize -lLLVMDebugInfoPDB -lLLVMDebugInfoDWARF -lLLVMXCoreDisassembler -lLLVMXCoreCodeGen -lLLVMXCoreDesc -lLLVMXCoreInfo -lLLVMXCoreAsmPrinter -lLLVMSystemZDisassembler -lLLVMSystemZCodeGen -lLLVMSystemZAsmParser -lLLVMSystemZDesc -lLLVMSystemZInfo -lLLVMSystemZAsmPrinter -lLLVMSparcDisassembler -lLLVMSparcCodeGen -lLLVMSparcAsmParser -lLLVMSparcDesc -lLLVMSparcInfo -lLLVMSparcAsmPrinter -lLLVMPowerPCDisassembler -lLLVMPowerPCCodeGen -lLLVMPowerPCAsmParser -lLLVMPowerPCDesc -lLLVMPowerPCInfo -lLLVMPowerPCAsmPrinter -lLLVMNVPTXCodeGen -lLLVMNVPTXDesc -lLLVMNVPTXInfo -lLLVMNVPTXAsmPrinter -lLLVMMSP430CodeGen -lLLVMMSP430Desc -lLLVMMSP430Info -lLLVMMSP430AsmPrinter -lLLVMMipsDisassembler -lLLVMMipsCodeGen -lLLVMMipsAsmParser -lLLVMMipsDesc -lLLVMMipsInfo -lLLVMMipsAsmPrinter -lLLVMHexagonDisassembler -lLLVMHexagonCodeGen -lLLVMHexagonAsmParser -lLLVMHexagonDesc -lLLVMHexagonInfo -lLLVMCppBackendCodeGen -lLLVMCppBackendInfo -lLLVMBPFCodeGen -lLLVMBPFDesc -lLLVMBPFInfo -lLLVMBPFAsmPrinter -lLLVMARMDisassembler -lLLVMARMCodeGen -lLLVMARMAsmParser -lLLVMARMDesc -lLLVMARMInfo -lLLVMARMAsmPrinter -lLLVMAMDGPUCodeGen -lLLVMAMDGPUAsmParser -lLLVMAMDGPUDesc -lLLVMAMDGPUUtils -lLLVMAMDGPUInfo -lLLVMAMDGPUAsmPrinter -lLLVMAArch64Disassembler -lLLVMAArch64CodeGen -lLLVMAArch64AsmParser -lLLVMAArch64Desc -lLLVMAArch64Info -lLLVMAArch64AsmPrinter -lLLVMAArch64Utils -lLLVMMIRParser -lLLVMLibDriver -lLLVMOption -lLLVMTableGen -lLLVMLineEditor -lLLVMX86Disassembler -lLLVMX86AsmParser -lLLVMX86CodeGen -lLLVMSelectionDAG -lLLVMAsmPrinter -lLLVMX86Desc -lLLVMMCDisassembler -lLLVMX86Info -lLLVMX86AsmPrinter -lLLVMX86Utils -lLLVMMCJIT -lLLVMPasses -lLLVMipo -lLLVMVectorize -lLLVMLinker -lLLVMIRReader -lLLVMAsmParser -lLLVMDebugInfoCodeView -lLLVMInterpreter -lLLVMCodeGen -lLLVMScalarOpts -lLLVMInstCombine -lLLVMInstrumentation -lLLVMProfileData -lLLVMBitWriter -lLLVMOrcJIT -lLLVMTransformUtils -lLLVMExecutionEngine -lLLVMTarget -lLLVMAnalysis -lLLVMRuntimeDyld -lLLVMObject -lLLVMMCParser -lLLVMBitReader -lLLVMMC -lLLVMCore -lLLVMSupport -lrt -ldl -ltinfo -lpthread -lz -lm
1 warning generated.
+ /usr/lib/llvm-3.8/bin/opt -S -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/multithreading-device.ll /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/multithreading-device-noopt.ll
+ /usr/lib/llvm-3.8/bin/clang++ -DUSE_CLEW -std=c++11 -x cuda --cuda-host-only -emit-llvm -O3 -S -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/EasyCL -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL/thirdparty/clew/include -I/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/src/EasyCL -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/fake_funcs.h -include /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/include/cocl/cocl_hostside.h test/cocl/multithreading.cu -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/multithreading-hostraw.ll
warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
cuStreamCreate redirected
cuStreamCreate current context=0
creating default context
Context() 0xb89030
Using Advanced Micro Devices, Inc. , OpenCL platform: AMD Accelerated Parallel Processing
Using OpenCL device: Pitcairn
cuStreamCreate redirected new stream 0xe96f50
A:
 3 5
 5 8
 2 -1
B:
 3 5 4 1
 5 8 5 7
ATrans:
 3 5 2
 5 8 -1
BTrans:
 3 5
 5 8
 4 5
 1 7
Memory::newDeviceAlloc context=0xb89030 bytes=4120 memory=0x10d7c50 clmem=0x10d7ae0
Memory::newDeviceAlloc context=0xb89030 bytes=4128 memory=0x10d7df0 clmem=0x10d7c80
Memory::newDeviceAlloc context=0xb89030 bytes=4144 memory=0x10d8010 clmem=0x10d7e60
cuMemcpyHtoDAsync dst=128 src=0x7ffed72eac10 bytes=24
found memory: 0x10d7c50 fakepos=128 bytes=4120
cuMemcpyHtoDAsync dst=4352 src=0x7ffed72eabf0 bytes=32
found memory: 0x10d7df0 fakepos=4352 bytes=4128
found memory: 0x10d7c50 fakepos=128 bytes=4120
found memory: 0x10d7df0 fakepos=4352 bytes=4128
found memory: 0x10d8010 fakepos=8576 bytes=4144
cuMemcpyDtoHAsync queue=0xe96f70 dst=0x7ffed72eab90 src=8576 bytes=48
found memory: 0x10d8010 fakepos=8576 bytes=4144
cuMemcpyDtoHAsync dst[0] 34
cudaStreamSynchronize queue=0xe96f70
C trans:
 34 55 1
 55 89 2
 37 60 3
 38 61 -5
C:
 34 55 37 38
 55 89 60 61
 1 2 3 -5
C check:
 34 55 37 38
 55 89 60 61
 1 2 3 -5
found memory: 0x10d7c50 fakepos=128 bytes=4120
found memory: 0x10d7df0 fakepos=4352 bytes=4128
found memory: 0x10d8010 fakepos=8576 bytes=4144
cuStreamDestroy_v2 redirected stream=0xe96f50
finished testblas
2 warnings generated.
+ /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/build/patch-hostside --hostrawfile /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/multithreading-hostraw.ll --devicellfile /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/multithreading-device.ll --hostpatchedfile /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/multithreading-hostpatched.ll
+ /usr/lib/llvm-3.8/bin/clang++ -I/usr/lib/llvm-3.8/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/lib/llvm-3.8/include -std=c++11 -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wno-maybe-uninitialized -Wdelete-non-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -fexceptions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -DUSE_CLEW -c /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/multithreading-hostpatched.ll -O3 -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/multithreading.o
[ 92%] Built target run-testblas
clang: warning: argument unused during compilation: '-I /usr/lib/llvm-3.8/include'
clang: warning: argument unused during compilation: '-I /usr/lib/llvm-3.8/include'
warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
1 warning generated.
+ [ !  ]
+ g++ -Wl,-rpath,/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/build -Wl,-rpath,25687ORIGIN -o /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/multithreading /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/cocl/multithreading.o -L/media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl/build -lcocl -lclblast -leasycl -lclew -lpthread -L/usr/lib/llvm-3.8/lib -lLLVMLTO -lLLVMObjCARCOpts -lLLVMSymbolize -lLLVMDebugInfoPDB -lLLVMDebugInfoDWARF -lLLVMXCoreDisassembler -lLLVMXCoreCodeGen -lLLVMXCoreDesc -lLLVMXCoreInfo -lLLVMXCoreAsmPrinter -lLLVMSystemZDisassembler -lLLVMSystemZCodeGen -lLLVMSystemZAsmParser -lLLVMSystemZDesc -lLLVMSystemZInfo -lLLVMSystemZAsmPrinter -lLLVMSparcDisassembler -lLLVMSparcCodeGen -lLLVMSparcAsmParser -lLLVMSparcDesc -lLLVMSparcInfo -lLLVMSparcAsmPrinter -lLLVMPowerPCDisassembler -lLLVMPowerPCCodeGen -lLLVMPowerPCAsmParser -lLLVMPowerPCDesc -lLLVMPowerPCInfo -lLLVMPowerPCAsmPrinter -lLLVMNVPTXCodeGen -lLLVMNVPTXDesc -lLLVMNVPTXInfo -lLLVMNVPTXAsmPrinter -lLLVMMSP430CodeGen -lLLVMMSP430Desc -lLLVMMSP430Info -lLLVMMSP430AsmPrinter -lLLVMMipsDisassembler -lLLVMMipsCodeGen -lLLVMMipsAsmParser -lLLVMMipsDesc -lLLVMMipsInfo -lLLVMMipsAsmPrinter -lLLVMHexagonDisassembler -lLLVMHexagonCodeGen -lLLVMHexagonAsmParser -lLLVMHexagonDesc -lLLVMHexagonInfo -lLLVMCppBackendCodeGen -lLLVMCppBackendInfo -lLLVMBPFCodeGen -lLLVMBPFDesc -lLLVMBPFInfo -lLLVMBPFAsmPrinter -lLLVMARMDisassembler -lLLVMARMCodeGen -lLLVMARMAsmParser -lLLVMARMDesc -lLLVMARMInfo -lLLVMARMAsmPrinter -lLLVMAMDGPUCodeGen -lLLVMAMDGPUAsmParser -lLLVMAMDGPUDesc -lLLVMAMDGPUUtils -lLLVMAMDGPUInfo -lLLVMAMDGPUAsmPrinter -lLLVMAArch64Disassembler -lLLVMAArch64CodeGen -lLLVMAArch64AsmParser -lLLVMAArch64Desc -lLLVMAArch64Info -lLLVMAArch64AsmPrinter -lLLVMAArch64Utils -lLLVMMIRParser -lLLVMLibDriver -lLLVMOption -lLLVMTableGen -lLLVMLineEditor -lLLVMX86Disassembler -lLLVMX86AsmParser -lLLVMX86CodeGen -lLLVMSelectionDAG -lLLVMAsmPrinter -lLLVMX86Desc -lLLVMMCDisassembler -lLLVMX86Info -lLLVMX86AsmPrinter -lLLVMX86Utils -lLLVMMCJIT -lLLVMPasses -lLLVMipo -lLLVMVectorize -lLLVMLinker -lLLVMIRReader -lLLVMAsmParser -lLLVMDebugInfoCodeView -lLLVMInterpreter -lLLVMCodeGen -lLLVMScalarOpts -lLLVMInstCombine -lLLVMInstrumentation -lLLVMProfileData -lLLVMBitWriter -lLLVMOrcJIT -lLLVMTransformUtils -lLLVMExecutionEngine -lLLVMTarget -lLLVMAnalysis -lLLVMRuntimeDyld -lLLVMObject -lLLVMMCParser -lLLVMBitReader -lLLVMMC -lLLVMCore -lLLVMSupport -lrt -ldl -ltinfo -lpthread -lz -lm
creaed threads
thread 0
cuStreamCreate redirected
cuStreamCreate current context=0
creating default context
Context() 0x7f859c0008e0
thread 1
cuStreamCreate redirected
cuStreamCreate current context=0
creating default context
Context() 0x7f85900008e0
thread 2
cuStreamCreate redirected
cuStreamCreate current context=0
creating default context
Context() 0x7f85880008e0
thread 3
cuStreamCreate redirected
cuStreamCreate current context=0
creating default context
Context() 0x7f85800008e0
Using Advanced Micro Devices, Inc. , OpenCL platform: AMD Accelerated Parallel Processing
Using OpenCL device: Pitcairn
Using Advanced Micro Devices, Inc. , OpenCL platform: AMD Accelerated Parallel Processing
Using OpenCL device: Pitcairn
Using Advanced Micro Devices, Inc. , OpenCL platform: AMD Accelerated Parallel Processing
Using OpenCL device: Pitcairn
cuStreamCreate redirected new stream 0x7f859c2b86c0
Memory::newDeviceAlloc context=0x7f859c0008e0 bytes=4096 memory=0x7f859c2b8ce0 clmem=0x7f859c2b8b70
cudaConfigureCall queue=0x7f859c2b86e0
grid(1, 1, 1)
block(32, 1, 1)
configureKernel name=_Z8getValuePfS_
building kernel _Z8getValuePfS_
cocl dump cl set
cl: [
kernel void _Z8getValuePfS_(global float* outdata, uint outdata_offset, global float* indata, uint indata_offset, local int *scratch);

kernel void _Z8getValuePfS_(global float* outdata, uint outdata_offset, global float* indata, uint indata_offset, local int *scratch) {
    indata += indata_offset;
    outdata += outdata_offset;

    float v4;

v1:;
    /* bool v2 = icmp indata <unk> */;
    /* float v4 = select v2 <unk> <unk> */;
    v4 = indata == 0 ? 3.0f : 2.0f;
    /* void v7 = store v4 outdata */;
    outdata[0] = v4;
    return;
}
]
Using Advanced Micro Devices, Inc. , OpenCL platform: AMD Accelerated Parallel Processing
Using OpenCL device: Pitcairn
cuStreamCreate redirected new stream 0x7f8590001810
Memory::newDeviceAlloc context=0x7f85900008e0 bytes=4096 memory=0x7f8590001290 clmem=0x7f8590002c80
cuStreamCreate redirected new stream 0x7f8588001810
Memory::newDeviceAlloc context=0x7f85880008e0 bytes=4096 memory=0x7f8588001290 clmem=0x7f8588002c80
cuStreamCreate redirected new stream 0x7f8580001810
Memory::newDeviceAlloc context=0x7f85800008e0 bytes=4096 memory=0x7f8580001290 clmem=0x7f8580002c80
__internal__ build log:
"/tmp/OCL25771T1.cl", line 10: warning: label "v1" was declared but never
          referenced
  v1:;
  ^


 ... built
setKernelArgCharStar 0x80
found memory: 0x7f859c2b8ce0 fakepos=128 bytes=4096
setKernelArgCharStar 0
kernelGo queue=0x7f859c2b86e0
<<< global=dim3(32,1,1,), workgroupsize=dim3(32,1,1,)>>>
workgroupSize=32
.. kernel queued
cudaConfigureCall queue=0x7f859c2b86e0
grid(1, 1, 1)
block(32, 1, 1)
configureKernel name=_Z8getValuePfS_
setKernelArgCharStar 0x80
found memory: 0x7f859c2b8ce0 fakepos=128 bytes=4096
setKernelArgCharStar 0
kernelGo queue=0x7f859c2b86e0
<<< global=dim3(32,1,1,), workgroupsize=dim3(32,1,1,)>>>
workgroupSize=32
.. kernel queued
cudaConfigureCall queue=0x7f859c2b86e0
grid(1, 1, 1)
block(32, 1, 1)
configureKernel name=_Z8getValuePfS_
setKernelArgCharStar 0x80
found memory: 0x7f859c2b8ce0 fakepos=128 bytes=4096
setKernelArgCharStar 0
kernelGo queue=0x7f859c2b86e0
<<< global=dim3(32,1,1,), workgroupsize=dim3(32,1,1,)>>>
workgroupSize=32
.. kernel queued
cudaConfigureCall queue=0x7f859c2b86e0
grid(1, 1, 1)
block(32, 1, 1)
configureKernel name=_Z8getValuePfS_
setKernelArgCharStar 0x80
found memory: 0x7f859c2b8ce0 fakepos=128 bytes=4096
setKernelArgCharStar 0
kernelGo queue=0x7f859c2b86e0
<<< global=dim3(32,1,1,), workgroupsize=dim3(32,1,1,)>>>
workgroupSize=32
.. kernel queued
cudaConfigureCall queue=0x7f8590001830
grid(1, 1, 1)
block(32, 1, 1)
configureKernel name=_Z8getValuePfS_
building kernel _Z8getValuePfS_
cocl dump cl set
cl: [
kernel void _Z8getValuePfS_(global float* outdata, uint outdata_offset, global float* indata, uint indata_offset, local int *scratch);

kernel void _Z8getValuePfS_(global float* outdata, uint outdata_offset, global float* indata, uint indata_offset, local int *scratch) {
    indata += indata_offset;
    outdata += outdata_offset;

    float v4;

v1:;
    /* bool v2 = icmp indata <unk> */;
    /* float v4 = select v2 <unk> <unk> */;
    v4 = indata == 0 ? 3.0f : 2.0f;
    /* void v7 = store v4 outdata */;
    outdata[0] = v4;
    return;
}
]
cudaStreamSynchronize queue=0x7f859c2b86e0
num kernels cached 1
num kernels calls 4
found memory: 0x7f859c2b8ce0 fakepos=128 bytes=4096
cuStreamDestroy_v2 redirected stream=0x7f859c2b86c0
joined thread 0
__internal__ build log:
"/tmp/OCL25771T2.cl", line 10: warning: label "v1" was declared but never
          referenced
  v1:;
  ^


 ... built
setKernelArgCharStar 0x80
found memory: 0x7f8590001290 fakepos=128 bytes=4096
setKernelArgCharStar 0
kernelGo queue=0x7f8590001830
<<< global=dim3(32,1,1,), workgroupsize=dim3(32,1,1,)>>>
workgroupSize=32
.. kernel queued
cudaConfigureCall queue=0x7f8590001830
grid(1, 1, 1)
block(32, 1, 1)
configureKernel name=_Z8getValuePfS_
setKernelArgCharStar 0x80
found memory: 0x7f8590001290 fakepos=128 bytes=4096
setKernelArgCharStar 0
kernelGo queue=0x7f8590001830
<<< global=dim3(32,1,1,), workgroupsize=dim3(32,1,1,)>>>
workgroupSize=32
.. kernel queued
cudaConfigureCall queue=0x7f8590001830
grid(1, 1, 1)
block(32, 1, 1)
configureKernel name=_Z8getValuePfS_
setKernelArgCharStar 0x80
found memory: 0x7f8590001290 fakepos=128 bytes=4096
setKernelArgCharStar 0
kernelGo queue=0x7f8590001830
<<< global=dim3(32,1,1,), workgroupsize=dim3(32,1,1,)>>>
workgroupSize=32
.. kernel queued
cudaConfigureCall queue=0x7f8590001830
grid(1, 1, 1)
block(32, 1, 1)
configureKernel name=_Z8getValuePfS_
setKernelArgCharStar 0x80
found memory: 0x7f8590001290 fakepos=128 bytes=4096
setKernelArgCharStar 0
kernelGo queue=0x7f8590001830
<<< global=dim3(32,1,1,), workgroupsize=dim3(32,1,1,)>>>
workgroupSize=32
.. kernel queued
cudaStreamSynchronize queue=0x7f8590001830
num kernels cached 1
num kernels calls 4
found memory: 0x7f8590001290 fakepos=128 bytes=4096
cudaConfigureCall queue=0x7f8580001830
grid(1, 1, 1)
block(32, 1, 1)
configureKernel name=_Z8getValuePfS_
cuStreamDestroy_v2 redirected stream=0x7f8590001810
building kernel _Z8getValuePfS_
cocl dump cl set
cl: [
kernel void _Z8getValuePfS_(global float* outdata, uint outdata_offset, global float* indata, uint indata_offset, local int *scratch);

kernel void _Z8getValuePfS_(global float* outdata, uint outdata_offset, global float* indata, uint indata_offset, local int *scratch) {
    indata += indata_offset;
    outdata += outdata_offset;

    float v4;

v1:;
    /* bool v2 = icmp indata <unk> */;
    /* float v4 = select v2 <unk> <unk> */;
    v4 = indata == 0 ? 3.0f : 2.0f;
    /* void v7 = store v4 outdata */;
    outdata[0] = v4;
    return;
}
]
joined thread 1
__internal__ build log:
"/tmp/OCL25771T3.cl", line 10: warning: label "v1" was declared but never
          referenced
  v1:;
  ^


 ... built
setKernelArgCharStar 0x80
found memory: 0x7f8580001290 fakepos=128 bytes=4096
setKernelArgCharStar 0
kernelGo queue=0x7f8580001830
<<< global=dim3(32,1,1,), workgroupsize=dim3(32,1,1,)>>>
workgroupSize=32
.. kernel queued
cudaConfigureCall queue=0x7f8580001830
grid(1, 1, 1)
block(32, 1, 1)
configureKernel name=_Z8getValuePfS_
setKernelArgCharStar 0x80
found memory: 0x7f8580001290 fakepos=128 bytes=4096
setKernelArgCharStar 0
kernelGo queue=0x7f8580001830
<<< global=dim3(32,1,1,), workgroupsize=dim3(32,1,1,)>>>
workgroupSize=32
.. kernel queued
cudaConfigureCall queue=0x7f8580001830
grid(1, 1, 1)
block(32, 1, 1)
configureKernel name=_Z8getValuePfS_
setKernelArgCharStar 0x80
found memory: 0x7f8580001290 fakepos=128 bytes=4096
setKernelArgCharStar 0
kernelGo queue=0x7f8580001830
<<< global=dim3(32,1,1,), workgroupsize=dim3(32,1,1,)>>>
workgroupSize=32
.. kernel queued
cudaConfigureCall queue=0x7f8580001830
grid(1, 1, 1)
block(32, 1, 1)
configureKernel name=_Z8getValuePfS_
setKernelArgCharStar 0x80
found memory: 0x7f8580001290 fakepos=128 bytes=4096
setKernelArgCharStar 0
kernelGo queue=0x7f8580001830
<<< global=dim3(32,1,1,), workgroupsize=dim3(32,1,1,)>>>
workgroupSize=32
.. kernel queued
cudaStreamSynchronize queue=0x7f8580001830
num kernels cached 1
num kernels calls 4
found memory: 0x7f8580001290 fakepos=128 bytes=4096
cudaConfigureCall queue=0x7f8588001830
grid(1, 1, 1)
block(32, 1, 1)
configureKernel name=_Z8getValuePfS_
cuStreamDestroy_v2 redirected stream=0x7f8580001810
building kernel _Z8getValuePfS_
cocl dump cl set
cl: [
kernel void _Z8getValuePfS_(global float* outdata, uint outdata_offset, global float* indata, uint indata_offset, local int *scratch);

kernel void _Z8getValuePfS_(global float* outdata, uint outdata_offset, global float* indata, uint indata_offset, local int *scratch) {
    indata += indata_offset;
    outdata += outdata_offset;

    float v4;

v1:;
    /* bool v2 = icmp indata <unk> */;
    /* float v4 = select v2 <unk> <unk> */;
    v4 = indata == 0 ? 3.0f : 2.0f;
    /* void v7 = store v4 outdata */;
    outdata[0] = v4;
    return;
}
]
__internal__ build log:
"/tmp/OCL25771T4.cl", line 10: warning: label "v1" was declared but never
          referenced
  v1:;
  ^


 ... built
setKernelArgCharStar 0x80
found memory: 0x7f8588001290 fakepos=128 bytes=4096
setKernelArgCharStar 0
kernelGo queue=0x7f8588001830
<<< global=dim3(32,1,1,), workgroupsize=dim3(32,1,1,)>>>
workgroupSize=32
.. kernel queued
cudaConfigureCall queue=0x7f8588001830
grid(1, 1, 1)
block(32, 1, 1)
configureKernel name=_Z8getValuePfS_
setKernelArgCharStar 0x80
found memory: 0x7f8588001290 fakepos=128 bytes=4096
setKernelArgCharStar 0
kernelGo queue=0x7f8588001830
<<< global=dim3(32,1,1,), workgroupsize=dim3(32,1,1,)>>>
workgroupSize=32
.. kernel queued
cudaConfigureCall queue=0x7f8588001830
grid(1, 1, 1)
block(32, 1, 1)
configureKernel name=_Z8getValuePfS_
setKernelArgCharStar 0x80
found memory: 0x7f8588001290 fakepos=128 bytes=4096
setKernelArgCharStar 0
kernelGo queue=0x7f8588001830
<<< global=dim3(32,1,1,), workgroupsize=dim3(32,1,1,)>>>
workgroupSize=32
.. kernel queued
cudaConfigureCall queue=0x7f8588001830
grid(1, 1, 1)
block(32, 1, 1)
configureKernel name=_Z8getValuePfS_
setKernelArgCharStar 0x80
found memory: 0x7f8588001290 fakepos=128 bytes=4096
setKernelArgCharStar 0
kernelGo queue=0x7f8588001830
<<< global=dim3(32,1,1,), workgroupsize=dim3(32,1,1,)>>>
workgroupSize=32
.. kernel queued
cudaStreamSynchronize queue=0x7f8588001830
num kernels cached 1
num kernels calls 4
found memory: 0x7f8588001290 fakepos=128 bytes=4096
cuStreamDestroy_v2 redirected stream=0x7f8588001810
joined thread 2
joined thread 3
[ 92%] Built target run-multithreading
CMakeFiles/Makefile2:208: recipe for target 'CMakeFiles/run-tests.dir/rule' failed
make[1]: *** [CMakeFiles/run-tests.dir/rule] Error 2
Makefile:201: recipe for target 'run-tests' failed
make: *** [run-tests] Error 2