Skip to content

Instantly share code, notes, and snippets.

@74th
Created May 3, 2018 13:01
Show Gist options
  • Star 10 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save 74th/31eacbbac6351649caa417b19231f09e to your computer and use it in GitHub Desktop.
Save 74th/31eacbbac6351649caa417b19231f09e to your computer and use it in GitHub Desktop.
TensorFlow v1.8.0 MacOS Nvidia GPU
diff --git a/tensorflow/core/framework/variant.h b/tensorflow/core/framework/variant.h
index c02391dae3..7f76609814 100644
--- a/tensorflow/core/framework/variant.h
+++ b/tensorflow/core/framework/variant.h
@@ -152,7 +152,8 @@ bool DecodeVariant(const string& buf, T* value);
//
class Variant {
public:
- constexpr Variant() noexcept = default;
+// constexpr Variant() noexcept = default;
+ Variant() noexcept = default;
Variant(const Variant& other)
: value_(other.is_empty() ? std::unique_ptr<ValueInterface>()
diff --git a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
index a561d918bd..785e0ddf4e 100644
--- a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
+++ b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
@@ -69,7 +69,7 @@ __global__ void concat_variable_kernel(
IntType num_inputs = input_ptr_data.size;
// verbose declaration needed due to template
- extern __shared__ __align__(sizeof(T)) unsigned char smem[];
+ extern __shared__ unsigned char smem[];
IntType* smem_col_scan = reinterpret_cast<IntType*>(smem);
if (useSmem) {
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
index 94989089ec..a2e3e8bc87 100644
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
@@ -172,7 +172,7 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNHWCSmall(
const DepthwiseArgs args, const T* input, const T* filter, T* output) {
assert(CanLaunchDepthwiseConv2dGPUSmall(args));
// Holds block plus halo and filter data for blockDim.x depths.
- extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
+ extern __shared__ unsigned char shared_memory[];
T* const shared_data = reinterpret_cast<T*>(shared_memory);
const int num_batches = args.batch;
@@ -452,7 +452,7 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNCHWSmall(
const DepthwiseArgs args, const T* input, const T* filter, T* output) {
assert(CanLaunchDepthwiseConv2dGPUSmall(args));
// Holds block plus halo and filter data for blockDim.z depths.
- extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
+ extern __shared__ unsigned char shared_memory[];
T* const shared_data = reinterpret_cast<T*>(shared_memory);
const int num_batches = args.batch;
@@ -1118,7 +1118,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall(
const DepthwiseArgs args, const T* output, const T* input, T* filter) {
assert(CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(args, blockDim.z));
// Holds block plus halo and filter data for blockDim.x depths.
- extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
+ extern __shared__ unsigned char shared_memory[];
T* const shared_data = reinterpret_cast<T*>(shared_memory);
const int num_batches = args.batch;
@@ -1388,7 +1388,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall(
const DepthwiseArgs args, const T* output, const T* input, T* filter) {
assert(CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(args, blockDim.x));
// Holds block plus halo and filter data for blockDim.z depths.
- extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
+ extern __shared__ unsigned char shared_memory[];
T* const shared_data = reinterpret_cast<T*>(shared_memory);
const int num_batches = args.batch;
diff --git a/tensorflow/core/kernels/split_lib_gpu.cu.cc b/tensorflow/core/kernels/split_lib_gpu.cu.cc
index 393818730b..a7d9e02853 100644
--- a/tensorflow/core/kernels/split_lib_gpu.cu.cc
+++ b/tensorflow/core/kernels/split_lib_gpu.cu.cc
@@ -121,7 +121,7 @@ __global__ void split_v_kernel(const T* input_ptr,
int num_outputs = output_ptr_data.size;
// verbose declaration needed due to template
- extern __shared__ __align__(sizeof(T)) unsigned char smem[];
+ extern __shared__ unsigned char smem[];
IntType* smem_col_scan = reinterpret_cast<IntType*>(smem);
if (useSmem) {
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 48728ac131..268e4fe2e6 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -330,11 +330,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
tf_http_archive(
name = "protobuf_archive",
urls = [
- "https://mirror.bazel.build/github.com/google/protobuf/archive/396336eb961b75f03b25824fe86cf6490fb75e3a.tar.gz",
- "https://github.com/google/protobuf/archive/396336eb961b75f03b25824fe86cf6490fb75e3a.tar.gz",
+ "https://mirror.bazel.build/github.com/dtrebbien/protobuf/archive/50f552646ba1de79e07562b41f3999fe036b4fd0.tar.gz",
+ "https://github.com/dtrebbien/protobuf/archive/50f552646ba1de79e07562b41f3999fe036b4fd0.tar.gz",
],
- sha256 = "846d907acf472ae233ec0882ef3a2d24edbbe834b80c305e867ac65a1f2c59e3",
- strip_prefix = "protobuf-396336eb961b75f03b25824fe86cf6490fb75e3a",
+ sha256 = "eb16b33431b91fe8cee479575cee8de202f3626aaf00d9bf1783c6e62b4ffbc7",
+ strip_prefix = "protobuf-50f552646ba1de79e07562b41f3999fe036b4fd0",
)
# We need to import the protobuf library under the names com_google_protobuf
@74th
Copy link
Author

74th commented May 3, 2018

make and build nccl 1.3.4 https://github.com/74th/nccl

export CUDA_HOME=/usr/local/cuda
export DYLD_LIBRARY_PATH=/usr/local/cuda/lib:/usr/local/cuda/extras/CUPTI/lib
export LD_LIBRARY_PATH=$DYLD_LIBRARY_PATH
export PATH=$DYLD_LIBRARY_PATH:$PATH
bazel build --config=cuda --config=opt --action_env PATH --action_env LD_LIBRARY_PATH --action_env DYLD_LIBRARY_PATH //tensorflow/tools/pip_package:build_pip_package
bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/tensorflow_pkg
pip install /tmp/tensorflow_pkg/tensorflow-1.8.0-cp36-cp36m-macosx_10_13_x86_64.whl

I uploaded this package.

https://storage.googleapis.com/74thopen/tensorflow_osx/index.html

@desert0616
Copy link

desert0616 commented May 22, 2018

hi @74th, thanks so much for your contritubions.
I am suffering from the error about "third_party/nccl/nccl.h' file not found".
do we need to copy nccl.h from https://github.com/74th/nccl to tensorflow/third_party/nccl/ ?

After that, the build will succeed. However, I am facing another issue: Symbol not found: _ncclAllReduce.

Did I do something wrong? Thanks.

@Orang-utan
Copy link

Hey @desert0616, I was able to replicate your issue. Were you able to solve it?

Thanks,

Daniel

@Orang-utan
Copy link

Hey all,

If you encounter -ncclAllReduce issue, I found a solution.

  1. Download file here: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/nccl/kernels/nccl_ops.cc
  2. Execute: gcc -c -fPIC nccl_ops.cc -o hello_world.o
  3. Execute: gcc hello_world.o -shared -o _nccl_ops.so
  4. Replace the file at Path: tensorflow/contrib/nccl/python/ops

@desert0616
Copy link

@Orang-utan Thanks so much. You save my days!

@mbertini
Copy link

I'm on 10.13.5 and everything compiles with these steps but running on a GPU results in a segfault (signal 11) 3 times out of 4. I'm using a MBP late 2013 (GT 750M) , with CUDA 9.1 and CuDNN 7.0.5. The basic CUDA driver is 396.64 and the NVIDIA Web driver is 387.10.10.10.35.106. Setting to default macOS GPU driver results in a warning from CUDA that it doesn't work and the only way to solve that warning is using the Nvidia web driver.

I suppose that the segfault is due to the CUDA driver version, but I'd like to hear from some other guys...

Copy link

ghost commented Jun 21, 2018

Rather than download nccl.h and nccl_ops.cc separately, what worked for me was to copy nccl.h from within the Bazel temp directories to tensorflow/contrib/nccl/kernels. To find the temporary copy, cd into Bazel's temp directories and find the file:

cd "/private/var/tmp/_bazel_$USER"
find . -name 'nccl.h'

@xiaoyuin
Copy link

@mbertini Same here. Segmentation fault: 11. Have you found the solution?

@alvaromuir
Copy link

@Orang-utan this is a good work-around, thanks

@mbertini
Copy link

mbertini commented Jul 9, 2018

@xiaoyuin unluckily I've not a found any solution. I've just upgraded CUDA Driver to 396.148 (it is still not compatible with 10.13.5, requiring to use the NVIDIA Webdriver 387.10.10.10.35.106) but I still get signal 11. Let's see what happens with new NVIDIA drivers or with next Tensorflow 1.9.0

@mbertini
Copy link

I've just compiled 1.9.0 and still segfaults 11 in the same part of Tensorflow (reporting last calls on stack):

void tensorflow::gtl::InlinedVector<tensorflow::EventMgr::InUse, 4>::emplace_back<tensorflow::EventMgr::InUse const&>(tensorflow::EventMgr::InUse const&&&) + 173
tensorflow::EventMgr::PollEvents(bool, tensorflow::gtl::InlinedVector<tensorflow::EventMgr::InUse, 4>) + 300
tensorflow::EventMgr::ThenExecute(stream_executor::Stream
, std::__1::function<void ()>) + 194
tensorflow::GPUUtil::CopyCPUTensorToGPU(tensorflow::Tensor const*, tensorflow::DeviceContext const*, tensorflow::Device*, tensorflow::Tensor*, std::__1::function<void (tensorflow::Status const&)>)

@bennix
Copy link

bennix commented Jul 14, 2018

Those who have segment 11 error.please check your clang version. The new anaconda3 installation have its own clang installed. Change that clang into other name. Use apple clang provided by Xcode, and rebuild Tensorflow, then the segment 11 error will gone!

@mbertini
Copy link

@bennix I've used clang:

Apple LLVM version 9.0.0 (clang-900.0.39.2)
Target: x86_64-apple-darwin17.7.0
Thread model: posix
InstalledDir: /Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin

that is clang provided with Xcode 9.2, the one supported by CUDA 9.1.
I've tried both 10.13.5 and 10.13.6.

Which clang/OSX version do you have ?

@iRonJ
Copy link

iRonJ commented Jul 21, 2018

Looks like something with nsync in libtensorflow

Termination Signal: Segmentation fault: 11
Termination Reason: Namespace SIGNAL, Code 0xb
Terminating Process: exc handler [0]

Thread 0:: Dispatch queue: com.apple.main-thread
0 libsystem_kernel.dylib 0x00007fff63af3a1e __psynch_cvwait + 10
1 libsystem_pthread.dylib 0x00007fff63cbc589 pthread_cond_wait + 732
2 libc++.1.dylib 0x00007fff618fbcb0 std::1::condition_variable::wait(std::1::unique_lockstd::__1::mutex&) + 18
3 libtensorflow_framework.so 0x00000001091c8deb nsync::nsync_mu_semaphore_p_with_deadline(nsync::nsync_semaphore_s
*, timespec) + 283
4 libtensorflow_framework.so 0x00000001091c5637 nsync::nsync_cv_wait_with_deadline_generic(nsync::nsync_cv_s
, void, void ()(void), void ()(void), timespec, nsync::nsync_note_s
*) + 423
5 libtensorflow_framework.so 0x00000001091c5da1 nsync::nsync_cv_wait(nsync::nsync_cv_s
*, nsync::nsync_mu_s
) + 49
6 _pywrap_tensorflow_internal.so 0x000000010eb9ceeb tensorflow::DirectSession::WaitForNotification(tensorflow::Notification
, long long) + 155
7 _pywrap_tensorflow_internal.so 0x000000010eb931f6 tensorflow::DirectSession::WaitForNotification(tensorflow::DirectSession::RunState*, tensorflow::CancellationManager*, long long) + 38
8 _pywrap_tensorflow_internal.so 0x000000010eb92ab7 tensorflow::DirectSession::RunInternal(long long, tensorflow::RunOptions const&, tensorflow::CallFrameInterface*, tensorflow::DirectSession::ExecutorsAndKeys*, tensorflow::RunMetadata*) + 2615
9 _pywrap_tensorflow_internal.so 0x000000010eb93c51 tensorflow::DirectSession::Run(tensorflow::RunOptions const&, std::__1::vector<std::__1::pair<std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator >, tensorflow::Tensor>, std::__1::allocator<std::__1::pair<std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator >, tensorflow::Tensor> > > const&, std::__1::vector<std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator >, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator > > > const&, std::__1::vector<std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator >, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator > > > const&, std::__1::vector<tensorflow::Tensor, std::__1::allocatortensorflow::Tensor >, tensorflow::RunMetadata) + 1473

Got this trying to run: https://www.tensorflow.org/tutorials/estimators/cnn

@smoothdvd
Copy link

@iRonJ Do you fix this problem?

@TomHeaven
Copy link

I encountered the Symbol not found: _ncclAllReduce problem and can confirm that Orang-utan's solution worked for me.

@yu-fei
Copy link

yu-fei commented Aug 19, 2018

I have segment 11 error. finally I switch Xcode from 9.2 to 8.3.3 and it's OK now.

@JerinWan
Copy link

@yu-fei
I got Segmentation fault: 11 even with Xcode 8.3.3.
Was there any trick?

@antoniopioricciardi
Copy link

https://github.com/dtrebbien

This github page needed for the tensorflow patch is gone. So we cannot compile patched TF. How could we solve? 

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment