crmne/tensorflow140_mac1013_cuda.patch

## tensorflow140_mac1013_cuda.patch
diff --git a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
index 0f7adaf24a..355584456b 100644
--- a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
+++ b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
@@ -69,7 +69,7 @@ __global__ void concat_variable_kernel(
   IntType num_inputs = input_ptr_data.size;

   // verbose declaration needed due to template
-  extern __shared__ __align__(sizeof(T)) unsigned char smem[];
+  extern __shared__  unsigned char smem[];
   IntType* smem_col_scan = reinterpret_cast<IntType*>(smem);

   if (useSmem) {
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
index ecfe51d599..3c2efb8a73 100644
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
@@ -163,7 +163,7 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNHWCSmall(
     const DepthwiseArgs args, const T* input, const T* filter, T* output) {
   assert(CanLaunchDepthwiseConv2dGPUSmall(args));
   // Holds block plus halo and filter data for blockDim.x depths.
-  extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
+  extern __shared__ unsigned char shared_memory[];
   T* const shared_data = reinterpret_cast<T*>(shared_memory);

   const int batches = args.batch;
@@ -433,7 +433,7 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNCHWSmall(
     const DepthwiseArgs args, const T* input, const T* filter, T* output) {
   assert(CanLaunchDepthwiseConv2dGPUSmall(args));
   // Holds block plus halo and filter data for blockDim.z depths.
-  extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
+  extern __shared__ unsigned char shared_memory[];
   T* const shared_data = reinterpret_cast<T*>(shared_memory);

   const int batches = args.batch;
@@ -1051,7 +1051,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall(
     const DepthwiseArgs args, const T* output, const T* input, T* filter) {
   assert(CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(args, blockDim.z));
   // Holds block plus halo and filter data for blockDim.x depths.
-  extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
+  extern __shared__ unsigned char shared_memory[];
   T* const shared_data = reinterpret_cast<T*>(shared_memory);

   const int batches = args.batch;
@@ -1310,7 +1310,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall(
     const DepthwiseArgs args, const T* output, const T* input, T* filter) {
   assert(CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(args, blockDim.x));
   // Holds block plus halo and filter data for blockDim.z depths.
-  extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
+  extern __shared__ unsigned char shared_memory[];
   T* const shared_data = reinterpret_cast<T*>(shared_memory);

   const int batches = args.batch;
diff --git a/tensorflow/core/kernels/split_lib_gpu.cu.cc b/tensorflow/core/kernels/split_lib_gpu.cu.cc
index dd6fc6115f..6a72c9a2fc 100644
--- a/tensorflow/core/kernels/split_lib_gpu.cu.cc
+++ b/tensorflow/core/kernels/split_lib_gpu.cu.cc
@@ -119,7 +119,7 @@ __global__ void split_v_kernel(const T* input_ptr,
   int num_outputs = output_ptr_data.size;

   // verbose declaration needed due to template
-  extern __shared__ __align__(sizeof(T)) unsigned char smem[];
+  extern __shared__  unsigned char smem[];
   IntType* smem_col_scan = reinterpret_cast<IntType*>(smem);

   if (useSmem) {

## tensorflow140_mac1013_cuda_libgompgcc7.patch
diff --git a/third_party/gpus/cuda/BUILD.tpl b/third_party/gpus/cuda/BUILD.tpl
index b752734a08..3f33af2684 100644
--- a/third_party/gpus/cuda/BUILD.tpl
+++ b/third_party/gpus/cuda/BUILD.tpl
@@ -109,7 +109,7 @@ cc_library(
         ".",
         "cuda/include",
     ],
-    linkopts = ["-lgomp"],
+    linkopts = ["-L/usr/local/lib/gcc/7"],
     linkstatic = 1,
     visibility = ["//visibility:public"],
 )
	diff --git a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
	index 0f7adaf24a..355584456b 100644
	--- a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
	+++ b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
	@@ -69,7 +69,7 @@ __global__ void concat_variable_kernel(
	IntType num_inputs = input_ptr_data.size;

	// verbose declaration needed due to template
	- extern __shared__ __align__(sizeof(T)) unsigned char smem[];
	+ extern __shared__ unsigned char smem[];
	IntType* smem_col_scan = reinterpret_cast<IntType*>(smem);

	if (useSmem) {
	diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
	index ecfe51d599..3c2efb8a73 100644
	--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
	+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
	@@ -163,7 +163,7 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNHWCSmall(
	const DepthwiseArgs args, const T* input, const T* filter, T* output) {
	assert(CanLaunchDepthwiseConv2dGPUSmall(args));
	// Holds block plus halo and filter data for blockDim.x depths.
	- extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
	+ extern __shared__ unsigned char shared_memory[];
	T* const shared_data = reinterpret_cast<T*>(shared_memory);

	const int batches = args.batch;
	@@ -433,7 +433,7 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNCHWSmall(
	const DepthwiseArgs args, const T* input, const T* filter, T* output) {
	assert(CanLaunchDepthwiseConv2dGPUSmall(args));
	// Holds block plus halo and filter data for blockDim.z depths.
	- extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
	+ extern __shared__ unsigned char shared_memory[];
	T* const shared_data = reinterpret_cast<T*>(shared_memory);

	const int batches = args.batch;
	@@ -1051,7 +1051,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall(
	const DepthwiseArgs args, const T* output, const T* input, T* filter) {
	assert(CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(args, blockDim.z));
	// Holds block plus halo and filter data for blockDim.x depths.
	- extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
	+ extern __shared__ unsigned char shared_memory[];
	T* const shared_data = reinterpret_cast<T*>(shared_memory);

	const int batches = args.batch;
	@@ -1310,7 +1310,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall(
	const DepthwiseArgs args, const T* output, const T* input, T* filter) {
	assert(CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(args, blockDim.x));
	// Holds block plus halo and filter data for blockDim.z depths.
	- extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
	+ extern __shared__ unsigned char shared_memory[];
	T* const shared_data = reinterpret_cast<T*>(shared_memory);

	const int batches = args.batch;
	diff --git a/tensorflow/core/kernels/split_lib_gpu.cu.cc b/tensorflow/core/kernels/split_lib_gpu.cu.cc
	index dd6fc6115f..6a72c9a2fc 100644
	--- a/tensorflow/core/kernels/split_lib_gpu.cu.cc
	+++ b/tensorflow/core/kernels/split_lib_gpu.cu.cc
	@@ -119,7 +119,7 @@ __global__ void split_v_kernel(const T* input_ptr,
	int num_outputs = output_ptr_data.size;

	// verbose declaration needed due to template
	- extern __shared__ __align__(sizeof(T)) unsigned char smem[];
	+ extern __shared__ unsigned char smem[];
	IntType* smem_col_scan = reinterpret_cast<IntType*>(smem);

	if (useSmem) {
	diff --git a/third_party/gpus/cuda/BUILD.tpl b/third_party/gpus/cuda/BUILD.tpl
	index b752734a08..3f33af2684 100644
	--- a/third_party/gpus/cuda/BUILD.tpl
	+++ b/third_party/gpus/cuda/BUILD.tpl
	@@ -109,7 +109,7 @@ cc_library(
	".",
	"cuda/include",
	],
	- linkopts = ["-lgomp"],
	+ linkopts = ["-L/usr/local/lib/gcc/7"],
	linkstatic = 1,
	visibility = ["//visibility:public"],
	)