cgmb/0001-Guard-use-of-OpenMP-to-make-it-optional.patch

## 0001-Guard-use-of-OpenMP-to-make-it-optional.patch
From fd158ca247274b593ec59892385a1e66c96fb9a6 Mon Sep 17 00:00:00 2001
From: Cordell Bloor <Cordell.Bloor@amd.com>
Date: Thu, 13 Apr 2023 17:55:03 -0600
Subject: [PATCH] Guard use of OpenMP to make it optional

This change allows rocBLAS to compile and run even when OpenMP is not
available.
---
 clients/common/blis_interface.cpp  |  4 +-
 clients/common/cblas_interface.cpp | 20 ++++++++++
 clients/include/rocblas_init.hpp   | 64 ++++++++++++++++++++++++++++++
 clients/include/utility.hpp        |  8 ++++
 clients/samples/example_openmp.cpp |  8 +++-
 5 files changed, 102 insertions(+), 2 deletions(-)

diff --git a/clients/common/blis_interface.cpp b/clients/common/blis_interface.cpp
index da7aef39..3327fc7d 100644
--- a/clients/common/blis_interface.cpp
+++ b/clients/common/blis_interface.cpp
@@ -1,5 +1,5 @@
 /* ************************************************************************
- * Copyright (C) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (C) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -21,7 +21,9 @@
  * ************************************************************************ */

 #include "blis.h"
+#ifdef _OPENMP
 #include "omp.h"
+#endif

 void setup_blis()
 {
diff --git a/clients/common/cblas_interface.cpp b/clients/common/cblas_interface.cpp
index b84831e0..2fe8a164 100644
--- a/clients/common/cblas_interface.cpp
+++ b/clients/common/cblas_interface.cpp
@@ -23,7 +23,9 @@
 #include "rocblas_vector.hpp"
 #include "utility.hpp"
 #include <bitset>
+#ifdef _OPENMP
 #include <omp.h>
+#endif

 /*
  * ===========================================================================
@@ -125,13 +127,17 @@ void cblas_scal(int64_t n, T alpha, U x, int64_t incx)

     if(incx == 1)
     {
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
         for(int64_t i = 0; i < n; i++)
             x[i] = alpha * x[i];
     }
     else
     {
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
         for(int64_t i = 0; i < n; i++)
             x[i * incx] = alpha * x[i * incx];
     }
@@ -484,7 +490,9 @@ void cblas_geam_helper(rocblas_operation transA,
     rocblas_int inc1_B = transB == rocblas_operation_none ? 1 : ldb;
     rocblas_int inc2_B = transB == rocblas_operation_none ? ldb : 1;

+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
     for(rocblas_int i = 0; i < M; i++)
     {
         for(rocblas_int j = 0; j < N; j++)
@@ -916,7 +924,9 @@ void cblas_geam_min_plus(rocblas_operation transA,
     bool TRANSA = transA != rocblas_operation_none;
     bool TRANSB = transB != rocblas_operation_none;

+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
     for(int n1 = 0; n1 < n; n1++)
     {
         for(int m1 = 0; m1 < m; m1++)
@@ -954,7 +964,9 @@ void cblas_geam_plus_min(rocblas_operation transA,
     bool TRANSA = transA != rocblas_operation_none;
     bool TRANSB = transB != rocblas_operation_none;

+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
     for(int n1 = 0; n1 < n; n1++)
     {
         for(int m1 = 0; m1 < m; m1++)
@@ -993,7 +1005,9 @@ void cblas_herkx(rocblas_fill      uplo,
     {
         if(uplo == rocblas_fill_upper)
         {
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(int j = 0; j < n; ++j)
             {
                 for(int i = 0; i <= j; i++)
@@ -1015,7 +1029,9 @@ void cblas_herkx(rocblas_fill      uplo,
         }
         else // lower
         {
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(int j = 0; j < n; ++j)
             {
                 for(int i = j; i < n; i++)
@@ -1040,7 +1056,9 @@ void cblas_herkx(rocblas_fill      uplo,
     {
         if(uplo == rocblas_fill_upper)
         {
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(int j = 0; j < n; ++j)
             {
                 for(int i = 0; i <= j; i++)
@@ -1064,7 +1082,9 @@ void cblas_herkx(rocblas_fill      uplo,
         }
         else // lower
         {
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(int j = 0; j < n; ++j)
             {
                 for(int i = j; i < n; i++)
diff --git a/clients/include/rocblas_init.hpp b/clients/include/rocblas_init.hpp
index d32ae57b..bf096c47 100644
--- a/clients/include/rocblas_init.hpp
+++ b/clients/include/rocblas_init.hpp
@@ -29,7 +29,9 @@
 #include "rocblas_random.hpp"
 #include <cinttypes>
 #include <iostream>
+#ifdef _OPENMP
 #include <omp.h>
+#endif
 #include <vector>

 //!
@@ -70,7 +72,9 @@ void rocblas_init_matrix_alternating_sign(rocblas_check_matrix_type matrix_type,
     if(matrix_type == rocblas_client_general_matrix)
     {
         for(size_t b = 0; b < batch_count; b++)
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(size_t i = 0; i < M; ++i)
                 for(size_t j = 0; j < N; ++j)
                 {
@@ -81,7 +85,9 @@ void rocblas_init_matrix_alternating_sign(rocblas_check_matrix_type matrix_type,
     else if(matrix_type == rocblas_client_triangular_matrix)
     {
         for(size_t b = 0; b < batch_count; b++)
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(size_t i = 0; i < M; ++i)
                 for(size_t j = 0; j < N; ++j)
                 {
@@ -107,7 +113,9 @@ void rocblas_init_matrix_alternating_sign(rocblas_check_matrix_type matrix_type,

         if(matrix_type == rocblas_client_general_matrix)
         {
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(size_t i = 0; i < M; ++i)
                 for(size_t j = 0; j < N; ++j)
                 {
@@ -117,7 +125,9 @@ void rocblas_init_matrix_alternating_sign(rocblas_check_matrix_type matrix_type,
         }
         else if(matrix_type == rocblas_client_triangular_matrix)
         {
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(size_t i = 0; i < M; ++i)
                 for(size_t j = 0; j < N; ++j)
                 {
@@ -136,7 +146,9 @@ void rocblas_init_vector_alternating_sign(T rand_gen(), T* x, rocblas_int N, roc
     if(incx < 0)
         x -= (N - 1) * incx;

+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
     for(rocblas_int j = 0; j < N; ++j)
     {
         auto value  = rand_gen();
@@ -162,7 +174,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type,
     if(matrix_type == rocblas_client_general_matrix)
     {
         for(size_t b = 0; b < batch_count; b++)
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(size_t i = 0; i < M; ++i)
                 for(size_t j = 0; j < N; ++j)
                     A[i + j * lda + b * stride] = rand_gen();
@@ -170,7 +184,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type,
     else if(matrix_type == rocblas_client_hermitian_matrix)
     {
         for(size_t b = 0; b < batch_count; ++b)
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(size_t i = 0; i < N; ++i)
                 for(size_t j = 0; j <= i; ++j)
                 {
@@ -197,7 +213,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type,
     else if(matrix_type == rocblas_client_symmetric_matrix)
     {
         for(size_t b = 0; b < batch_count; ++b)
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(size_t i = 0; i < N; ++i)
                 for(size_t j = 0; j <= i; ++j)
                 {
@@ -224,7 +242,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type,
     else if(matrix_type == rocblas_client_triangular_matrix)
     {
         for(size_t b = 0; b < batch_count; b++)
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(size_t i = 0; i < M; ++i)
                 for(size_t j = 0; j < N; ++j)
                 {
@@ -241,7 +261,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type,

     else if(matrix_type == rocblas_client_diagonally_dominant_triangular_matrix)
     {
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
         for(size_t i = 0; i < M; ++i)
             for(size_t j = 0; j < N; ++j)
             {
@@ -255,7 +277,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type,

         if(uplo == 'U') // rocblas_fill_upper
         {
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(int i = 0; i < N; i++)
             {
                 T abs_sum_off_diagonal_row
@@ -276,7 +300,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type,
         }
         else // rocblas_fill_lower
         {
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(int j = 0; j < N; j++)
             {
                 T abs_sum_off_diagonal_row
@@ -313,14 +339,18 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type,
         auto  lda = hA.lda();
         if(matrix_type == rocblas_client_general_matrix)
         {
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(size_t i = 0; i < M; ++i)
                 for(size_t j = 0; j < N; ++j)
                     A[i + j * lda] = rand_gen();
         }
         else if(matrix_type == rocblas_client_hermitian_matrix)
         {
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(size_t i = 0; i < N; ++i)
                 for(size_t j = 0; j <= i; ++j)
                 {
@@ -346,7 +376,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type,
         }
         else if(matrix_type == rocblas_client_symmetric_matrix)
         {
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(size_t i = 0; i < N; ++i)
                 for(size_t j = 0; j <= i; ++j)
                 {
@@ -372,7 +404,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type,
         }
         else if(matrix_type == rocblas_client_triangular_matrix)
         {
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(size_t i = 0; i < M; ++i)
                 for(size_t j = 0; j < N; ++j)
                 {
@@ -389,7 +423,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type,

         else if(matrix_type == rocblas_client_diagonally_dominant_triangular_matrix)
         {
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(size_t i = 0; i < M; ++i)
                 for(size_t j = 0; j < N; ++j)
                 {
@@ -403,7 +439,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type,

             if(uplo == 'U') // rocblas_fill_upper
             {
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
                 for(int i = 0; i < N; i++)
                 {
                     T abs_sum_off_diagonal_row = T(
@@ -424,7 +462,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type,
             }
             else // rocblas_fill_lower
             {
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
                 for(int j = 0; j < N; j++)
                 {
                     T abs_sum_off_diagonal_row = T(
@@ -457,7 +497,9 @@ void rocblas_init_vector(T rand_gen(), T* x, rocblas_int N, rocblas_stride incx)
     if(incx < 0)
         x -= (N - 1) * incx;

+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
     for(rocblas_int j = 0; j < N; ++j)
         x[j * incx] = rand_gen();
 }
@@ -480,7 +522,9 @@ void rocblas_init_matrix_trig(rocblas_check_matrix_type matrix_type,
     if(matrix_type == rocblas_client_general_matrix)
     {
         for(size_t b = 0; b < batch_count; b++)
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(size_t i = 0; i < M; ++i)
                 for(size_t j = 0; j < N; ++j)
                     A[i + j * lda + b * stride] = T(seedReset ? cos(i + j * lda + b * stride)
@@ -489,7 +533,9 @@ void rocblas_init_matrix_trig(rocblas_check_matrix_type matrix_type,
     else if(matrix_type == rocblas_client_hermitian_matrix)
     {
         for(size_t b = 0; b < batch_count; ++b)
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(size_t i = 0; i < N; ++i)
                 for(size_t j = 0; j <= i; ++j)
                 {
@@ -518,7 +564,9 @@ void rocblas_init_matrix_trig(rocblas_check_matrix_type matrix_type,
     else if(matrix_type == rocblas_client_symmetric_matrix)
     {
         for(size_t b = 0; b < batch_count; ++b)
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(size_t i = 0; i < N; ++i)
                 for(size_t j = 0; j <= i; ++j)
                 {
@@ -546,7 +594,9 @@ void rocblas_init_matrix_trig(rocblas_check_matrix_type matrix_type,
     else if(matrix_type == rocblas_client_triangular_matrix)
     {
         for(size_t b = 0; b < batch_count; b++)
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(size_t i = 0; i < M; ++i)
                 for(size_t j = 0; j < N; ++j)
                 {
@@ -577,14 +627,18 @@ void rocblas_init_matrix_trig(rocblas_check_matrix_type matrix_type,

         if(matrix_type == rocblas_client_general_matrix)
         {
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(size_t i = 0; i < M; ++i)
                 for(size_t j = 0; j < N; ++j)
                     A[i + j * lda] = T(seedReset ? cos(i + j * lda) : sin(i + j * lda));
         }
         else if(matrix_type == rocblas_client_hermitian_matrix)
         {
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(size_t i = 0; i < N; ++i)
                 for(size_t j = 0; j <= i; ++j)
                 {
@@ -611,7 +665,9 @@ void rocblas_init_matrix_trig(rocblas_check_matrix_type matrix_type,
         }
         else if(matrix_type == rocblas_client_symmetric_matrix)
         {
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(size_t i = 0; i < N; ++i)
                 for(size_t j = 0; j <= i; ++j)
                 {
@@ -637,7 +693,9 @@ void rocblas_init_matrix_trig(rocblas_check_matrix_type matrix_type,
         }
         else if(matrix_type == rocblas_client_triangular_matrix)
         {
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(size_t i = 0; i < M; ++i)
                 for(size_t j = 0; j < N; ++j)
                 {
@@ -661,7 +719,9 @@ void rocblas_init_vector_trig(T* x, rocblas_int N, rocblas_stride incx, bool see
     if(incx < 0)
         x -= (N - 1) * incx;

+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
     for(rocblas_int j = 0; j < N; ++j)
         x[j * incx] = T(seedReset ? cos(j * incx) : sin(j * incx));
 }
@@ -937,7 +997,9 @@ void rocblas_copy_matrix(const T* A,
     {
         size_t stride_offset_a = i_batch * stridea;
         size_t stride_offset_b = i_batch * strideb;
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
         for(size_t j = 0; j < N; ++j)
         {
             size_t offset_a = stride_offset_a + j * lda;
@@ -956,7 +1018,9 @@ void rocblas_copy_matrix(

     for(size_t i_batch = 0; i_batch < batch_count; i_batch++)
     {
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
         for(size_t j = 0; j < N; ++j)
         {
             size_t offset_a = j * lda;
diff --git a/clients/include/utility.hpp b/clients/include/utility.hpp
index 9db10c61..873494af 100644
--- a/clients/include/utility.hpp
+++ b/clients/include/utility.hpp
@@ -264,7 +264,9 @@ inline void regular_to_banded(bool upper, const T& h_A, T& h_AB, rocblas_int k)
     size_t      ldab = h_AB.lda();
     rocblas_int n    = h_AB.n();

+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
     for(rocblas_int batch_index = 0; batch_index < h_A.batch_count(); ++batch_index)
     {
         auto* A  = h_A[batch_index];
@@ -328,7 +330,9 @@ inline void banded_matrix_setup(bool upper, T& h_A, rocblas_int k)
 {
     rocblas_int n = h_A.n();

+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
     for(rocblas_int batch_index = 0; batch_index < h_A.batch_count(); ++batch_index)
     {
         auto* A = h_A[batch_index];
@@ -383,7 +387,9 @@ inline void regular_to_packed(bool upper, const T* A, T* AP, rocblas_int n)
 template <typename U>
 inline void regular_to_packed(bool upper, U& h_A, U& h_AP, rocblas_int n)
 {
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
     for(rocblas_int batch_index = 0; batch_index < h_A.batch_count(); ++batch_index)
     {
         auto* AP    = h_AP[batch_index];
@@ -452,7 +458,9 @@ void make_unit_diagonal(rocblas_fill uplo, T& h_A)
     rocblas_int N   = h_A.n();
     size_t      lda = h_A.lda();

+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
     for(rocblas_int batch_index = 0; batch_index < h_A.batch_count(); ++batch_index)
     {
         auto* A = h_A[batch_index];
diff --git a/clients/samples/example_openmp.cpp b/clients/samples/example_openmp.cpp
index f62dae64..8cc5f2ea 100644
--- a/clients/samples/example_openmp.cpp
+++ b/clients/samples/example_openmp.cpp
@@ -1,5 +1,5 @@
 /* ************************************************************************
- * Copyright (C) 2016-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (C) 2016-2023 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -42,7 +42,9 @@
 #include <cstdlib>
 #include <hip/hip_runtime.h>
 #include <iostream>
+#ifdef _OPENMP
 #include <omp.h>
+#endif
 #include <vector>

 #define NUM_THREADS 4
@@ -100,7 +102,9 @@ int main()

 // 1st parallel rocblas routine call : scal x
 // spawn openmp threads
+#ifdef _OPENMP
 #pragma omp parallel private(thread_id)
+#endif
     {

         thread_id = omp_get_thread_num(); // thread_id from 0,...,NUM_THREADS-1
@@ -118,7 +122,9 @@ int main()

 // 2nd parallel rocblas routine call : copy x to y
 // spawn openmp threads
+#ifdef _OPENMP
 #pragma omp parallel private(thread_id)
+#endif
     {

         thread_id = omp_get_thread_num(); // thread_id from 0,...,NUM_THREADS-1
--
2.25.1
	From fd158ca247274b593ec59892385a1e66c96fb9a6 Mon Sep 17 00:00:00 2001
	From: Cordell Bloor <Cordell.Bloor@amd.com>
	Date: Thu, 13 Apr 2023 17:55:03 -0600
	Subject: [PATCH] Guard use of OpenMP to make it optional

	This change allows rocBLAS to compile and run even when OpenMP is not
	available.
	---
	clients/common/blis_interface.cpp \| 4 +-
	clients/common/cblas_interface.cpp \| 20 ++++++++++
	clients/include/rocblas_init.hpp \| 64 ++++++++++++++++++++++++++++++
	clients/include/utility.hpp \| 8 ++++
	clients/samples/example_openmp.cpp \| 8 +++-
	5 files changed, 102 insertions(+), 2 deletions(-)

	diff --git a/clients/common/blis_interface.cpp b/clients/common/blis_interface.cpp
	index da7aef39..3327fc7d 100644
	--- a/clients/common/blis_interface.cpp
	+++ b/clients/common/blis_interface.cpp
	@@ -1,5 +1,5 @@
	/* ************************************************************************
	- * Copyright (C) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
	+ * Copyright (C) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
	*
	* Permission is hereby granted, free of charge, to any person obtaining a copy
	* of this software and associated documentation files (the "Software"), to deal
	@@ -21,7 +21,9 @@
	* ************************************************************************ */

	#include "blis.h"
	+#ifdef _OPENMP
	#include "omp.h"
	+#endif

	void setup_blis()
	{
	diff --git a/clients/common/cblas_interface.cpp b/clients/common/cblas_interface.cpp
	index b84831e0..2fe8a164 100644
	--- a/clients/common/cblas_interface.cpp
	+++ b/clients/common/cblas_interface.cpp
	@@ -23,7 +23,9 @@
	#include "rocblas_vector.hpp"
	#include "utility.hpp"
	#include <bitset>
	+#ifdef _OPENMP
	#include <omp.h>
	+#endif

	/*
	* ===========================================================================
	@@ -125,13 +127,17 @@ void cblas_scal(int64_t n, T alpha, U x, int64_t incx)

	if(incx == 1)
	{
	+#ifdef _OPENMP
	#pragma omp parallel for
	+#endif
	for(int64_t i = 0; i < n; i++)
	x[i] = alpha * x[i];
	}
	else
	{
	+#ifdef _OPENMP
	#pragma omp parallel for
	+#endif
	for(int64_t i = 0; i < n; i++)
	x[i * incx] = alpha * x[i * incx];
	}
	@@ -484,7 +490,9 @@ void cblas_geam_helper(rocblas_operation transA,
	rocblas_int inc1_B = transB == rocblas_operation_none ? 1 : ldb;
	rocblas_int inc2_B = transB == rocblas_operation_none ? ldb : 1;

	+#ifdef _OPENMP
	#pragma omp parallel for
	+#endif
	for(rocblas_int i = 0; i < M; i++)
	{
	for(rocblas_int j = 0; j < N; j++)
	@@ -916,7 +924,9 @@ void cblas_geam_min_plus(rocblas_operation transA,
	bool TRANSA = transA != rocblas_operation_none;
	bool TRANSB = transB != rocblas_operation_none;

	+#ifdef _OPENMP
	#pragma omp parallel for
	+#endif
	for(int n1 = 0; n1 < n; n1++)
	{
	for(int m1 = 0; m1 < m; m1++)
	@@ -954,7 +964,9 @@ void cblas_geam_plus_min(rocblas_operation transA,
	bool TRANSA = transA != rocblas_operation_none;
	bool TRANSB = transB != rocblas_operation_none;

	+#ifdef _OPENMP
	#pragma omp parallel for
	+#endif
	for(int n1 = 0; n1 < n; n1++)
	{
	for(int m1 = 0; m1 < m; m1++)
	@@ -993,7 +1005,9 @@ void cblas_herkx(rocblas_fill uplo,
	{
	if(uplo == rocblas_fill_upper)
	{
	+#ifdef _OPENMP
	#pragma omp parallel for
	+#endif
	for(int j = 0; j < n; ++j)
	{
	for(int i = 0; i <= j; i++)
	@@ -1015,7 +1029,9 @@ void cblas_herkx(rocblas_fill uplo,
	}
	else // lower
	{
	+#ifdef _OPENMP
	#pragma omp parallel for
	+#endif
	for(int j = 0; j < n; ++j)
	{
	for(int i = j; i < n; i++)
	@@ -1040,7 +1056,9 @@ void cblas_herkx(rocblas_fill uplo,
	{
	if(uplo == rocblas_fill_upper)
	{
	+#ifdef _OPENMP
	#pragma omp parallel for
	+#endif
	for(int j = 0; j < n; ++j)
	{
	for(int i = 0; i <= j; i++)
	@@ -1064,7 +1082,9 @@ void cblas_herkx(rocblas_fill uplo,
	}
	else // lower
	{
	+#ifdef _OPENMP
	#pragma omp parallel for
	+#endif
	for(int j = 0; j < n; ++j)
	{
	for(int i = j; i < n; i++)
	diff --git a/clients/include/rocblas_init.hpp b/clients/include/rocblas_init.hpp
	index d32ae57b..bf096c47 100644
	--- a/clients/include/rocblas_init.hpp
	+++ b/clients/include/rocblas_init.hpp
	@@ -29,7 +29,9 @@
	#include "rocblas_random.hpp"
	#include <cinttypes>
	#include <iostream>
	+#ifdef _OPENMP
	#include <omp.h>
	+#endif
	#include <vector>

	//!
	@@ -70,7 +72,9 @@ void rocblas_init_matrix_alternating_sign(rocblas_check_matrix_type matrix_type,
	if(matrix_type == rocblas_client_general_matrix)
	{
	for(size_t b = 0; b < batch_count; b++)
	+#ifdef _OPENMP
	#pragma omp parallel for
	+#endif
	for(size_t i = 0; i < M; ++i)
	for(size_t j = 0; j < N; ++j)
	{
	@@ -81,7 +85,9 @@ void rocblas_init_matrix_alternating_sign(rocblas_check_matrix_type matrix_type,
	else if(matrix_type == rocblas_client_triangular_matrix)
	{
	for(size_t b = 0; b < batch_count; b++)
	+#ifdef _OPENMP
	#pragma omp parallel for
	+#endif
	for(size_t i = 0; i < M; ++i)
	for(size_t j = 0; j < N; ++j)
	{
	@@ -107,7 +113,9 @@ void rocblas_init_matrix_alternating_sign(rocblas_check_matrix_type matrix_type,

	if(matrix_type == rocblas_client_general_matrix)
	{
	+#ifdef _OPENMP
	#pragma omp parallel for
	+#endif
	for(size_t i = 0; i < M; ++i)
	for(size_t j = 0; j < N; ++j)
	{
	@@ -117,7 +125,9 @@ void rocblas_init_matrix_alternating_sign(rocblas_check_matrix_type matrix_type,
	}
	else if(matrix_type == rocblas_client_triangular_matrix)
	{
	+#ifdef _OPENMP
	#pragma omp parallel for
	+#endif
	for(size_t i = 0; i < M; ++i)
	for(size_t j = 0; j < N; ++j)
	{
	@@ -136,7 +146,9 @@ void rocblas_init_vector_alternating_sign(T rand_gen(), T* x, rocblas_int N, roc
	if(incx < 0)
	x -= (N - 1) * incx;

	+#ifdef _OPENMP
	#pragma omp parallel for
	+#endif
	for(rocblas_int j = 0; j < N; ++j)
	{
	auto value = rand_gen();
	@@ -162,7 +174,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type,
	if(matrix_type == rocblas_client_general_matrix)
	{
	for(size_t b = 0; b < batch_count; b++)
	+#ifdef _OPENMP
	#pragma omp parallel for
	+#endif
	for(size_t i = 0; i < M; ++i)
	for(size_t j = 0; j < N; ++j)
	A[i + j * lda + b * stride] = rand_gen();
	@@ -170,7 +184,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type,
	else if(matrix_type == rocblas_client_hermitian_matrix)
	{
	for(size_t b = 0; b < batch_count; ++b)
	+#ifdef _OPENMP
	#pragma omp parallel for
	+#endif
	for(size_t i = 0; i < N; ++i)
	for(size_t j = 0; j <= i; ++j)
	{
	@@ -197,7 +213,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type,
	else if(matrix_type == rocblas_client_symmetric_matrix)
	{
	for(size_t b = 0; b < batch_count; ++b)
	+#ifdef _OPENMP
	#pragma omp parallel for
	+#endif
	for(size_t i = 0; i < N; ++i)
	for(size_t j = 0; j <= i; ++j)
	{
	@@ -224,7 +242,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type,
	else if(matrix_type == rocblas_client_triangular_matrix)
	{
	for(size_t b = 0; b < batch_count; b++)
	+#ifdef _OPENMP
	#pragma omp parallel for
	+#endif
	for(size_t i = 0; i < M; ++i)
	for(size_t j = 0; j < N; ++j)
	{
	@@ -241,7 +261,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type,

	else if(matrix_type == rocblas_client_diagonally_dominant_triangular_matrix)
	{
	+#ifdef _OPENMP
	#pragma omp parallel for
	+#endif
	for(size_t i = 0; i < M; ++i)
	for(size_t j = 0; j < N; ++j)
	{
	@@ -255,7 +277,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type,

	if(uplo == 'U') // rocblas_fill_upper
	{
	+#ifdef _OPENMP
	#pragma omp parallel for
	+#endif
	for(int i = 0; i < N; i++)
	{
	T abs_sum_off_diagonal_row
	@@ -276,7 +300,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type,
	}
	else // rocblas_fill_lower
	{
	+#ifdef _OPENMP
	#pragma omp parallel for
	+#endif
	for(int j = 0; j < N; j++)
	{
	T abs_sum_off_diagonal_row
	@@ -313,14 +339,18 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type,
	auto lda = hA.lda();
	if(matrix_type == rocblas_client_general_matrix)
	{
	+#ifdef _OPENMP
	#pragma omp parallel for
	+#endif
	for(size_t i = 0; i < M; ++i)
	for(size_t j = 0; j < N; ++j)
	A[i + j * lda] = rand_gen();
	}
	else if(matrix_type == rocblas_client_hermitian_matrix)
	{
	+#ifdef _OPENMP
	#pragma omp parallel for
	+#endif
	for(size_t i = 0; i < N; ++i)
	for(size_t j = 0; j <= i; ++j)
	{
	@@ -346,7 +376,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type,
	}
	else if(matrix_type == rocblas_client_symmetric_matrix)
	{
	+#ifdef _OPENMP
	#pragma omp parallel for
	+#endif
	for(size_t i = 0; i < N; ++i)
	for(size_t j = 0; j <= i; ++j)
	{
	@@ -372,7 +404,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type,
	}
	else if(matrix_type == rocblas_client_triangular_matrix)
	{
	+#ifdef _OPENMP
	#pragma omp parallel for
	+#endif
	for(size_t i = 0; i < M; ++i)
	for(size_t j = 0; j < N; ++j)
	{
	@@ -389,7 +423,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type,

	else if(matrix_type == rocblas_client_diagonally_dominant_triangular_matrix)
	{
	+#ifdef _OPENMP
	#pragma omp parallel for
	+#endif
	for(size_t i = 0; i < M; ++i)
	for(size_t j = 0; j < N; ++j)
	{
	@@ -403,7 +439,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type,

	if(uplo == 'U') // rocblas_fill_upper
	{
	+#ifdef _OPENMP
	#pragma omp parallel for
	+#endif
	for(int i = 0; i < N; i++)
	{
	T abs_sum_off_diagonal_row = T(
	@@ -424,7 +462,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type,
	}
	else // rocblas_fill_lower
	{
	+#ifdef _OPENMP
	#pragma omp parallel for
	+#endif
	for(int j = 0; j < N; j++)
	{
	T abs_sum_off_diagonal_row = T(
	@@ -457,7 +497,9 @@ void rocblas_init_vector(T rand_gen(), T* x, rocblas_int N, rocblas_stride incx)
	if(incx < 0)
	x -= (N - 1) * incx;

	+#ifdef _OPENMP
	#pragma omp parallel for
	+#endif
	for(rocblas_int j = 0; j < N; ++j)
	x[j * incx] = rand_gen();
	}
	@@ -480,7 +522,9 @@ void rocblas_init_matrix_trig(rocblas_check_matrix_type matrix_type,
	if(matrix_type == rocblas_client_general_matrix)
	{
	for(size_t b = 0; b < batch_count; b++)
	+#ifdef _OPENMP
	#pragma omp parallel for
	+#endif
	for(size_t i = 0; i < M; ++i)
	for(size_t j = 0; j < N; ++j)
	A[i + j * lda + b * stride] = T(seedReset ? cos(i + j * lda + b * stride)
	@@ -489,7 +533,9 @@ void rocblas_init_matrix_trig(rocblas_check_matrix_type matrix_type,
	else if(matrix_type == rocblas_client_hermitian_matrix)
	{
	for(size_t b = 0; b < batch_count; ++b)
	+#ifdef _OPENMP
	#pragma omp parallel for
	+#endif
	for(size_t i = 0; i < N; ++i)
	for(size_t j = 0; j <= i; ++j)
	{
	@@ -518,7 +564,9 @@ void rocblas_init_matrix_trig(rocblas_check_matrix_type matrix_type,
	else if(matrix_type == rocblas_client_symmetric_matrix)
	{
	for(size_t b = 0; b < batch_count; ++b)
	+#ifdef _OPENMP
	#pragma omp parallel for
	+#endif
	for(size_t i = 0; i < N; ++i)
	for(size_t j = 0; j <= i; ++j)
	{
	@@ -546,7 +594,9 @@ void rocblas_init_matrix_trig(rocblas_check_matrix_type matrix_type,
	else if(matrix_type == rocblas_client_triangular_matrix)
	{
	for(size_t b = 0; b < batch_count; b++)
	+#ifdef _OPENMP
	#pragma omp parallel for
	+#endif
	for(size_t i = 0; i < M; ++i)
	for(size_t j = 0; j < N; ++j)
	{
	@@ -577,14 +627,18 @@ void rocblas_init_matrix_trig(rocblas_check_matrix_type matrix_type,

	if(matrix_type == rocblas_client_general_matrix)
	{
	+#ifdef _OPENMP
	#pragma omp parallel for
	+#endif
	for(size_t i = 0; i < M; ++i)
	for(size_t j = 0; j < N; ++j)
	A[i + j * lda] = T(seedReset ? cos(i + j * lda) : sin(i + j * lda));
	}
	else if(matrix_type == rocblas_client_hermitian_matrix)
	{
	+#ifdef _OPENMP
	#pragma omp parallel for
	+#endif
	for(size_t i = 0; i < N; ++i)
	for(size_t j = 0; j <= i; ++j)
	{
	@@ -611,7 +665,9 @@ void rocblas_init_matrix_trig(rocblas_check_matrix_type matrix_type,
	}
	else if(matrix_type == rocblas_client_symmetric_matrix)
	{
	+#ifdef _OPENMP
	#pragma omp parallel for
	+#endif
	for(size_t i = 0; i < N; ++i)
	for(size_t j = 0; j <= i; ++j)
	{
	@@ -637,7 +693,9 @@ void rocblas_init_matrix_trig(rocblas_check_matrix_type matrix_type,
	}
	else if(matrix_type == rocblas_client_triangular_matrix)
	{
	+#ifdef _OPENMP
	#pragma omp parallel for
	+#endif
	for(size_t i = 0; i < M; ++i)
	for(size_t j = 0; j < N; ++j)
	{
	@@ -661,7 +719,9 @@ void rocblas_init_vector_trig(T* x, rocblas_int N, rocblas_stride incx, bool see
	if(incx < 0)
	x -= (N - 1) * incx;

	+#ifdef _OPENMP
	#pragma omp parallel for
	+#endif
	for(rocblas_int j = 0; j < N; ++j)
	x[j * incx] = T(seedReset ? cos(j * incx) : sin(j * incx));
	}
	@@ -937,7 +997,9 @@ void rocblas_copy_matrix(const T* A,
	{
	size_t stride_offset_a = i_batch * stridea;
	size_t stride_offset_b = i_batch * strideb;
	+#ifdef _OPENMP
	#pragma omp parallel for
	+#endif
	for(size_t j = 0; j < N; ++j)
	{
	size_t offset_a = stride_offset_a + j * lda;
	@@ -956,7 +1018,9 @@ void rocblas_copy_matrix(

	for(size_t i_batch = 0; i_batch < batch_count; i_batch++)
	{
	+#ifdef _OPENMP
	#pragma omp parallel for
	+#endif
	for(size_t j = 0; j < N; ++j)
	{
	size_t offset_a = j * lda;
	diff --git a/clients/include/utility.hpp b/clients/include/utility.hpp
	index 9db10c61..873494af 100644
	--- a/clients/include/utility.hpp
	+++ b/clients/include/utility.hpp
	@@ -264,7 +264,9 @@ inline void regular_to_banded(bool upper, const T& h_A, T& h_AB, rocblas_int k)
	size_t ldab = h_AB.lda();
	rocblas_int n = h_AB.n();

	+#ifdef _OPENMP
	#pragma omp parallel for
	+#endif
	for(rocblas_int batch_index = 0; batch_index < h_A.batch_count(); ++batch_index)
	{
	auto* A = h_A[batch_index];
	@@ -328,7 +330,9 @@ inline void banded_matrix_setup(bool upper, T& h_A, rocblas_int k)
	{
	rocblas_int n = h_A.n();

	+#ifdef _OPENMP
	#pragma omp parallel for
	+#endif
	for(rocblas_int batch_index = 0; batch_index < h_A.batch_count(); ++batch_index)
	{
	auto* A = h_A[batch_index];
	@@ -383,7 +387,9 @@ inline void regular_to_packed(bool upper, const T* A, T* AP, rocblas_int n)
	template <typename U>
	inline void regular_to_packed(bool upper, U& h_A, U& h_AP, rocblas_int n)
	{
	+#ifdef _OPENMP
	#pragma omp parallel for
	+#endif
	for(rocblas_int batch_index = 0; batch_index < h_A.batch_count(); ++batch_index)
	{
	auto* AP = h_AP[batch_index];
	@@ -452,7 +458,9 @@ void make_unit_diagonal(rocblas_fill uplo, T& h_A)
	rocblas_int N = h_A.n();
	size_t lda = h_A.lda();

	+#ifdef _OPENMP
	#pragma omp parallel for
	+#endif
	for(rocblas_int batch_index = 0; batch_index < h_A.batch_count(); ++batch_index)
	{
	auto* A = h_A[batch_index];
	diff --git a/clients/samples/example_openmp.cpp b/clients/samples/example_openmp.cpp
	index f62dae64..8cc5f2ea 100644
	--- a/clients/samples/example_openmp.cpp
	+++ b/clients/samples/example_openmp.cpp
	@@ -1,5 +1,5 @@
	/* ************************************************************************
	- * Copyright (C) 2016-2022 Advanced Micro Devices, Inc. All rights reserved.
	+ * Copyright (C) 2016-2023 Advanced Micro Devices, Inc. All rights reserved.
	*
	* Permission is hereby granted, free of charge, to any person obtaining a copy
	* of this software and associated documentation files (the "Software"), to deal
	@@ -42,7 +42,9 @@
	#include <cstdlib>
	#include <hip/hip_runtime.h>
	#include <iostream>
	+#ifdef _OPENMP
	#include <omp.h>
	+#endif
	#include <vector>

	#define NUM_THREADS 4
	@@ -100,7 +102,9 @@ int main()

	// 1st parallel rocblas routine call : scal x
	// spawn openmp threads
	+#ifdef _OPENMP
	#pragma omp parallel private(thread_id)
	+#endif
	{

	thread_id = omp_get_thread_num(); // thread_id from 0,...,NUM_THREADS-1
	@@ -118,7 +122,9 @@ int main()

	// 2nd parallel rocblas routine call : copy x to y
	// spawn openmp threads
	+#ifdef _OPENMP
	#pragma omp parallel private(thread_id)
	+#endif
	{

	thread_id = omp_get_thread_num(); // thread_id from 0,...,NUM_THREADS-1
	--
	2.25.1