Created
April 25, 2023 16:05
-
-
Save cgmb/971a24d2c8df70aefc6d35efd6f7c582 to your computer and use it in GitHub Desktop.
Make OpenMP optional in rocBLAS
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
From fd158ca247274b593ec59892385a1e66c96fb9a6 Mon Sep 17 00:00:00 2001 | |
From: Cordell Bloor <Cordell.Bloor@amd.com> | |
Date: Thu, 13 Apr 2023 17:55:03 -0600 | |
Subject: [PATCH] Guard use of OpenMP to make it optional | |
This change allows rocBLAS to compile and run even when OpenMP is not | |
available. | |
--- | |
clients/common/blis_interface.cpp | 4 +- | |
clients/common/cblas_interface.cpp | 20 ++++++++++ | |
clients/include/rocblas_init.hpp | 64 ++++++++++++++++++++++++++++++ | |
clients/include/utility.hpp | 8 ++++ | |
clients/samples/example_openmp.cpp | 8 +++- | |
5 files changed, 102 insertions(+), 2 deletions(-) | |
diff --git a/clients/common/blis_interface.cpp b/clients/common/blis_interface.cpp | |
index da7aef39..3327fc7d 100644 | |
--- a/clients/common/blis_interface.cpp | |
+++ b/clients/common/blis_interface.cpp | |
@@ -1,5 +1,5 @@ | |
/* ************************************************************************ | |
- * Copyright (C) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. | |
+ * Copyright (C) 2019-2023 Advanced Micro Devices, Inc. All rights reserved. | |
* | |
* Permission is hereby granted, free of charge, to any person obtaining a copy | |
* of this software and associated documentation files (the "Software"), to deal | |
@@ -21,7 +21,9 @@ | |
* ************************************************************************ */ | |
#include "blis.h" | |
+#ifdef _OPENMP | |
#include "omp.h" | |
+#endif | |
void setup_blis() | |
{ | |
diff --git a/clients/common/cblas_interface.cpp b/clients/common/cblas_interface.cpp | |
index b84831e0..2fe8a164 100644 | |
--- a/clients/common/cblas_interface.cpp | |
+++ b/clients/common/cblas_interface.cpp | |
@@ -23,7 +23,9 @@ | |
#include "rocblas_vector.hpp" | |
#include "utility.hpp" | |
#include <bitset> | |
+#ifdef _OPENMP | |
#include <omp.h> | |
+#endif | |
/* | |
* =========================================================================== | |
@@ -125,13 +127,17 @@ void cblas_scal(int64_t n, T alpha, U x, int64_t incx) | |
if(incx == 1) | |
{ | |
+#ifdef _OPENMP | |
#pragma omp parallel for | |
+#endif | |
for(int64_t i = 0; i < n; i++) | |
x[i] = alpha * x[i]; | |
} | |
else | |
{ | |
+#ifdef _OPENMP | |
#pragma omp parallel for | |
+#endif | |
for(int64_t i = 0; i < n; i++) | |
x[i * incx] = alpha * x[i * incx]; | |
} | |
@@ -484,7 +490,9 @@ void cblas_geam_helper(rocblas_operation transA, | |
rocblas_int inc1_B = transB == rocblas_operation_none ? 1 : ldb; | |
rocblas_int inc2_B = transB == rocblas_operation_none ? ldb : 1; | |
+#ifdef _OPENMP | |
#pragma omp parallel for | |
+#endif | |
for(rocblas_int i = 0; i < M; i++) | |
{ | |
for(rocblas_int j = 0; j < N; j++) | |
@@ -916,7 +924,9 @@ void cblas_geam_min_plus(rocblas_operation transA, | |
bool TRANSA = transA != rocblas_operation_none; | |
bool TRANSB = transB != rocblas_operation_none; | |
+#ifdef _OPENMP | |
#pragma omp parallel for | |
+#endif | |
for(int n1 = 0; n1 < n; n1++) | |
{ | |
for(int m1 = 0; m1 < m; m1++) | |
@@ -954,7 +964,9 @@ void cblas_geam_plus_min(rocblas_operation transA, | |
bool TRANSA = transA != rocblas_operation_none; | |
bool TRANSB = transB != rocblas_operation_none; | |
+#ifdef _OPENMP | |
#pragma omp parallel for | |
+#endif | |
for(int n1 = 0; n1 < n; n1++) | |
{ | |
for(int m1 = 0; m1 < m; m1++) | |
@@ -993,7 +1005,9 @@ void cblas_herkx(rocblas_fill uplo, | |
{ | |
if(uplo == rocblas_fill_upper) | |
{ | |
+#ifdef _OPENMP | |
#pragma omp parallel for | |
+#endif | |
for(int j = 0; j < n; ++j) | |
{ | |
for(int i = 0; i <= j; i++) | |
@@ -1015,7 +1029,9 @@ void cblas_herkx(rocblas_fill uplo, | |
} | |
else // lower | |
{ | |
+#ifdef _OPENMP | |
#pragma omp parallel for | |
+#endif | |
for(int j = 0; j < n; ++j) | |
{ | |
for(int i = j; i < n; i++) | |
@@ -1040,7 +1056,9 @@ void cblas_herkx(rocblas_fill uplo, | |
{ | |
if(uplo == rocblas_fill_upper) | |
{ | |
+#ifdef _OPENMP | |
#pragma omp parallel for | |
+#endif | |
for(int j = 0; j < n; ++j) | |
{ | |
for(int i = 0; i <= j; i++) | |
@@ -1064,7 +1082,9 @@ void cblas_herkx(rocblas_fill uplo, | |
} | |
else // lower | |
{ | |
+#ifdef _OPENMP | |
#pragma omp parallel for | |
+#endif | |
for(int j = 0; j < n; ++j) | |
{ | |
for(int i = j; i < n; i++) | |
diff --git a/clients/include/rocblas_init.hpp b/clients/include/rocblas_init.hpp | |
index d32ae57b..bf096c47 100644 | |
--- a/clients/include/rocblas_init.hpp | |
+++ b/clients/include/rocblas_init.hpp | |
@@ -29,7 +29,9 @@ | |
#include "rocblas_random.hpp" | |
#include <cinttypes> | |
#include <iostream> | |
+#ifdef _OPENMP | |
#include <omp.h> | |
+#endif | |
#include <vector> | |
//! | |
@@ -70,7 +72,9 @@ void rocblas_init_matrix_alternating_sign(rocblas_check_matrix_type matrix_type, | |
if(matrix_type == rocblas_client_general_matrix) | |
{ | |
for(size_t b = 0; b < batch_count; b++) | |
+#ifdef _OPENMP | |
#pragma omp parallel for | |
+#endif | |
for(size_t i = 0; i < M; ++i) | |
for(size_t j = 0; j < N; ++j) | |
{ | |
@@ -81,7 +85,9 @@ void rocblas_init_matrix_alternating_sign(rocblas_check_matrix_type matrix_type, | |
else if(matrix_type == rocblas_client_triangular_matrix) | |
{ | |
for(size_t b = 0; b < batch_count; b++) | |
+#ifdef _OPENMP | |
#pragma omp parallel for | |
+#endif | |
for(size_t i = 0; i < M; ++i) | |
for(size_t j = 0; j < N; ++j) | |
{ | |
@@ -107,7 +113,9 @@ void rocblas_init_matrix_alternating_sign(rocblas_check_matrix_type matrix_type, | |
if(matrix_type == rocblas_client_general_matrix) | |
{ | |
+#ifdef _OPENMP | |
#pragma omp parallel for | |
+#endif | |
for(size_t i = 0; i < M; ++i) | |
for(size_t j = 0; j < N; ++j) | |
{ | |
@@ -117,7 +125,9 @@ void rocblas_init_matrix_alternating_sign(rocblas_check_matrix_type matrix_type, | |
} | |
else if(matrix_type == rocblas_client_triangular_matrix) | |
{ | |
+#ifdef _OPENMP | |
#pragma omp parallel for | |
+#endif | |
for(size_t i = 0; i < M; ++i) | |
for(size_t j = 0; j < N; ++j) | |
{ | |
@@ -136,7 +146,9 @@ void rocblas_init_vector_alternating_sign(T rand_gen(), T* x, rocblas_int N, roc | |
if(incx < 0) | |
x -= (N - 1) * incx; | |
+#ifdef _OPENMP | |
#pragma omp parallel for | |
+#endif | |
for(rocblas_int j = 0; j < N; ++j) | |
{ | |
auto value = rand_gen(); | |
@@ -162,7 +174,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type, | |
if(matrix_type == rocblas_client_general_matrix) | |
{ | |
for(size_t b = 0; b < batch_count; b++) | |
+#ifdef _OPENMP | |
#pragma omp parallel for | |
+#endif | |
for(size_t i = 0; i < M; ++i) | |
for(size_t j = 0; j < N; ++j) | |
A[i + j * lda + b * stride] = rand_gen(); | |
@@ -170,7 +184,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type, | |
else if(matrix_type == rocblas_client_hermitian_matrix) | |
{ | |
for(size_t b = 0; b < batch_count; ++b) | |
+#ifdef _OPENMP | |
#pragma omp parallel for | |
+#endif | |
for(size_t i = 0; i < N; ++i) | |
for(size_t j = 0; j <= i; ++j) | |
{ | |
@@ -197,7 +213,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type, | |
else if(matrix_type == rocblas_client_symmetric_matrix) | |
{ | |
for(size_t b = 0; b < batch_count; ++b) | |
+#ifdef _OPENMP | |
#pragma omp parallel for | |
+#endif | |
for(size_t i = 0; i < N; ++i) | |
for(size_t j = 0; j <= i; ++j) | |
{ | |
@@ -224,7 +242,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type, | |
else if(matrix_type == rocblas_client_triangular_matrix) | |
{ | |
for(size_t b = 0; b < batch_count; b++) | |
+#ifdef _OPENMP | |
#pragma omp parallel for | |
+#endif | |
for(size_t i = 0; i < M; ++i) | |
for(size_t j = 0; j < N; ++j) | |
{ | |
@@ -241,7 +261,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type, | |
else if(matrix_type == rocblas_client_diagonally_dominant_triangular_matrix) | |
{ | |
+#ifdef _OPENMP | |
#pragma omp parallel for | |
+#endif | |
for(size_t i = 0; i < M; ++i) | |
for(size_t j = 0; j < N; ++j) | |
{ | |
@@ -255,7 +277,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type, | |
if(uplo == 'U') // rocblas_fill_upper | |
{ | |
+#ifdef _OPENMP | |
#pragma omp parallel for | |
+#endif | |
for(int i = 0; i < N; i++) | |
{ | |
T abs_sum_off_diagonal_row | |
@@ -276,7 +300,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type, | |
} | |
else // rocblas_fill_lower | |
{ | |
+#ifdef _OPENMP | |
#pragma omp parallel for | |
+#endif | |
for(int j = 0; j < N; j++) | |
{ | |
T abs_sum_off_diagonal_row | |
@@ -313,14 +339,18 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type, | |
auto lda = hA.lda(); | |
if(matrix_type == rocblas_client_general_matrix) | |
{ | |
+#ifdef _OPENMP | |
#pragma omp parallel for | |
+#endif | |
for(size_t i = 0; i < M; ++i) | |
for(size_t j = 0; j < N; ++j) | |
A[i + j * lda] = rand_gen(); | |
} | |
else if(matrix_type == rocblas_client_hermitian_matrix) | |
{ | |
+#ifdef _OPENMP | |
#pragma omp parallel for | |
+#endif | |
for(size_t i = 0; i < N; ++i) | |
for(size_t j = 0; j <= i; ++j) | |
{ | |
@@ -346,7 +376,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type, | |
} | |
else if(matrix_type == rocblas_client_symmetric_matrix) | |
{ | |
+#ifdef _OPENMP | |
#pragma omp parallel for | |
+#endif | |
for(size_t i = 0; i < N; ++i) | |
for(size_t j = 0; j <= i; ++j) | |
{ | |
@@ -372,7 +404,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type, | |
} | |
else if(matrix_type == rocblas_client_triangular_matrix) | |
{ | |
+#ifdef _OPENMP | |
#pragma omp parallel for | |
+#endif | |
for(size_t i = 0; i < M; ++i) | |
for(size_t j = 0; j < N; ++j) | |
{ | |
@@ -389,7 +423,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type, | |
else if(matrix_type == rocblas_client_diagonally_dominant_triangular_matrix) | |
{ | |
+#ifdef _OPENMP | |
#pragma omp parallel for | |
+#endif | |
for(size_t i = 0; i < M; ++i) | |
for(size_t j = 0; j < N; ++j) | |
{ | |
@@ -403,7 +439,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type, | |
if(uplo == 'U') // rocblas_fill_upper | |
{ | |
+#ifdef _OPENMP | |
#pragma omp parallel for | |
+#endif | |
for(int i = 0; i < N; i++) | |
{ | |
T abs_sum_off_diagonal_row = T( | |
@@ -424,7 +462,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type, | |
} | |
else // rocblas_fill_lower | |
{ | |
+#ifdef _OPENMP | |
#pragma omp parallel for | |
+#endif | |
for(int j = 0; j < N; j++) | |
{ | |
T abs_sum_off_diagonal_row = T( | |
@@ -457,7 +497,9 @@ void rocblas_init_vector(T rand_gen(), T* x, rocblas_int N, rocblas_stride incx) | |
if(incx < 0) | |
x -= (N - 1) * incx; | |
+#ifdef _OPENMP | |
#pragma omp parallel for | |
+#endif | |
for(rocblas_int j = 0; j < N; ++j) | |
x[j * incx] = rand_gen(); | |
} | |
@@ -480,7 +522,9 @@ void rocblas_init_matrix_trig(rocblas_check_matrix_type matrix_type, | |
if(matrix_type == rocblas_client_general_matrix) | |
{ | |
for(size_t b = 0; b < batch_count; b++) | |
+#ifdef _OPENMP | |
#pragma omp parallel for | |
+#endif | |
for(size_t i = 0; i < M; ++i) | |
for(size_t j = 0; j < N; ++j) | |
A[i + j * lda + b * stride] = T(seedReset ? cos(i + j * lda + b * stride) | |
@@ -489,7 +533,9 @@ void rocblas_init_matrix_trig(rocblas_check_matrix_type matrix_type, | |
else if(matrix_type == rocblas_client_hermitian_matrix) | |
{ | |
for(size_t b = 0; b < batch_count; ++b) | |
+#ifdef _OPENMP | |
#pragma omp parallel for | |
+#endif | |
for(size_t i = 0; i < N; ++i) | |
for(size_t j = 0; j <= i; ++j) | |
{ | |
@@ -518,7 +564,9 @@ void rocblas_init_matrix_trig(rocblas_check_matrix_type matrix_type, | |
else if(matrix_type == rocblas_client_symmetric_matrix) | |
{ | |
for(size_t b = 0; b < batch_count; ++b) | |
+#ifdef _OPENMP | |
#pragma omp parallel for | |
+#endif | |
for(size_t i = 0; i < N; ++i) | |
for(size_t j = 0; j <= i; ++j) | |
{ | |
@@ -546,7 +594,9 @@ void rocblas_init_matrix_trig(rocblas_check_matrix_type matrix_type, | |
else if(matrix_type == rocblas_client_triangular_matrix) | |
{ | |
for(size_t b = 0; b < batch_count; b++) | |
+#ifdef _OPENMP | |
#pragma omp parallel for | |
+#endif | |
for(size_t i = 0; i < M; ++i) | |
for(size_t j = 0; j < N; ++j) | |
{ | |
@@ -577,14 +627,18 @@ void rocblas_init_matrix_trig(rocblas_check_matrix_type matrix_type, | |
if(matrix_type == rocblas_client_general_matrix) | |
{ | |
+#ifdef _OPENMP | |
#pragma omp parallel for | |
+#endif | |
for(size_t i = 0; i < M; ++i) | |
for(size_t j = 0; j < N; ++j) | |
A[i + j * lda] = T(seedReset ? cos(i + j * lda) : sin(i + j * lda)); | |
} | |
else if(matrix_type == rocblas_client_hermitian_matrix) | |
{ | |
+#ifdef _OPENMP | |
#pragma omp parallel for | |
+#endif | |
for(size_t i = 0; i < N; ++i) | |
for(size_t j = 0; j <= i; ++j) | |
{ | |
@@ -611,7 +665,9 @@ void rocblas_init_matrix_trig(rocblas_check_matrix_type matrix_type, | |
} | |
else if(matrix_type == rocblas_client_symmetric_matrix) | |
{ | |
+#ifdef _OPENMP | |
#pragma omp parallel for | |
+#endif | |
for(size_t i = 0; i < N; ++i) | |
for(size_t j = 0; j <= i; ++j) | |
{ | |
@@ -637,7 +693,9 @@ void rocblas_init_matrix_trig(rocblas_check_matrix_type matrix_type, | |
} | |
else if(matrix_type == rocblas_client_triangular_matrix) | |
{ | |
+#ifdef _OPENMP | |
#pragma omp parallel for | |
+#endif | |
for(size_t i = 0; i < M; ++i) | |
for(size_t j = 0; j < N; ++j) | |
{ | |
@@ -661,7 +719,9 @@ void rocblas_init_vector_trig(T* x, rocblas_int N, rocblas_stride incx, bool see | |
if(incx < 0) | |
x -= (N - 1) * incx; | |
+#ifdef _OPENMP | |
#pragma omp parallel for | |
+#endif | |
for(rocblas_int j = 0; j < N; ++j) | |
x[j * incx] = T(seedReset ? cos(j * incx) : sin(j * incx)); | |
} | |
@@ -937,7 +997,9 @@ void rocblas_copy_matrix(const T* A, | |
{ | |
size_t stride_offset_a = i_batch * stridea; | |
size_t stride_offset_b = i_batch * strideb; | |
+#ifdef _OPENMP | |
#pragma omp parallel for | |
+#endif | |
for(size_t j = 0; j < N; ++j) | |
{ | |
size_t offset_a = stride_offset_a + j * lda; | |
@@ -956,7 +1018,9 @@ void rocblas_copy_matrix( | |
for(size_t i_batch = 0; i_batch < batch_count; i_batch++) | |
{ | |
+#ifdef _OPENMP | |
#pragma omp parallel for | |
+#endif | |
for(size_t j = 0; j < N; ++j) | |
{ | |
size_t offset_a = j * lda; | |
diff --git a/clients/include/utility.hpp b/clients/include/utility.hpp | |
index 9db10c61..873494af 100644 | |
--- a/clients/include/utility.hpp | |
+++ b/clients/include/utility.hpp | |
@@ -264,7 +264,9 @@ inline void regular_to_banded(bool upper, const T& h_A, T& h_AB, rocblas_int k) | |
size_t ldab = h_AB.lda(); | |
rocblas_int n = h_AB.n(); | |
+#ifdef _OPENMP | |
#pragma omp parallel for | |
+#endif | |
for(rocblas_int batch_index = 0; batch_index < h_A.batch_count(); ++batch_index) | |
{ | |
auto* A = h_A[batch_index]; | |
@@ -328,7 +330,9 @@ inline void banded_matrix_setup(bool upper, T& h_A, rocblas_int k) | |
{ | |
rocblas_int n = h_A.n(); | |
+#ifdef _OPENMP | |
#pragma omp parallel for | |
+#endif | |
for(rocblas_int batch_index = 0; batch_index < h_A.batch_count(); ++batch_index) | |
{ | |
auto* A = h_A[batch_index]; | |
@@ -383,7 +387,9 @@ inline void regular_to_packed(bool upper, const T* A, T* AP, rocblas_int n) | |
template <typename U> | |
inline void regular_to_packed(bool upper, U& h_A, U& h_AP, rocblas_int n) | |
{ | |
+#ifdef _OPENMP | |
#pragma omp parallel for | |
+#endif | |
for(rocblas_int batch_index = 0; batch_index < h_A.batch_count(); ++batch_index) | |
{ | |
auto* AP = h_AP[batch_index]; | |
@@ -452,7 +458,9 @@ void make_unit_diagonal(rocblas_fill uplo, T& h_A) | |
rocblas_int N = h_A.n(); | |
size_t lda = h_A.lda(); | |
+#ifdef _OPENMP | |
#pragma omp parallel for | |
+#endif | |
for(rocblas_int batch_index = 0; batch_index < h_A.batch_count(); ++batch_index) | |
{ | |
auto* A = h_A[batch_index]; | |
diff --git a/clients/samples/example_openmp.cpp b/clients/samples/example_openmp.cpp | |
index f62dae64..8cc5f2ea 100644 | |
--- a/clients/samples/example_openmp.cpp | |
+++ b/clients/samples/example_openmp.cpp | |
@@ -1,5 +1,5 @@ | |
/* ************************************************************************ | |
- * Copyright (C) 2016-2022 Advanced Micro Devices, Inc. All rights reserved. | |
+ * Copyright (C) 2016-2023 Advanced Micro Devices, Inc. All rights reserved. | |
* | |
* Permission is hereby granted, free of charge, to any person obtaining a copy | |
* of this software and associated documentation files (the "Software"), to deal | |
@@ -42,7 +42,9 @@ | |
#include <cstdlib> | |
#include <hip/hip_runtime.h> | |
#include <iostream> | |
+#ifdef _OPENMP | |
#include <omp.h> | |
+#endif | |
#include <vector> | |
#define NUM_THREADS 4 | |
@@ -100,7 +102,9 @@ int main() | |
// 1st parallel rocblas routine call : scal x | |
// spawn openmp threads | |
+#ifdef _OPENMP | |
#pragma omp parallel private(thread_id) | |
+#endif | |
{ | |
thread_id = omp_get_thread_num(); // thread_id from 0,...,NUM_THREADS-1 | |
@@ -118,7 +122,9 @@ int main() | |
// 2nd parallel rocblas routine call : copy x to y | |
// spawn openmp threads | |
+#ifdef _OPENMP | |
#pragma omp parallel private(thread_id) | |
+#endif | |
{ | |
thread_id = omp_get_thread_num(); // thread_id from 0,...,NUM_THREADS-1 | |
-- | |
2.25.1 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment