Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save cgmb/971a24d2c8df70aefc6d35efd6f7c582 to your computer and use it in GitHub Desktop.
Save cgmb/971a24d2c8df70aefc6d35efd6f7c582 to your computer and use it in GitHub Desktop.
Make OpenMP optional in rocBLAS
From fd158ca247274b593ec59892385a1e66c96fb9a6 Mon Sep 17 00:00:00 2001
From: Cordell Bloor <Cordell.Bloor@amd.com>
Date: Thu, 13 Apr 2023 17:55:03 -0600
Subject: [PATCH] Guard use of OpenMP to make it optional
This change allows rocBLAS to compile and run even when OpenMP is not
available.
---
clients/common/blis_interface.cpp | 4 +-
clients/common/cblas_interface.cpp | 20 ++++++++++
clients/include/rocblas_init.hpp | 64 ++++++++++++++++++++++++++++++
clients/include/utility.hpp | 8 ++++
clients/samples/example_openmp.cpp | 8 +++-
5 files changed, 102 insertions(+), 2 deletions(-)
diff --git a/clients/common/blis_interface.cpp b/clients/common/blis_interface.cpp
index da7aef39..3327fc7d 100644
--- a/clients/common/blis_interface.cpp
+++ b/clients/common/blis_interface.cpp
@@ -1,5 +1,5 @@
/* ************************************************************************
- * Copyright (C) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (C) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
@@ -21,7 +21,9 @@
* ************************************************************************ */
#include "blis.h"
+#ifdef _OPENMP
#include "omp.h"
+#endif
void setup_blis()
{
diff --git a/clients/common/cblas_interface.cpp b/clients/common/cblas_interface.cpp
index b84831e0..2fe8a164 100644
--- a/clients/common/cblas_interface.cpp
+++ b/clients/common/cblas_interface.cpp
@@ -23,7 +23,9 @@
#include "rocblas_vector.hpp"
#include "utility.hpp"
#include <bitset>
+#ifdef _OPENMP
#include <omp.h>
+#endif
/*
* ===========================================================================
@@ -125,13 +127,17 @@ void cblas_scal(int64_t n, T alpha, U x, int64_t incx)
if(incx == 1)
{
+#ifdef _OPENMP
#pragma omp parallel for
+#endif
for(int64_t i = 0; i < n; i++)
x[i] = alpha * x[i];
}
else
{
+#ifdef _OPENMP
#pragma omp parallel for
+#endif
for(int64_t i = 0; i < n; i++)
x[i * incx] = alpha * x[i * incx];
}
@@ -484,7 +490,9 @@ void cblas_geam_helper(rocblas_operation transA,
rocblas_int inc1_B = transB == rocblas_operation_none ? 1 : ldb;
rocblas_int inc2_B = transB == rocblas_operation_none ? ldb : 1;
+#ifdef _OPENMP
#pragma omp parallel for
+#endif
for(rocblas_int i = 0; i < M; i++)
{
for(rocblas_int j = 0; j < N; j++)
@@ -916,7 +924,9 @@ void cblas_geam_min_plus(rocblas_operation transA,
bool TRANSA = transA != rocblas_operation_none;
bool TRANSB = transB != rocblas_operation_none;
+#ifdef _OPENMP
#pragma omp parallel for
+#endif
for(int n1 = 0; n1 < n; n1++)
{
for(int m1 = 0; m1 < m; m1++)
@@ -954,7 +964,9 @@ void cblas_geam_plus_min(rocblas_operation transA,
bool TRANSA = transA != rocblas_operation_none;
bool TRANSB = transB != rocblas_operation_none;
+#ifdef _OPENMP
#pragma omp parallel for
+#endif
for(int n1 = 0; n1 < n; n1++)
{
for(int m1 = 0; m1 < m; m1++)
@@ -993,7 +1005,9 @@ void cblas_herkx(rocblas_fill uplo,
{
if(uplo == rocblas_fill_upper)
{
+#ifdef _OPENMP
#pragma omp parallel for
+#endif
for(int j = 0; j < n; ++j)
{
for(int i = 0; i <= j; i++)
@@ -1015,7 +1029,9 @@ void cblas_herkx(rocblas_fill uplo,
}
else // lower
{
+#ifdef _OPENMP
#pragma omp parallel for
+#endif
for(int j = 0; j < n; ++j)
{
for(int i = j; i < n; i++)
@@ -1040,7 +1056,9 @@ void cblas_herkx(rocblas_fill uplo,
{
if(uplo == rocblas_fill_upper)
{
+#ifdef _OPENMP
#pragma omp parallel for
+#endif
for(int j = 0; j < n; ++j)
{
for(int i = 0; i <= j; i++)
@@ -1064,7 +1082,9 @@ void cblas_herkx(rocblas_fill uplo,
}
else // lower
{
+#ifdef _OPENMP
#pragma omp parallel for
+#endif
for(int j = 0; j < n; ++j)
{
for(int i = j; i < n; i++)
diff --git a/clients/include/rocblas_init.hpp b/clients/include/rocblas_init.hpp
index d32ae57b..bf096c47 100644
--- a/clients/include/rocblas_init.hpp
+++ b/clients/include/rocblas_init.hpp
@@ -29,7 +29,9 @@
#include "rocblas_random.hpp"
#include <cinttypes>
#include <iostream>
+#ifdef _OPENMP
#include <omp.h>
+#endif
#include <vector>
//!
@@ -70,7 +72,9 @@ void rocblas_init_matrix_alternating_sign(rocblas_check_matrix_type matrix_type,
if(matrix_type == rocblas_client_general_matrix)
{
for(size_t b = 0; b < batch_count; b++)
+#ifdef _OPENMP
#pragma omp parallel for
+#endif
for(size_t i = 0; i < M; ++i)
for(size_t j = 0; j < N; ++j)
{
@@ -81,7 +85,9 @@ void rocblas_init_matrix_alternating_sign(rocblas_check_matrix_type matrix_type,
else if(matrix_type == rocblas_client_triangular_matrix)
{
for(size_t b = 0; b < batch_count; b++)
+#ifdef _OPENMP
#pragma omp parallel for
+#endif
for(size_t i = 0; i < M; ++i)
for(size_t j = 0; j < N; ++j)
{
@@ -107,7 +113,9 @@ void rocblas_init_matrix_alternating_sign(rocblas_check_matrix_type matrix_type,
if(matrix_type == rocblas_client_general_matrix)
{
+#ifdef _OPENMP
#pragma omp parallel for
+#endif
for(size_t i = 0; i < M; ++i)
for(size_t j = 0; j < N; ++j)
{
@@ -117,7 +125,9 @@ void rocblas_init_matrix_alternating_sign(rocblas_check_matrix_type matrix_type,
}
else if(matrix_type == rocblas_client_triangular_matrix)
{
+#ifdef _OPENMP
#pragma omp parallel for
+#endif
for(size_t i = 0; i < M; ++i)
for(size_t j = 0; j < N; ++j)
{
@@ -136,7 +146,9 @@ void rocblas_init_vector_alternating_sign(T rand_gen(), T* x, rocblas_int N, roc
if(incx < 0)
x -= (N - 1) * incx;
+#ifdef _OPENMP
#pragma omp parallel for
+#endif
for(rocblas_int j = 0; j < N; ++j)
{
auto value = rand_gen();
@@ -162,7 +174,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type,
if(matrix_type == rocblas_client_general_matrix)
{
for(size_t b = 0; b < batch_count; b++)
+#ifdef _OPENMP
#pragma omp parallel for
+#endif
for(size_t i = 0; i < M; ++i)
for(size_t j = 0; j < N; ++j)
A[i + j * lda + b * stride] = rand_gen();
@@ -170,7 +184,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type,
else if(matrix_type == rocblas_client_hermitian_matrix)
{
for(size_t b = 0; b < batch_count; ++b)
+#ifdef _OPENMP
#pragma omp parallel for
+#endif
for(size_t i = 0; i < N; ++i)
for(size_t j = 0; j <= i; ++j)
{
@@ -197,7 +213,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type,
else if(matrix_type == rocblas_client_symmetric_matrix)
{
for(size_t b = 0; b < batch_count; ++b)
+#ifdef _OPENMP
#pragma omp parallel for
+#endif
for(size_t i = 0; i < N; ++i)
for(size_t j = 0; j <= i; ++j)
{
@@ -224,7 +242,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type,
else if(matrix_type == rocblas_client_triangular_matrix)
{
for(size_t b = 0; b < batch_count; b++)
+#ifdef _OPENMP
#pragma omp parallel for
+#endif
for(size_t i = 0; i < M; ++i)
for(size_t j = 0; j < N; ++j)
{
@@ -241,7 +261,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type,
else if(matrix_type == rocblas_client_diagonally_dominant_triangular_matrix)
{
+#ifdef _OPENMP
#pragma omp parallel for
+#endif
for(size_t i = 0; i < M; ++i)
for(size_t j = 0; j < N; ++j)
{
@@ -255,7 +277,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type,
if(uplo == 'U') // rocblas_fill_upper
{
+#ifdef _OPENMP
#pragma omp parallel for
+#endif
for(int i = 0; i < N; i++)
{
T abs_sum_off_diagonal_row
@@ -276,7 +300,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type,
}
else // rocblas_fill_lower
{
+#ifdef _OPENMP
#pragma omp parallel for
+#endif
for(int j = 0; j < N; j++)
{
T abs_sum_off_diagonal_row
@@ -313,14 +339,18 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type,
auto lda = hA.lda();
if(matrix_type == rocblas_client_general_matrix)
{
+#ifdef _OPENMP
#pragma omp parallel for
+#endif
for(size_t i = 0; i < M; ++i)
for(size_t j = 0; j < N; ++j)
A[i + j * lda] = rand_gen();
}
else if(matrix_type == rocblas_client_hermitian_matrix)
{
+#ifdef _OPENMP
#pragma omp parallel for
+#endif
for(size_t i = 0; i < N; ++i)
for(size_t j = 0; j <= i; ++j)
{
@@ -346,7 +376,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type,
}
else if(matrix_type == rocblas_client_symmetric_matrix)
{
+#ifdef _OPENMP
#pragma omp parallel for
+#endif
for(size_t i = 0; i < N; ++i)
for(size_t j = 0; j <= i; ++j)
{
@@ -372,7 +404,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type,
}
else if(matrix_type == rocblas_client_triangular_matrix)
{
+#ifdef _OPENMP
#pragma omp parallel for
+#endif
for(size_t i = 0; i < M; ++i)
for(size_t j = 0; j < N; ++j)
{
@@ -389,7 +423,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type,
else if(matrix_type == rocblas_client_diagonally_dominant_triangular_matrix)
{
+#ifdef _OPENMP
#pragma omp parallel for
+#endif
for(size_t i = 0; i < M; ++i)
for(size_t j = 0; j < N; ++j)
{
@@ -403,7 +439,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type,
if(uplo == 'U') // rocblas_fill_upper
{
+#ifdef _OPENMP
#pragma omp parallel for
+#endif
for(int i = 0; i < N; i++)
{
T abs_sum_off_diagonal_row = T(
@@ -424,7 +462,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type,
}
else // rocblas_fill_lower
{
+#ifdef _OPENMP
#pragma omp parallel for
+#endif
for(int j = 0; j < N; j++)
{
T abs_sum_off_diagonal_row = T(
@@ -457,7 +497,9 @@ void rocblas_init_vector(T rand_gen(), T* x, rocblas_int N, rocblas_stride incx)
if(incx < 0)
x -= (N - 1) * incx;
+#ifdef _OPENMP
#pragma omp parallel for
+#endif
for(rocblas_int j = 0; j < N; ++j)
x[j * incx] = rand_gen();
}
@@ -480,7 +522,9 @@ void rocblas_init_matrix_trig(rocblas_check_matrix_type matrix_type,
if(matrix_type == rocblas_client_general_matrix)
{
for(size_t b = 0; b < batch_count; b++)
+#ifdef _OPENMP
#pragma omp parallel for
+#endif
for(size_t i = 0; i < M; ++i)
for(size_t j = 0; j < N; ++j)
A[i + j * lda + b * stride] = T(seedReset ? cos(i + j * lda + b * stride)
@@ -489,7 +533,9 @@ void rocblas_init_matrix_trig(rocblas_check_matrix_type matrix_type,
else if(matrix_type == rocblas_client_hermitian_matrix)
{
for(size_t b = 0; b < batch_count; ++b)
+#ifdef _OPENMP
#pragma omp parallel for
+#endif
for(size_t i = 0; i < N; ++i)
for(size_t j = 0; j <= i; ++j)
{
@@ -518,7 +564,9 @@ void rocblas_init_matrix_trig(rocblas_check_matrix_type matrix_type,
else if(matrix_type == rocblas_client_symmetric_matrix)
{
for(size_t b = 0; b < batch_count; ++b)
+#ifdef _OPENMP
#pragma omp parallel for
+#endif
for(size_t i = 0; i < N; ++i)
for(size_t j = 0; j <= i; ++j)
{
@@ -546,7 +594,9 @@ void rocblas_init_matrix_trig(rocblas_check_matrix_type matrix_type,
else if(matrix_type == rocblas_client_triangular_matrix)
{
for(size_t b = 0; b < batch_count; b++)
+#ifdef _OPENMP
#pragma omp parallel for
+#endif
for(size_t i = 0; i < M; ++i)
for(size_t j = 0; j < N; ++j)
{
@@ -577,14 +627,18 @@ void rocblas_init_matrix_trig(rocblas_check_matrix_type matrix_type,
if(matrix_type == rocblas_client_general_matrix)
{
+#ifdef _OPENMP
#pragma omp parallel for
+#endif
for(size_t i = 0; i < M; ++i)
for(size_t j = 0; j < N; ++j)
A[i + j * lda] = T(seedReset ? cos(i + j * lda) : sin(i + j * lda));
}
else if(matrix_type == rocblas_client_hermitian_matrix)
{
+#ifdef _OPENMP
#pragma omp parallel for
+#endif
for(size_t i = 0; i < N; ++i)
for(size_t j = 0; j <= i; ++j)
{
@@ -611,7 +665,9 @@ void rocblas_init_matrix_trig(rocblas_check_matrix_type matrix_type,
}
else if(matrix_type == rocblas_client_symmetric_matrix)
{
+#ifdef _OPENMP
#pragma omp parallel for
+#endif
for(size_t i = 0; i < N; ++i)
for(size_t j = 0; j <= i; ++j)
{
@@ -637,7 +693,9 @@ void rocblas_init_matrix_trig(rocblas_check_matrix_type matrix_type,
}
else if(matrix_type == rocblas_client_triangular_matrix)
{
+#ifdef _OPENMP
#pragma omp parallel for
+#endif
for(size_t i = 0; i < M; ++i)
for(size_t j = 0; j < N; ++j)
{
@@ -661,7 +719,9 @@ void rocblas_init_vector_trig(T* x, rocblas_int N, rocblas_stride incx, bool see
if(incx < 0)
x -= (N - 1) * incx;
+#ifdef _OPENMP
#pragma omp parallel for
+#endif
for(rocblas_int j = 0; j < N; ++j)
x[j * incx] = T(seedReset ? cos(j * incx) : sin(j * incx));
}
@@ -937,7 +997,9 @@ void rocblas_copy_matrix(const T* A,
{
size_t stride_offset_a = i_batch * stridea;
size_t stride_offset_b = i_batch * strideb;
+#ifdef _OPENMP
#pragma omp parallel for
+#endif
for(size_t j = 0; j < N; ++j)
{
size_t offset_a = stride_offset_a + j * lda;
@@ -956,7 +1018,9 @@ void rocblas_copy_matrix(
for(size_t i_batch = 0; i_batch < batch_count; i_batch++)
{
+#ifdef _OPENMP
#pragma omp parallel for
+#endif
for(size_t j = 0; j < N; ++j)
{
size_t offset_a = j * lda;
diff --git a/clients/include/utility.hpp b/clients/include/utility.hpp
index 9db10c61..873494af 100644
--- a/clients/include/utility.hpp
+++ b/clients/include/utility.hpp
@@ -264,7 +264,9 @@ inline void regular_to_banded(bool upper, const T& h_A, T& h_AB, rocblas_int k)
size_t ldab = h_AB.lda();
rocblas_int n = h_AB.n();
+#ifdef _OPENMP
#pragma omp parallel for
+#endif
for(rocblas_int batch_index = 0; batch_index < h_A.batch_count(); ++batch_index)
{
auto* A = h_A[batch_index];
@@ -328,7 +330,9 @@ inline void banded_matrix_setup(bool upper, T& h_A, rocblas_int k)
{
rocblas_int n = h_A.n();
+#ifdef _OPENMP
#pragma omp parallel for
+#endif
for(rocblas_int batch_index = 0; batch_index < h_A.batch_count(); ++batch_index)
{
auto* A = h_A[batch_index];
@@ -383,7 +387,9 @@ inline void regular_to_packed(bool upper, const T* A, T* AP, rocblas_int n)
template <typename U>
inline void regular_to_packed(bool upper, U& h_A, U& h_AP, rocblas_int n)
{
+#ifdef _OPENMP
#pragma omp parallel for
+#endif
for(rocblas_int batch_index = 0; batch_index < h_A.batch_count(); ++batch_index)
{
auto* AP = h_AP[batch_index];
@@ -452,7 +458,9 @@ void make_unit_diagonal(rocblas_fill uplo, T& h_A)
rocblas_int N = h_A.n();
size_t lda = h_A.lda();
+#ifdef _OPENMP
#pragma omp parallel for
+#endif
for(rocblas_int batch_index = 0; batch_index < h_A.batch_count(); ++batch_index)
{
auto* A = h_A[batch_index];
diff --git a/clients/samples/example_openmp.cpp b/clients/samples/example_openmp.cpp
index f62dae64..8cc5f2ea 100644
--- a/clients/samples/example_openmp.cpp
+++ b/clients/samples/example_openmp.cpp
@@ -1,5 +1,5 @@
/* ************************************************************************
- * Copyright (C) 2016-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (C) 2016-2023 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
@@ -42,7 +42,9 @@
#include <cstdlib>
#include <hip/hip_runtime.h>
#include <iostream>
+#ifdef _OPENMP
#include <omp.h>
+#endif
#include <vector>
#define NUM_THREADS 4
@@ -100,7 +102,9 @@ int main()
// 1st parallel rocblas routine call : scal x
// spawn openmp threads
+#ifdef _OPENMP
#pragma omp parallel private(thread_id)
+#endif
{
thread_id = omp_get_thread_num(); // thread_id from 0,...,NUM_THREADS-1
@@ -118,7 +122,9 @@ int main()
// 2nd parallel rocblas routine call : copy x to y
// spawn openmp threads
+#ifdef _OPENMP
#pragma omp parallel private(thread_id)
+#endif
{
thread_id = omp_get_thread_num(); // thread_id from 0,...,NUM_THREADS-1
--
2.25.1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment