Files
rocm-faster-whisper/patches/ct2_4.0.0_rocm.patch
scott 9dd6213831 Add ROCm build for faster-whisper with Wyoming endpoint
Multi-stage Dockerfile builds CTranslate2 v4.0.0 with HIP/ROCm support
targeting gfx1030/gfx1031 (RX 6000 series), then installs faster-whisper
and wyoming-faster-whisper on top for a Wyoming ASR server on port 10300.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-02 20:57:03 -04:00

1164 lines
42 KiB
Diff
Executable File

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1089106c..d47f504d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.7)
+cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
# Set policy for setting the MSVC runtime library for static MSVC builds
if(POLICY CMP0091)
@@ -42,12 +42,12 @@ else()
endif()
set(INTEL_ROOT ${INTEL_ROOT_DEFAULT} CACHE FILEPATH "Path to Intel root directory")
set(OPENMP_RUNTIME "INTEL" CACHE STRING "OpenMP runtime (INTEL, COMP, NONE)")
-
+set_property(CACHE OPENMP_RUNTIME PROPERTY STRINGS INTEL COMP NONE)
# Set Release build type by default to get sane performance.
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release)
endif(NOT CMAKE_BUILD_TYPE)
-
+set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS Debug Release RelWithDebInfo MinSizeRel)
# Set CXX flags.
set(CMAKE_CXX_STANDARD 17)
@@ -106,7 +106,7 @@ set(SOURCES
src/cpu/primitives.cc
src/decoding.cc
src/decoding_utils.cc
- src/devices.cc
+ #src/devices.cc
src/dtw.cc
src/encoder.cc
src/env.cc
@@ -120,13 +120,13 @@ set(SOURCES
src/layers/whisper.cc
src/logging.cc
src/models/language_model.cc
- src/models/model.cc
+ #src/models/model.cc
src/models/model_factory.cc
src/models/model_reader.cc
src/models/sequence_to_sequence.cc
src/models/transformer.cc
- src/models/wav2vec2.cc
- src/models/whisper.cc
+ #src/models/wav2vec2.cc
+ #src/models/whisper.cc
src/ops/activation.cc
src/ops/add.cc
src/ops/alibi_add.cc
@@ -184,11 +184,11 @@ set(SOURCES
src/random.cc
src/sampling.cc
src/scoring.cc
- src/storage_view.cc
+ #src/storage_view.cc
src/thread_pool.cc
src/translator.cc
- src/types.cc
- src/utils.cc
+ #src/types.cc
+ #src/utils.cc
src/vocabulary.cc
src/vocabulary_map.cc
)
@@ -416,10 +416,89 @@ if (WITH_RUY)
unset(CMAKE_POSITION_INDEPENDENT_CODE)
list(APPEND LIBRARIES ruy)
endif()
+set(GPU_RUNTIME "HIP" CACHE STRING "GPU_RUNTIME (HIP and CUDA)")
+set_property(CACHE GPU_RUNTIME PROPERTY STRINGS CUDA HIP)
+if(WIN32)
+ set(ROCM_ROOT "$ENV{HIP_PATH}" CACHE PATH "Root directory of the ROCm installation")
+else()
+ if (NOT DEFINED ROCM_PATH )
+ if (NOT DEFINED ENV{ROCM_PATH} )
+ set(ROCM_PATH "/opt/rocm" CACHE PATH "Root directory of the ROCm installation")
+ else()
+ set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Root directory of the ROCm installation")
+ endif()
+ endif()
+ set(CMAKE_MODULE_PATH ${ROCM_PATH}/lib/cmake)
+endif()
if (WITH_CUDA)
- find_package(CUDA 11.0 REQUIRED)
+ set(GPU_SOURCES
+ src/cuda/allocator.cc
+ src/cuda/primitives.cu
+ src/cuda/random.cu
+ src/cuda/utils.cc
+ src/ops/alibi_add_gpu.cu
+ src/ops/bias_add_gpu.cu
+ src/ops/concat_split_slide_gpu.cu
+ src/ops/conv1d_gpu.cu
+ src/ops/dequantize_gpu.cu
+ src/ops/gather_gpu.cu
+ src/ops/gumbel_max_gpu.cu
+ src/ops/layer_norm_gpu.cu
+ src/ops/mean_gpu.cu
+ src/ops/multinomial_gpu.cu
+ src/ops/rms_norm_gpu.cu
+ src/ops/rotary_gpu.cu
+ src/ops/softmax_gpu.cu
+ src/ops/tile_gpu.cu
+ src/ops/topk_gpu.cu
+ src/ops/topp_mask_gpu.cu
+ src/ops/quantize_gpu.cu
+ )
add_definitions(-DCT2_WITH_CUDA)
+ list(APPEND GPU_SOURCES
+ src/devices.cc
+ src/types.cc
+ src/storage_view.cc
+ src/utils.cc
+ src/models/model.cc
+ src/models/wav2vec2.cc
+ src/models/whisper.cc
+ )
+ if(GPU_RUNTIME STREQUAL "HIP")
+ enable_language(${GPU_RUNTIME})
+ set(CMAKE_${GPU_RUNTIME}_STANDARD 17)
+ set(CMAKE_${GPU_RUNTIME}_EXTENSIONS OFF)
+ set(CMAKE_${GPU_RUNTIME}_STANDARD_REQUIRED ON)
+ add_definitions(-D__HIP_PLATFORM_AMD__)
+ find_package(hipblas REQUIRED)
+ find_package(hip REQUIRED)
+
+ if(WITH_CUDNN)
+ find_package(miopen REQUIRED)
+ add_definitions(-DCT2_WITH_CUDNN)
+ endif()
+ # list(APPEND GPU_SOURCES src/devices.cc src/types.cc src/utils.cc src/models/model.cc src/models/whisper.cc)
+ add_library(${PROJECT_NAME} SHARED
+ ${SOURCES}
+ ${GPU_SOURCES}
+ )
+ set_source_files_properties(${GPU_SOURCES} PROPERTIES LANGUAGE ${GPU_RUNTIME})
+ set_source_files_properties(${SOURCES} PROPERTIES LANGUAGE CXX)
+ target_link_libraries(${PROJECT_NAME} PUBLIC roc::hipblas)
+ if(WITH_CUDNN)
+ # find_library(LIBRT NAMES "librt.so.1" PATHS /lib/x86_64-linux-gnu/ NO_DEFAULT_PATH )
+ find_package(miopen REQUIRED)
+ find_library(LIBRT rt)
+ if(LIBRT)
+ message(STATUS "Librt: " ${LIBRT})
+ target_link_libraries(${PROJECT_NAME} PUBLIC MIOpen ${LIBRT} )
+ else()
+ target_link_libraries(${PROJECT_NAME} PUBLIC MIOpen )
+ endif()
+ endif()
+ else()
+ find_package(CUDA 11.0 REQUIRED)
if(MSVC)
if(BUILD_SHARED_LIBS)
list(APPEND CUDA_NVCC_FLAGS "-Xcompiler=/MD$<$<CONFIG:Debug>:d>")
@@ -501,31 +580,20 @@ if (WITH_CUDA)
set(CUDA_LINK_LIBRARIES_KEYWORD PRIVATE)
cuda_add_library(${PROJECT_NAME}
${SOURCES}
- src/cuda/allocator.cc
- src/cuda/primitives.cu
- src/cuda/random.cu
- src/cuda/utils.cc
- src/ops/alibi_add_gpu.cu
- src/ops/bias_add_gpu.cu
- src/ops/concat_split_slide_gpu.cu
- src/ops/conv1d_gpu.cu
- src/ops/dequantize_gpu.cu
- src/ops/gather_gpu.cu
- src/ops/gumbel_max_gpu.cu
- src/ops/layer_norm_gpu.cu
- src/ops/mean_gpu.cu
- src/ops/multinomial_gpu.cu
- src/ops/rms_norm_gpu.cu
- src/ops/rotary_gpu.cu
- src/ops/softmax_gpu.cu
- src/ops/tile_gpu.cu
- src/ops/topk_gpu.cu
- src/ops/topp_mask_gpu.cu
- src/ops/quantize_gpu.cu
+ ${GPU_SOURCES}
)
+ endif()
elseif(WITH_CUDNN)
message(FATAL_ERROR "WITH_CUDNN=ON requires WITH_CUDA=ON")
else()
+ list(APPEND SOURCES
+ src/devices.cc
+ src/types.cc
+ src/storage_view.cc
+ src/utils.cc
+ src/models/model.cc
+ src/models/wav2vec2.cc
+ src/models/whisper.cc)
add_library(${PROJECT_NAME} ${SOURCES})
endif()
diff --git a/cli/CMakeLists.txt b/cli/CMakeLists.txt
index 3311ad33..e1dfe246 100644
--- a/cli/CMakeLists.txt
+++ b/cli/CMakeLists.txt
@@ -10,9 +10,24 @@ add_executable(translator
target_include_directories(translator
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../third_party/cxxopts/include
)
-target_link_libraries(translator
- PRIVATE ${PROJECT_NAME}
-)
+if(WITH_CUDA)
+ if(GPU_RUNTIME STREQUAL "HIP")
+ find_package(hipblas REQUIRED)
+ find_package(miopen REQUIRED)
+ target_link_libraries(translator
+ PRIVATE ${PROJECT_NAME} MIOpen #roc::hipblas
+ )
+ else()
+ target_link_libraries(translator
+ PRIVATE ${PROJECT_NAME}
+ )
+ endif()
+
+else()
+ target_link_libraries(translator
+ PRIVATE ${PROJECT_NAME}
+ )
+endif()
set_target_properties(translator PROPERTIES OUTPUT_NAME ct2-translator)
diff --git a/src/cuda/allocator.cc b/src/cuda/allocator.cc
index 2311bd00..0ed42fd0 100644
--- a/src/cuda/allocator.cc
+++ b/src/cuda/allocator.cc
@@ -6,9 +6,13 @@
#include "ctranslate2/utils.h"
#include "cuda/utils.h"
#include "env.h"
-
-#include <cuda.h>
-#include <cub/util_allocator.cuh>
+#ifdef __HIP_PLATFORM_AMD__
+ #include <hip/hip_runtime.h>
+ #include <hipcub/util_allocator.hpp>
+#else
+ #include <cuda.h>
+ #include <cub/util_allocator.cuh>
+#endif
#include <spdlog/spdlog.h>
namespace ctranslate2 {
@@ -63,7 +67,7 @@ namespace ctranslate2 {
class CudaAsyncAllocator : public Allocator {
public:
void* allocate(size_t size, int device_index) override {
-#if CUDA_VERSION >= 11020
+#if (CUDA_VERSION >= 11020) || defined (__HIP_PLATFORM_AMD__)
int prev_device_index = -1;
if (device_index >= 0) {
CUDA_CHECK(cudaGetDevice(&prev_device_index));
@@ -86,7 +90,7 @@ namespace ctranslate2 {
}
void free(void* ptr, int device_index) override {
-#if CUDA_VERSION >= 11020
+#if (CUDA_VERSION >= 11020) || defined (__HIP_PLATFORM_AMD__)
int prev_device_index = -1;
if (device_index >= 0) {
CUDA_CHECK(cudaGetDevice(&prev_device_index));
@@ -107,12 +111,16 @@ namespace ctranslate2 {
};
static bool support_cuda_malloc_async() {
-#if CUDA_VERSION < 11020
+#if CUDA_VERSION < 11020 && !defined(__HIP_PLATFORM_AMD__)
return false;
#else
for (int i = 0; i < get_gpu_count(); ++i) {
int supported = 0;
+#ifdef __HIP_PLATFORM_AMD__
+ supported = 1;
+#else
cudaDeviceGetAttribute(&supported, cudaDevAttrMemoryPoolsSupported, i);
+#endif
if (!supported)
return false;
}
diff --git a/src/cuda/helpers.h b/src/cuda/helpers.h
index a34d5d89..62c54df1 100644
--- a/src/cuda/helpers.h
+++ b/src/cuda/helpers.h
@@ -2,21 +2,23 @@
#include <algorithm>
#include <limits>
-
-#include <cuda_fp16.h>
-#include <cuda_bf16.h>
-
+#ifdef __HIP_PLATFORM_AMD__
+ #include<cuda2hip_macros.hpp>
+#else
+ #include <cuda_fp16.h>
+ #include <cuda_bf16.h>
+#endif
#include "ctranslate2/types.h"
#include "utils.h"
-#if !defined(__CUDACC__) || !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 530
+#if !defined(__CUDACC__) || !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 530 ||defined(__HIP_PLATFORM_AMD__)
# define CUDA_CAN_USE_HALF 1
#else
# define CUDA_CAN_USE_HALF 0
#endif
-#if defined(__CUDACC__) && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
+#if defined(__CUDACC__) && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) || defined(__HIP_PLATFORM_AMD__)
# define CUDA_CAN_USE_BF16_MATH 1
#else
# define CUDA_CAN_USE_BF16_MATH 0
@@ -362,9 +364,11 @@ namespace ctranslate2 {
// The following kernels are adapted from:
// https://github.com/pytorch/pytorch/blob/40eff454ce5638fbff638a7f4502e29ffb9a2f0d/aten/src/ATen/native/cuda/SoftMax.cu
// They help define row-wise reduction where each block handles a single row.
-
-#define C10_WARP_SIZE 32
-
+#ifdef __HIP_PLATFORM_AMD__
+ #define C10_WARP_SIZE 64
+#else
+ #define C10_WARP_SIZE 32
+#endif
template <index_t ILP = 2>
inline dim3 get_block_size(index_t dim_size) {
index_t block_size = 1;
@@ -400,7 +404,9 @@ namespace ctranslate2 {
for (index_t i = 0; i < C10_WARP_SIZE; ++i) {
warpVal = r(warpVal, smem[lane * C10_WARP_SIZE + i]);
}
+#ifndef __HIP_PLATFORM_AMD__
__syncwarp(mask);
+#endif
smem[lane] = warpVal;
}
}
diff --git a/src/cuda/primitives.cu b/src/cuda/primitives.cu
index 149e10db..8fc2825f 100644
--- a/src/cuda/primitives.cu
+++ b/src/cuda/primitives.cu
@@ -1,7 +1,11 @@
#include "ctranslate2/primitives.h"
-
-#include <cuda_runtime.h>
-#include <cublas_v2.h>
+#ifndef __HIP_PLATFORM_AMD__
+ #include <cuda_runtime.h>
+ #include <cublas_v2.h>
+#else
+ #include <hip/hip_runtime.h>
+ #include <hipblas/hipblas.h>
+#endif
#include <thrust/device_ptr.h>
#include "cuda/helpers.h"
@@ -240,7 +244,11 @@ namespace ctranslate2 {
};
template <typename T>
- __global__ void penalize_previous_tokens_kernel(T* scores,
+ __global__ void
+#ifdef __HIP_PLATFORM_AMD__
+ __launch_bounds__(64)
+#endif
+ penalize_previous_tokens_kernel(T* scores,
const T* previous_scores,
const int32_t* previous_ids,
float penalty,
@@ -265,7 +273,11 @@ namespace ctranslate2 {
dim_t batch_size,
dim_t length,
dim_t vocabulary_size) {
+#ifndef __HIP_PLATFORM_AMD__
dim3 block(32);
+#else
+ dim3 block(64);
+#endif
dim3 grid((batch_size * length + block.x - 1) / block.x);
penalize_previous_tokens_kernel<<<grid, block, 0, cuda::get_cuda_stream()>>>(
cuda::device_cast(scores),
@@ -478,12 +490,12 @@ namespace ctranslate2 {
const void* alpha_ptr = &alpha_h;
const void* beta_ptr = &beta_h;
- cudaDataType_t compute_type = CUDA_R_16F;
+ cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; //cudaDataType_t compute_type = CUDA_R_16F;
if (!cuda::use_true_fp16_gemm()) {
alpha_ptr = &alpha;
beta_ptr = &beta;
- compute_type = CUDA_R_32F;
+ compute_type = CUBLAS_COMPUTE_32F; //compute_type = CUDA_R_32F;
}
// cuBLAS assumes column-major storage, so swap a and b accordingly.
@@ -521,7 +533,7 @@ namespace ctranslate2 {
a, CUDA_R_16BF, lda,
&beta,
c, CUDA_R_16BF, ldc,
- CUDA_R_32F,
+ CUBLAS_COMPUTE_32F, //CUDA_R_32F,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
}
@@ -549,7 +561,7 @@ namespace ctranslate2 {
a, CUDA_R_8I, lda,
&beta_i,
c, CUDA_R_32I, ldc,
- CUDA_R_32I,
+ CUBLAS_COMPUTE_32I, //CUDA_R_32I,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
}
@@ -591,12 +603,12 @@ namespace ctranslate2 {
const void* alpha_ptr = &alpha_h;
const void* beta_ptr = &beta_h;
- cudaDataType_t compute_type = CUDA_R_16F;
+ cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; //cudaDataType_t compute_type = CUDA_R_16F;
if (!cuda::use_true_fp16_gemm()) {
alpha_ptr = &alpha;
beta_ptr = &beta;
- compute_type = CUDA_R_32F;
+ compute_type = CUBLAS_COMPUTE_32F; //compute_type = CUDA_R_32F;
}
// cuBLAS assumes column-major storage, so swap a and b accordingly.
@@ -635,7 +647,7 @@ namespace ctranslate2 {
&beta,
c, CUDA_R_16BF, ldc, stridec,
batch_size,
- CUDA_R_32F,
+ CUBLAS_COMPUTE_32F, //CUDA_R_32F,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
}
diff --git a/src/cuda/random.cu b/src/cuda/random.cu
index f016bb44..1d081303 100644
--- a/src/cuda/random.cu
+++ b/src/cuda/random.cu
@@ -22,7 +22,11 @@ namespace ctranslate2 {
ScopedCurandStates(size_t num_states)
: _allocator(get_allocator<Device::CUDA>())
{
+#ifdef __HIP_PLATFORM_AMD__
+ constexpr size_t num_init_threads = 64;
+#else
constexpr size_t num_init_threads = 32;
+#endif
const size_t blocks = ceil_divide(num_states, num_init_threads);
_num_states = blocks * num_init_threads;
_states = static_cast<curandState*>(_allocator.allocate(_num_states * sizeof (curandState)));
diff --git a/src/cuda/random.h b/src/cuda/random.h
index e12ae20f..20fe7436 100644
--- a/src/cuda/random.h
+++ b/src/cuda/random.h
@@ -1,7 +1,10 @@
#pragma once
-
-#include <curand_kernel.h>
-
+#ifdef __HIP_PLATFORM_AMD__
+ #include <hiprand/hiprand_kernel.h>
+ #include <cuda2hip_macros.hpp>
+#else
+ #include <curand_kernel.h>
+#endif
namespace ctranslate2 {
namespace cuda {
diff --git a/src/cuda/utils.h b/src/cuda/utils.h
index 29bc99a3..e14f1aab 100644
--- a/src/cuda/utils.h
+++ b/src/cuda/utils.h
@@ -2,12 +2,22 @@
#include <string>
-#include <cuda_runtime.h>
-#include <cublas_v2.h>
+#ifdef __HIP_PLATFORM_AMD__
+ #include <hip/hip_runtime.h>
+ #include <hipblas/hipblas.h>
+ #include <cuda2hip_macros.hpp>
+#else
+ #include <cuda_runtime.h>
+ #include <cublas_v2.h>
+#endif
#include <thrust/execution_policy.h>
#ifdef CT2_WITH_CUDNN
-# include <cudnn.h>
+ #ifdef __HIP_PLATFORM_AMD__
+ #include <miopen/miopen.h>
+ #else
+ #include <cudnn.h>
+ #endif
#endif
#include "ctranslate2/types.h"
@@ -79,7 +89,10 @@ namespace ctranslate2 {
};
// Convenience macro to call Thrust functions with a default execution policy.
-#define THRUST_CALL(FUN, ...) FUN(thrust::cuda::par_nosync.on(ctranslate2::cuda::get_cuda_stream()), __VA_ARGS__)
-
+#ifdef __HIP_PLATFORM_AMD__
+ #define THRUST_CALL(FUN, ...) FUN(thrust::hip::par_nosync.on(ctranslate2::cuda::get_cuda_stream()), __VA_ARGS__)
+#else
+ #define THRUST_CALL(FUN, ...) FUN(thrust::cuda::par_nosync.on(ctranslate2::cuda::get_cuda_stream()), __VA_ARGS__)
+#endif
}
}
diff --git a/src/cuda2hip_macros.hpp b/src/cuda2hip_macros.hpp
new file mode 100644
index 00000000..48be99c6
--- /dev/null
+++ b/src/cuda2hip_macros.hpp
@@ -0,0 +1,138 @@
+#pragma once
+#ifdef __HIP_PLATFORM_AMD__
+ #include <hip/hip_runtime.h>
+ #include<hip/hip_bfloat16.h>
+ #define __nv_bfloat16 hip_bfloat16
+ #define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
+ #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
+ __device__ hip_bfloat16 hlog(const hip_bfloat16 h) {
+ return hip_bfloat16(__ocml_log_f32(float(h)));
+ }
+ __device__ hip_bfloat16 hsin(const hip_bfloat16 h) {
+ return hip_bfloat16(__ocml_sin_f32(float(h)));
+ }
+ __device__ hip_bfloat16 hcos(const hip_bfloat16 h) {
+ return hip_bfloat16(__ocml_cos_f32(float(h)));
+ }
+ __device__ hip_bfloat16 hexp(const hip_bfloat16 h) {
+ return hip_bfloat16(__ocml_exp_f32(float(h)));
+ }
+
+ __device__ hip_bfloat16 __habs(const hip_bfloat16 a) {
+ auto ret = a;
+ ret.data &= 0x7FFF;
+ return ret;
+ }
+ __device__ hip_bfloat16 __hmax(const hip_bfloat16 a, const hip_bfloat16 b) {
+ return hip_bfloat16(__ocml_fmax_f32(float(a), float(b)));
+ }
+ #define curandStatePhilox4_32_10_t hiprandStatePhilox4_32_10_t
+ #define cublasStatus_t hipblasStatus_t
+ #define cublasHandle_t hipblasHandle_t
+ #define curand_init hiprand_init
+ #define cudaDeviceProp hipDeviceProp_t
+ #define cudaDeviceSynchronize hipDeviceSynchronize
+ #define cudaErrorInsufficientDriver hipErrorInsufficientDriver
+ #define cudaErrorNoDevice hipErrorNoDevice
+ #define cudaError_t hipError_t
+ #define cudaEventCreate hipEventCreate
+ #define cudaEventElapsedTime hipEventElapsedTime
+ #define cudaEventRecord hipEventRecord
+ #define cudaEventSynchronize hipEventSynchronize
+ #define cudaEvent_t hipEvent_t
+ #define cudaFree hipFree
+ #define cudaGetDevice hipGetDevice
+ #define cudaGetDeviceCount hipGetDeviceCount
+ #define cudaGetDeviceProperties hipGetDeviceProperties
+ #define cudaGetErrorString hipGetErrorString
+ #define cudaGetLastError hipGetLastError
+ #define cudaLaunchKernelGGL hipLaunchKernelGGL
+ #define cudaMalloc hipMalloc
+ #define cudaMemcpy hipMemcpy
+ #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
+ #define cudaMemcpyHostToDevice hipMemcpyHostToDevice
+ #define cudaSuccess hipSuccess
+ #define cudaMallocHost hipHostMalloc
+ #define cudaStream_t hipStream_t
+ #define cudaStreamCreate hipStreamCreate
+ #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
+ #define cudaStreamNonBlocking hipStreamNonBlocking
+ #define cudaStreamDestroy hipStreamDestroy
+ #define cudaSetDevice hipSetDevice
+ #define udaMemcpyToSymbol hipMemcpyToSymbol c
+ #define cudaMemcpyAsync hipMemcpyAsync
+ #define cudaFreeHost hipHostFree
+ #define cudaDeviceReset hipDeviceReset
+ #define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+ #define cudaStreamSynchronize hipStreamSynchronize
+ #define cudaMallocAsync hipMallocAsync
+ #define cudaFreeAsync hipFreeAsync
+ #define cudaDeviceGetAttribute hipDeviceGetAttribute
+ #define cudaDevAttrMemoryPoolsSupported hipDeviceAttributeMemoryPoolsSupported
+ #define CUBLAS_OP_T HIPBLAS_OP_T
+ #define CUBLAS_OP_N HIPBLAS_OP_N
+ #define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+ #define cublasSgemmStridedBatched hipblasSgemmStridedBatched
+ #define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
+ #define cublasSgemm hipblasSgemm
+ #define cublasGemmEx hipblasGemmEx
+ #define CUBLAS_COMPUTE_16F HIPBLAS_COMPUTE_16F //new added, HIPBLAS_R_16F deprecated
+ #define CUBLAS_COMPUTE_32F HIPBLAS_COMPUTE_32F //new added
+ #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_COMPUTE_32F_FAST_16F //new added
+ #define CUBLAS_COMPUTE_32F_FAST_16BF HIPBLAS_COMPUTE_32F_FAST_16BF //new added
+ #define CUBLAS_COMPUTE_32I HIPBLAS_COMPUTE_32I //new added
+ #define CUDA_R_16F HIP_R_16F //HIPBLAS_R_16F deprecated
+ #define CUDA_R_32F HIP_R_32F //HIPBLAS_R_32F deprecated
+ #define CUDA_R_16B HIP_R_16BF //HIPBLAS_R_16B deprecated
+ #define CUDA_R_16BF HIP_R_16BF //HIPBLAS_R_16B deprecated
+ #define CUDA_R_32I HIP_R_32I //HIPBLAS_R_32I deprecated
+ #define CUDA_R_8I HIP_R_8I //HIPBLAS_R_8I deprecated
+ #define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
+ #define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
+ #define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
+ #define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
+ #define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
+ #define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
+ #define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
+ #define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
+ #define CUBLAS_STATUS_LICENSE_ERROR HIPBLAS_STATUS_UNKNOWN
+ #define cublasCreate hipblasCreate
+ #define cublasSetStream hipblasSetStream
+ #define cublasDestroy hipblasDestroy
+ #define cublasComputeType_t hipblasComputeType_t //hipblasDatatype_t & hipblasLtComputeType_t deprecated
+ #define cudaDataType_t hipDataType //hipblasDatatype_t deprecated, use hipDataType for rocm7
+ #define cub hipcub
+ #define cudaStreamDefault hipStreamDefault
+ #define curand_uniform hiprand_uniform
+//cudnn vs miopen
+ #define cudnnHandle_t miopenHandle_t
+ #define cudnnDataType_t miopenDataType_t
+ #define cudnnStatus_t miopenStatus_t
+ #define cudnnGetErrorString miopenGetErrorString
+ #define cudnnCreate miopenCreate
+ #define cudnnSetStream miopenSetStream
+ #define cudnnDestroy miopenDestroy
+ #define CUDNN_STATUS_SUCCESS miopenStatusSuccess
+ #define CUDNN_DATA_FLOAT miopenFloat
+ #define CUDNN_DATA_HALF miopenHalf
+ #define CUDNN_DATA_BFLOAT16 miopenBFloat16
+ #define CUDNN_DATA_BFLOAT16 miopenBFloat16
+ #define CUDNN_DATA_INT32 miopenInt32
+ #define CUDNN_DATA_INT8 miopenInt8
+ #define cudnnCreateTensorDescriptor miopenCreateTensorDescriptor
+ #define cudnnTensorDescriptor_t miopenTensorDescriptor_t
+ #define cudnnSetTensor4dDescriptor miopenSet4dTensorDescriptor
+ #define cudnnFilterDescriptor_t miopenTensorDescriptor_t
+ #define cudnnCreateFilterDescriptor miopenCreateTensorDescriptor
+ #define cudnnSetFilter4dDescriptor miopenSet4dTensorDescriptor
+ #define cudnnActivationDescriptor_t miopenActivationDescriptor_t
+ #define cudnnCreateActivationDescriptor miopenCreateActivationDescriptor
+ #define cudnnConvolutionDescriptor_t miopenConvolutionDescriptor_t
+ #define cudnnCreateConvolutionDescriptor miopenCreateConvolutionDescriptor
+ #define cudnnDestroyActivationDescriptor miopenDestroyActivationDescriptor
+ #define cudnnDestroyTensorDescriptor miopenDestroyTensorDescriptor
+ #define cudnnDestroyConvolutionDescriptor miopenDestroyConvolutionDescriptor
+ #define cudnnConvolutionBiasActivationForward miopenConvolutionBiasActivationForward
+ #define cudnnDestroyFilterDescriptor miopenDestroyTensorDescriptor
+#endif
+
diff --git a/src/ops/bias_add_gpu.cu b/src/ops/bias_add_gpu.cu
index 8f53bcf6..47222b02 100644
--- a/src/ops/bias_add_gpu.cu
+++ b/src/ops/bias_add_gpu.cu
@@ -7,9 +7,9 @@ namespace ctranslate2 {
namespace ops {
template <typename T, typename AddFunc, typename Epilogue>
- __global__ void bias_add_kernel(const T* value,
- const T* bias,
- T* output,
+ __global__ void bias_add_kernel(const T* __restrict__ value,
+ const T* __restrict__ bias,
+ T* __restrict__ output,
cuda::index_t depth,
const AddFunc& add_func,
const Epilogue& epilogue) {
diff --git a/src/ops/conv1d_gpu.cu b/src/ops/conv1d_gpu.cu
index 3b389358..be1fa170 100644
--- a/src/ops/conv1d_gpu.cu
+++ b/src/ops/conv1d_gpu.cu
@@ -30,40 +30,66 @@ namespace ctranslate2 {
cudnnTensorDescriptor_t input_desc;
CUDNN_CHECK(cudnnCreateTensorDescriptor(&input_desc));
- CUDNN_CHECK(cudnnSetTensor4dDescriptor(input_desc, CUDNN_TENSOR_NCHW, data_type,
- batch_size, in_channels, 1, input_length));
+ CUDNN_CHECK(cudnnSetTensor4dDescriptor(input_desc,
+#ifndef __HIP_PLATFORM_AMD__
+ CUDNN_TENSOR_NCHW,
+#endif
+ data_type, batch_size, in_channels, 1, input_length));
cudnnTensorDescriptor_t output_desc;
CUDNN_CHECK(cudnnCreateTensorDescriptor(&output_desc));
- CUDNN_CHECK(cudnnSetTensor4dDescriptor(output_desc, CUDNN_TENSOR_NCHW, data_type,
- batch_size, out_channels, 1, output_length));
+ CUDNN_CHECK(cudnnSetTensor4dDescriptor(output_desc,
+#ifndef __HIP_PLATFORM_AMD__
+ CUDNN_TENSOR_NCHW,
+#endif
+ data_type, batch_size, out_channels, 1, output_length));
cudnnFilterDescriptor_t weight_desc;
CUDNN_CHECK(cudnnCreateFilterDescriptor(&weight_desc));
- CUDNN_CHECK(cudnnSetFilter4dDescriptor(weight_desc, data_type, CUDNN_TENSOR_NCHW,
- out_channels, in_channels, 1, kernel_size));
+ CUDNN_CHECK(cudnnSetFilter4dDescriptor(weight_desc, data_type,
+#ifndef __HIP_PLATFORM_AMD__
+ CUDNN_TENSOR_NCHW,
+#endif
+ out_channels, in_channels, 1, kernel_size));
cudnnConvolutionDescriptor_t conv_desc;
CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc));
+#ifndef __HIP_PLATFORM_AMD__
CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc,
/*pad_h=*/0, /*pad_w=*/_padding,
/*stride_h=*/1, /*stride_w=*/_stride,
/*dilation_h=*/1, /*dilation_w=*/_dilation,
CUDNN_CROSS_CORRELATION,
CUDNN_DATA_FLOAT));
-
+#else
+ CUDNN_CHECK(miopenInitConvolutionDescriptor(conv_desc,
+ miopenConvolution,
+ /*pad_h=*/0, /*pad_w=*/_padding,
+ /*stride_h=*/1, /*stride_w=*/_stride,
+ /*dilation_h=*/1, /*dilation_w=*/_dilation) );
+#endif
+#ifndef __HIP_PLATFORM_AMD__
CUDNN_CHECK(cudnnSetConvolutionMathType(conv_desc, CUDNN_DEFAULT_MATH));
if (data_type == CUDNN_DATA_HALF)
CUDNN_CHECK(cudnnSetConvolutionMathType(conv_desc, CUDNN_TENSOR_OP_MATH));
-
+#endif
cudnnHandle_t handle = cuda::get_cudnn_handle();
+#ifndef __HIP_PLATFORM_AMD__
cudnnConvolutionFwdAlgo_t algo = (bias
? CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
: CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM);
+#else
+ miopenConvFwdAlgorithm_t algo = (bias
+ ? miopenConvolutionFwdAlgoGEMM
+ : miopenConvolutionFwdAlgoImplicitGEMM);
+#endif
+
size_t workspace_size = 0;
void* workspace = nullptr;
+#ifndef __HIP_PLATFORM_AMD__
+
CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(handle,
input_desc,
weight_desc,
@@ -74,23 +100,85 @@ namespace ctranslate2 {
if (workspace_size > 0)
workspace = get_allocator<Device::CUDA>().allocate(workspace_size);
-
+#else
+ std::size_t count;
+ CUDNN_CHECK(miopenConvolutionForwardGetSolutionCount(handle,
+ weight_desc,
+ input_desc,
+ conv_desc,
+ output_desc,
+ &count));
+ if(count <1){
+ std::cout<<"count: "<<count<<std::endl;
+ return;
+ }
+ auto solutions = std::vector<miopenConvSolution_t>(count);
+ CUDNN_CHECK(miopenConvolutionForwardGetSolution(handle,
+ weight_desc,
+ input_desc,
+ conv_desc,
+ output_desc,
+ count,
+ &count,
+ solutions.data()));
+ const miopenConvSolution_t* selected = &solutions.front();
+ CUDNN_CHECK(miopenConvolutionForwardGetSolutionWorkspaceSize(handle,
+ weight_desc,
+ input_desc,
+ conv_desc,
+ output_desc,
+ selected->solution_id,
+ &workspace_size));
+ if (workspace_size > 0){
+ workspace = get_allocator<Device::CUDA>().allocate(workspace_size);
+ }
+ CUDNN_CHECK(miopenConvolutionForwardCompileSolution(handle,
+ weight_desc,
+ input_desc,
+ conv_desc,
+ output_desc,
+ selected->solution_id));
+
+ CUDNN_CHECK(miopenConvolutionForwardImmediate(handle,
+ weight_desc,
+ weight.buffer(),
+ input_desc,
+ input.buffer(),
+ conv_desc,
+ output_desc,
+ output.buffer(),
+ workspace,
+ workspace_size,
+ selected->solution_id));
+#endif
float alpha = 1;
float beta = 0;
if (bias) {
cudnnTensorDescriptor_t bias_desc;
CUDNN_CHECK(cudnnCreateTensorDescriptor(&bias_desc));
- CUDNN_CHECK(cudnnSetTensor4dDescriptor(bias_desc, CUDNN_TENSOR_NCHW, data_type,
- 1, out_channels, 1, 1));
+ CUDNN_CHECK(cudnnSetTensor4dDescriptor(bias_desc,
+#ifndef __HIP_PLATFORM_AMD__
+ CUDNN_TENSOR_NCHW,
+#endif
+ data_type, 1, out_channels, 1, 1));
+
cudnnActivationDescriptor_t activation_desc;
CUDNN_CHECK(cudnnCreateActivationDescriptor(&activation_desc));
+#ifndef __HIP_PLATFORM_AMD__
CUDNN_CHECK(cudnnSetActivationDescriptor(activation_desc,
CUDNN_ACTIVATION_IDENTITY,
CUDNN_NOT_PROPAGATE_NAN,
/*coef=*/0));
-
+#else
+ CUDNN_CHECK(miopenSetActivationDescriptor(activation_desc,
+ miopenActivationPASTHRU,
+ 0,
+ 0,
+ 0));
+#endif
+#ifndef __HIP_PLATFORM_AMD__
CUDNN_CHECK(cudnnConvolutionBiasActivationForward(handle,
&alpha,
input_desc,
@@ -109,11 +197,21 @@ namespace ctranslate2 {
activation_desc,
output_desc,
output.buffer()));
+#else
+ CUDNN_CHECK(miopenConvolutionForwardBias(handle,
+ &alpha,
+ bias_desc,
+ bias->buffer(),
+ &beta,
+ output_desc,
+ output.buffer()));
+#endif
CUDNN_CHECK(cudnnDestroyActivationDescriptor(activation_desc));
CUDNN_CHECK(cudnnDestroyTensorDescriptor(bias_desc));
} else {
+#ifndef __HIP_PLATFORM_AMD__
CUDNN_CHECK(cudnnConvolutionForward(handle,
&alpha,
input_desc,
@@ -127,6 +225,7 @@ namespace ctranslate2 {
&beta,
output_desc,
output.buffer()));
+#endif
}
if (workspace)
diff --git a/src/ops/dequantize_gpu.cu b/src/ops/dequantize_gpu.cu
index 241b3acd..99de40b9 100644
--- a/src/ops/dequantize_gpu.cu
+++ b/src/ops/dequantize_gpu.cu
@@ -4,6 +4,16 @@
namespace ctranslate2 {
namespace ops {
+#ifdef __HIP_PLATFORM_AMD__
+ template <typename InT, typename OutT>
+ struct dequantize_func {
+ __device__ __forceinline__
+ OutT operator()(float scale, InT x) const {
+ return OutT(__fdividef(float(x), scale));
+ }
+ };
+
+#else
template <typename InT, typename OutT>
struct dequantize_func {
@@ -12,7 +22,7 @@ namespace ctranslate2 {
return __fdividef(static_cast<float>(x), scale);
}
};
-
+#endif
template <Device D, typename InT, typename OutT>
void Dequantize::dequantize(const StorageView& input,
const StorageView& scale,
diff --git a/src/ops/gumbel_max_gpu.cu b/src/ops/gumbel_max_gpu.cu
index 160390c4..b50b5343 100644
--- a/src/ops/gumbel_max_gpu.cu
+++ b/src/ops/gumbel_max_gpu.cu
@@ -17,7 +17,11 @@ namespace ctranslate2 {
template <typename DataType, typename IndexType>
__device__ DataType operator()(DataType value, IndexType id) const {
const float z = -logf(curand_uniform(_states + id));
+#ifdef __HIP_PLATFORM_AMD__
+ return value + DataType(z);
+#else
return float(value) + z;
+#endif
}
private:
diff --git a/src/ops/layer_norm_gpu.cu b/src/ops/layer_norm_gpu.cu
index 8c644d87..7dfcae9a 100644
--- a/src/ops/layer_norm_gpu.cu
+++ b/src/ops/layer_norm_gpu.cu
@@ -139,14 +139,17 @@ namespace ctranslate2 {
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
-
-#include <cub/block/block_reduce.cuh>
-
+#ifndef __HIP_PLATFORM_AMD__
+ #include <cub/block/block_reduce.cuh>
+#else
+ #include<hipcub/hipcub.hpp>
+ #include <hipcub/block/block_reduce.hpp>
+#endif
namespace at {
namespace native {
template <typename T, typename SizeT>
- __global__ void LayerNormForwardCUDAKernel(SizeT N,
+ __global__ void __launch_bounds__(CUDA_NUM_THREADS) LayerNormForwardCUDAKernel(SizeT N,
float eps,
const T* X,
const T* gamma,
diff --git a/src/ops/mean_gpu.cu b/src/ops/mean_gpu.cu
index 5125924c..508d9706 100644
--- a/src/ops/mean_gpu.cu
+++ b/src/ops/mean_gpu.cu
@@ -1,7 +1,10 @@
#include "ctranslate2/ops/mean.h"
-
-#include <cub/block/block_reduce.cuh>
-
+#ifndef __HIP_PLATFORM_AMD__
+ #include <cub/block/block_reduce.cuh>
+#else
+#include <hipcub/hipcub.hpp>
+#include <hipcub/block/block_reduce.hpp>
+#endif
#include "type_dispatch.h"
#include "cuda/helpers.h"
@@ -11,7 +14,7 @@ namespace ctranslate2 {
constexpr dim_t num_threads = 256;
template <typename T, typename AccumT>
- __global__ void mean_kernel(const T* input,
+ __global__ void __launch_bounds__(num_threads) mean_kernel(const T* input,
const cuda::index_t outer_size,
const cuda::index_t axis_size,
const cuda::index_t inner_size,
diff --git a/src/ops/multinomial_gpu.cu b/src/ops/multinomial_gpu.cu
index 90f36377..e2425836 100644
--- a/src/ops/multinomial_gpu.cu
+++ b/src/ops/multinomial_gpu.cu
@@ -1,8 +1,12 @@
#include "ctranslate2/ops/multinomial.h"
-
-#include <cub/block/block_reduce.cuh>
-#include <cub/block/block_scan.cuh>
-
+#ifndef __HIP_PLATFORM_AMD__
+ #include <cub/block/block_reduce.cuh>
+ #include <cub/block/block_scan.cuh>
+#else
+ #include <hipcub/hipcub.hpp>
+ #include <hipcub/block/block_reduce.hpp>
+ #include <hipcub/block/block_scan.hpp>
+#endif
#include "cuda/helpers.h"
#include "cuda/random.h"
@@ -24,7 +28,7 @@ namespace ctranslate2 {
constexpr dim_t num_threads = 256;
template <typename In, typename Out>
- __global__ void multinomial_kernel(const In* probs,
+ __global__ void __launch_bounds__(num_threads) multinomial_kernel(const In* probs,
cuda::index_t class_size,
Out* output,
curandStatePhilox4_32_10_t* states) {
diff --git a/src/ops/rms_norm_gpu.cu b/src/ops/rms_norm_gpu.cu
index 41130cd9..f2734ede 100644
--- a/src/ops/rms_norm_gpu.cu
+++ b/src/ops/rms_norm_gpu.cu
@@ -1,7 +1,10 @@
#include "ctranslate2/ops/rms_norm.h"
-
-#include <cub/block/block_reduce.cuh>
-
+#ifndef __HIP_PLATFORM_AMD__
+ #include <cub/block/block_reduce.cuh>
+#else
+ #include <hipcub/hipcub.hpp>
+ #include <hipcub/block/block_reduce.hpp>
+#endif
#include "cuda/helpers.h"
#include "cuda/utils.h"
@@ -11,7 +14,7 @@ namespace ctranslate2 {
constexpr dim_t num_threads = 512;
template <typename T>
- __global__ void rms_norm_kernel(const T* input,
+ __global__ void __launch_bounds__(num_threads) rms_norm_kernel(const T* input,
const T* gamma,
T* output,
cuda::index_t depth,
diff --git a/src/ops/topk_gpu.cu b/src/ops/topk_gpu.cu
index ad010fb4..ffbe192e 100644
--- a/src/ops/topk_gpu.cu
+++ b/src/ops/topk_gpu.cu
@@ -115,9 +115,12 @@ namespace ctranslate2 {
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
-
-#include <cub/block/block_reduce.cuh>
-
+#ifndef __HIP_PLATFORM_AMD__
+ #include <cub/block/block_reduce.cuh>
+#else
+ #include <hipcub/hipcub.hpp>
+ #include <hipcub/block/block_reduce.hpp>
+#endif
namespace fastertransformer {
#define NOT_FOUND -1
diff --git a/src/ops/topp_mask_gpu.cu b/src/ops/topp_mask_gpu.cu
index a4e6cb6e..4d6661b0 100644
--- a/src/ops/topp_mask_gpu.cu
+++ b/src/ops/topp_mask_gpu.cu
@@ -1,6 +1,10 @@
#include "ctranslate2/ops/topp_mask.h"
-
-#include <cub/block/block_radix_sort.cuh>
+#ifndef __HIP_PLATFORM_AMD__
+ #include <cub/block/block_radix_sort.cuh>
+#else
+ #include <hipcub/hipcub.hpp>
+ #include <hipcub/block/block_radix_sort.hpp>
+#endif
#include "cuda/helpers.h"
@@ -10,7 +14,7 @@ namespace ctranslate2 {
constexpr dim_t num_threads = 256;
template <typename T, int ITEMS_PER_THREAD = 4>
- __global__ void topp_mask_kernel(const T* input,
+ __global__ void __launch_bounds__(num_threads) topp_mask_kernel(const T* input,
const T* probs,
T* output,
const float p,
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 283c49db..87906a7e 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -18,18 +18,45 @@ add_executable(ctranslate2_test
target_include_directories(ctranslate2_test PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/../src
)
-target_link_libraries(ctranslate2_test
- ${PROJECT_NAME}
- gtest_main
+if(WITH_CUDA)
+ if(GPU_RUNTIME STREQUAL "HIP")
+ find_package(hipblas REQUIRED)
+ find_package(miopen REQUIRED)
+ target_link_libraries(ctranslate2_test
+ ${PROJECT_NAME}
+ gtest_main # roc::hipblas
+ MIOpen
+ )
+ else()
+ target_link_libraries(ctranslate2_test
+ ${PROJECT_NAME}
+ gtest_main
+ )
+ endif()
+else()
+
+ target_link_libraries(ctranslate2_test
+ ${PROJECT_NAME}
+ gtest_main
)
+endif()
add_executable(benchmark_ops
benchmark_ops.cc
)
+target_include_directories(benchmark_ops PRIVATE
+ ${CMAKE_CURRENT_SOURCE_DIR}/../include
+ )
target_link_libraries(benchmark_ops
- ${PROJECT_NAME}
- )
+ ${PROJECT_NAME}
+ )
if(WITH_CUDA)
- target_link_libraries(benchmark_ops ${CUDA_LIBRARIES})
+ set_source_files_properties(benchmark_ops.cc PROPERTIES LANGUAGE ${GPU_RUNTIME})
+ if(GPU_RUNTIME STREQUAL "HIP")
+ find_package(hipblas REQUIRED)
+ target_link_libraries(benchmark_ops roc::hipblas)
+ else()
+ target_link_libraries(benchmark_ops ${CUDA_LIBRARIES})
+ endif()
endif()
diff --git a/tests/benchmark_utils.h b/tests/benchmark_utils.h
index 55ee77b0..c35da837 100644
--- a/tests/benchmark_utils.h
+++ b/tests/benchmark_utils.h
@@ -5,8 +5,13 @@
#include <vector>
#ifdef CT2_WITH_CUDA
-# include <cuda_runtime.h>
-# define SYNCHRONIZE cudaDeviceSynchronize()
+ #ifndef __HIP_PLATFORM_AMD__
+ #include <cuda_runtime.h>
+ #define SYNCHRONIZE cudaDeviceSynchronize()
+ #else
+ #include <hip/hip_runtime.h>
+ #define SYNCHRONIZE hipDeviceSynchronize()
+ #endif
#else
# define SYNCHRONIZE do {} while (false)
#endif