diff --git a/.ci/docker/common/install_rocm.sh b/.ci/docker/common/install_rocm.sh index 085304ac7c97..5659b487f838 100644 --- a/.ci/docker/common/install_rocm.sh +++ b/.ci/docker/common/install_rocm.sh @@ -6,9 +6,6 @@ ver() { printf "%3d%03d%03d%03d" $(echo "$1" | tr '.' ' '); } -# Map ROCm version to AMDGPU version -declare -A AMDGPU_VERSIONS=( ["5.0"]="21.50" ["5.1.1"]="22.10.1" ["5.2"]="22.20" ) - install_ubuntu() { apt-get update if [[ $UBUNTU_VERSION == 18.04 ]]; then @@ -26,31 +23,14 @@ install_ubuntu() { apt-get install -y libc++1 apt-get install -y libc++abi1 - if [[ $(ver $ROCM_VERSION) -ge $(ver 4.5) ]]; then - # Add amdgpu repository - UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'` - local amdgpu_baseurl - if [[ $(ver $ROCM_VERSION) -ge $(ver 5.3) ]]; then - amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu" - else - amdgpu_baseurl="https://repo.radeon.com/amdgpu/${AMDGPU_VERSIONS[$ROCM_VERSION]}/ubuntu" - fi - echo "deb [arch=amd64] ${amdgpu_baseurl} ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list - fi - - ROCM_REPO="ubuntu" - if [[ $(ver $ROCM_VERSION) -lt $(ver 4.2) ]]; then - ROCM_REPO="xenial" - fi - - if [[ $(ver $ROCM_VERSION) -ge $(ver 5.3) ]]; then - ROCM_REPO="${UBUNTU_VERSION_NAME}" - fi + # Add amdgpu repository + UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'` + echo "deb [arch=amd64] https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list # Add rocm repository wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - local rocm_baseurl="http://repo.radeon.com/rocm/apt/${ROCM_VERSION}" - echo "deb [arch=amd64] ${rocm_baseurl} ${ROCM_REPO} main" > /etc/apt/sources.list.d/rocm.list + echo "deb [arch=amd64] ${rocm_baseurl} ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/rocm.list apt-get update --allow-insecure-repositories DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \ @@ -68,29 +48,18 @@ install_ubuntu() { # precompiled miopen kernels added in ROCm 3.5, renamed in ROCm 5.5 # search for all unversioned packages # if search fails it will abort this script; use true to avoid case where search fails - if [[ $(ver $ROCM_VERSION) -ge $(ver 5.5) ]]; then - MIOPENHIPGFX=$(apt-cache search --names-only miopen-hip-gfx | awk '{print $1}' | grep -F -v . || true) - if [[ "x${MIOPENHIPGFX}" = x ]]; then - echo "miopen-hip-gfx package not available" && exit 1 - else - DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENHIPGFX} - fi + MIOPENHIPGFX=$(apt-cache search --names-only miopen-hip-gfx | awk '{print $1}' | grep -F -v . || true) + if [[ "x${MIOPENHIPGFX}" = x ]]; then + echo "miopen-hip-gfx package not available" && exit 1 else - MIOPENKERNELS=$(apt-cache search --names-only miopenkernels | awk '{print $1}' | grep -F -v . || true) - if [[ "x${MIOPENKERNELS}" = x ]]; then - echo "miopenkernels package not available" && exit 1 - else - DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENKERNELS} - fi + DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENHIPGFX} fi # ROCm 6.0 had a regression where journal_mode was enabled on the kdb files resulting in permission errors at runtime - if [[ $(ver $ROCM_VERSION) -ge $(ver 6.0) ]]; then - for kdb in /opt/rocm/share/miopen/db/*.kdb - do - sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;" - done - fi + for kdb in /opt/rocm/share/miopen/db/*.kdb + do + sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;" + done # Cleanup apt-get autoclean && apt-get clean @@ -107,25 +76,19 @@ install_centos() { yum install -y epel-release yum install -y dkms kernel-headers-`uname -r` kernel-devel-`uname -r` - if [[ $(ver $ROCM_VERSION) -ge $(ver 4.5) ]]; then - # Add amdgpu repository - local amdgpu_baseurl - if [[ $OS_VERSION == 9 ]]; then - amdgpu_baseurl="https://repo.radeon.com/amdgpu/${AMDGPU_VERSIONS[$ROCM_VERSION]}/rhel/9.0/main/x86_64" - else - if [[ $(ver $ROCM_VERSION) -ge $(ver 5.3) ]]; then - amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/rhel/7.9/main/x86_64" - else - amdgpu_baseurl="https://repo.radeon.com/amdgpu/${AMDGPU_VERSIONS[$ROCM_VERSION]}/rhel/7.9/main/x86_64" - fi - fi - echo "[AMDGPU]" > /etc/yum.repos.d/amdgpu.repo - echo "name=AMDGPU" >> /etc/yum.repos.d/amdgpu.repo - echo "baseurl=${amdgpu_baseurl}" >> /etc/yum.repos.d/amdgpu.repo - echo "enabled=1" >> /etc/yum.repos.d/amdgpu.repo - echo "gpgcheck=1" >> /etc/yum.repos.d/amdgpu.repo - echo "gpgkey=http://repo.radeon.com/rocm/rocm.gpg.key" >> /etc/yum.repos.d/amdgpu.repo + # Add amdgpu repository + local amdgpu_baseurl + if [[ $OS_VERSION == 9 ]]; then + amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/rhel/9.0/main/x86_64" + else + amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/rhel/7.9/main/x86_64" fi + echo "[AMDGPU]" > /etc/yum.repos.d/amdgpu.repo + echo "name=AMDGPU" >> /etc/yum.repos.d/amdgpu.repo + echo "baseurl=${amdgpu_baseurl}" >> /etc/yum.repos.d/amdgpu.repo + echo "enabled=1" >> /etc/yum.repos.d/amdgpu.repo + echo "gpgcheck=1" >> /etc/yum.repos.d/amdgpu.repo + echo "gpgkey=http://repo.radeon.com/rocm/rocm.gpg.key" >> /etc/yum.repos.d/amdgpu.repo local rocm_baseurl="http://repo.radeon.com/rocm/yum/${ROCM_VERSION}" echo "[ROCm]" > /etc/yum.repos.d/rocm.repo @@ -147,29 +110,18 @@ install_centos() { # precompiled miopen kernels; search for all unversioned packages # if search fails it will abort this script; use true to avoid case where search fails - if [[ $(ver $ROCM_VERSION) -ge $(ver 5.5) ]]; then - MIOPENHIPGFX=$(yum -q search miopen-hip-gfx | grep miopen-hip-gfx | awk '{print $1}'| grep -F kdb. || true) - if [[ "x${MIOPENHIPGFX}" = x ]]; then - echo "miopen-hip-gfx package not available" && exit 1 - else - yum install -y ${MIOPENHIPGFX} - fi + MIOPENHIPGFX=$(yum -q search miopen-hip-gfx | grep miopen-hip-gfx | awk '{print $1}'| grep -F kdb. || true) + if [[ "x${MIOPENHIPGFX}" = x ]]; then + echo "miopen-hip-gfx package not available" && exit 1 else - MIOPENKERNELS=$(yum -q search miopenkernels | grep miopenkernels- | awk '{print $1}'| grep -F kdb. || true) - if [[ "x${MIOPENKERNELS}" = x ]]; then - echo "miopenkernels package not available" && exit 1 - else - yum install -y ${MIOPENKERNELS} - fi + yum install -y ${MIOPENHIPGFX} fi # ROCm 6.0 had a regression where journal_mode was enabled on the kdb files resulting in permission errors at runtime - if [[ $(ver $ROCM_VERSION) -ge $(ver 6.0) ]]; then - for kdb in /opt/rocm/share/miopen/db/*.kdb - do - sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;" - done - fi + for kdb in /opt/rocm/share/miopen/db/*.kdb + do + sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;" + done # Cleanup yum clean all diff --git a/RELEASE.md b/RELEASE.md index 5f95a8409b76..50da4cf44192 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -48,14 +48,14 @@ Following is the Release Compatibility Matrix for PyTorch releases: -| PyTorch version | Python | Stable CUDA | Experimental CUDA | -| --- | --- | --- | --- | -| 2.3 | >=3.8, <=3.11, (3.12 experimental) | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | -| 2.2 | >=3.8, <=3.11, (3.12 experimental) | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | -| 2.1 | >=3.8, <=3.11 | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | -| 2.0 | >=3.8, <=3.11 | CUDA 11.7, CUDNN 8.5.0.96 | CUDA 11.8, CUDNN 8.7.0.84 | -| 1.13 | >=3.7, <=3.10 | CUDA 11.6, CUDNN 8.3.2.44 | CUDA 11.7, CUDNN 8.5.0.96 | -| 1.12 | >=3.7, <=3.10 | CUDA 11.3, CUDNN 8.3.2.44 | CUDA 11.6, CUDNN 8.3.2.44 | +| PyTorch version | Python | Stable CUDA | Experimental CUDA | Stable ROCm | +| --- | --- | --- | --- | --- | +| 2.3 | >=3.8, <=3.11, (3.12 experimental) | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 6.0 | +| 2.2 | >=3.8, <=3.11, (3.12 experimental) | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 5.7 | +| 2.1 | >=3.8, <=3.11 | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 5.6 | +| 2.0 | >=3.8, <=3.11 | CUDA 11.7, CUDNN 8.5.0.96 | CUDA 11.8, CUDNN 8.7.0.84 | ROCm 5.4 | +| 1.13 | >=3.7, <=3.10 | CUDA 11.6, CUDNN 8.3.2.44 | CUDA 11.7, CUDNN 8.5.0.96 | ROCm 5.2 | +| 1.12 | >=3.7, <=3.10 | CUDA 11.3, CUDNN 8.3.2.44 | CUDA 11.6, CUDNN 8.3.2.44 | ROCm 5.0 | ## Release Cadence diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp index bfe6a02741ed..2502456e285b 100644 --- a/aten/src/ATen/cuda/CUDABlas.cpp +++ b/aten/src/ATen/cuda/CUDABlas.cpp @@ -14,9 +14,7 @@ #include #ifdef USE_ROCM -#if ROCM_VERSION >= 60000 #include -#endif // until hipblas has an API to accept flags, we must use rocblas here #include #include @@ -236,57 +234,6 @@ namespace at::cuda::blas { CUDABLAS_NONNEGINT_CHECK(bgemm, num_batches); \ } while (0) -#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700) - -#if defined(USE_ROCM) && ROCM_VERSION >= 50700 && ROCM_VERSION < 60000 -// only for rocm 5.7 where we first supported hipblaslt, it was difficult -// to hipify correctly without this change. -#define hipDataType hipblasDatatype_t -#endif - -// hipblaslt custom types were a temporary work-around -#if defined(USE_ROCM) && ROCM_VERSION >= 60000 && defined(HIPBLASLT_CUSTOM_DATA_TYPE) -hipblasltDatatype_t hipToLt(hipDataType type) { - switch (type) { - case HIP_R_32F: return HIPBLASLT_R_32F; - case HIP_R_64F: return HIPBLASLT_R_64F; - case HIP_R_16F: return HIPBLASLT_R_16F; - case HIP_R_8I: return HIPBLASLT_R_8I; - case HIP_C_32F: return HIPBLASLT_C_32F; - case HIP_C_64F: return HIPBLASLT_C_64F; - case HIP_C_16F: return HIPBLASLT_C_16F; - case HIP_C_8I: return HIPBLASLT_C_8I; - case HIP_R_8U: return HIPBLASLT_R_8U; - case HIP_C_8U: return HIPBLASLT_C_8U; - case HIP_R_32I: return HIPBLASLT_R_32I; - case HIP_C_32I: return HIPBLASLT_C_32I; - case HIP_R_32U: return HIPBLASLT_R_32U; - case HIP_C_32U: return HIPBLASLT_C_32U; - case HIP_R_16BF: return HIPBLASLT_R_16B; - case HIP_C_16BF: return HIPBLASLT_C_16B; - default: TORCH_CHECK(false, "unknown hipDataType"); - } -} -#define HIPTOLT(type) hipToLt(type) -#else -#define HIPTOLT(type) type -#endif - -#if defined(USE_ROCM) && ROCM_VERSION >= 60000 && defined(HIPBLASLT_CUSTOM_COMPUTE_TYPE) -hipblasLtComputeType_t hipblasToLt(hipblasComputeType_t type) { - switch (type) { - case HIPBLAS_COMPUTE_32F: return HIPBLASLT_COMPUTE_F32; - case HIPBLAS_COMPUTE_32F_FAST_16F: return HIPBLASLT_COMPUTE_F32_FAST_F16; - case HIPBLAS_COMPUTE_32F_FAST_TF32: return HIPBLASLT_COMPUTE_F32_FAST_XF32; - case HIPBLAS_COMPUTE_64F: return HIPBLASLT_COMPUTE_F64; - case HIPBLAS_COMPUTE_32I: return HIPBLASLT_COMPUTE_I32; - default: TORCH_CHECK(false, "unknown hipblasComputeType_t"); - } -} -#define HIPCOMPTOLT(type) hipblasToLt(type) -#else -#define HIPCOMPTOLT(type) type -#endif namespace { // Following the pattern of CuSparseDescriptor @@ -325,7 +272,7 @@ class CuBlasLtMatmulDescriptor : public CuBlasLtDescriptor< cudaDataType_t scale_type) { cublasLtMatmulDesc_t raw_descriptor = nullptr; TORCH_CUDABLAS_CHECK( - cublasLtMatmulDescCreate(&raw_descriptor, HIPCOMPTOLT(compute_type), HIPTOLT(scale_type))); + cublasLtMatmulDescCreate(&raw_descriptor, compute_type, scale_type)); descriptor_.reset(raw_descriptor); } template @@ -346,7 +293,7 @@ class CuBlasLtMatrixLayout : public CuBlasLtDescriptor< bool t = false) { cublasLtMatrixLayout_t raw_descriptor = nullptr; TORCH_CUDABLAS_CHECK( - cublasLtMatrixLayoutCreate(&raw_descriptor, HIPTOLT(type), t ? cols : rows, t ? rows : cols, ld)); + cublasLtMatrixLayoutCreate(&raw_descriptor, type, t ? cols : rows, t ? rows : cols, ld)); descriptor_.reset(raw_descriptor); } template @@ -371,11 +318,9 @@ class CuBlasLtMatmulPreference : public CuBlasLtDescriptor< }; } // namespace -#endif template inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { -#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 60000) cudaDataType_t abcType = CUDA_R_32F; cublasComputeType_t computeType = CUBLAS_COMPUTE_32F; cudaDataType_t scaleType = CUDA_R_32F; @@ -506,9 +451,6 @@ inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { computeType, " scaleType ", scaleType); -#else - AT_ERROR("at::cuda::blas::bgemm_internal_cublaslt: not implemented for ", typeid(Dtype).name()); -#endif } @@ -632,7 +574,7 @@ void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) const float fbeta = beta; _cublasAdjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc); -#if defined(USE_ROCM) && ROCM_VERSION >= 60000 +#if defined(USE_ROCM) auto compute_type = CUBLAS_COMPUTE_32F; #else auto compute_type = CUDA_R_32F; @@ -1018,7 +960,7 @@ void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) { cublas_flags = static_cast(cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION); } #endif -#if defined(USE_ROCM) && ROCM_VERSION >= 60000 +#if defined(USE_ROCM) auto compute_type = CUBLAS_COMPUTE_32F; #else auto compute_type = CUDA_R_32F; @@ -1235,7 +1177,6 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) { } } -#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700) template void gemm_and_bias( @@ -1260,13 +1201,9 @@ void gemm_and_bias( cublasComputeType_t computeType = CUBLAS_COMPUTE_32F; cudaDataType_t scaleType = CUDA_R_32F; if constexpr (std::is_same_v) { -#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 60000) abcType = CUDA_R_64F; computeType = CUBLAS_COMPUTE_64F; scaleType = CUDA_R_64F; -#else - TORCH_CHECK(false, "gemm_and_bias is only supported for double type on ROCm 6.0 and above"); -#endif } else if constexpr (std::is_same_v) { #ifndef USE_ROCM if (at::globalContext().allowTF32CuBLAS()) { @@ -1473,7 +1410,7 @@ void scaled_gemm( ScalarType result_dtype, void* amax_ptr, bool use_fast_accum) { -#if CUDA_VERSION >= 11080 || (defined(USE_ROCM) && ROCM_VERSION >= 60000) +#if CUDA_VERSION >= 11080 || defined(USE_ROCM) const auto computeType = CUBLAS_COMPUTE_32F; const auto scaleType = CUDA_R_32F; const int8_t fastAccuMode = use_fast_accum ? 1 : 0; @@ -1537,13 +1474,13 @@ if (isFloat8Type(result_dtype)) { hipblaslt_ext::GemmType::HIPBLASLT_GEMM, _cublasOpFromChar(transa), _cublasOpFromChar(transb), - HIPTOLT(ScalarTypeToCudaDataType(mat1_dtype)), - HIPTOLT(ScalarTypeToCudaDataType(mat2_dtype)), + ScalarTypeToCudaDataType(mat1_dtype), + ScalarTypeToCudaDataType(mat2_dtype), // C is nullptr and beta=0, so set to something reasonable. See above. - //HIPTOLT(ScalarTypeToCudaDataType(bias_dtype)), - HIPTOLT(ScalarTypeToCudaDataType(result_dtype)), - HIPTOLT(ScalarTypeToCudaDataType(result_dtype)), - HIPCOMPTOLT(CUBLAS_COMPUTE_32F), + //ScalarTypeToCudaDataType(bias_dtype), + ScalarTypeToCudaDataType(result_dtype), + ScalarTypeToCudaDataType(result_dtype), + CUBLAS_COMPUTE_32F, all_algos)); if (all_algos.size() == 0) { TORCH_CUDABLAS_CHECK(CUBLAS_STATUS_NOT_SUPPORTED); @@ -1620,7 +1557,7 @@ if (isFloat8Type(result_dtype)) { " scaleType ", scaleType); return; -#endif // CUDA_VERSION >= 11080 || (defined(USE_ROCM) && ROCM_VERSION >= 60000) +#endif // CUDA_VERSION >= 11080 || defined(USE_ROCM) TORCH_CHECK(false, "scaled_gemm is only supported for CUDA 11.8 and above"); } @@ -1636,7 +1573,6 @@ void int8_gemm( int64_t mat2_ld, int32_t* result_ptr, int64_t result_ld) { -#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 60000) cublasComputeType_t computeType = CUBLAS_COMPUTE_32I; cudaDataType_t scaleType = CUDA_R_32I; @@ -1741,18 +1677,7 @@ void int8_gemm( computeType, " scaleType ", scaleType); -#else - TORCH_CHECK(false, "int8_gemm is only supported for ROCm 6.0 and above"); -#endif // !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 60000) } -#endif // !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700) - -// ROCm 5.6 hipblas matches the const Dtype *A API, but prior hipblas does not. -#if defined(USE_ROCM) && ROCM_VERSION < 50600 -#define ROCM_CONST_BUG_CAST(Type, Input) const_cast(reinterpret_cast(Input)) -#else -#define ROCM_CONST_BUG_CAST(Type, Input) reinterpret_cast(Input) -#endif template <> void trsm(CUDABLAS_TRSM_ARGTYPES(float)) { @@ -1777,7 +1702,7 @@ void trsm>(CUDABLAS_TRSM_ARGTYPES(c10::complex)) { m, n, reinterpret_cast(alpha), - ROCM_CONST_BUG_CAST(cuComplex*, A), + reinterpret_cast(A), lda, reinterpret_cast(B), ldb)); @@ -1794,7 +1719,7 @@ void trsm>(CUDABLAS_TRSM_ARGTYPES(c10::complex)) { m, n, reinterpret_cast(alpha), - ROCM_CONST_BUG_CAST(cuDoubleComplex*, A), + reinterpret_cast(A), lda, reinterpret_cast(B), ldb)); diff --git a/aten/src/ATen/cuda/CUDABlas.h b/aten/src/ATen/cuda/CUDABlas.h index 24aad7678ec4..d418dc53af38 100644 --- a/aten/src/ATen/cuda/CUDABlas.h +++ b/aten/src/ATen/cuda/CUDABlas.h @@ -82,7 +82,6 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::Half)); template <> void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)); -#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700) enum GEMMAndBiasActivationEpilogue { None, RELU, @@ -143,7 +142,6 @@ void scaled_gemm( ScalarType result_dtype, void* amax_ptr, bool use_fast_accum); -#endif #define CUDABLAS_BGEMM_ARGTYPES(Dtype) \ char transa, char transb, int64_t m, int64_t n, int64_t k, at::opmath_type alpha, \ @@ -190,18 +188,10 @@ void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::Half)); template <> void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); -#if defined(USE_ROCM) && ROCM_VERSION <= 50500 -// ROCm 5.6 hipblas matches the const Dtype *A API, but prior hipblas does not. -#define CUDABLAS_TRSM_ARGTYPES(Dtype) \ - hipblasHandle_t handle, hipblasSideMode_t side, hipblasFillMode_t uplo, \ - hipblasOperation_t trans, hipblasDiagType_t diag, int m, int n, \ - const Dtype *alpha, Dtype *A, int lda, Dtype *B, int ldb -#else #define CUDABLAS_TRSM_ARGTYPES(Dtype) \ cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, \ cublasOperation_t trans, cublasDiagType_t diag, int m, int n, \ const Dtype *alpha, const Dtype *A, int lda, Dtype *B, int ldb -#endif template inline void trsm(CUDABLAS_TRSM_ARGTYPES(Dtype)) { diff --git a/aten/src/ATen/cuda/CUDAContextLight.h b/aten/src/ATen/cuda/CUDAContextLight.h index 60d09dfaee16..f2b657ced51b 100644 --- a/aten/src/ATen/cuda/CUDAContextLight.h +++ b/aten/src/ATen/cuda/CUDAContextLight.h @@ -9,15 +9,13 @@ // cublasLT was introduced in CUDA 10.1 but we enable only for 11.1 that also // added bf16 support -#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700) #include -#endif #ifdef CUDART_VERSION #include #endif -#if defined(USE_ROCM) && ROCM_VERSION >= 50300 +#if defined(USE_ROCM) #include #endif @@ -82,13 +80,11 @@ TORCH_CUDA_CPP_API c10::Allocator* getCUDADeviceAllocator(); /* Handles */ TORCH_CUDA_CPP_API cusparseHandle_t getCurrentCUDASparseHandle(); TORCH_CUDA_CPP_API cublasHandle_t getCurrentCUDABlasHandle(); -#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700) TORCH_CUDA_CPP_API cublasLtHandle_t getCurrentCUDABlasLtHandle(); -#endif TORCH_CUDA_CPP_API void clearCublasWorkspaces(); -#if defined(CUDART_VERSION) || defined(USE_ROCM) && ROCM_VERSION >= 50300 +#if defined(CUDART_VERSION) || defined(USE_ROCM) TORCH_CUDA_CPP_API cusolverDnHandle_t getCurrentCUDASolverDnHandle(); #endif diff --git a/aten/src/ATen/cuda/CUDAGraph.cpp b/aten/src/ATen/cuda/CUDAGraph.cpp index 436408f88a51..01d3d513c4eb 100644 --- a/aten/src/ATen/cuda/CUDAGraph.cpp +++ b/aten/src/ATen/cuda/CUDAGraph.cpp @@ -17,16 +17,11 @@ static bool _cuda_graphs_debug = false; constexpr int kSynchronizeBusyWaitMillis = 10; MempoolId_t graph_pool_handle() { -#if !defined(USE_ROCM) || ROCM_VERSION >= 50300 // uuid count starts at 1. 0 is reserved to mean "wasn't set by graph_pool_handle". static std::atomic uid{1}; // Sets just the second value, to distinguish it from MempoolId_ts created from // cudaStreamGetCaptureInfo id_s in capture_begin. return {0, uid++}; -#else - TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0 or ROCM >= 5.3") - return {0, 0}; -#endif } @@ -84,9 +79,6 @@ int CUDAGraph::num_pending_event_queries() { CUDAGraph::CUDAGraph() // CUDAStreams may not be default-constructed. : capture_stream_(at::cuda::getCurrentCUDAStream()) { -#if (defined(USE_ROCM) && ROCM_VERSION < 50300) - TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0 or ROCM >= 5.3"); -#endif } void CUDAGraph::register_generator_state( @@ -102,7 +94,6 @@ void CUDAGraph::register_generator_state(const at::Generator& generator) { } void CUDAGraph::capture_begin(MempoolId_t pool/*=0*/, cudaStreamCaptureMode capture_mode) { -#if !defined(USE_ROCM) || ROCM_VERSION >= 50300 TORCH_CHECK(!has_graph_exec_, "This CUDAGraph instance already owns a captured graph. " "To capture a new graph, create a new instance."); @@ -171,13 +162,9 @@ void CUDAGraph::capture_begin(MempoolId_t pool/*=0*/, cudaStreamCaptureMode capt TORCH_INTERNAL_ASSERT(status == cudaStreamCaptureStatus::cudaStreamCaptureStatusActive); TORCH_INTERNAL_ASSERT(id_ > 0); -#else - TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0 or ROCM >= 5.3") -#endif } void CUDAGraph::capture_end() { -#if !defined(USE_ROCM) || ROCM_VERSION >= 50300 auto stream = at::cuda::getCurrentCUDAStream(); TORCH_CHECK(stream == capture_stream_, @@ -245,13 +232,9 @@ void CUDAGraph::capture_end() { } else { TORCH_WARN("DEBUG: TORCH_CUDAGRAPHS_DEBUG_PATH detected. graph_ will not be freed until debug_dump is called."); } -#else - TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0 or ROCM >= 5.3") -#endif } void CUDAGraph::replay() { -#if !defined(USE_ROCM) || ROCM_VERSION >= 50300 TORCH_CHECK(has_graph_exec_, "Called CUDAGraph::replay without a preceding successful capture."); @@ -273,22 +256,14 @@ void CUDAGraph::replay() { // The bug is fixed in CUDA 11.4+. AT_CUDA_CHECK(cudaDeviceSynchronize()); } -#else - TORCH_CHECK(false, "CUDA graphs is not yet supported on ROCM"); -#endif } void CUDAGraph::enable_debug_mode() { -#if !defined(USE_ROCM) || ROCM_VERSION >= 50300 _cuda_graphs_debug = true; -#else - TORCH_CHECK(false, "CUDA graphs is not yet supported on ROCM"); -#endif - } void CUDAGraph::debug_dump(const std::string& debug_path) { -#if (defined(CUDA_VERSION) && CUDA_VERSION >= 11030)|| (defined(USE_ROCM) && ROCM_VERSION >= 50600) +#if (defined(CUDA_VERSION) && CUDA_VERSION >= 11030)|| defined(USE_ROCM) if (_cuda_graphs_debug) { TORCH_WARN("DEBUG: calling debug_dump()"); if (has_graph_) { @@ -305,7 +280,6 @@ void CUDAGraph::debug_dump(const std::string& debug_path) { } void CUDAGraph::reset() { -#if !defined(USE_ROCM) || ROCM_VERSION >= 50300 // I'd prefer these checks throw exceptions, not print warnings, // but the destructor calls reset(), and at least one CI build // refuses to compile with a throwing destructor. @@ -337,19 +311,12 @@ void CUDAGraph::reset() { C10_CUDA_CHECK_WARN(cudaGraphExecDestroy(graph_exec_)); has_graph_exec_ = false; } -#else - TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0 or ROCM >= 5.3") -#endif } // Returns an id another graph's capture_begin can use to share the same memory pool as this graph. MempoolId_t CUDAGraph::pool() { -#if !defined(USE_ROCM) || ROCM_VERSION >= 50300 TORCH_CHECK(has_graph_exec_, "Called CUDAGraph::pool() without a preceding successful capture."); -#else - TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0 or ROCM >= 5.3") -#endif return mempool_id_; } diff --git a/aten/src/ATen/cuda/CUDAGraph.h b/aten/src/ATen/cuda/CUDAGraph.h index 3acdad18b0ee..9494855de8e6 100644 --- a/aten/src/ATen/cuda/CUDAGraph.h +++ b/aten/src/ATen/cuda/CUDAGraph.h @@ -39,10 +39,8 @@ struct TORCH_CUDA_CPP_API CUDAGraph { void debug_dump(const std::string& debug_path); protected: -#if !defined(USE_ROCM) || ROCM_VERSION >= 50300 cudaGraph_t graph_ = NULL; cudaGraphExec_t graph_exec_ = NULL; -#endif static std::atomic pending_event_queries; diff --git a/aten/src/ATen/cuda/CUDAGraphsUtils.cuh b/aten/src/ATen/cuda/CUDAGraphsUtils.cuh index f3e27a7b159a..d70fc52a3c33 100644 --- a/aten/src/ATen/cuda/CUDAGraphsUtils.cuh +++ b/aten/src/ATen/cuda/CUDAGraphsUtils.cuh @@ -19,16 +19,12 @@ using CaptureStatus = c10::cuda::CaptureStatus; // Use this version where you don't want to create a CUDA context if none exists. inline CaptureStatus currentStreamCaptureStatus() { -#if !defined(USE_ROCM) || ROCM_VERSION >= 50300 // don't create a context if we don't have to if (c10::cuda::hasPrimaryContext(c10::cuda::current_device())) { return c10::cuda::currentStreamCaptureStatusMayInitCtx(); } else { return CaptureStatus::None; } -#else - return CaptureStatus::None; -#endif } inline void assertNotCapturing(std::string attempt) { diff --git a/aten/src/ATen/cuda/CUDASparse.h b/aten/src/ATen/cuda/CUDASparse.h index 1052469ea7d8..736fbe4ae50d 100644 --- a/aten/src/ATen/cuda/CUDASparse.h +++ b/aten/src/ATen/cuda/CUDASparse.h @@ -68,8 +68,7 @@ #endif // BSR triangular solve functions were added in hipSPARSE 1.11.2 (ROCm 4.5.0) -#if defined(CUDART_VERSION) || \ - (defined(USE_ROCM) && ROCM_VERSION >= 40500 ) +#if defined(CUDART_VERSION) || defined(USE_ROCM) #define AT_USE_HIPSPARSE_TRIANGULAR_SOLVE() 1 #else #define AT_USE_HIPSPARSE_TRIANGULAR_SOLVE() 0 diff --git a/aten/src/ATen/cuda/CUDASparseDescriptors.h b/aten/src/ATen/cuda/CUDASparseDescriptors.h index 9e3d50f34e77..a0c2cd21344d 100644 --- a/aten/src/ATen/cuda/CUDASparseDescriptors.h +++ b/aten/src/ATen/cuda/CUDASparseDescriptors.h @@ -273,7 +273,6 @@ class TORCH_CUDA_CPP_API CuSparseSpSMDescriptor }; #endif -#if (defined(USE_ROCM) && ROCM_VERSION >= 50200) || !defined(USE_ROCM) class TORCH_CUDA_CPP_API CuSparseSpGEMMDescriptor : public CuSparseDescriptor { public: @@ -283,7 +282,6 @@ class TORCH_CUDA_CPP_API CuSparseSpGEMMDescriptor descriptor_.reset(raw_descriptor); } }; -#endif #endif // AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API() diff --git a/aten/src/ATen/cuda/CublasHandlePool.cpp b/aten/src/ATen/cuda/CublasHandlePool.cpp index 6a03803720b5..8eac525b3695 100644 --- a/aten/src/ATen/cuda/CublasHandlePool.cpp +++ b/aten/src/ATen/cuda/CublasHandlePool.cpp @@ -29,7 +29,7 @@ namespace at::cuda { namespace { -#if defined(USE_ROCM) && ROCM_VERSION >= 50700 +#if defined(USE_ROCM) void createCublasLtHandle(cublasLtHandle_t *handle) { TORCH_CUDABLAS_CHECK(cublasLtCreate(handle)); } @@ -191,7 +191,6 @@ cublasHandle_t getCurrentCUDABlasHandle() { return handle; } -#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700) cublasLtHandle_t getCurrentCUDABlasLtHandle() { #ifdef USE_ROCM c10::DeviceIndex device = 0; @@ -218,6 +217,5 @@ cublasLtHandle_t getCurrentCUDABlasLtHandle() { return reinterpret_cast(getCurrentCUDABlasHandle()); #endif } -#endif } // namespace at::cuda diff --git a/aten/src/ATen/cuda/cub-RadixSortPairs.cu b/aten/src/ATen/cuda/cub-RadixSortPairs.cu index cc7c96930010..0eefb0824e59 100644 --- a/aten/src/ATen/cuda/cub-RadixSortPairs.cu +++ b/aten/src/ATen/cuda/cub-RadixSortPairs.cu @@ -80,10 +80,6 @@ AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, AT_INSTANTIATE_SORT_PAIRS_8) AT_INSTANTIATE_SORT_PAIRS(uint16_t, 8) AT_INSTANTIATE_SORT_PAIRS(uint32_t, 8) AT_INSTANTIATE_SORT_PAIRS(uint64_t, 8) - -// BFloat16 Radix sort is supported from ROCm 4.5 onwards -#if !AT_ROCM_ENABLED() || (AT_ROCM_ENABLED() && ROCM_VERSION >= 40500) AT_INSTANTIATE_SORT_PAIRS(c10::BFloat16, 8) -#endif } // namespace at::cuda::cub::detail diff --git a/aten/src/ATen/cuda/cub.cuh b/aten/src/ATen/cuda/cub.cuh index 062c365a4e1a..c5261534dd4e 100644 --- a/aten/src/ATen/cuda/cub.cuh +++ b/aten/src/ATen/cuda/cub.cuh @@ -51,8 +51,7 @@ #define ROCM_HIPCUB(x) x #endif -#if (!defined(USE_ROCM) && !CUB_SUPPORTS_NV_BFLOAT16()) || \ - (defined(USE_ROCM) && ROCM_VERSION >= 40500) +#if (!defined(USE_ROCM) && !CUB_SUPPORTS_NV_BFLOAT16()) || defined(USE_ROCM) #if !defined(USE_ROCM) namespace at_cuda_detail { @@ -110,7 +109,7 @@ struct cuda_type { using type = __nv_bfloat16; }; -#elif (defined(USE_ROCM) && ROCM_VERSION >= 40500) +#elif defined(USE_ROCM) template<> struct cuda_type { @@ -234,7 +233,7 @@ constexpr int max_cub_size = std::numeric_limits::max() / 2 + 1; // 2**30 // so split at int_max/2 template inline void inclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT scan_op, int64_t num_items) { -#if defined(USE_ROCM) && (ROCM_VERSION >= 50000) +#if defined(USE_ROCM) //For ROCm, use hipCUB chained iterators CUB_WRAPPER(NO_ROCM(detail)::hipcub::DeviceScan::InclusiveScan, input, @@ -301,7 +300,7 @@ inline void inclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT template inline void exclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT scan_op, InitValueT init_value, int64_t num_items) { -#if defined(USE_ROCM) && (ROCM_VERSION >= 50000) +#if defined(USE_ROCM) //For ROCm, use hipCUB chained iterators CUB_WRAPPER(NO_ROCM(detail)::hipcub::DeviceScan::ExclusiveScan, input, diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp index d3b80af2e859..dca69b024894 100644 --- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp +++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp @@ -151,11 +151,7 @@ bool CUDAHooks::isPinnedPtr(const void* data) const { return false; } #endif -#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700) return attr.type == cudaMemoryTypeHost; -#else - return attr.memoryType == cudaMemoryTypeHost; -#endif } bool CUDAHooks::hasCUDA() const { @@ -177,7 +173,7 @@ bool CUDAHooks::hasCuDNN() const { bool CUDAHooks::hasCuSOLVER() const { #if defined(CUDART_VERSION) && defined(CUSOLVER_VERSION) return true; -#elif AT_ROCM_ENABLED() && defined(ROCM_VERSION) && ROCM_VERSION >= 50300 +#elif AT_ROCM_ENABLED() return true; #else return false; @@ -187,7 +183,7 @@ bool CUDAHooks::hasCuSOLVER() const { bool CUDAHooks::hasCuBLASLt() const { #if defined(CUDART_VERSION) return true; -#elif AT_ROCM_ENABLED() && defined(ROCM_VERSION) && ROCM_VERSION >= 50700 +#elif AT_ROCM_ENABLED() return true; #else return false; diff --git a/aten/src/ATen/cuda/tunable/GemmHipblaslt.h b/aten/src/ATen/cuda/tunable/GemmHipblaslt.h index da1483aee72c..b26c2415af7b 100644 --- a/aten/src/ATen/cuda/tunable/GemmHipblaslt.h +++ b/aten/src/ATen/cuda/tunable/GemmHipblaslt.h @@ -24,64 +24,6 @@ namespace at::cuda::tunable { -#ifdef HIPBLASLT_HAS_GETINDEXFROMALGO -#define GETINDEXFROMALGO(algo) hipblaslt_ext::getIndexFromAlgo(algo) -#else -static int getIndexFromAlgo(hipblasLtMatmulAlgo_t& algo) { - int* algo_ptr = (int*)algo.data; - if(*algo_ptr < 0) { - return -1; - } - return *algo_ptr; -} -#define GETINDEXFROMALGO(algo) getIndexFromAlgo(algo) -#endif - -#ifdef HIPBLASLT_CUSTOM_COMPUTE_TYPE -#define COMPUTE_TYPE_32 HIPBLASLT_COMPUTE_F32 -#else -#define COMPUTE_TYPE_32 HIPBLAS_COMPUTE_32F -#endif - -#ifdef HIPBLASLT_CUSTOM_DATA_TYPE - -template -constexpr hipblasltDatatype_t HipBlasDataTypeFor(); - -template <> -constexpr hipblasltDatatype_t HipBlasDataTypeFor() { - return HIPBLASLT_R_32F; -} - -template <> -constexpr hipblasltDatatype_t HipBlasDataTypeFor() { - return HIPBLASLT_R_16F; -} - -template <> -constexpr hipblasltDatatype_t HipBlasDataTypeFor() { - return HIPBLASLT_R_16B; -} - -template <> -constexpr hipblasltDatatype_t HipBlasDataTypeFor() { - return HIPBLASLT_R_64F; -} - -template <> -constexpr hipblasltDatatype_t HipBlasDataTypeFor() { - return HIPBLASLT_R_8F_E4M3; -} - -template <> -constexpr hipblasltDatatype_t HipBlasDataTypeFor() { - return HIPBLASLT_R_8F_E5M3; -} - -#define DATA_TYPE_R_32 HIPBLASLT_R_32F - -#else - template constexpr hipblasDatatype_t HipBlasDataTypeFor(); @@ -115,14 +57,6 @@ constexpr hipblasDatatype_t HipBlasDataTypeFor() { return HIP_R_8F_E5M2_FNUZ; } -#ifdef HIPBLAS_V2 -#define DATA_TYPE_R_32 HIP_R_32F -#else -#define DATA_TYPE_R_32 HIPBLAS_R_32F -#endif - -#endif - template int GetBatchFromParams(const GemmParams* params) { return 1; @@ -439,7 +373,7 @@ class HipblasltGemmOp : public Callable { mat_c, HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stride_c, sizeof(stride_c))); } - HipBlasLtMatmulDescriptor matmul(COMPUTE_TYPE_32, DATA_TYPE_R_32); + HipBlasLtMatmulDescriptor matmul(HIPBLAS_COMPUTE_32F, HIP_R_32F); matmul.setAttribute(HIPBLASLT_MATMUL_DESC_TRANSA, opa); matmul.setAttribute(HIPBLASLT_MATMUL_DESC_TRANSB, opb); @@ -543,7 +477,7 @@ auto GetHipBlasLtTypeStringAndOps() { b_datatype, in_out_datatype, in_out_datatype, - COMPUTE_TYPE_32, + HIPBLAS_COMPUTE_32F, heuristic_result)); TORCH_HIPBLASLT_CHECK(hipblasLtDestroy(handle)); @@ -551,14 +485,14 @@ auto GetHipBlasLtTypeStringAndOps() { std::sort(heuristic_result.begin(), heuristic_result.end(), [](hipblasLtMatmulHeuristicResult_t& a, hipblasLtMatmulHeuristicResult_t& b) { - return GETINDEXFROMALGO(a.algo) < GETINDEXFROMALGO(b.algo); + return hipblaslt_ext::getIndexFromAlgo(a.algo) < hipblaslt_ext::getIndexFromAlgo(b.algo); }); int returned_algo_count = heuristic_result.size(); std::vector>>> ret; for (int i = 0; i < returned_algo_count; i++) { auto algo = heuristic_result[i].algo; - int algo_index = GETINDEXFROMALGO(algo); + int algo_index = hipblaslt_ext::getIndexFromAlgo(algo); auto callable = std::make_unique>(algo); std::string type_string = c10::str( "Gemm_Hipblaslt_", _charFromhipblasOp(transa_outer), _charFromhipblasOp(transb_outer), "_", algo_index); @@ -584,8 +518,5 @@ auto GetHipBlasLtScaledGemmTypeStringAndOps() { } #undef TORCH_HIPBLASLT_CHECK -#undef GETINDEXFROMALGO -#undef COMPUTE_TYPE_32 -#undef DATA_TYPE_R_32 } // namespace at::cuda::tunable diff --git a/aten/src/ATen/cuda/tunable/TunableGemm.h b/aten/src/ATen/cuda/tunable/TunableGemm.h index 3b5e7e0903c8..1eaf251caad7 100644 --- a/aten/src/ATen/cuda/tunable/TunableGemm.h +++ b/aten/src/ATen/cuda/tunable/TunableGemm.h @@ -11,9 +11,7 @@ #include #ifdef USE_ROCM -#if ROCM_VERSION >= 50700 #include -#endif #include #endif #include @@ -220,7 +218,7 @@ class GemmTunableOp : public TunableOp, StreamTimer> { } #endif -#if defined(USE_ROCM) && ROCM_VERSION >= 50700 +#if defined(USE_ROCM) static const char *env = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED"); if (env == nullptr || strcmp(env, "1") == 0) { // disallow tuning of hipblaslt with c10::complex @@ -294,7 +292,7 @@ class GemmStridedBatchedTunableOp : public TunableOp } #endif -#if defined(USE_ROCM) && ROCM_VERSION >= 50700 +#if defined(USE_ROCM) static const char *env = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED"); if (env == nullptr || strcmp(env, "1") == 0) { // disallow tuning of hipblaslt with c10::complex @@ -334,7 +332,7 @@ class ScaledGemmTunableOp : public TunableOp, StreamTimer> auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators(); -#if defined(USE_ROCM) && ROCM_VERSION >= 50700 +#if defined(USE_ROCM) for (auto&& [name, op] : GetHipBlasLtScaledGemmTypeStringAndOps()) { this->RegisterOp(std::move(name), std::move(op)); } diff --git a/aten/src/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h b/aten/src/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h index ebd3e9f353b7..856e65a675f8 100644 --- a/aten/src/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h +++ b/aten/src/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h @@ -104,10 +104,6 @@ struct HIPGuardImplMasqueradingAsCUDA final : public c10::impl::DeviceGuardImplI int deviceCnt; hipError_t _err; _err = hipGetDeviceCount(&deviceCnt); -#if defined(USE_ROCM) && (ROCM_VERSION < 50201) - if(_err == hipErrorInvalidDevice) - return 0; -#endif if(_err != hipErrorNoDevice && _err != hipSuccess) C10_HIP_CHECK(_err); return deviceCnt; diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp index df6f47091642..9e76aad45f64 100644 --- a/aten/src/ATen/native/cuda/Blas.cpp +++ b/aten/src/ATen/native/cuda/Blas.cpp @@ -157,7 +157,6 @@ enum class Activation { GELU, }; -#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700) cuda::blas::GEMMAndBiasActivationEpilogue activation_to_gemm_and_blas_arg(Activation a) { switch (a) { case Activation::None: @@ -171,7 +170,6 @@ cuda::blas::GEMMAndBiasActivationEpilogue activation_to_gemm_and_blas_arg(Activa return cuda::blas::GEMMAndBiasActivationEpilogue::None; } } -#endif static bool getDisableAddmmCudaLt() { static const char* env_value = std::getenv("DISABLE_ADDMM_CUDA_LT"); @@ -236,7 +234,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma at::ScalarType scalar_type = self.scalar_type(); c10::MaybeOwned self_; if (&result != &self) { -#if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11040)) || (defined(USE_ROCM) && (ROCM_VERSION >= 50700)) +#if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11040)) || defined(USE_ROCM) // Strangely, if mat2 has only 1 row or column, we get // CUBLAS_STATUS_INVALID_VALUE error from cublasLtMatmulAlgoGetHeuristic. // self.dim() == 1 && result.dim() == 2 && self.sizes()[0] == mat2_sizes[1] @@ -283,7 +281,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma } self__sizes = self_->sizes(); } else { -#if defined(USE_ROCM) && ROCM_VERSION >= 50700 +#if defined(USE_ROCM) useLtInterface = !disable_addmm_cuda_lt && result.dim() == 2 && result.is_contiguous() && isSupportedHipLtROCmArch(self.device().index()) && @@ -334,7 +332,6 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!args.result->is_conj()); -#if !defined(USE_ROCM) || (defined(USE_ROCM) && (ROCM_VERSION >= 50700)) if (useLtInterface) { #if defined(USE_ROCM) AT_DISPATCH_FLOATING_TYPES_AND2( @@ -398,7 +395,6 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma }); #endif } else -#endif { AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2( at::ScalarType::Half, @@ -770,7 +766,7 @@ Tensor& _int_mm_out_cuda(const Tensor& self, const Tensor& mat2, Tensor& result) TORCH_CHECK(result.is_contiguous(), "Expected result to be contiguous."); -#if (!defined(USE_ROCM) && defined(CUDA_VERSION) && (CUDA_VERSION >= 11070)) || (defined(USE_ROCM) && (ROCM_VERSION >= 60000)) +#if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11070)) || defined(USE_ROCM) cublasCommonArgs args(self, mat2, result); at::cuda::blas::int8_gemm( @@ -910,7 +906,6 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2, at::native::resize_output(out, {mat1_sizes[0], mat2_sizes[1]}); at::native::resize_output(amax, {}); -#if !defined(USE_ROCM) || (defined(USE_ROCM) && (ROCM_VERSION >= 60000)) cublasCommonArgs args(mat1, mat2, out); const auto out_dtype_ = args.result->scalar_type(); TORCH_CHECK(args.transa == 't' && args.transb == 'n', "Only multiplication of row-major and column-major matrices is supported by cuBLASLt"); @@ -1016,11 +1011,8 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2, amax.data_ptr(), use_fast_accum); } -#else - TORCH_CHECK(false, "_scaled_mm_out_cuda is not compiled for this platform."); -#endif -#if defined(USE_ROCM) && ROCM_VERSION >= 60000 +#if defined(USE_ROCM) // rocm's hipblaslt does not yet support amax, so calculate separately amax = at::max(at::abs(out.to(kFloat))); #endif diff --git a/aten/src/ATen/native/cuda/SortStable.cu b/aten/src/ATen/native/cuda/SortStable.cu index b80f6fdb4ea4..9e572a227fcd 100644 --- a/aten/src/ATen/native/cuda/SortStable.cu +++ b/aten/src/ATen/native/cuda/SortStable.cu @@ -233,18 +233,6 @@ void launch_stable_sort_kernel( TORCH_CHECK(nbatch > 0, "Cannot sort dimension of length ", nsort); int64_t* indices_ptr = indices.mutable_data_ptr(); -#if (defined(USE_ROCM) && ROCM_VERSION < 40500) - constexpr bool is_rocm_bf16_sort_unsupported = true; -#else - constexpr bool is_rocm_bf16_sort_unsupported = false; -#endif - - if constexpr (is_rocm_bf16_sort_unsupported) { - if (self.scalar_type() == kBFloat16) { - TORCH_CHECK(false, "BFloat16 is not supported on ROCm < 4.5"); - } - } - AT_DISPATCH_ALL_TYPES_AND3( kBool, kHalf, kBFloat16, self.scalar_type(), "sort", [&] { const scalar_t* self_ptr = self.const_data_ptr(); diff --git a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp index 06b095af4f66..643501f0cbcc 100644 --- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp +++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp @@ -328,7 +328,7 @@ inline static void svd_cusolver_gesvd(const Tensor& A, const Tensor& U, const Te // gesvd just knows how to handle m >= n, so in the other case we need to transpose A const auto not_A_H = A.size(-2) >= A.size(-1); Tensor Vcopy = V; // Shallow copy -#ifdef ROCM_VERSION +#ifdef USE_ROCM // Similar to the case in svd_magma(), experiments have shown Vh tensor is // not guaranteed to be column major on ROCM, we have to create a copy to // deal with this @@ -347,7 +347,7 @@ inline static void svd_cusolver_gesvd(const Tensor& A, const Tensor& U, const Te infos, full_matrices, compute_uv, calculate_all_batches, batches); }); -#ifdef ROCM_VERSION +#ifdef USE_ROCM if (!not_A_H) { V.copy_(Vcopy); } @@ -661,7 +661,7 @@ void svd_cusolver(const Tensor& A, static const char* check_svd_doc = "Check doc at https://pytorch.org/docs/stable/generated/torch.linalg.svd.html"; // The default heuristic is to use gesvdj driver -#ifdef ROCM_VERSION +#ifdef USE_ROCM const auto driver_v = c10::string_view("gesvdj"); #else const auto driver_v = driver.value_or("gesvdj"); diff --git a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.h b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.h index 2cf2004e8e72..cca2e04941a5 100644 --- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.h +++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.h @@ -8,7 +8,7 @@ #include #include -#if (defined(CUDART_VERSION) && defined(CUSOLVER_VERSION)) || (defined(USE_ROCM) && ROCM_VERSION >= 50300) +#if (defined(CUDART_VERSION) && defined(CUSOLVER_VERSION)) || defined(USE_ROCM) #define USE_LINALG_SOLVER #endif diff --git a/aten/src/ATen/native/cuda/linalg/CUDASolver.cpp b/aten/src/ATen/native/cuda/linalg/CUDASolver.cpp index 717cd1873f5f..7b068b5f5aec 100644 --- a/aten/src/ATen/native/cuda/linalg/CUDASolver.cpp +++ b/aten/src/ATen/native/cuda/linalg/CUDASolver.cpp @@ -4,7 +4,7 @@ #include #include -#if defined(CUDART_VERSION) || defined(ROCM_VERSION) && ROCM_VERSION >= 50300 +#if defined(CUDART_VERSION) || defined(USE_ROCM) namespace at::cuda::solver { diff --git a/aten/src/ATen/native/cuda/linalg/CUDASolver.h b/aten/src/ATen/native/cuda/linalg/CUDASolver.h index f628b6ba329d..b8901d1d6f5d 100644 --- a/aten/src/ATen/native/cuda/linalg/CUDASolver.h +++ b/aten/src/ATen/native/cuda/linalg/CUDASolver.h @@ -7,7 +7,7 @@ #define USE_CUSOLVER_64_BIT #endif -#if defined(CUDART_VERSION) || defined(ROCM_VERSION) && ROCM_VERSION >= 50300 +#if defined(CUDART_VERSION) || defined(USE_ROCM) namespace at { namespace cuda { diff --git a/aten/src/ATen/native/cuda/linalg/CusolverDnHandlePool.cpp b/aten/src/ATen/native/cuda/linalg/CusolverDnHandlePool.cpp index 3016897c66c5..e2027139ad31 100644 --- a/aten/src/ATen/native/cuda/linalg/CusolverDnHandlePool.cpp +++ b/aten/src/ATen/native/cuda/linalg/CusolverDnHandlePool.cpp @@ -1,7 +1,7 @@ #include #include -#if defined(CUDART_VERSION) || defined(ROCM_VERSION) && ROCM_VERSION >= 50300 +#if defined(CUDART_VERSION) || defined(USE_ROCM) namespace at::cuda { namespace { diff --git a/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp b/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp index ab61aed6fe96..e43df8e048e8 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp +++ b/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp @@ -692,13 +692,6 @@ void spgemm( const Scalar& beta, const Scalar& alpha, const at::sparse_csr::SparseCsrTensor& C) { -#if defined(USE_ROCM) && ROCM_VERSION < 50200 - TORCH_CHECK( - false, - "Calling addmm with sparse GPU tensors requires compiling ", - "PyTorch with ROCm 5.2+. ", - "Please use PyTorch built with newer ROCm version."); -#else // older versions of cusparse on Windows segfault for complex128 dtype #if defined(_WIN32) && defined(CUSPARSE_VERSION) && CUSPARSE_VERSION < 11400 TORCH_CHECK( @@ -834,7 +827,6 @@ void spgemm( CUSPARSE_SPGEMM_DEFAULT, spgemm_desc.descriptor())); }); -#endif } } // anonymous namespace diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cpp b/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cpp index 02518574753f..f3aabc63e2a2 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cpp +++ b/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cpp @@ -19,7 +19,7 @@ #define IS_SPMM_AVAILABLE() 0 #endif -#if defined(USE_ROCM) && ROCM_VERSION >= 50200 +#if defined(USE_ROCM) #define IS_SPMM_HIP_AVAILABLE() 1 #else #define IS_SPMM_HIP_AVAILABLE() 0 diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp index c472e82ce2f1..8af2c41dfab7 100644 --- a/c10/cuda/CUDACachingAllocator.cpp +++ b/c10/cuda/CUDACachingAllocator.cpp @@ -778,12 +778,9 @@ struct MempoolIdHash { }; cudaError_t cudaMallocMaybeCapturing(void** p, size_t size) { -#if !defined(USE_ROCM) || ROCM_VERSION >= 50300 if (at::cuda::currentStreamCaptureStatusMayInitCtx() == at::cuda::CaptureStatus::None) { -#endif return C10_CUDA_ERROR_HANDLED(cudaMalloc(p, size)); -#if !defined(USE_ROCM) || ROCM_VERSION >= 50300 } else { // It's ok to capture cudaMallocs, as long as we never cudaFree those // addresses before replay. @@ -792,7 +789,6 @@ cudaError_t cudaMallocMaybeCapturing(void** p, size_t size) { at::cuda::CUDAStreamCaptureModeGuard g{cudaStreamCaptureModeRelaxed}; return C10_CUDA_ERROR_HANDLED(cudaMalloc(p, size)); } -#endif } } // anonymous namespace @@ -2183,7 +2179,6 @@ class DeviceCachingAllocator { } BlockPool& get_pool(size_t size, cudaStream_t stream) { -#if !defined(USE_ROCM) || ROCM_VERSION >= 50300 // captures_underway is a conservative guess that the current stream may be // capturing. It's only non-empty if some thread has begun and not yet ended // a capture, so it's usually 0, and we can short-circuit @@ -2201,7 +2196,6 @@ class DeviceCachingAllocator { } } } -#endif if (size <= kSmallSize) { return small_blocks; } else { diff --git a/c10/cuda/CUDAGraphsC10Utils.h b/c10/cuda/CUDAGraphsC10Utils.h index 79a36d80cead..57d10d694cee 100644 --- a/c10/cuda/CUDAGraphsC10Utils.h +++ b/c10/cuda/CUDAGraphsC10Utils.h @@ -17,7 +17,6 @@ using MempoolId_t = std::pair; // RAII guard for "cudaStreamCaptureMode", a thread-local value // that controls the error-checking strictness of a capture. -#if !defined(USE_ROCM) || ROCM_VERSION >= 50300 struct C10_CUDA_API CUDAStreamCaptureModeGuard { CUDAStreamCaptureModeGuard(cudaStreamCaptureMode desired) : strictness_(desired) { @@ -30,9 +29,7 @@ struct C10_CUDA_API CUDAStreamCaptureModeGuard { private: cudaStreamCaptureMode strictness_; }; -#endif -#if !defined(USE_ROCM) || ROCM_VERSION >= 50300 // Protects against enum cudaStreamCaptureStatus implementation changes. // Some compilers seem not to like static_assert without the messages. static_assert( @@ -44,16 +41,11 @@ static_assert( static_assert( int(cudaStreamCaptureStatus::cudaStreamCaptureStatusInvalidated) == 2, "unexpected int(cudaStreamCaptureStatusInvalidated) value"); -#endif enum class CaptureStatus : int { -#if !defined(USE_ROCM) || ROCM_VERSION >= 50300 None = int(cudaStreamCaptureStatus::cudaStreamCaptureStatusNone), Active = int(cudaStreamCaptureStatus::cudaStreamCaptureStatusActive), Invalidated = int(cudaStreamCaptureStatus::cudaStreamCaptureStatusInvalidated) -#else - None = 0 -#endif }; inline std::ostream& operator<<(std::ostream& os, CaptureStatus status) { @@ -61,14 +53,12 @@ inline std::ostream& operator<<(std::ostream& os, CaptureStatus status) { case CaptureStatus::None: os << "cudaStreamCaptureStatusNone"; break; -#if !defined(USE_ROCM) || ROCM_VERSION >= 50300 case CaptureStatus::Active: os << "cudaStreamCaptureStatusActive"; break; case CaptureStatus::Invalidated: os << "cudaStreamCaptureStatusInvalidated"; break; -#endif default: TORCH_INTERNAL_ASSERT( false, "Unknown CUDA graph CaptureStatus", int(status)); @@ -78,14 +68,10 @@ inline std::ostream& operator<<(std::ostream& os, CaptureStatus status) { // Use this version where you're sure a CUDA context exists already. inline CaptureStatus currentStreamCaptureStatusMayInitCtx() { -#if !defined(USE_ROCM) || ROCM_VERSION >= 50300 cudaStreamCaptureStatus is_capturing{cudaStreamCaptureStatusNone}; C10_CUDA_CHECK( cudaStreamIsCapturing(c10::cuda::getCurrentCUDAStream(), &is_capturing)); return CaptureStatus(is_capturing); -#else - return CaptureStatus::None; -#endif } } // namespace c10::cuda diff --git a/c10/macros/Macros.h b/c10/macros/Macros.h index 8c0dfea6e2b4..f28e526a0431 100644 --- a/c10/macros/Macros.h +++ b/c10/macros/Macros.h @@ -323,8 +323,7 @@ constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256; // CUDA_KERNEL_ASSERT checks the assertion // even when NDEBUG is defined. This is useful for important assertions in CUDA // code that would otherwise be suppressed when building Release. -#if defined(__ANDROID__) || defined(__APPLE__) || defined(__FreeBSD__) || \ - (defined(USE_ROCM) && ROCM_VERSION < 40100) +#if defined(__ANDROID__) || defined(__APPLE__) || defined(__FreeBSD__) // Those platforms do not support assert() #define CUDA_KERNEL_ASSERT(cond) #define SYCL_KERNEL_ASSERT(cond) diff --git a/c10/test/util/exception_test.cpp b/c10/test/util/exception_test.cpp index 0fc7abe982ee..c260e96961ad 100644 --- a/c10/test/util/exception_test.cpp +++ b/c10/test/util/exception_test.cpp @@ -34,8 +34,7 @@ TEST(ExceptionTest, TORCH_INTERNAL_ASSERT_DEBUG_ONLY) { } // On these platforms there's no assert -#if !defined(__ANDROID__) && !defined(__APPLE__) && \ - !(defined(USE_ROCM) && ROCM_VERSION < 40100) +#if !defined(__ANDROID__) && !defined(__APPLE__) TEST(ExceptionTest, CUDA_KERNEL_ASSERT) { // This function always throws even in NDEBUG mode ASSERT_DEATH_IF_SUPPORTED({ CUDA_KERNEL_ASSERT(false); }, "Assert"); diff --git a/caffe2/core/common_gpu.cc b/caffe2/core/common_gpu.cc index ad01c4af02fb..e5a26359d3f2 100644 --- a/caffe2/core/common_gpu.cc +++ b/caffe2/core/common_gpu.cc @@ -70,7 +70,7 @@ int GetGPUIDForPointer(const void* ptr) { // Otherwise, there must be no error CUDA_ENFORCE(err); - if (attr.CAFFE2_CUDA_PTRATTR_MEMTYPE == cudaMemoryTypeHost) { + if (attr.type == cudaMemoryTypeHost) { return -1; } diff --git a/caffe2/core/common_gpu.h b/caffe2/core/common_gpu.h index bbf25a513521..011f46264b19 100644 --- a/caffe2/core/common_gpu.h +++ b/caffe2/core/common_gpu.h @@ -86,12 +86,6 @@ namespace caffe2 { class TensorCoreEngine {}; #endif // USE_ROCM -#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700) -#define CAFFE2_CUDA_PTRATTR_MEMTYPE type -#else -#define CAFFE2_CUDA_PTRATTR_MEMTYPE memoryType -#endif - /** * A runtime function to report the cuda version that Caffe2 is built with. */ diff --git a/caffe2/core/context_gpu_test.cc b/caffe2/core/context_gpu_test.cc index aba8193e59a1..9eb92b429ef0 100644 --- a/caffe2/core/context_gpu_test.cc +++ b/caffe2/core/context_gpu_test.cc @@ -48,7 +48,7 @@ TEST(CUDAContextTest, MemoryPoolAllocateDealloc) { EXPECT_NE(allocated, nullptr); cudaPointerAttributes attr; CUDA_ENFORCE(cudaPointerGetAttributes(&attr, allocated.get())); - EXPECT_EQ(attr.CAFFE2_CUDA_PTRATTR_MEMTYPE, cudaMemoryTypeDevice); + EXPECT_EQ(attr.type, cudaMemoryTypeDevice); EXPECT_EQ(attr.device, i); void* prev_allocated = allocated.get(); allocated.clear(); diff --git a/caffe2/utils/math_gpu.cu b/caffe2/utils/math_gpu.cu index 98f090ab416b..e6dfbf85039f 100644 --- a/caffe2/utils/math_gpu.cu +++ b/caffe2/utils/math_gpu.cu @@ -620,11 +620,6 @@ CAFFE2_CUDA_EXPORT void Gemm( // It has more general hipblasGemmEx API which is more close to cublasGemmEx. // hipblasGemmEx does D = alpha*op( A )*op( B ) + beta*C, // whereas cublasSgemmEx does C = alpha*op( A )*op( B ) + beta*C -#if ROCM_VERSION >= 60000 && defined(HIPBLAS_V2) - auto compute_type = HIPBLAS_COMPUTE_32F; -#else - auto compute_type = HIPBLAS_R_32F; -#endif HIPBLAS_ENFORCE(hipblasGemmEx( context->hipblas_handle(), cu_trans_B, @@ -643,7 +638,7 @@ CAFFE2_CUDA_EXPORT void Gemm( C, HIPBLAS_R_16F, N, - compute_type, + HIPBLAS_COMPUTE_32F, HIPBLAS_GEMM_DEFAULT)); #else CUBLAS_ENFORCE(cublasSgemmEx( @@ -861,7 +856,7 @@ CAFFE2_CUDA_EXPORT void GemmBatched( thrust::device_vector C_device(C, C + batch_size); CUBLAS_ENFORCE(cublasSetPointerMode( context->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); -#if defined(USE_ROCM) && ROCM_VERSION >= 60000 && defined(HIPBLAS_V2) +#if defined(USE_ROCM) auto compute_type = HIPBLAS_COMPUTE_32F; #else auto compute_type = CUDA_R_32F; @@ -957,7 +952,7 @@ CAFFE2_CUDA_EXPORT void GemmStridedBatched( if (math_type == TensorProto_DataType_FLOAT) { CUBLAS_ENFORCE(cublasSetPointerMode( context->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); -#if defined(USE_ROCM) && ROCM_VERSION >= 60000 && defined(HIPBLAS_V2) +#if defined(USE_ROCM) auto compute_type = HIPBLAS_COMPUTE_32F; #else auto compute_type = CUDA_R_32F; @@ -1076,11 +1071,6 @@ CAFFE2_CUDA_EXPORT void Gemv( // It has more general hipblasGemmEx API which is more close to cublasGemmEx. // hipblasGemmEx does D = alpha*op( A )*op( B ) + beta*C, // whereas cublasSgemmEx does C = alpha*op( A )*op( B ) + beta*C -#if ROCM_VERSION >= 60000 && defined(HIPBLAS_V2) - auto compute_type = HIPBLAS_COMPUTE_32F; -#else - auto compute_type = HIPBLAS_R_32F; -#endif HIPBLAS_ENFORCE(hipblasGemmEx( context->hipblas_handle(), cu_trans_A, @@ -1099,7 +1089,7 @@ CAFFE2_CUDA_EXPORT void Gemv( y, HIPBLAS_R_16F, ldc, - compute_type, + HIPBLAS_COMPUTE_32F, HIPBLAS_GEMM_DEFAULT)); #else CUBLAS_ENFORCE(cublasSgemmEx( diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index e53eef3ec4a7..73dba4061dce 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1213,18 +1213,7 @@ if(USE_ROCM) list(APPEND HIP_CXX_FLAGS -DCAFFE2_USE_MIOPEN) list(APPEND HIP_CXX_FLAGS -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP) list(APPEND HIP_CXX_FLAGS -std=c++17) - if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "6.0.0") - list(APPEND HIP_CXX_FLAGS -DHIPBLAS_V2) - endif() - if(HIPBLASLT_CUSTOM_DATA_TYPE) - list(APPEND HIP_CXX_FLAGS -DHIPBLASLT_CUSTOM_DATA_TYPE) - endif() - if(HIPBLASLT_CUSTOM_COMPUTE_TYPE) - list(APPEND HIP_CXX_FLAGS -DHIPBLASLT_CUSTOM_COMPUTE_TYPE) - endif() - if(HIPBLASLT_HAS_GETINDEXFROMALGO) - list(APPEND HIP_CXX_FLAGS -DHIPBLASLT_HAS_GETINDEXFROMALGO) - endif() + list(APPEND HIP_CXX_FLAGS -DHIPBLAS_V2) if(HIP_NEW_TYPE_ENUMS) list(APPEND HIP_CXX_FLAGS -DHIP_NEW_TYPE_ENUMS) endif() @@ -1256,9 +1245,7 @@ if(USE_ROCM) set(Caffe2_PUBLIC_HIP_DEPENDENCY_LIBS ${PYTORCH_HIP_LIBRARIES} ${PYTORCH_MIOPEN_LIBRARIES} ${hipcub_LIBRARIES} ${ROCM_HIPRTC_LIB} ${ROCM_ROCTX_LIB}) - if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "5.7.0") - list(APPEND Caffe2_PUBLIC_HIP_DEPENDENCY_LIBS ${hipblaslt_LIBRARIES}) - endif() + list(APPEND Caffe2_PUBLIC_HIP_DEPENDENCY_LIBS ${hipblaslt_LIBRARIES}) list(APPEND Caffe2_PUBLIC_HIP_DEPENDENCY_LIBS roc::hipblas hip::hipfft hip::hiprand roc::hipsparse roc::hipsolver) @@ -1281,18 +1268,6 @@ if(USE_ROCM) endif() endif() -# ---[ ROCm -if(USE_ROCM AND ROCM_VERSION_DEV VERSION_LESS "5.2.0") - # We check again for USE_ROCM because it might have been set to OFF - # in the if above - include_directories(SYSTEM ${HIP_PATH}/include) - include_directories(SYSTEM ${HIPBLAS_PATH}/include) - include_directories(SYSTEM ${HIPFFT_PATH}/include) - include_directories(SYSTEM ${HIPSPARSE_PATH}/include) - include_directories(SYSTEM ${HIPRAND_PATH}/include) - include_directories(SYSTEM ${THRUST_PATH}) -endif() - # ---[ NCCL if(USE_NCCL) if(NOT (USE_CUDA OR USE_ROCM)) diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake index f6ca263c5e5b..fa39156031ff 100644 --- a/cmake/public/LoadHIP.cmake +++ b/cmake/public/LoadHIP.cmake @@ -155,15 +155,9 @@ if(HIP_FOUND) find_package_and_print_version(hiprand REQUIRED) find_package_and_print_version(rocblas REQUIRED) find_package_and_print_version(hipblas REQUIRED) - if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "5.7.0") - find_package_and_print_version(hipblaslt REQUIRED) - endif() + find_package_and_print_version(hipblaslt REQUIRED) find_package_and_print_version(miopen REQUIRED) - if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "4.1.0") - find_package_and_print_version(hipfft REQUIRED) - else() - find_package_and_print_version(rocfft REQUIRED) - endif() + find_package_and_print_version(hipfft REQUIRED) find_package_and_print_version(hipsparse REQUIRED) find_package_and_print_version(rccl) find_package_and_print_version(rocprim REQUIRED) @@ -191,88 +185,6 @@ if(HIP_FOUND) # roctx is part of roctracer find_library(ROCM_ROCTX_LIB roctx64 HINTS ${ROCM_PATH}/lib) - if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "5.7.0") - # check whether hipblaslt is using its own datatype - set(file "${PROJECT_BINARY_DIR}/hipblaslt_test_data_type.cc") - file(WRITE ${file} "" - "#include \n" - "int main() {\n" - " hipblasltDatatype_t bar = HIPBLASLT_R_16F;\n" - " return 0;\n" - "}\n" - ) - - try_compile(hipblaslt_compile_result_custom_datatype ${PROJECT_RANDOM_BINARY_DIR} ${file} - CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${ROCM_INCLUDE_DIRS}" - COMPILE_DEFINITIONS -D__HIP_PLATFORM_AMD__ -D__HIP_PLATFORM_HCC__ - OUTPUT_VARIABLE hipblaslt_compile_output) - - if(hipblaslt_compile_result_custom_datatype) - set(HIPBLASLT_CUSTOM_DATA_TYPE ON) - #message("hipblaslt is using custom data type: ${hipblaslt_compile_output}") - message("hipblaslt is using custom data type") - else() - set(HIPBLASLT_CUSTOM_DATA_TYPE OFF) - #message("hipblaslt is NOT using custom data type: ${hipblaslt_compile_output}") - message("hipblaslt is NOT using custom data type") - endif() - - # check whether hipblaslt is using its own compute type - set(file "${PROJECT_BINARY_DIR}/hipblaslt_test_compute_type.cc") - file(WRITE ${file} "" - "#include \n" - "int main() {\n" - " hipblasLtComputeType_t baz = HIPBLASLT_COMPUTE_F32;\n" - " return 0;\n" - "}\n" - ) - - try_compile(hipblaslt_compile_result_custom_compute_type ${PROJECT_RANDOM_BINARY_DIR} ${file} - CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${ROCM_INCLUDE_DIRS}" - COMPILE_DEFINITIONS -D__HIP_PLATFORM_AMD__ -D__HIP_PLATFORM_HCC__ - OUTPUT_VARIABLE hipblaslt_compile_output) - - if(hipblaslt_compile_result_custom_compute_type) - set(HIPBLASLT_CUSTOM_COMPUTE_TYPE ON) - #message("hipblaslt is using custom compute type: ${hipblaslt_compile_output}") - message("hipblaslt is using custom compute type") - else() - set(HIPBLASLT_CUSTOM_COMPUTE_TYPE OFF) - #message("hipblaslt is NOT using custom compute type: ${hipblaslt_compile_output}") - message("hipblaslt is NOT using custom compute type") - endif() - - # check whether hipblaslt provides getIndexFromAlgo - set(file "${PROJECT_BINARY_DIR}/hipblaslt_test_getIndexFromAlgo.cc") - file(WRITE ${file} "" - "#include \n" - "#include \n" - "int main() {\n" - " hipblasLtMatmulAlgo_t algo;\n" - " return hipblaslt_ext::getIndexFromAlgo(algo);\n" - " return 0;\n" - "}\n" - ) - - try_compile(hipblaslt_compile_result_getindexfromalgo ${PROJECT_RANDOM_BINARY_DIR} ${file} - CMAKE_FLAGS - "-DINCLUDE_DIRECTORIES=${ROCM_INCLUDE_DIRS}" - "-DLINK_DIRECTORIES=${ROCM_PATH}/lib" - LINK_LIBRARIES ${hipblaslt_LIBRARIES} - COMPILE_DEFINITIONS -D__HIP_PLATFORM_AMD__ -D__HIP_PLATFORM_HCC__ - OUTPUT_VARIABLE hipblaslt_compile_output) - - if(hipblaslt_compile_result_getindexfromalgo) - set(HIPBLASLT_HAS_GETINDEXFROMALGO ON) - #message("hipblaslt provides getIndexFromAlgo: ${hipblaslt_compile_output}") - message("hipblaslt provides getIndexFromAlgo") - else() - set(HAS_GETINDEXFROMALGO OFF) - #message("hipblaslt does not provide getIndexFromAlgo: ${hipblaslt_compile_output}") - message("hipblaslt does not provide getIndexFromAlgo") - endif() - endif() - # check whether HIP declares new types set(file "${PROJECT_BINARY_DIR}/hip_new_types.cc") file(WRITE ${file} "" @@ -283,18 +195,18 @@ if(HIP_FOUND) "}\n" ) - try_compile(hipblaslt_compile_result ${PROJECT_RANDOM_BINARY_DIR} ${file} + try_compile(hip_compile_result ${PROJECT_RANDOM_BINARY_DIR} ${file} CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${ROCM_INCLUDE_DIRS}" COMPILE_DEFINITIONS -D__HIP_PLATFORM_AMD__ -D__HIP_PLATFORM_HCC__ - OUTPUT_VARIABLE hipblaslt_compile_output) + OUTPUT_VARIABLE hip_compile_output) - if(hipblaslt_compile_result) + if(hip_compile_result) set(HIP_NEW_TYPE_ENUMS ON) - #message("HIP is using new type enums: ${hipblaslt_compile_output}") + #message("HIP is using new type enums: ${hip_compile_output}") message("HIP is using new type enums") else() set(HIP_NEW_TYPE_ENUMS OFF) - #message("HIP is NOT using new type enums: ${hipblaslt_compile_output}") + #message("HIP is NOT using new type enums: ${hip_compile_output}") message("HIP is NOT using new type enums") endif() diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp index 0b53841aafe3..cfacbf0e3be7 100644 --- a/torch/csrc/autograd/engine.cpp +++ b/torch/csrc/autograd/engine.cpp @@ -1550,17 +1550,8 @@ void GraphTask::stash_current_streams() { caller_current_streams_.resize(num_devices); if (num_devices > 0) { for (c10::DeviceIndex idx = 0; idx < num_devices; idx++) { -#if defined(USE_ROCM) && (ROCM_VERSION < 50000) - // If the build targets ROCM, stash streams for all visible devices - // unconditionally, to work around - // https://github.com/pytorch/pytorch/issues/59750. - // TODO: Remove ROCM-specific behavior when - // https://github.com/pytorch/pytorch/issues/59750 is fixed. - if (true) { -#else if (at::globalContext().getAcceleratorHooksInterface().hasPrimaryContext( idx)) { -#endif caller_current_streams_[idx] = guard.getStream({accelerator, idx}); } else { caller_current_streams_[idx] = c10::nullopt; diff --git a/torch/csrc/cuda/nccl.cpp b/torch/csrc/cuda/nccl.cpp index f9b29c38dce5..b4c91b624aaf 100644 --- a/torch/csrc/cuda/nccl.cpp +++ b/torch/csrc/cuda/nccl.cpp @@ -821,7 +821,7 @@ void all2all_single_equal_split( const auto* sendbuff = reinterpret_cast(input.const_data_ptr()); auto* recvbuff = reinterpret_cast(output.data_ptr()); auto comm = to_nccl_comm(_comm); -#if defined(USE_ROCM) && ROCM_VERSION >= 50000 +#if defined(USE_ROCM) NCCL_CHECK(ncclAllToAll(sendbuff, recvbuff, count, type, comm, stream)); #else NCCL_CHECK(ncclCommCount(comm, &numranks)); diff --git a/torch/csrc/jit/codegen/fuser/codegen.cpp b/torch/csrc/jit/codegen/fuser/codegen.cpp index c28ad2ba1ae0..10ddf2267b21 100644 --- a/torch/csrc/jit/codegen/fuser/codegen.cpp +++ b/torch/csrc/jit/codegen/fuser/codegen.cpp @@ -659,25 +659,6 @@ std::string generateKernel( env.s("RandInit", ""); } - // HIP headers must be included until precompiled header feature is available - // clang-format off -#if defined(USE_ROCM) -#if ROCM_VERSION < 40200 - if (use_cuda && has_half_tensor) { - env.s("RuntimeHeader", R"( -#include -#include -)"); - } else if (use_cuda) { - env.s("RuntimeHeader", R"( -#include -)"); - } -#else - // Still need the key defined, but empty. - env.s("RuntimeHeader", R"()"); -#endif -#endif // clang-format on // Instantiates the CUDA or CPU-specific templates diff --git a/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp b/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp index b1b05c4f60cd..d17da0cbaa21 100644 --- a/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp +++ b/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp @@ -128,9 +128,7 @@ FusedKernelCUDA::FusedKernelCUDA( #if defined(USE_ROCM) std::vector args = {"--std=c++17"}; -#if ROCM_VERSION >= 40200 args.push_back("-hip-pch"); -#endif #else const std::string compute = std::string("--gpu-architecture=") + #if defined(CUDA_VERSION) && CUDA_VERSION >= 11010 @@ -190,16 +188,8 @@ FusedKernelCUDA::FusedKernelCUDA( nvrtc().cuModuleGetFunction(&function_, module_, name_.c_str())); // Computes max blocks -#if defined(USE_ROCM) && ROCM_VERSION < 30500 - // HIP function signature is not compatible yet - uint32_t max_blocks; - AT_CUDA_DRIVER_CHECK(nvrtc().hipOccupancyMaxActiveBlocksPerMultiprocessor( - &max_blocks, function_, 128, 0)); - maxBlocks_ = max_blocks; -#else AT_CUDA_DRIVER_CHECK(nvrtc().cuOccupancyMaxActiveBlocksPerMultiprocessor( &maxBlocks_, function_, 128, 0)); -#endif maxBlocks_ *= prop_->multiProcessorCount; // Resets device (end of hacked at::DeviceGuard) diff --git a/torch/csrc/jit/codegen/fuser/cuda/resource_strings.h b/torch/csrc/jit/codegen/fuser/cuda/resource_strings.h index 7a9cc3e6f870..0eb7299223aa 100644 --- a/torch/csrc/jit/codegen/fuser/cuda/resource_strings.h +++ b/torch/csrc/jit/codegen/fuser/cuda/resource_strings.h @@ -15,7 +15,6 @@ cases*/ #if defined(USE_ROCM) static auto type_declarations_template = at::jit::CodeTemplate(R"( -${RuntimeHeader} ${HalfHeader} ${BFloat16Header} ${RandHeader} diff --git a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp index 2df89085b91c..07626232399e 100644 --- a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp +++ b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp @@ -897,14 +897,6 @@ void CudaCodeGen::Initialize() { HalfChecker halfChecker(buffer_args()); stmt_v->accept(&halfChecker); -#if defined(USE_ROCM) -#if ROCM_VERSION < 40200 - os() << "#include " << std::endl; - if (halfChecker.hasHalf()) { - os() << "#include " << std::endl; - } -#endif -#endif os() << device_resource_string << shared_resource_string; if (has_random_) { @@ -1319,9 +1311,7 @@ void CudaCodeGen::CompileToNVRTC( #if defined(USE_ROCM) std::vector args = {"--std=c++17"}; -#if ROCM_VERSION >= 40200 args.push_back("-hip-pch"); -#endif #else const std::string compute = std::string("--gpu-architecture=") + #if defined(CUDA_VERSION) && CUDA_VERSION >= 11010 diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py index ee5584315e94..abeae6a4ff71 100644 --- a/torch/utils/cpp_extension.py +++ b/torch/utils/cpp_extension.py @@ -235,11 +235,9 @@ COMMON_HIP_FLAGS = [ '-fPIC', '-D__HIP_PLATFORM_AMD__=1', '-DUSE_ROCM=1', + '-DHIPBLAS_V2', ] -if ROCM_VERSION is not None and ROCM_VERSION >= (6, 0): - COMMON_HIP_FLAGS.append('-DHIPBLAS_V2') - COMMON_HIPCC_FLAGS = [ '-DCUDA_HAS_FP16=1', '-D__HIP_NO_HALF_OPERATORS__=1', @@ -1083,8 +1081,7 @@ def CUDAExtension(name, sources, *args, **kwargs): libraries.append('torch_cpu') libraries.append('torch_python') if IS_HIP_EXTENSION: - assert ROCM_VERSION is not None - libraries.append('amdhip64' if ROCM_VERSION >= (3, 5) else 'hip_hcc') + libraries.append('amdhip64') libraries.append('c10_hip') libraries.append('torch_hip') else: @@ -1907,9 +1904,8 @@ def _prepare_ldflags(extra_ldflags, with_cuda, verbose, is_standalone): if CUDNN_HOME is not None: extra_ldflags.append(f'-L{os.path.join(CUDNN_HOME, "lib64")}') elif IS_HIP_EXTENSION: - assert ROCM_VERSION is not None extra_ldflags.append(f'-L{_join_rocm_home("lib")}') - extra_ldflags.append('-lamdhip64' if ROCM_VERSION >= (3, 5) else '-lhip_hcc') + extra_ldflags.append('-lamdhip64') return extra_ldflags diff --git a/torch/utils/hipify/cuda_to_hip_mappings.py b/torch/utils/hipify/cuda_to_hip_mappings.py index d07292a22b0a..3c84e1bff4c9 100644 --- a/torch/utils/hipify/cuda_to_hip_mappings.py +++ b/torch/utils/hipify/cuda_to_hip_mappings.py @@ -1,7 +1,4 @@ import collections -import os -import re -import subprocess from .constants import (API_BLAS, API_C10, API_CAFFE2, API_DRIVER, API_FFT, API_PYTORCH, API_RAND, API_ROCTX, API_RTC, API_RUNTIME, @@ -27,40 +24,6 @@ ROCm/HIP string, a type and API annotation and - optionally - an annotation if i supported in ROCm/HIP yet. """ -# We need to know the ROCm version so we can conditionalize some of the mappings later. -# As of ROCm 5.0, the version is found in rocm_version.h header file under /opt/rocm/include. -rocm_path = os.environ.get('ROCM_HOME') or os.environ.get('ROCM_PATH') or "/opt/rocm" -try: - rocm_path = subprocess.check_output(["hipconfig", "--rocmpath"]).decode("utf-8") -except subprocess.CalledProcessError: - print(f"Warning: hipconfig --rocmpath failed, assuming {rocm_path}") -except (FileNotFoundError, PermissionError, NotADirectoryError): - # Do not print warning. This is okay. This file can also be imported for non-ROCm builds. - pass - -rocm_version = (0, 0, 0) -rocm_version_h = f"{rocm_path}/include/rocm-core/rocm_version.h" -if not os.path.isfile(rocm_version_h): - rocm_version_h = f"{rocm_path}/include/rocm_version.h" - -# The file could be missing due to 1) ROCm version < 5.2, or 2) no ROCm install. -if os.path.isfile(rocm_version_h): - RE_MAJOR = re.compile(r"#define\s+ROCM_VERSION_MAJOR\s+(\d+)") - RE_MINOR = re.compile(r"#define\s+ROCM_VERSION_MINOR\s+(\d+)") - RE_PATCH = re.compile(r"#define\s+ROCM_VERSION_PATCH\s+(\d+)") - major, minor, patch = 0, 0, 0 - for line in open(rocm_version_h): - match = RE_MAJOR.search(line) - if match: - major = int(match.group(1)) - match = RE_MINOR.search(line) - if match: - minor = int(match.group(1)) - match = RE_PATCH.search(line) - if match: - patch = int(match.group(1)) - rocm_version = (major, minor, patch) - # List of math functions that should be replaced inside device code only. MATH_TRANSPILATIONS = collections.OrderedDict( [ @@ -7304,20 +7267,19 @@ CUDA_IDENTIFIER_MAP = collections.OrderedDict( ), ( "cublasComputeType_t", - ("hipblasComputeType_t" if rocm_version >= (6, 0, 0) else "hipblasLtComputeType_t", - CONV_MATH_FUNC, API_BLAS) + ("hipblasComputeType_t", CONV_MATH_FUNC, API_BLAS) ), ( "CUBLAS_COMPUTE_32I", - ("HIPBLAS_COMPUTE_32I" if rocm_version >= (6, 0, 0) else "HIPBLASLT_COMPUTE_I32", CONV_MATH_FUNC, API_BLAS) + ("HIPBLAS_COMPUTE_32I", CONV_MATH_FUNC, API_BLAS) ), ( "CUBLAS_COMPUTE_32F", - ("HIPBLAS_COMPUTE_32F" if rocm_version >= (6, 0, 0) else "HIPBLASLT_COMPUTE_F32", CONV_MATH_FUNC, API_BLAS) + ("HIPBLAS_COMPUTE_32F", CONV_MATH_FUNC, API_BLAS) ), ( "CUBLAS_COMPUTE_64F", - ("HIPBLAS_COMPUTE_64F" if rocm_version >= (6, 0, 0) else "HIPBLASLT_COMPUTE_F64", CONV_MATH_FUNC, API_BLAS) + ("HIPBLAS_COMPUTE_64F", CONV_MATH_FUNC, API_BLAS) ), ("cublasLtEpilogue_t", ("hipblasLtEpilogue_t", CONV_MATH_FUNC, API_BLAS)), ("CUBLASLT_EPILOGUE_DEFAULT", ("HIPBLASLT_EPILOGUE_DEFAULT", CONV_MATH_FUNC, API_BLAS)), @@ -7770,14 +7732,8 @@ CUDA_IDENTIFIER_MAP = collections.OrderedDict( HIP_UNSUPPORTED, ), ), - ( - "cuComplex", - ("hipComplex" if rocm_version >= (6, 0, 0) else "hipblasComplex", CONV_TYPE, API_BLAS) - ), - ( - "cuDoubleComplex", - ("hipDoubleComplex" if rocm_version >= (6, 0, 0) else "hipblasDoubleComplex", CONV_TYPE, API_BLAS), - ), + ("cuComplex", ("hipComplex", CONV_TYPE, API_BLAS)), + ("cuDoubleComplex", ("hipDoubleComplex", CONV_TYPE, API_BLAS)), ("cufftResult_t", ("hipfftResult_t", CONV_TYPE, API_FFT)), ("cufftResult", ("hipfftResult", CONV_TYPE, API_FFT)), ("CUFFT_SUCCESS", ("HIPFFT_SUCCESS", CONV_NUMERIC_LITERAL, API_FFT)),