mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
[ROCm] Update to ROCm 3.1.1 (#35552)
Summary: Redux. Pull Request resolved: https://github.com/pytorch/pytorch/pull/35552 Differential Revision: D20701593 Pulled By: ezyang fbshipit-source-id: 1946d1e8fb47d597da903bae5d355bf52a5f017f
This commit is contained in:
committed by
Facebook GitHub Bot
parent
ff71a4192d
commit
835ee34e38
@ -12,7 +12,7 @@ from dataclasses import dataclass
|
||||
|
||||
DOCKER_IMAGE_PATH_BASE = "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/"
|
||||
|
||||
DOCKER_IMAGE_VERSION = "345"
|
||||
DOCKER_IMAGE_VERSION = "373"
|
||||
|
||||
|
||||
@dataclass
|
||||
|
@ -2422,14 +2422,14 @@ workflows:
|
||||
requires:
|
||||
- setup
|
||||
build_environment: "caffe2-onnx-main-py3.6-clang7-ubuntu16.04-build"
|
||||
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py3.6-clang7-ubuntu16.04:345"
|
||||
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py3.6-clang7-ubuntu16.04:373"
|
||||
- caffe2_linux_test:
|
||||
name: caffe2_onnx_main_py3_6_clang7_ubuntu16_04_test
|
||||
requires:
|
||||
- setup
|
||||
- caffe2_onnx_main_py3_6_clang7_ubuntu16_04_build
|
||||
build_environment: "caffe2-onnx-main-py3.6-clang7-ubuntu16.04-test"
|
||||
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py3.6-clang7-ubuntu16.04:345"
|
||||
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py3.6-clang7-ubuntu16.04:373"
|
||||
resource_class: large
|
||||
- caffe2_linux_test:
|
||||
name: caffe2_onnx_ort1_py3_6_clang7_ubuntu16_04_test
|
||||
@ -2437,7 +2437,7 @@ workflows:
|
||||
- setup
|
||||
- caffe2_onnx_main_py3_6_clang7_ubuntu16_04_build
|
||||
build_environment: "caffe2-onnx-ort1-py3.6-clang7-ubuntu16.04-test"
|
||||
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py3.6-clang7-ubuntu16.04:345"
|
||||
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py3.6-clang7-ubuntu16.04:373"
|
||||
resource_class: large
|
||||
- caffe2_linux_test:
|
||||
name: caffe2_onnx_ort2_py3_6_clang7_ubuntu16_04_test
|
||||
@ -2445,7 +2445,7 @@ workflows:
|
||||
- setup
|
||||
- caffe2_onnx_main_py3_6_clang7_ubuntu16_04_build
|
||||
build_environment: "caffe2-onnx-ort2-py3.6-clang7-ubuntu16.04-test"
|
||||
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py3.6-clang7-ubuntu16.04:345"
|
||||
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py3.6-clang7-ubuntu16.04:373"
|
||||
resource_class: large
|
||||
# TODO: Refactor circleci/cimodel/data/binary_build_data.py to generate this file
|
||||
# instead of doing one offs here
|
||||
@ -6287,7 +6287,7 @@ workflows:
|
||||
- ecr_gc_job:
|
||||
name: ecr_gc_job_for_caffe2
|
||||
project: caffe2
|
||||
tags_to_keep: "348,345,336,325,324,315,306,301,287,283,276,273,266,253,248,238,230,213"
|
||||
tags_to_keep: "373,369,348,345,336,325,324,315,306,301,287,283,276,273,266,253,248,238,230,213"
|
||||
- ecr_gc_job:
|
||||
name: ecr_gc_job_for_translate
|
||||
project: translate
|
||||
|
@ -15,6 +15,8 @@ OS="ubuntu"
|
||||
DOCKERFILE="${OS}/Dockerfile"
|
||||
if [[ "$image" == *-cuda* ]]; then
|
||||
DOCKERFILE="${OS}-cuda/Dockerfile"
|
||||
elif [[ "$image" == *-rocm* ]]; then
|
||||
DOCKERFILE="${OS}-rocm/Dockerfile"
|
||||
fi
|
||||
|
||||
if [[ "$image" == *-trusty* ]]; then
|
||||
@ -176,6 +178,16 @@ case "$image" in
|
||||
DB=yes
|
||||
VISION=yes
|
||||
;;
|
||||
pytorch-linux-xenial-rocm-py3.6-clang7)
|
||||
ANACONDA_PYTHON_VERSION=3.6
|
||||
CLANG_VERSION=7
|
||||
PROTOBUF=yes
|
||||
DB=yes
|
||||
VISION=yes
|
||||
ROCM=yes
|
||||
# newer cmake version required
|
||||
CMAKE_VERSION=3.6.3
|
||||
;;
|
||||
esac
|
||||
|
||||
# Set Jenkins UID and GID if running Jenkins
|
||||
@ -213,6 +225,7 @@ docker build \
|
||||
--build-arg "CMAKE_VERSION=${CMAKE_VERSION:-}" \
|
||||
--build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \
|
||||
--build-arg "KATEX=${KATEX:-}" \
|
||||
--build-arg "ROCM=${ROCM:-}" \
|
||||
-f $(dirname ${DOCKERFILE})/Dockerfile \
|
||||
-t "$tmp_tag" \
|
||||
"$@" \
|
||||
|
75
.circleci/docker/common/install_rocm.sh
Normal file
75
.circleci/docker/common/install_rocm.sh
Normal file
@ -0,0 +1,75 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -ex
|
||||
|
||||
install_ubuntu() {
|
||||
apt-get update
|
||||
apt-get install -y wget
|
||||
apt-get install -y libopenblas-dev
|
||||
|
||||
# Need the libc++1 and libc++abi1 libraries to allow torch._C to load at runtime
|
||||
apt-get install libc++1
|
||||
apt-get install libc++abi1
|
||||
|
||||
DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/debian
|
||||
# Add rocm repository
|
||||
wget -qO - $DEB_ROCM_REPO/rocm.gpg.key | apt-key add -
|
||||
echo "deb [arch=amd64] $DEB_ROCM_REPO xenial main" > /etc/apt/sources.list.d/rocm.list
|
||||
apt-get update --allow-insecure-repositories
|
||||
|
||||
DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
|
||||
rocm-dev \
|
||||
rocm-utils \
|
||||
rocfft \
|
||||
miopen-hip \
|
||||
rocblas \
|
||||
hipsparse \
|
||||
rocrand \
|
||||
hipcub \
|
||||
rocthrust \
|
||||
rccl \
|
||||
rocprofiler-dev \
|
||||
roctracer-dev
|
||||
}
|
||||
|
||||
install_centos() {
|
||||
|
||||
yum update -y
|
||||
yum install -y wget
|
||||
yum install -y openblas-devel
|
||||
|
||||
yum install -y epel-release
|
||||
yum install -y dkms kernel-headers-`uname -r` kernel-devel-`uname -r`
|
||||
|
||||
echo "[ROCm]" > /etc/yum.repos.d/rocm.repo
|
||||
echo "name=ROCm" >> /etc/yum.repos.d/rocm.repo
|
||||
echo "baseurl=http://repo.radeon.com/rocm/yum/rpm/" >> /etc/yum.repos.d/rocm.repo
|
||||
echo "enabled=1" >> /etc/yum.repos.d/rocm.repo
|
||||
echo "gpgcheck=0" >> /etc/yum.repos.d/rocm.repo
|
||||
|
||||
yum update -y
|
||||
|
||||
yum install -y \
|
||||
rocm-dev \
|
||||
rocm-utils \
|
||||
rocfft \
|
||||
miopen-hip \
|
||||
rocblas \
|
||||
hipsparse \
|
||||
rocrand \
|
||||
rccl \
|
||||
hipcub \
|
||||
rocthrust \
|
||||
rocprofiler-dev \
|
||||
roctracer-dev
|
||||
}
|
||||
|
||||
# Install Python packages depending on the base OS
|
||||
if [ -f /etc/lsb-release ]; then
|
||||
install_ubuntu
|
||||
elif [ -f /etc/os-release ]; then
|
||||
install_centos
|
||||
else
|
||||
echo "Unable to determine OS..."
|
||||
exit 1
|
||||
fi
|
1
.circleci/docker/ubuntu-rocm/.gitignore
vendored
Normal file
1
.circleci/docker/ubuntu-rocm/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
*.sh
|
79
.circleci/docker/ubuntu-rocm/Dockerfile
Normal file
79
.circleci/docker/ubuntu-rocm/Dockerfile
Normal file
@ -0,0 +1,79 @@
|
||||
ARG UBUNTU_VERSION
|
||||
|
||||
FROM ubuntu:${UBUNTU_VERSION}
|
||||
|
||||
ARG UBUNTU_VERSION
|
||||
|
||||
ENV DEBIAN_FRONTEND noninteractive
|
||||
|
||||
# Install common dependencies (so that this step can be cached separately)
|
||||
ARG EC2
|
||||
ADD ./common/install_base.sh install_base.sh
|
||||
RUN bash ./install_base.sh && rm install_base.sh
|
||||
|
||||
# Install clang
|
||||
ARG LLVMDEV
|
||||
ARG CLANG_VERSION
|
||||
ADD ./common/install_clang.sh install_clang.sh
|
||||
RUN bash ./install_clang.sh && rm install_clang.sh
|
||||
|
||||
# Install user
|
||||
ADD ./common/install_user.sh install_user.sh
|
||||
RUN bash ./install_user.sh && rm install_user.sh
|
||||
|
||||
# Install conda
|
||||
ENV PATH /opt/conda/bin:$PATH
|
||||
ARG ANACONDA_PYTHON_VERSION
|
||||
ADD ./common/install_conda.sh install_conda.sh
|
||||
RUN bash ./install_conda.sh && rm install_conda.sh
|
||||
|
||||
# (optional) Install database packages like LMDB and LevelDB
|
||||
ARG DB
|
||||
ADD ./common/install_db.sh install_db.sh
|
||||
RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
|
||||
RUN rm install_db.sh
|
||||
ENV INSTALLED_DB ${DB}
|
||||
|
||||
# (optional) Install vision packages like OpenCV and ffmpeg
|
||||
ARG VISION
|
||||
ADD ./common/install_vision.sh install_vision.sh
|
||||
RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
|
||||
RUN rm install_vision.sh
|
||||
ENV INSTALLED_VISION ${VISION}
|
||||
|
||||
# Install rocm
|
||||
ARG ROCM
|
||||
ADD ./common/install_rocm.sh install_rocm.sh
|
||||
RUN bash ./install_rocm.sh
|
||||
RUN rm install_rocm.sh
|
||||
ENV PATH /opt/rocm/bin:$PATH
|
||||
ENV PATH /opt/rocm/hcc/bin:$PATH
|
||||
ENV PATH /opt/rocm/hip/bin:$PATH
|
||||
ENV PATH /opt/rocm/opencl/bin:$PATH
|
||||
ENV HIP_PLATFORM hcc
|
||||
ENV LANG C.UTF-8
|
||||
ENV LC_ALL C.UTF-8
|
||||
|
||||
# (optional) Install non-default CMake version
|
||||
ARG CMAKE_VERSION
|
||||
ADD ./common/install_cmake.sh install_cmake.sh
|
||||
RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
|
||||
RUN rm install_cmake.sh
|
||||
|
||||
# (optional) Install non-default Ninja version
|
||||
ARG NINJA_VERSION
|
||||
ADD ./common/install_ninja.sh install_ninja.sh
|
||||
RUN if [ -n "${NINJA_VERSION}" ]; then bash ./install_ninja.sh; fi
|
||||
RUN rm install_ninja.sh
|
||||
|
||||
# Install ccache/sccache (do this last, so we get priority in PATH)
|
||||
ADD ./common/install_cache.sh install_cache.sh
|
||||
ENV PATH /opt/cache/bin:$PATH
|
||||
RUN bash ./install_cache.sh && rm install_cache.sh
|
||||
|
||||
# Include BUILD_ENVIRONMENT environment variable in image
|
||||
ARG BUILD_ENVIRONMENT
|
||||
ENV BUILD_ENVIRONMENT ${BUILD_ENVIRONMENT}
|
||||
|
||||
USER jenkins
|
||||
CMD ["bash"]
|
@ -14,7 +14,7 @@
|
||||
- ecr_gc_job:
|
||||
name: ecr_gc_job_for_caffe2
|
||||
project: caffe2
|
||||
tags_to_keep: "348,345,336,325,324,315,306,301,287,283,276,273,266,253,248,238,230,213"
|
||||
tags_to_keep: "373,369,348,345,336,325,324,315,306,301,287,283,276,273,266,253,248,238,230,213"
|
||||
- ecr_gc_job:
|
||||
name: ecr_gc_job_for_translate
|
||||
project: translate
|
||||
|
@ -4,11 +4,24 @@
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <cuda.h>
|
||||
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
#include <hip/hip_version.h>
|
||||
#endif
|
||||
|
||||
namespace at {
|
||||
namespace native {
|
||||
|
||||
Scalar _local_scalar_dense_cuda(const Tensor& self) {
|
||||
Scalar r;
|
||||
#if HIP_VERSION >= 301
|
||||
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
|
||||
at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16, self.scalar_type(), "_local_scalar_dense_cuda", [&] {
|
||||
scalar_t value;
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
AT_CUDA_CHECK(hipMemcpyWithStream(&value, self.data_ptr<scalar_t>(), sizeof(scalar_t), cudaMemcpyDeviceToHost, stream));
|
||||
r = Scalar(value);
|
||||
});
|
||||
#else
|
||||
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
|
||||
at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16, self.scalar_type(), "_local_scalar_dense_cuda", [&] {
|
||||
scalar_t value;
|
||||
@ -17,6 +30,7 @@ Scalar _local_scalar_dense_cuda(const Tensor& self) {
|
||||
AT_CUDA_CHECK(cudaStreamSynchronize(stream));
|
||||
r = Scalar(value);
|
||||
});
|
||||
#endif
|
||||
return r;
|
||||
}
|
||||
|
||||
|
@ -9,6 +9,10 @@
|
||||
#include <ATen/native/cuda/Loops.cuh>
|
||||
#include <THC/THC.h>
|
||||
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
#include <hip/hip_version.h>
|
||||
#endif
|
||||
|
||||
namespace at {
|
||||
namespace native {
|
||||
|
||||
@ -177,13 +181,17 @@ static void copy_kernel_cuda(TensorIterator& iter, bool non_blocking) {
|
||||
int64_t nbytes = iter.numel() * iter.element_size(0);
|
||||
CUDAStream stream = getCurrentCUDAStream();
|
||||
|
||||
AT_CUDA_CHECK(cudaMemcpyAsync(dst, src, nbytes, kind, stream));
|
||||
|
||||
if (non_blocking) {
|
||||
AT_CUDA_CHECK(cudaMemcpyAsync(dst, src, nbytes, kind, stream));
|
||||
void* ptr = (dst_device == kCPU ? dst : src);
|
||||
AT_CUDA_CHECK(THCCachingHostAllocator_recordEvent(ptr, stream));
|
||||
} else {
|
||||
#if HIP_VERSION >= 301
|
||||
AT_CUDA_CHECK(hipMemcpyWithStream(dst, src, nbytes, kind, stream));
|
||||
#else
|
||||
AT_CUDA_CHECK(cudaMemcpyAsync(dst, src, nbytes, kind, stream));
|
||||
AT_CUDA_CHECK(cudaStreamSynchronize(stream));
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -12,6 +12,10 @@
|
||||
#include <THC/THCReduceApplyUtils.cuh>
|
||||
#include <c10/macros/Macros.h>
|
||||
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
#include <hip/hip_version.h>
|
||||
#endif
|
||||
|
||||
// Size per each reduction block
|
||||
#define THC_REDUCE_ALL_BLOCK_SIZE 1024L
|
||||
|
||||
@ -320,12 +324,20 @@ bool THC_reduceAll(THCState* state,
|
||||
// the host (synchronous!)
|
||||
if (!outOnDevice) {
|
||||
cudaStream_t stream = c10::cuda::getCurrentCUDAStream();
|
||||
#if HIP_VERSION >= 301
|
||||
THCudaCheck(hipMemcpyWithStream(out,
|
||||
devOut,
|
||||
sizeof(AccT),
|
||||
cudaMemcpyDeviceToHost,
|
||||
stream));
|
||||
#else
|
||||
THCudaCheck(cudaMemcpyAsync(out,
|
||||
devOut,
|
||||
sizeof(AccT),
|
||||
cudaMemcpyDeviceToHost,
|
||||
stream));
|
||||
THCudaCheck(cudaStreamSynchronize(stream));
|
||||
#endif
|
||||
}
|
||||
|
||||
if (freeDevOut) {
|
||||
|
@ -5,6 +5,10 @@
|
||||
#include <c10/util/intrusive_ptr.h>
|
||||
#include <c10/util/typeid.h>
|
||||
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
#include <hip/hip_version.h>
|
||||
#endif
|
||||
|
||||
scalar_t* THCStorage_(data)(THCState *state, const THCStorage *self)
|
||||
{
|
||||
return self->data<scalar_t>();
|
||||
@ -24,10 +28,16 @@ void THCStorage_(set)(THCState *state, THCStorage *self, ptrdiff_t index, scalar
|
||||
{
|
||||
THArgCheck((index >= 0) && (index < self->numel()), 2, "index out of bounds");
|
||||
cudaStream_t stream = c10::cuda::getCurrentCUDAStream();
|
||||
#if HIP_VERSION >= 301
|
||||
THCudaCheck(hipMemcpyWithStream(THCStorage_(data)(state, self) + index, &value, sizeof(scalar_t),
|
||||
cudaMemcpyHostToDevice,
|
||||
stream));
|
||||
#else
|
||||
THCudaCheck(cudaMemcpyAsync(THCStorage_(data)(state, self) + index, &value, sizeof(scalar_t),
|
||||
cudaMemcpyHostToDevice,
|
||||
stream));
|
||||
THCudaCheck(cudaStreamSynchronize(stream));
|
||||
#endif
|
||||
}
|
||||
|
||||
scalar_t THCStorage_(get)(THCState *state, const THCStorage *self, ptrdiff_t index)
|
||||
@ -35,9 +45,14 @@ scalar_t THCStorage_(get)(THCState *state, const THCStorage *self, ptrdiff_t ind
|
||||
THArgCheck((index >= 0) && (index < self->numel()), 2, "index out of bounds");
|
||||
scalar_t value;
|
||||
cudaStream_t stream = c10::cuda::getCurrentCUDAStream();
|
||||
#if HIP_VERSION >= 301
|
||||
THCudaCheck(hipMemcpyWithStream(&value, THCStorage_(data)(state, self) + index, sizeof(scalar_t),
|
||||
cudaMemcpyDeviceToHost, stream));
|
||||
#else
|
||||
THCudaCheck(cudaMemcpyAsync(&value, THCStorage_(data)(state, self) + index, sizeof(scalar_t),
|
||||
cudaMemcpyDeviceToHost, stream));
|
||||
THCudaCheck(cudaStreamSynchronize(stream));
|
||||
#endif
|
||||
return value;
|
||||
}
|
||||
|
||||
|
@ -2,16 +2,28 @@
|
||||
#define THC_GENERIC_FILE "THC/generic/THCStorageCopy.cpp"
|
||||
#else
|
||||
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
#include <hip/hip_version.h>
|
||||
#endif
|
||||
|
||||
void THCStorage_(copyCPU)(THCState *state, THCStorage *self, struct THStorage *src)
|
||||
{
|
||||
THArgCheck(self->numel() == src->numel(), 2, "size does not match");
|
||||
cudaStream_t stream = c10::cuda::getCurrentCUDAStream();
|
||||
#if HIP_VERSION >= 301
|
||||
THCudaCheck(hipMemcpyWithStream(THCStorage_(data)(state, self),
|
||||
THStorage_(data)(src),
|
||||
self->numel() * sizeof(scalar_t),
|
||||
cudaMemcpyHostToDevice,
|
||||
stream));
|
||||
#else
|
||||
THCudaCheck(cudaMemcpyAsync(THCStorage_(data)(state, self),
|
||||
THStorage_(data)(src),
|
||||
self->numel() * sizeof(scalar_t),
|
||||
cudaMemcpyHostToDevice,
|
||||
stream));
|
||||
THCudaCheck(cudaStreamSynchronize(stream));
|
||||
#endif
|
||||
}
|
||||
|
||||
#define TH_CUDA_STORAGE_IMPLEMENT_COPY(TYPEC) \
|
||||
@ -40,12 +52,20 @@ void THStorage_(copyCuda)(THCState *state, THStorage *self, struct THCStorage *s
|
||||
{
|
||||
THArgCheck(self->numel() == src->numel(), 2, "size does not match");
|
||||
cudaStream_t stream = c10::cuda::getCurrentCUDAStream();
|
||||
#if HIP_VERSION >= 301
|
||||
THCudaCheck(hipMemcpyWithStream(THStorage_(data)(self),
|
||||
THCStorage_(data)(state, src),
|
||||
self->numel() * sizeof(scalar_t),
|
||||
cudaMemcpyDeviceToHost,
|
||||
stream));
|
||||
#else
|
||||
THCudaCheck(cudaMemcpyAsync(THStorage_(data)(self),
|
||||
THCStorage_(data)(state, src),
|
||||
self->numel() * sizeof(scalar_t),
|
||||
cudaMemcpyDeviceToHost,
|
||||
stream));
|
||||
THCudaCheck(cudaStreamSynchronize(stream));
|
||||
#endif
|
||||
}
|
||||
|
||||
#define TH_CUDA_STORAGE_IMPLEMENT_COPYTO(TYPEC) \
|
||||
|
@ -6,6 +6,10 @@
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/utils/math.h"
|
||||
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
#include <hip/hip_version.h>
|
||||
#endif
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
namespace {
|
||||
@ -33,7 +37,7 @@ __global__ void ReluCUDAKernel<half>(const int N, const half* X, half* Y) {
|
||||
const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
|
||||
if (i < N) {
|
||||
const half kZero = __float2half(0.0f);
|
||||
#if __CUDA_ARCH__ >= 530
|
||||
#if __CUDA_ARCH__ >= 530 || HIP_VERSION >= 300
|
||||
Y[i] = __hgt(__ldg(X + i), kZero) ? __ldg(X + i) : kZero;
|
||||
#else
|
||||
Y[i] = (__half2float(X[i]) > 0) ? X[i] : kZero;
|
||||
@ -46,7 +50,7 @@ __global__ void ReluCUDAKernel<half2>(const int N, const half2* X, half2* Y) {
|
||||
const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
|
||||
if (i < N) {
|
||||
const half2 kZero = __float2half2_rn(0.0f);
|
||||
#if __CUDA_ARCH__ >= 530
|
||||
#if __CUDA_ARCH__ >= 530 || HIP_VERSION >= 300
|
||||
Y[i] = __hmul2(__hgt2(__ldg(X + i), kZero), __ldg(X + i));
|
||||
#else
|
||||
const float2 xx = __half22float2(X[i]);
|
||||
@ -61,7 +65,7 @@ __global__ void
|
||||
ReluGradientCUDAKernel(const int N, const T* dY, const T* Y, T* dX) {
|
||||
const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
|
||||
if (i < N) {
|
||||
#if __CUDA_ARCH__ >= 350
|
||||
#if __CUDA_ARCH__ >= 350 || HIP_VERSION >= 300
|
||||
dX[i] = __ldg(Y + i) > T(0) ? __ldg(dY + i) : T(0);
|
||||
#else
|
||||
dX[i] = Y[i] > T(0) ? dY[i] : T(0);
|
||||
@ -78,7 +82,7 @@ __global__ void ReluGradientCUDAKernel<half>(
|
||||
const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
|
||||
if (i < N) {
|
||||
const half kZero = __float2half(0.0f);
|
||||
#if __CUDA_ARCH__ >= 530
|
||||
#if __CUDA_ARCH__ >= 530 || HIP_VERSION >= 300
|
||||
dX[i] = __hgt(__ldg(Y + i), kZero) ? __ldg(dY + i) : kZero;
|
||||
#else
|
||||
dX[i] = (__half2float(Y[i]) > 0) ? dY[i] : kZero;
|
||||
@ -95,7 +99,7 @@ __global__ void ReluGradientCUDAKernel<half2>(
|
||||
const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
|
||||
if (i < N) {
|
||||
const half2 kZero = __float2half2_rn(0.0f);
|
||||
#if __CUDA_ARCH__ >= 530
|
||||
#if __CUDA_ARCH__ >= 530 || HIP_VERSION >= 300
|
||||
dX[i] = __hmul2(__hgt2(__ldg(Y + i), kZero), __ldg(dY + i));
|
||||
#else
|
||||
const float2 dy = __half22float2(dY[i]);
|
||||
|
@ -1033,7 +1033,6 @@ if(USE_ROCM)
|
||||
list(APPEND HIP_CXX_FLAGS -DCUDA_HAS_FP16=1)
|
||||
list(APPEND HIP_CXX_FLAGS -D__HIP_NO_HALF_OPERATORS__=1)
|
||||
list(APPEND HIP_CXX_FLAGS -D__HIP_NO_HALF_CONVERSIONS__=1)
|
||||
list(APPEND HIP_CXX_FLAGS -DHIP_VERSION=${HIP_VERSION_MAJOR})
|
||||
list(APPEND HIP_CXX_FLAGS -Wno-macro-redefined)
|
||||
list(APPEND HIP_CXX_FLAGS -Wno-inconsistent-missing-override)
|
||||
list(APPEND HIP_CXX_FLAGS -Wno-exceptions)
|
||||
|
@ -30,6 +30,7 @@ install_ubuntu() {
|
||||
apt-get update
|
||||
apt-get install -y --no-install-recommends \
|
||||
autoconf \
|
||||
apt-transport-https \
|
||||
build-essential \
|
||||
ca-certificates \
|
||||
cmake \
|
||||
|
@ -158,7 +158,7 @@ pip install --no-cache-dir \
|
||||
protobuf \
|
||||
pytest \
|
||||
pyyaml \
|
||||
scipy==0.19.1 \
|
||||
scipy==1.1.0 \
|
||||
scikit-image \
|
||||
tabulate \
|
||||
virtualenv \
|
||||
|
@ -12048,7 +12048,7 @@ class TestTorchDeviceType(TestCase):
|
||||
_test((10,), 5, 4, win_sizes=(11,), expected_error=RuntimeError)
|
||||
_test((10,), 5, 4, win_sizes=(1, 1), expected_error=RuntimeError)
|
||||
|
||||
@skipCUDAIfRocm
|
||||
@skipIfRocm
|
||||
def test_fft_input_modification(self, device):
|
||||
# FFT functions should not modify their input (gh-34551)
|
||||
|
||||
|
@ -11,7 +11,6 @@ if(USE_CUDA)
|
||||
elseif(USE_ROCM)
|
||||
add_definitions(-DUSE_ROCM=1)
|
||||
add_definitions(-D__HIP_PLATFORM_HCC__=1)
|
||||
add_definitions(-DHIP_VERSION=${HIP_VERSION_MAJOR})
|
||||
else()
|
||||
message(STATUS "Building c10d without CUDA/ROCm support")
|
||||
endif()
|
||||
|
Reference in New Issue
Block a user