[ROCm] Update to ROCm 3.1.1 (#35552)

Summary:
Redux.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/35552

Differential Revision: D20701593

Pulled By: ezyang

fbshipit-source-id: 1946d1e8fb47d597da903bae5d355bf52a5f017f
This commit is contained in:
Johannes M Dieterich
2020-03-27 12:18:31 -07:00
committed by Facebook GitHub Bot
parent ff71a4192d
commit 835ee34e38
18 changed files with 258 additions and 18 deletions

View File

@ -12,7 +12,7 @@ from dataclasses import dataclass
DOCKER_IMAGE_PATH_BASE = "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/"
DOCKER_IMAGE_VERSION = "345"
DOCKER_IMAGE_VERSION = "373"
@dataclass

View File

@ -2422,14 +2422,14 @@ workflows:
requires:
- setup
build_environment: "caffe2-onnx-main-py3.6-clang7-ubuntu16.04-build"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py3.6-clang7-ubuntu16.04:345"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py3.6-clang7-ubuntu16.04:373"
- caffe2_linux_test:
name: caffe2_onnx_main_py3_6_clang7_ubuntu16_04_test
requires:
- setup
- caffe2_onnx_main_py3_6_clang7_ubuntu16_04_build
build_environment: "caffe2-onnx-main-py3.6-clang7-ubuntu16.04-test"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py3.6-clang7-ubuntu16.04:345"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py3.6-clang7-ubuntu16.04:373"
resource_class: large
- caffe2_linux_test:
name: caffe2_onnx_ort1_py3_6_clang7_ubuntu16_04_test
@ -2437,7 +2437,7 @@ workflows:
- setup
- caffe2_onnx_main_py3_6_clang7_ubuntu16_04_build
build_environment: "caffe2-onnx-ort1-py3.6-clang7-ubuntu16.04-test"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py3.6-clang7-ubuntu16.04:345"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py3.6-clang7-ubuntu16.04:373"
resource_class: large
- caffe2_linux_test:
name: caffe2_onnx_ort2_py3_6_clang7_ubuntu16_04_test
@ -2445,7 +2445,7 @@ workflows:
- setup
- caffe2_onnx_main_py3_6_clang7_ubuntu16_04_build
build_environment: "caffe2-onnx-ort2-py3.6-clang7-ubuntu16.04-test"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py3.6-clang7-ubuntu16.04:345"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py3.6-clang7-ubuntu16.04:373"
resource_class: large
# TODO: Refactor circleci/cimodel/data/binary_build_data.py to generate this file
# instead of doing one offs here
@ -6287,7 +6287,7 @@ workflows:
- ecr_gc_job:
name: ecr_gc_job_for_caffe2
project: caffe2
tags_to_keep: "348,345,336,325,324,315,306,301,287,283,276,273,266,253,248,238,230,213"
tags_to_keep: "373,369,348,345,336,325,324,315,306,301,287,283,276,273,266,253,248,238,230,213"
- ecr_gc_job:
name: ecr_gc_job_for_translate
project: translate

View File

@ -15,6 +15,8 @@ OS="ubuntu"
DOCKERFILE="${OS}/Dockerfile"
if [[ "$image" == *-cuda* ]]; then
DOCKERFILE="${OS}-cuda/Dockerfile"
elif [[ "$image" == *-rocm* ]]; then
DOCKERFILE="${OS}-rocm/Dockerfile"
fi
if [[ "$image" == *-trusty* ]]; then
@ -176,6 +178,16 @@ case "$image" in
DB=yes
VISION=yes
;;
pytorch-linux-xenial-rocm-py3.6-clang7)
ANACONDA_PYTHON_VERSION=3.6
CLANG_VERSION=7
PROTOBUF=yes
DB=yes
VISION=yes
ROCM=yes
# newer cmake version required
CMAKE_VERSION=3.6.3
;;
esac
# Set Jenkins UID and GID if running Jenkins
@ -213,6 +225,7 @@ docker build \
--build-arg "CMAKE_VERSION=${CMAKE_VERSION:-}" \
--build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \
--build-arg "KATEX=${KATEX:-}" \
--build-arg "ROCM=${ROCM:-}" \
-f $(dirname ${DOCKERFILE})/Dockerfile \
-t "$tmp_tag" \
"$@" \

View File

@ -0,0 +1,75 @@
#!/bin/bash
set -ex
install_ubuntu() {
apt-get update
apt-get install -y wget
apt-get install -y libopenblas-dev
# Need the libc++1 and libc++abi1 libraries to allow torch._C to load at runtime
apt-get install libc++1
apt-get install libc++abi1
DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/debian
# Add rocm repository
wget -qO - $DEB_ROCM_REPO/rocm.gpg.key | apt-key add -
echo "deb [arch=amd64] $DEB_ROCM_REPO xenial main" > /etc/apt/sources.list.d/rocm.list
apt-get update --allow-insecure-repositories
DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
rocm-dev \
rocm-utils \
rocfft \
miopen-hip \
rocblas \
hipsparse \
rocrand \
hipcub \
rocthrust \
rccl \
rocprofiler-dev \
roctracer-dev
}
install_centos() {
yum update -y
yum install -y wget
yum install -y openblas-devel
yum install -y epel-release
yum install -y dkms kernel-headers-`uname -r` kernel-devel-`uname -r`
echo "[ROCm]" > /etc/yum.repos.d/rocm.repo
echo "name=ROCm" >> /etc/yum.repos.d/rocm.repo
echo "baseurl=http://repo.radeon.com/rocm/yum/rpm/" >> /etc/yum.repos.d/rocm.repo
echo "enabled=1" >> /etc/yum.repos.d/rocm.repo
echo "gpgcheck=0" >> /etc/yum.repos.d/rocm.repo
yum update -y
yum install -y \
rocm-dev \
rocm-utils \
rocfft \
miopen-hip \
rocblas \
hipsparse \
rocrand \
rccl \
hipcub \
rocthrust \
rocprofiler-dev \
roctracer-dev
}
# Install Python packages depending on the base OS
if [ -f /etc/lsb-release ]; then
install_ubuntu
elif [ -f /etc/os-release ]; then
install_centos
else
echo "Unable to determine OS..."
exit 1
fi

View File

@ -0,0 +1 @@
*.sh

View File

@ -0,0 +1,79 @@
ARG UBUNTU_VERSION
FROM ubuntu:${UBUNTU_VERSION}
ARG UBUNTU_VERSION
ENV DEBIAN_FRONTEND noninteractive
# Install common dependencies (so that this step can be cached separately)
ARG EC2
ADD ./common/install_base.sh install_base.sh
RUN bash ./install_base.sh && rm install_base.sh
# Install clang
ARG LLVMDEV
ARG CLANG_VERSION
ADD ./common/install_clang.sh install_clang.sh
RUN bash ./install_clang.sh && rm install_clang.sh
# Install user
ADD ./common/install_user.sh install_user.sh
RUN bash ./install_user.sh && rm install_user.sh
# Install conda
ENV PATH /opt/conda/bin:$PATH
ARG ANACONDA_PYTHON_VERSION
ADD ./common/install_conda.sh install_conda.sh
RUN bash ./install_conda.sh && rm install_conda.sh
# (optional) Install database packages like LMDB and LevelDB
ARG DB
ADD ./common/install_db.sh install_db.sh
RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
RUN rm install_db.sh
ENV INSTALLED_DB ${DB}
# (optional) Install vision packages like OpenCV and ffmpeg
ARG VISION
ADD ./common/install_vision.sh install_vision.sh
RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
RUN rm install_vision.sh
ENV INSTALLED_VISION ${VISION}
# Install rocm
ARG ROCM
ADD ./common/install_rocm.sh install_rocm.sh
RUN bash ./install_rocm.sh
RUN rm install_rocm.sh
ENV PATH /opt/rocm/bin:$PATH
ENV PATH /opt/rocm/hcc/bin:$PATH
ENV PATH /opt/rocm/hip/bin:$PATH
ENV PATH /opt/rocm/opencl/bin:$PATH
ENV HIP_PLATFORM hcc
ENV LANG C.UTF-8
ENV LC_ALL C.UTF-8
# (optional) Install non-default CMake version
ARG CMAKE_VERSION
ADD ./common/install_cmake.sh install_cmake.sh
RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
RUN rm install_cmake.sh
# (optional) Install non-default Ninja version
ARG NINJA_VERSION
ADD ./common/install_ninja.sh install_ninja.sh
RUN if [ -n "${NINJA_VERSION}" ]; then bash ./install_ninja.sh; fi
RUN rm install_ninja.sh
# Install ccache/sccache (do this last, so we get priority in PATH)
ADD ./common/install_cache.sh install_cache.sh
ENV PATH /opt/cache/bin:$PATH
RUN bash ./install_cache.sh && rm install_cache.sh
# Include BUILD_ENVIRONMENT environment variable in image
ARG BUILD_ENVIRONMENT
ENV BUILD_ENVIRONMENT ${BUILD_ENVIRONMENT}
USER jenkins
CMD ["bash"]

View File

@ -14,7 +14,7 @@
- ecr_gc_job:
name: ecr_gc_job_for_caffe2
project: caffe2
tags_to_keep: "348,345,336,325,324,315,306,301,287,283,276,273,266,253,248,238,230,213"
tags_to_keep: "373,369,348,345,336,325,324,315,306,301,287,283,276,273,266,253,248,238,230,213"
- ecr_gc_job:
name: ecr_gc_job_for_translate
project: translate

View File

@ -4,11 +4,24 @@
#include <ATen/cuda/CUDAContext.h>
#include <cuda.h>
#ifdef __HIP_PLATFORM_HCC__
#include <hip/hip_version.h>
#endif
namespace at {
namespace native {
Scalar _local_scalar_dense_cuda(const Tensor& self) {
Scalar r;
#if HIP_VERSION >= 301
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16, self.scalar_type(), "_local_scalar_dense_cuda", [&] {
scalar_t value;
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_CUDA_CHECK(hipMemcpyWithStream(&value, self.data_ptr<scalar_t>(), sizeof(scalar_t), cudaMemcpyDeviceToHost, stream));
r = Scalar(value);
});
#else
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16, self.scalar_type(), "_local_scalar_dense_cuda", [&] {
scalar_t value;
@ -17,6 +30,7 @@ Scalar _local_scalar_dense_cuda(const Tensor& self) {
AT_CUDA_CHECK(cudaStreamSynchronize(stream));
r = Scalar(value);
});
#endif
return r;
}

View File

@ -9,6 +9,10 @@
#include <ATen/native/cuda/Loops.cuh>
#include <THC/THC.h>
#ifdef __HIP_PLATFORM_HCC__
#include <hip/hip_version.h>
#endif
namespace at {
namespace native {
@ -177,13 +181,17 @@ static void copy_kernel_cuda(TensorIterator& iter, bool non_blocking) {
int64_t nbytes = iter.numel() * iter.element_size(0);
CUDAStream stream = getCurrentCUDAStream();
AT_CUDA_CHECK(cudaMemcpyAsync(dst, src, nbytes, kind, stream));
if (non_blocking) {
AT_CUDA_CHECK(cudaMemcpyAsync(dst, src, nbytes, kind, stream));
void* ptr = (dst_device == kCPU ? dst : src);
AT_CUDA_CHECK(THCCachingHostAllocator_recordEvent(ptr, stream));
} else {
#if HIP_VERSION >= 301
AT_CUDA_CHECK(hipMemcpyWithStream(dst, src, nbytes, kind, stream));
#else
AT_CUDA_CHECK(cudaMemcpyAsync(dst, src, nbytes, kind, stream));
AT_CUDA_CHECK(cudaStreamSynchronize(stream));
#endif
}
}

View File

@ -12,6 +12,10 @@
#include <THC/THCReduceApplyUtils.cuh>
#include <c10/macros/Macros.h>
#ifdef __HIP_PLATFORM_HCC__
#include <hip/hip_version.h>
#endif
// Size per each reduction block
#define THC_REDUCE_ALL_BLOCK_SIZE 1024L
@ -320,12 +324,20 @@ bool THC_reduceAll(THCState* state,
// the host (synchronous!)
if (!outOnDevice) {
cudaStream_t stream = c10::cuda::getCurrentCUDAStream();
#if HIP_VERSION >= 301
THCudaCheck(hipMemcpyWithStream(out,
devOut,
sizeof(AccT),
cudaMemcpyDeviceToHost,
stream));
#else
THCudaCheck(cudaMemcpyAsync(out,
devOut,
sizeof(AccT),
cudaMemcpyDeviceToHost,
stream));
THCudaCheck(cudaStreamSynchronize(stream));
#endif
}
if (freeDevOut) {

View File

@ -5,6 +5,10 @@
#include <c10/util/intrusive_ptr.h>
#include <c10/util/typeid.h>
#ifdef __HIP_PLATFORM_HCC__
#include <hip/hip_version.h>
#endif
scalar_t* THCStorage_(data)(THCState *state, const THCStorage *self)
{
return self->data<scalar_t>();
@ -24,10 +28,16 @@ void THCStorage_(set)(THCState *state, THCStorage *self, ptrdiff_t index, scalar
{
THArgCheck((index >= 0) && (index < self->numel()), 2, "index out of bounds");
cudaStream_t stream = c10::cuda::getCurrentCUDAStream();
#if HIP_VERSION >= 301
THCudaCheck(hipMemcpyWithStream(THCStorage_(data)(state, self) + index, &value, sizeof(scalar_t),
cudaMemcpyHostToDevice,
stream));
#else
THCudaCheck(cudaMemcpyAsync(THCStorage_(data)(state, self) + index, &value, sizeof(scalar_t),
cudaMemcpyHostToDevice,
stream));
THCudaCheck(cudaStreamSynchronize(stream));
#endif
}
scalar_t THCStorage_(get)(THCState *state, const THCStorage *self, ptrdiff_t index)
@ -35,9 +45,14 @@ scalar_t THCStorage_(get)(THCState *state, const THCStorage *self, ptrdiff_t ind
THArgCheck((index >= 0) && (index < self->numel()), 2, "index out of bounds");
scalar_t value;
cudaStream_t stream = c10::cuda::getCurrentCUDAStream();
#if HIP_VERSION >= 301
THCudaCheck(hipMemcpyWithStream(&value, THCStorage_(data)(state, self) + index, sizeof(scalar_t),
cudaMemcpyDeviceToHost, stream));
#else
THCudaCheck(cudaMemcpyAsync(&value, THCStorage_(data)(state, self) + index, sizeof(scalar_t),
cudaMemcpyDeviceToHost, stream));
THCudaCheck(cudaStreamSynchronize(stream));
#endif
return value;
}

View File

@ -2,16 +2,28 @@
#define THC_GENERIC_FILE "THC/generic/THCStorageCopy.cpp"
#else
#ifdef __HIP_PLATFORM_HCC__
#include <hip/hip_version.h>
#endif
void THCStorage_(copyCPU)(THCState *state, THCStorage *self, struct THStorage *src)
{
THArgCheck(self->numel() == src->numel(), 2, "size does not match");
cudaStream_t stream = c10::cuda::getCurrentCUDAStream();
#if HIP_VERSION >= 301
THCudaCheck(hipMemcpyWithStream(THCStorage_(data)(state, self),
THStorage_(data)(src),
self->numel() * sizeof(scalar_t),
cudaMemcpyHostToDevice,
stream));
#else
THCudaCheck(cudaMemcpyAsync(THCStorage_(data)(state, self),
THStorage_(data)(src),
self->numel() * sizeof(scalar_t),
cudaMemcpyHostToDevice,
stream));
THCudaCheck(cudaStreamSynchronize(stream));
#endif
}
#define TH_CUDA_STORAGE_IMPLEMENT_COPY(TYPEC) \
@ -40,12 +52,20 @@ void THStorage_(copyCuda)(THCState *state, THStorage *self, struct THCStorage *s
{
THArgCheck(self->numel() == src->numel(), 2, "size does not match");
cudaStream_t stream = c10::cuda::getCurrentCUDAStream();
#if HIP_VERSION >= 301
THCudaCheck(hipMemcpyWithStream(THStorage_(data)(self),
THCStorage_(data)(state, src),
self->numel() * sizeof(scalar_t),
cudaMemcpyDeviceToHost,
stream));
#else
THCudaCheck(cudaMemcpyAsync(THStorage_(data)(self),
THCStorage_(data)(state, src),
self->numel() * sizeof(scalar_t),
cudaMemcpyDeviceToHost,
stream));
THCudaCheck(cudaStreamSynchronize(stream));
#endif
}
#define TH_CUDA_STORAGE_IMPLEMENT_COPYTO(TYPEC) \

View File

@ -6,6 +6,10 @@
#include "caffe2/core/context_gpu.h"
#include "caffe2/utils/math.h"
#ifdef __HIP_PLATFORM_HCC__
#include <hip/hip_version.h>
#endif
namespace caffe2 {
namespace {
@ -33,7 +37,7 @@ __global__ void ReluCUDAKernel<half>(const int N, const half* X, half* Y) {
const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
if (i < N) {
const half kZero = __float2half(0.0f);
#if __CUDA_ARCH__ >= 530
#if __CUDA_ARCH__ >= 530 || HIP_VERSION >= 300
Y[i] = __hgt(__ldg(X + i), kZero) ? __ldg(X + i) : kZero;
#else
Y[i] = (__half2float(X[i]) > 0) ? X[i] : kZero;
@ -46,7 +50,7 @@ __global__ void ReluCUDAKernel<half2>(const int N, const half2* X, half2* Y) {
const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
if (i < N) {
const half2 kZero = __float2half2_rn(0.0f);
#if __CUDA_ARCH__ >= 530
#if __CUDA_ARCH__ >= 530 || HIP_VERSION >= 300
Y[i] = __hmul2(__hgt2(__ldg(X + i), kZero), __ldg(X + i));
#else
const float2 xx = __half22float2(X[i]);
@ -61,7 +65,7 @@ __global__ void
ReluGradientCUDAKernel(const int N, const T* dY, const T* Y, T* dX) {
const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
if (i < N) {
#if __CUDA_ARCH__ >= 350
#if __CUDA_ARCH__ >= 350 || HIP_VERSION >= 300
dX[i] = __ldg(Y + i) > T(0) ? __ldg(dY + i) : T(0);
#else
dX[i] = Y[i] > T(0) ? dY[i] : T(0);
@ -78,7 +82,7 @@ __global__ void ReluGradientCUDAKernel<half>(
const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
if (i < N) {
const half kZero = __float2half(0.0f);
#if __CUDA_ARCH__ >= 530
#if __CUDA_ARCH__ >= 530 || HIP_VERSION >= 300
dX[i] = __hgt(__ldg(Y + i), kZero) ? __ldg(dY + i) : kZero;
#else
dX[i] = (__half2float(Y[i]) > 0) ? dY[i] : kZero;
@ -95,7 +99,7 @@ __global__ void ReluGradientCUDAKernel<half2>(
const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
if (i < N) {
const half2 kZero = __float2half2_rn(0.0f);
#if __CUDA_ARCH__ >= 530
#if __CUDA_ARCH__ >= 530 || HIP_VERSION >= 300
dX[i] = __hmul2(__hgt2(__ldg(Y + i), kZero), __ldg(dY + i));
#else
const float2 dy = __half22float2(dY[i]);

View File

@ -1033,7 +1033,6 @@ if(USE_ROCM)
list(APPEND HIP_CXX_FLAGS -DCUDA_HAS_FP16=1)
list(APPEND HIP_CXX_FLAGS -D__HIP_NO_HALF_OPERATORS__=1)
list(APPEND HIP_CXX_FLAGS -D__HIP_NO_HALF_CONVERSIONS__=1)
list(APPEND HIP_CXX_FLAGS -DHIP_VERSION=${HIP_VERSION_MAJOR})
list(APPEND HIP_CXX_FLAGS -Wno-macro-redefined)
list(APPEND HIP_CXX_FLAGS -Wno-inconsistent-missing-override)
list(APPEND HIP_CXX_FLAGS -Wno-exceptions)

View File

@ -30,6 +30,7 @@ install_ubuntu() {
apt-get update
apt-get install -y --no-install-recommends \
autoconf \
apt-transport-https \
build-essential \
ca-certificates \
cmake \

View File

@ -158,7 +158,7 @@ pip install --no-cache-dir \
protobuf \
pytest \
pyyaml \
scipy==0.19.1 \
scipy==1.1.0 \
scikit-image \
tabulate \
virtualenv \

View File

@ -12048,7 +12048,7 @@ class TestTorchDeviceType(TestCase):
_test((10,), 5, 4, win_sizes=(11,), expected_error=RuntimeError)
_test((10,), 5, 4, win_sizes=(1, 1), expected_error=RuntimeError)
@skipCUDAIfRocm
@skipIfRocm
def test_fft_input_modification(self, device):
# FFT functions should not modify their input (gh-34551)

View File

@ -11,7 +11,6 @@ if(USE_CUDA)
elseif(USE_ROCM)
add_definitions(-DUSE_ROCM=1)
add_definitions(-D__HIP_PLATFORM_HCC__=1)
add_definitions(-DHIP_VERSION=${HIP_VERSION_MAJOR})
else()
message(STATUS "Building c10d without CUDA/ROCm support")
endif()