mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
[CI] Add basic CUDA 13.0 periodic test (#161013)
https://github.com/pytorch/pytorch/issues/159779 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161013 Approved by: https://github.com/atalman Co-authored-by: Andrey Talman <atalman@fb.com> Co-authored-by: Aidyn-A <31858918+Aidyn-A@users.noreply.github.com>
This commit is contained in:
committed by
PyTorch MergeBot
parent
f532f99822
commit
303f514d5b
@ -81,8 +81,8 @@ elif [[ "$image" == *riscv* ]]; then
|
||||
DOCKERFILE="ubuntu-cross-riscv/Dockerfile"
|
||||
fi
|
||||
|
||||
_UCX_COMMIT=7bb2722ff2187a0cad557ae4a6afa090569f83fb
|
||||
_UCC_COMMIT=20eae37090a4ce1b32bcce6144ccad0b49943e0b
|
||||
_UCX_COMMIT=7836b165abdbe468a2f607e7254011c07d788152
|
||||
_UCC_COMMIT=430e241bf5d38cbc73fc7a6b89155397232e3f96
|
||||
if [[ "$image" == *rocm* ]]; then
|
||||
_UCX_COMMIT=cc312eaa4655c0cc5c2bcd796db938f90563bcf6
|
||||
_UCC_COMMIT=0c0fc21559835044ab107199e334f7157d6a0d3d
|
||||
@ -114,6 +114,16 @@ case "$tag" in
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11)
|
||||
CUDA_VERSION=13.0.0
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
GCC_VERSION=11
|
||||
VISION=yes
|
||||
KATEX=yes
|
||||
UCX_COMMIT=${_UCX_COMMIT}
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks)
|
||||
CUDA_VERSION=12.8.1
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
|
@ -44,8 +44,12 @@ function install_ucc() {
|
||||
|
||||
./autogen.sh
|
||||
|
||||
# We only run distributed tests on Tesla M60 and A10G
|
||||
NVCC_GENCODE="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_86,code=compute_86"
|
||||
if [[ -n "$CUDA_VERSION" && $CUDA_VERSION == 13* ]]; then
|
||||
NVCC_GENCODE="-gencode=arch=compute_86,code=compute_86"
|
||||
else
|
||||
# We only run distributed tests on Tesla M60 and A10G
|
||||
NVCC_GENCODE="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_86,code=compute_86"
|
||||
fi
|
||||
|
||||
if [[ -n "$ROCM_VERSION" ]]; then
|
||||
if [[ -n "$PYTORCH_ROCM_ARCH" ]]; then
|
||||
|
@ -66,6 +66,7 @@ ENV NCCL_LIB_DIR="/usr/local/cuda/lib64/"
|
||||
# (optional) Install UCC
|
||||
ARG UCX_COMMIT
|
||||
ARG UCC_COMMIT
|
||||
ARG CUDA_VERSION
|
||||
ENV UCX_COMMIT $UCX_COMMIT
|
||||
ENV UCC_COMMIT $UCC_COMMIT
|
||||
ENV UCX_HOME /usr
|
||||
|
1
.github/workflows/docker-builds.yml
vendored
1
.github/workflows/docker-builds.yml
vendored
@ -50,6 +50,7 @@ jobs:
|
||||
runner: [linux.12xlarge]
|
||||
docker-image-name: [
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11,
|
||||
pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11,
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm,
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks,
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks,
|
||||
|
32
.github/workflows/periodic.yml
vendored
32
.github/workflows/periodic.yml
vendored
@ -170,6 +170,38 @@ jobs:
|
||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-debug-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda13_0-py3_10-gcc11-build:
|
||||
name: linux-jammy-cuda13.0-py3.10-gcc11
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
cuda-arch-list: 7.5
|
||||
build-environment: linux-jammy-cuda13.0-py3.10-gcc11
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "nogpu_AVX512", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
|
||||
{ config: "nogpu_AVX512", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
|
||||
{ config: "nogpu_AVX512", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
|
||||
{ config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
|
||||
{ config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
|
||||
{ config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda13_0-py3_10-gcc11-test:
|
||||
name: linux-jammy-cuda13.0-py3.10-gcc11
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs:
|
||||
- linux-jammy-cuda13_0-py3_10-gcc11-build
|
||||
- target-determination
|
||||
with:
|
||||
build-environment: linux-jammy-cuda13.0-py3.10-gcc11
|
||||
docker-image: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-rocm-py3_10-build:
|
||||
name: linux-jammy-rocm-py3.10
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
|
@ -216,7 +216,7 @@ file(GLOB mem_eff_attention_cuda_cpp "native/transformers/cuda/mem_eff_attention
|
||||
if(USE_CUDA AND (USE_FLASH_ATTENTION OR USE_MEM_EFF_ATTENTION))
|
||||
add_library(flash_attention OBJECT EXCLUDE_FROM_ALL ${flash_attention_cuda_kernels_cu} ${flash_attention_cuda_cpp})
|
||||
|
||||
target_include_directories(flash_attention PUBLIC
|
||||
target_include_directories(flash_attention SYSTEM PUBLIC
|
||||
${PROJECT_SOURCE_DIR}/third_party/flash-attention/csrc
|
||||
${PROJECT_SOURCE_DIR}/third_party/flash-attention/include
|
||||
${PROJECT_SOURCE_DIR}/third_party/cutlass/include
|
||||
|
@ -10,8 +10,13 @@ using namespace at::native::memory;
|
||||
|
||||
constexpr int buffer_size = 1024;
|
||||
|
||||
#if defined(CUDA_VERSION) && CUDA_VERSION < 13000
|
||||
__managed__ double4 buffer1[buffer_size];
|
||||
__managed__ double4 buffer2[buffer_size];
|
||||
#else
|
||||
__managed__ double4_16a buffer1[buffer_size];
|
||||
__managed__ double4_16a buffer2[buffer_size];
|
||||
#endif
|
||||
|
||||
void reset_buffers() {
|
||||
for (int i = 0; i < buffer_size; i++) {
|
||||
|
@ -61,11 +61,14 @@ void* get_symbol(const char* name, int version) {
|
||||
}
|
||||
#endif
|
||||
|
||||
// As of CUDA 13, this API is deprecated.
|
||||
#if defined(CUDA_VERSION) && (CUDA_VERSION < 13000)
|
||||
// This fallback to the old API to try getting the symbol again.
|
||||
if (auto st = cudaGetDriverEntryPoint(name, &out, cudaEnableDefault, &qres);
|
||||
st == cudaSuccess && qres == cudaDriverEntryPointSuccess && out) {
|
||||
return out;
|
||||
}
|
||||
#endif
|
||||
|
||||
// If the symbol cannot be resolved, report and return nullptr;
|
||||
// the caller is responsible for checking the pointer.
|
||||
|
@ -1063,7 +1063,7 @@ elseif(USE_CUDA)
|
||||
UNFUSE_FMA # Addressing issue #121558
|
||||
)
|
||||
target_sources(torch_cuda PRIVATE $<TARGET_OBJECTS:flash_attention>)
|
||||
target_include_directories(torch_cuda PUBLIC
|
||||
target_include_directories(torch_cuda SYSTEM PUBLIC
|
||||
$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/third_party/flash-attention/csrc>
|
||||
$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/third_party/flash-attention/include>
|
||||
$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/third_party/cutlass/include>
|
||||
|
Reference in New Issue
Block a user