[ROCm][Inductor][CK backend] Install rocm-composable-kernel python package on ROCm Linux CI docker images (#162288)

Reopened from #158747 which got reverted since without setuptools-scm in pytorch index URL the wheel cannot be built

We reconsider the original PR idea of introducing CK as a pytorch dependency on ROCm Linux and install the CK python package in CI only -- since (1) rocm-composable-kernel depends on setuptools-scm which depends on tomli and the existing index URLs need to be modified to host the new packages and (2) there also is a packaging [bug](https://github.com/pypa/setuptools/issues/3269#issuecomment-1254507377) in Ubuntu 22.04 which prevents correct dynamic version calculation with default system pip.

Extras:

 ->   this PR reconsiders how TORCHINDUCTOR_CK_DIR env variable is used; previously, this var was used to point to rocm-composable-kernel package installation path on the filesystem; now, the path is inferred by trying to import ck4inductor
 ->   the tests are updated to reflect this change
 ->   since in CI clang points to a bash script which invokes sccache, we cannot patch PATH to not contain sccache, this logic is removed from the testing code
->    scaled_mm test crashes during the benchmarking when the benchmarking happens in the main process, and times out benchmarking when it happens in a subprocess, on gfx942, so it is disabled

TBD: roll back rocm-mi300 workflow before merging

Pull Request resolved: https://github.com/pytorch/pytorch/pull/162288
Approved by: https://github.com/jeffdaily
This commit is contained in:
Max Podkorytov
2025-09-10 19:33:40 +00:00
committed by PyTorch MergeBot
parent 5f630d28d7
commit ab0694f1c6
8 changed files with 39 additions and 36 deletions

View File

@ -56,9 +56,13 @@ ENV INSTALLED_VISION ${VISION}
# Install rocm
ARG ROCM_VERSION
RUN mkdir ci_commit_pins
COPY ./common/common_utils.sh common_utils.sh
COPY ./ci_commit_pins/rocm-composable-kernel.txt ci_commit_pins/rocm-composable-kernel.txt
COPY ./common/install_rocm.sh install_rocm.sh
RUN bash ./install_rocm.sh
RUN rm install_rocm.sh
RUN rm install_rocm.sh common_utils.sh
RUN rm -r ci_commit_pins
COPY ./common/install_rocm_magma.sh install_rocm_magma.sh
RUN bash ./install_rocm_magma.sh ${ROCM_VERSION}
RUN rm install_rocm_magma.sh

View File

@ -0,0 +1 @@
7fe50dc3da2069d6645d9deb8c017a876472a977

View File

@ -2,6 +2,11 @@
set -ex
# for pip_install function
source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
ROCM_COMPOSABLE_KERNEL_VERSION="$(cat $(dirname $0)/../ci_commit_pins/rocm-composable-kernel.txt)"
ver() {
printf "%3d%03d%03d%03d" $(echo "$1" | tr '.' ' ');
}
@ -113,6 +118,8 @@ EOF
rm -rf HIP clr
fi
pip_install "git+https://github.com/rocm/composable_kernel@$ROCM_COMPOSABLE_KERNEL_VERSION"
# Cleanup
apt-get autoclean && apt-get clean
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
@ -176,6 +183,8 @@ install_centos() {
sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;"
done
pip_install "git+https://github.com/rocm/composable_kernel@$ROCM_COMPOSABLE_KERNEL_VERSION"
# Cleanup
yum clean all
rm -rf /var/cache/yum

View File

@ -52,9 +52,13 @@ ENV INSTALLED_VISION ${VISION}
# Install rocm
ARG ROCM_VERSION
RUN mkdir ci_commit_pins
COPY ./common/common_utils.sh common_utils.sh
COPY ./ci_commit_pins/rocm-composable-kernel.txt ci_commit_pins/rocm-composable-kernel.txt
COPY ./common/install_rocm.sh install_rocm.sh
RUN bash ./install_rocm.sh
RUN rm install_rocm.sh
RUN rm install_rocm.sh common_utils.sh
RUN rm -r ci_commit_pins
COPY ./common/install_rocm_magma.sh install_rocm_magma.sh
RUN bash ./install_rocm_magma.sh ${ROCM_VERSION}
RUN rm install_rocm_magma.sh

View File

@ -70,4 +70,5 @@ jobs:
build-environment: linux-noble-rocm-py3.12-mi300
docker-image: ${{ needs.linux-noble-rocm-py3_12-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-noble-rocm-py3_12-build.outputs.test-matrix }}
tests-to-include: "inductor/test_ck_backend"
secrets: inherit

View File

@ -1,5 +1,4 @@
# Owner(s): ["module: inductor"]
import functools
import logging
import os
import unittest
@ -13,6 +12,7 @@ except ImportError:
import torch
from torch._inductor import config
from torch._inductor.test_case import run_tests, TestCase
from torch._inductor.utils import try_import_ck_lib
from torch.testing._internal.common_cuda import tf32_off
from torch.testing._internal.common_utils import (
instantiate_parametrized_tests,
@ -32,20 +32,8 @@ if HAS_CUDA_AND_TRITON:
log = logging.getLogger(__name__)
@functools.lru_cache(None)
def _get_path_without_sccache() -> str:
"""
Get the PATH environment variable without sccache.
"""
path_envs = os.environ.get("PATH", "").split(":")
path_envs = [env for env in path_envs if "/opt/cache/bin" not in env]
return ":".join(path_envs)
_test_env = {
"PATH": _get_path_without_sccache(),
"DISABLE_SCCACHE": "1",
}
# patch env for tests if needed
_test_env = {}
@instantiate_parametrized_tests
@ -61,13 +49,10 @@ class TestCKBackend(TestCase):
)
torch.random.manual_seed(1234)
try:
import ck4inductor # @manual
self.ck_dir = os.path.dirname(ck4inductor.__file__)
os.environ["TORCHINDUCTOR_CK_DIR"] = self.ck_dir
except ImportError as e:
raise unittest.SkipTest("Composable Kernel library not installed") from e
self.ck_dir, _, _, _ = try_import_ck_lib()
if not self.ck_dir:
raise unittest.SkipTest("Composable Kernel library is not installed")
try:
os.environ["INDUCTOR_TEST_DISABLE_FRESH_CACHE"] = "1"
@ -288,6 +273,9 @@ class TestCKBackend(TestCase):
torch.testing.assert_close(Y_compiled, Y_eager)
@unittest.skip(
"FIXME(tenpercent): kernel compilation errors on gfx942 as of 09/01/25"
)
@unittest.skipIf(not torch.version.hip, "ROCM only")
@unittest.mock.patch.dict(os.environ, _test_env)
@parametrize("max_autotune_gemm_backends", ("CK", "ATen,Triton,CK"))

View File

@ -4,7 +4,7 @@ import os
from typing import Optional
from torch._inductor import config
from torch._inductor.utils import is_linux
from torch._inductor.utils import is_linux, try_import_ck_lib
log = logging.getLogger(__name__)
@ -18,18 +18,23 @@ def _rocm_include_paths(dst_file_ext: str) -> list[str]:
if config.rocm.rocm_home
else cpp_extension._join_rocm_home("include")
)
if not config.rocm.ck_dir:
log.warning("Unspecified Composable Kernel include dir")
if config.is_fbcode():
from libfb.py import parutil
ck_path = parutil.get_dir_path("composable-kernel-headers")
else:
if not config.rocm.ck_dir:
ck_dir, _, _, _ = try_import_ck_lib()
if not ck_dir:
log.warning("Unspecified Composable Kernel directory")
config.rocm.ck_dir = ck_dir
ck_path = config.rocm.ck_dir or cpp_extension._join_rocm_home(
"composable_kernel"
)
log.debug("Using ck path %s", ck_path)
ck_include = os.path.join(ck_path, "include")
ck_library_include = os.path.join(ck_path, "library", "include")

View File

@ -1985,16 +1985,7 @@ def use_ck_template(layout: Layout) -> bool:
log.warning("Please pip install Composable Kernel package")
return False
if config.is_fbcode():
config.rocm.ck_dir = ck_package_dirname
if not config.rocm.ck_dir:
log.warning("Please set TORCHINDUCTOR_CK_DIR env variable")
return False
if ck_package_dirname != config.rocm.ck_dir:
log.warning("Invalid path to CK library")
return False
config.rocm.ck_dir = ck_package_dirname
return True