From ab0694f1c6974ef82a05d9d2f964bffc8b3d47e8 Mon Sep 17 00:00:00 2001 From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Wed, 10 Sep 2025 19:33:40 +0000 Subject: [PATCH] [ROCm][Inductor][CK backend] Install rocm-composable-kernel python package on ROCm Linux CI docker images (#162288) Reopened from #158747 which got reverted since without setuptools-scm in pytorch index URL the wheel cannot be built We reconsider the original PR idea of introducing CK as a pytorch dependency on ROCm Linux and install the CK python package in CI only -- since (1) rocm-composable-kernel depends on setuptools-scm which depends on tomli and the existing index URLs need to be modified to host the new packages and (2) there also is a packaging [bug](https://github.com/pypa/setuptools/issues/3269#issuecomment-1254507377) in Ubuntu 22.04 which prevents correct dynamic version calculation with default system pip. Extras: -> this PR reconsiders how TORCHINDUCTOR_CK_DIR env variable is used; previously, this var was used to point to rocm-composable-kernel package installation path on the filesystem; now, the path is inferred by trying to import ck4inductor -> the tests are updated to reflect this change -> since in CI clang points to a bash script which invokes sccache, we cannot patch PATH to not contain sccache, this logic is removed from the testing code -> scaled_mm test crashes during the benchmarking when the benchmarking happens in the main process, and times out benchmarking when it happens in a subprocess, on gfx942, so it is disabled TBD: roll back rocm-mi300 workflow before merging Pull Request resolved: https://github.com/pytorch/pytorch/pull/162288 Approved by: https://github.com/jeffdaily --- .ci/docker/centos-rocm/Dockerfile | 6 +++- .../ci_commit_pins/rocm-composable-kernel.txt | 1 + .ci/docker/common/install_rocm.sh | 9 ++++++ .ci/docker/ubuntu-rocm/Dockerfile | 6 +++- .github/workflows/rocm-mi300.yml | 1 + test/inductor/test_ck_backend.py | 30 ++++++------------- .../_inductor/codegen/rocm/compile_command.py | 11 +++++-- torch/_inductor/utils.py | 11 +------ 8 files changed, 39 insertions(+), 36 deletions(-) create mode 100644 .ci/docker/ci_commit_pins/rocm-composable-kernel.txt diff --git a/.ci/docker/centos-rocm/Dockerfile b/.ci/docker/centos-rocm/Dockerfile index 8d1e7f5972b1..319765590fc0 100644 --- a/.ci/docker/centos-rocm/Dockerfile +++ b/.ci/docker/centos-rocm/Dockerfile @@ -56,9 +56,13 @@ ENV INSTALLED_VISION ${VISION} # Install rocm ARG ROCM_VERSION +RUN mkdir ci_commit_pins +COPY ./common/common_utils.sh common_utils.sh +COPY ./ci_commit_pins/rocm-composable-kernel.txt ci_commit_pins/rocm-composable-kernel.txt COPY ./common/install_rocm.sh install_rocm.sh RUN bash ./install_rocm.sh -RUN rm install_rocm.sh +RUN rm install_rocm.sh common_utils.sh +RUN rm -r ci_commit_pins COPY ./common/install_rocm_magma.sh install_rocm_magma.sh RUN bash ./install_rocm_magma.sh ${ROCM_VERSION} RUN rm install_rocm_magma.sh diff --git a/.ci/docker/ci_commit_pins/rocm-composable-kernel.txt b/.ci/docker/ci_commit_pins/rocm-composable-kernel.txt new file mode 100644 index 000000000000..c45f46af95d0 --- /dev/null +++ b/.ci/docker/ci_commit_pins/rocm-composable-kernel.txt @@ -0,0 +1 @@ +7fe50dc3da2069d6645d9deb8c017a876472a977 diff --git a/.ci/docker/common/install_rocm.sh b/.ci/docker/common/install_rocm.sh index 5d355276def7..a156670cb815 100644 --- a/.ci/docker/common/install_rocm.sh +++ b/.ci/docker/common/install_rocm.sh @@ -2,6 +2,11 @@ set -ex +# for pip_install function +source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh" + +ROCM_COMPOSABLE_KERNEL_VERSION="$(cat $(dirname $0)/../ci_commit_pins/rocm-composable-kernel.txt)" + ver() { printf "%3d%03d%03d%03d" $(echo "$1" | tr '.' ' '); } @@ -113,6 +118,8 @@ EOF rm -rf HIP clr fi + pip_install "git+https://github.com/rocm/composable_kernel@$ROCM_COMPOSABLE_KERNEL_VERSION" + # Cleanup apt-get autoclean && apt-get clean rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* @@ -176,6 +183,8 @@ install_centos() { sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;" done + pip_install "git+https://github.com/rocm/composable_kernel@$ROCM_COMPOSABLE_KERNEL_VERSION" + # Cleanup yum clean all rm -rf /var/cache/yum diff --git a/.ci/docker/ubuntu-rocm/Dockerfile b/.ci/docker/ubuntu-rocm/Dockerfile index 681f6fe75051..b517a990a057 100644 --- a/.ci/docker/ubuntu-rocm/Dockerfile +++ b/.ci/docker/ubuntu-rocm/Dockerfile @@ -52,9 +52,13 @@ ENV INSTALLED_VISION ${VISION} # Install rocm ARG ROCM_VERSION +RUN mkdir ci_commit_pins +COPY ./common/common_utils.sh common_utils.sh +COPY ./ci_commit_pins/rocm-composable-kernel.txt ci_commit_pins/rocm-composable-kernel.txt COPY ./common/install_rocm.sh install_rocm.sh RUN bash ./install_rocm.sh -RUN rm install_rocm.sh +RUN rm install_rocm.sh common_utils.sh +RUN rm -r ci_commit_pins COPY ./common/install_rocm_magma.sh install_rocm_magma.sh RUN bash ./install_rocm_magma.sh ${ROCM_VERSION} RUN rm install_rocm_magma.sh diff --git a/.github/workflows/rocm-mi300.yml b/.github/workflows/rocm-mi300.yml index 7e3ba43bf984..8ffd58cb9811 100644 --- a/.github/workflows/rocm-mi300.yml +++ b/.github/workflows/rocm-mi300.yml @@ -70,4 +70,5 @@ jobs: build-environment: linux-noble-rocm-py3.12-mi300 docker-image: ${{ needs.linux-noble-rocm-py3_12-build.outputs.docker-image }} test-matrix: ${{ needs.linux-noble-rocm-py3_12-build.outputs.test-matrix }} + tests-to-include: "inductor/test_ck_backend" secrets: inherit diff --git a/test/inductor/test_ck_backend.py b/test/inductor/test_ck_backend.py index f73a47e45a57..079be79fcc9d 100644 --- a/test/inductor/test_ck_backend.py +++ b/test/inductor/test_ck_backend.py @@ -1,5 +1,4 @@ # Owner(s): ["module: inductor"] -import functools import logging import os import unittest @@ -13,6 +12,7 @@ except ImportError: import torch from torch._inductor import config from torch._inductor.test_case import run_tests, TestCase +from torch._inductor.utils import try_import_ck_lib from torch.testing._internal.common_cuda import tf32_off from torch.testing._internal.common_utils import ( instantiate_parametrized_tests, @@ -32,20 +32,8 @@ if HAS_CUDA_AND_TRITON: log = logging.getLogger(__name__) -@functools.lru_cache(None) -def _get_path_without_sccache() -> str: - """ - Get the PATH environment variable without sccache. - """ - path_envs = os.environ.get("PATH", "").split(":") - path_envs = [env for env in path_envs if "/opt/cache/bin" not in env] - return ":".join(path_envs) - - -_test_env = { - "PATH": _get_path_without_sccache(), - "DISABLE_SCCACHE": "1", -} +# patch env for tests if needed +_test_env = {} @instantiate_parametrized_tests @@ -61,13 +49,10 @@ class TestCKBackend(TestCase): ) torch.random.manual_seed(1234) - try: - import ck4inductor # @manual - self.ck_dir = os.path.dirname(ck4inductor.__file__) - os.environ["TORCHINDUCTOR_CK_DIR"] = self.ck_dir - except ImportError as e: - raise unittest.SkipTest("Composable Kernel library not installed") from e + self.ck_dir, _, _, _ = try_import_ck_lib() + if not self.ck_dir: + raise unittest.SkipTest("Composable Kernel library is not installed") try: os.environ["INDUCTOR_TEST_DISABLE_FRESH_CACHE"] = "1" @@ -288,6 +273,9 @@ class TestCKBackend(TestCase): torch.testing.assert_close(Y_compiled, Y_eager) + @unittest.skip( + "FIXME(tenpercent): kernel compilation errors on gfx942 as of 09/01/25" + ) @unittest.skipIf(not torch.version.hip, "ROCM only") @unittest.mock.patch.dict(os.environ, _test_env) @parametrize("max_autotune_gemm_backends", ("CK", "ATen,Triton,CK")) diff --git a/torch/_inductor/codegen/rocm/compile_command.py b/torch/_inductor/codegen/rocm/compile_command.py index b9cae55102b6..aa935b14af23 100644 --- a/torch/_inductor/codegen/rocm/compile_command.py +++ b/torch/_inductor/codegen/rocm/compile_command.py @@ -4,7 +4,7 @@ import os from typing import Optional from torch._inductor import config -from torch._inductor.utils import is_linux +from torch._inductor.utils import is_linux, try_import_ck_lib log = logging.getLogger(__name__) @@ -18,18 +18,23 @@ def _rocm_include_paths(dst_file_ext: str) -> list[str]: if config.rocm.rocm_home else cpp_extension._join_rocm_home("include") ) - if not config.rocm.ck_dir: - log.warning("Unspecified Composable Kernel include dir") if config.is_fbcode(): from libfb.py import parutil ck_path = parutil.get_dir_path("composable-kernel-headers") else: + if not config.rocm.ck_dir: + ck_dir, _, _, _ = try_import_ck_lib() + if not ck_dir: + log.warning("Unspecified Composable Kernel directory") + config.rocm.ck_dir = ck_dir ck_path = config.rocm.ck_dir or cpp_extension._join_rocm_home( "composable_kernel" ) + log.debug("Using ck path %s", ck_path) + ck_include = os.path.join(ck_path, "include") ck_library_include = os.path.join(ck_path, "library", "include") diff --git a/torch/_inductor/utils.py b/torch/_inductor/utils.py index abb850ea4cce..0b09f9a67a96 100644 --- a/torch/_inductor/utils.py +++ b/torch/_inductor/utils.py @@ -1985,16 +1985,7 @@ def use_ck_template(layout: Layout) -> bool: log.warning("Please pip install Composable Kernel package") return False - if config.is_fbcode(): - config.rocm.ck_dir = ck_package_dirname - - if not config.rocm.ck_dir: - log.warning("Please set TORCHINDUCTOR_CK_DIR env variable") - return False - - if ck_package_dirname != config.rocm.ck_dir: - log.warning("Invalid path to CK library") - return False + config.rocm.ck_dir = ck_package_dirname return True