Revert "[ROCm] Add ROCm AMDGPU support for inductor cpp codegen (#105141)"

This reverts commit 8ff00360a4daab7848307a9a0b1c81b1da873d0c. Reverted https://github.com/pytorch/pytorch/pull/105141 on behalf of https://github.com/DanilBaibak due to Break internal build ([comment](https://github.com/pytorch/pytorch/pull/105141#issuecomment-1715629007))
2025-11-11 22:34:53 +08:00 · 2023-09-12 12:29:55 +00:00
parent 5531a23b20
commit 5a7c008b30
6 changed files with 12 additions and 74 deletions
--- a/test/inductor/test_cpp_wrapper.py
+++ b/test/inductor/test_cpp_wrapper.py
@ -41,7 +41,7 @@ except unittest.SkipTest:


 RUN_CPU = HAS_CPU and not torch.backends.mps.is_available() and not IS_MACOS
-RUN_CUDA = HAS_CUDA and not TEST_WITH_ASAN
+RUN_CUDA = HAS_CUDA and not TEST_WITH_ASAN and not TEST_WITH_ROCM


 class CppWrapperTemplate:
@ -91,37 +91,6 @@ test_failures_cuda_wrapper = {
    ),
 }

-if TEST_WITH_ROCM:
-    # Current skips for ROCm
-    rocm_exclude_list = [
-        "test_addmm",
-        "test_batch_norm_2d_2_cuda",
-        "test_bmm1_cuda",
-        "test_cat_cuda",
-        "test_convolution1_cuda",
-        "test_custom_op_cuda",
-        "test_foreach_cpp_wrapper",
-        "test_linear_relu",
-        "test_index_put_deterministic_fallback_cuda",
-        "test_index_tensor_cuda",
-        "test_multi_device_cuda",
-        "test_mm_plus_mm2",
-        "test_scaled_dot_product_efficient_attention_cuda",
-        "test_sum_dtype_cuda",
-        "test_transpose_cuda",
-        "test_index_tensor_cuda",
-    ]
-
-    # Create skip entries for both the cuda and cuda_dynamic_shapes variants
-    for test_name in rocm_exclude_list:
-        dynamic_shapes_test_name = f"{test_name}_dynamic_shapes"
-        test_failures_cuda_wrapper[test_name] = test_torchinductor.TestFailure(
-            ("cuda_wrapper",), is_skip=True
-        )
-        test_failures_cuda_wrapper[
-            dynamic_shapes_test_name
-        ] = test_torchinductor.TestFailure(("cuda_wrapper",), is_skip=True)
-

 def make_test_case(name, device, tests, condition=True, slow=False, func_inputs=None):
    test_name = f"{name}_{device}" if device else name
@ -342,9 +311,7 @@ if RUN_CUDA:
    ]:
        make_test_case(item.name, item.device, item.tests)

-    test_torchinductor.copy_tests(
-        CudaWrapperTemplate, TestCudaWrapper, "cuda_wrapper", test_failures_cuda_wrapper
-    )
+    test_torchinductor.copy_tests(CudaWrapperTemplate, TestCudaWrapper, "cuda_wrapper")

    DynamicShapesCudaWrapperTemplate = (
        test_torchinductor_dynamic_shapes.make_dynamic_cls(CudaWrapperTemplate)
--- a/tools/amd_build/build_amd.py
+++ b/tools/amd_build/build_amd.py
@ -195,7 +195,6 @@ hipify_python.hipify(
    output_directory=out_dir,
    includes=includes,
    ignores=ignores,
-    extra_files=["torch/_inductor/codegen/wrapper.py"],
    out_of_place_only=args.out_of_place_only,
    hip_clang_launch=is_hip_clang(),
 )
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@ -327,10 +327,10 @@ def get_path(basename: str, extension: str, specified_dir: str = ""):


 def get_hash(content: Union[str, bytes], extra: str = "", hash_type: str = "code"):
-    assert hash_type in ["code", "cubin", "hsaco"], "Hash type not supported"
+    assert hash_type in ["code", "cubin"], "Hash type not supported"
    if hash_type == "code":
        return code_hash(content, extra)
-    if hash_type == "cubin" or "hsaco":
+    if hash_type == "cubin":
        return code_hash(repr(content))


@ -820,13 +820,10 @@ def get_include_and_linking_paths(
            else:
                macros = f"-D{macros}"
        if cuda:
-            if torch.version.hip is not None:
-                libs += ["c10_hip", "torch_hip"]
+            if config.is_fbcode():
+                libs += ["cuda"]
            else:
-                if config.is_fbcode():
-                    libs += ["cuda"]
-                else:
-                    libs += ["c10_cuda", "cuda", "torch_cuda"]
+                libs += ["c10_cuda", "cuda", "torch_cuda"]
    else:
        # Note - this is effectively a header only inclusion. Usage of some header files may result in
        # symbol not found, if those header files require a library.
@ -962,19 +959,13 @@ class CudaKernelParamCache:

    @classmethod
    def set(cls, key, params, cubin):
-        bin_type = "cubin" if torch.version.hip is None else "hsaco"
        _, path = write(
            cubin,
-            bin_type,
-            hash_type=bin_type,
+            "cubin",
+            hash_type="cubin",
            specified_dir=config.aot_inductor.output_path,
        )
-
-        if torch.version.hip is None:
-            params["cubin_path"] = path
-        else:
-            params["hsaco_path"] = path
-
+        params["cubin_path"] = path
        cls.cache[key] = params

    @classmethod
--- a/torch/_inductor/codegen/wrapper.py
+++ b/torch/_inductor/codegen/wrapper.py
@ -1643,9 +1643,7 @@ class CudaWrapperCodeGen(CppWrapperCodeGen):
    def generate_load_kernel(self, name, params):
        mangled_name = params.get("mangled_name", None)
        assert mangled_name is not None, "missing mangled_name"
-        cubin_path = params.get(
-            "cubin_path" if torch.version.hip is None else "hsaco_path", None
-        )
+        cubin_path = params.get("cubin_path", None)
        assert os.path.exists(
            cubin_path
        ), "cubin file should already exist at this moment"
--- a/torch/_inductor/triton_heuristics.py
+++ b/torch/_inductor/triton_heuristics.py
@ -344,19 +344,7 @@ class CachingAutotuner(KernelInterface):
            "shared_mem": launcher.bin.shared,
            "stream": stream,
        }
-
-        if torch.version.hip is None:
-            CudaKernelParamCache.set(key, params, launcher.bin.asm["cubin"])
-        else:
-            # There is some divergence between CUDA and ROCm here.
-            # On ROCm's triton we only have the the path to the binary, not the binary itself.
-            # For ROCm we will copy the binary to the new location instead of writing to file
-            import pathlib
-
-            launcher.bin.asm["hsaco"] = pathlib.Path(
-                launcher.bin.asm["hsaco_path"]
-            ).read_bytes()
-            CudaKernelParamCache.set(key, params, launcher.bin.asm["hsaco"])
+        CudaKernelParamCache.set(key, params, launcher.bin.asm["cubin"])

    def coordinate_descent_tuning(self, launcher, *args, **kwargs):
        """
--- a/torch/utils/hipify/cuda_to_hip_mappings.py
+++ b/torch/utils/hipify/cuda_to_hip_mappings.py
@ -2235,10 +2235,6 @@ CUDA_IDENTIFIER_MAP = collections.OrderedDict(
            "CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES",
            ("hipFuncAttributeSharedSizeBytes", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
        ),
-        (
-            "CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES",
-            ("hipFuncAttributeMaxDynamicSharedMemorySize", CONV_TYPE, API_RUNTIME),
-        ),
        (
            "CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES",
            ("hipFuncAttributeConstSizeBytes", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
@ -2920,7 +2916,6 @@ CUDA_IDENTIFIER_MAP = collections.OrderedDict(
        ("cuEventQuery", ("hipEventQuery", CONV_EVENT, API_DRIVER)),
        ("cuEventRecord", ("hipEventRecord", CONV_EVENT, API_DRIVER)),
        ("cuEventSynchronize", ("hipEventSynchronize", CONV_EVENT, API_DRIVER)),
-        ("cuFuncSetAttribute", ("hipFuncSetAttribute", CONV_EVENT, API_DRIVER)),
        (
            "cuFuncGetAttribute",
            ("hipFuncGetAttribute", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED),