[Inductor] Allocate another shard for testing cpp-wrapper JIT (#121310)

Summary: The ABI-compatible for cpp wrapper has not been turned on as default, so test them separately. Expect to add more tests for the shard. Differential Revision: [D54617287](https://our.internmc.facebook.com/intern/diff/D54617287) Pull Request resolved: https://github.com/pytorch/pytorch/pull/121310 Approved by: https://github.com/chenyang78 ghstack dependencies: #121309
2025-10-20 21:14:14 +08:00 · 2024-03-06 17:57:15 -08:00
parent 7e598c0053
commit 0339f1ca82
5 changed files with 94 additions and 2 deletions
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -323,6 +323,14 @@ test_inductor() {
  fi
 }

+test_inductor_cpp_wrapper_abi_compatible() {
+  export TORCHINDUCTOR_ABI_COMPATIBLE=1
+  echo "Testing Inductor cpp wrapper mode with TORCHINDUCTOR_ABI_COMPATIBLE=1"
+  # cpu stack allocation causes segfault and needs more investigation
+  TORCHINDUCTOR_STACK_ALLOCATION=0 python test/run_test.py --include inductor/test_cpu_cpp_wrapper
+  python test/run_test.py --include inductor/test_cuda_cpp_wrapper
+}
+
 # "Global" flags for inductor benchmarking controlled by TEST_CONFIG
 # For example 'dynamic_aot_eager_torchbench' TEST_CONFIG means we run
 # the benchmark script with '--dynamic-shapes --backend aot_eager --device cuda'
@ -1173,6 +1181,9 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
    fi
    PYTHONPATH=$(pwd)/torchbench test_dynamo_benchmark torchbench "$id"
  fi
+elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper_abi_compatible* ]]; then
+  install_torchvision
+  test_inductor_cpp_wrapper_abi_compatible
 elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 1 ]]; then
  install_torchvision
  test_inductor
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@ -65,6 +65,7 @@ jobs:
          { config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
          { config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
          { config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_cpp_wrapper_abi_compatible", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
        ]}
    secrets:
      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
--- a/test/inductor/test_cpu_cpp_wrapper.py
+++ b/test/inductor/test_cpu_cpp_wrapper.py
@ -72,6 +72,53 @@ test_failures_cpp_wrapper = {
    ),
 }

+if config.abi_compatible:
+    xfail_list = [
+        "test_bernoulli1_cpu",  # cpp fallback op naming issue
+        "test_conv2d_binary_inplace_fusion_failed_cpu",
+        "test_conv2d_binary_inplace_fusion_pass_cpu",
+        "test_cumsum_cpu",
+        "test_custom_op_cpu",  # needs custom op support
+        "test_dtype_sympy_expr_cpu",
+        "test_dynamic_qlinear_cpu",
+        "test_dynamic_qlinear_qat_cpu",
+        "test_index_put_deterministic_fallback_cpu",
+        "test_lstm_packed_change_input_sizes_cpu",
+        "test_profiler_mark_wrapper_call_cpu",
+        "test_qconv2d_add_cpu",
+        "test_qconv2d_add_relu_cpu",
+        "test_qconv2d_cpu",
+        "test_qconv2d_dequant_promotion_cpu",
+        "test_qconv2d_maxpool2d_linear_dynamic_cpu",
+        "test_qconv2d_relu_cpu",
+        "test_qlinear_cpu",
+        "test_qlinear_dequant_promotion_cpu",
+        "test_qlinear_relu_cpu",
+        "test_randint_cpu",
+        "test_randn_with_dtype_and_device_cpu",
+        "test_scatter5_cpu",
+        "test_scatter6_cpu",
+        "test_tensor2_cpu",
+    ]
+    for test_name in xfail_list:
+        test_failures_cpp_wrapper[test_name] = test_torchinductor.TestFailure(
+            ("cpp_wrapper",), is_skip=False
+        )
+        test_failures_cpp_wrapper[
+            f"{test_name}_dynamic_shapes"
+        ] = test_torchinductor.TestFailure(("cpp_wrapper",), is_skip=False)
+    skip_list = [
+        "test_linear1_cpu",  # segfault from double free
+        "test_multihead_attention_cpu",
+    ]
+    for test_name in skip_list:
+        test_failures_cpp_wrapper[test_name] = test_torchinductor.TestFailure(
+            ("cpp_wrapper",), is_skip=True
+        )
+        test_failures_cpp_wrapper[
+            f"{test_name}_dynamic_shapes"
+        ] = test_torchinductor.TestFailure(("cpp_wrapper",), is_skip=True)
+

 def make_test_case(
    name,
@ -306,7 +353,12 @@ if RUN_CPU:
            item.code_string_count,
        )

-    test_torchinductor.copy_tests(CppWrapperTemplate, TestCppWrapper, "cpp_wrapper")
+    test_torchinductor.copy_tests(
+        CppWrapperTemplate,
+        TestCppWrapper,
+        "cpp_wrapper",
+        test_failures_cpp_wrapper,
+    )

    DynamicShapesCppWrapperTemplate = (
        test_torchinductor_dynamic_shapes.make_dynamic_cls(CppWrapperTemplate)
--- a/test/inductor/test_cuda_cpp_wrapper.py
+++ b/test/inductor/test_cuda_cpp_wrapper.py
@ -93,6 +93,34 @@ if TEST_WITH_ROCM:
            dynamic_shapes_test_name
        ] = test_torchinductor.TestFailure(("cuda_wrapper",), is_skip=True)

+if config.abi_compatible:
+    xfail_list = [
+        "test_bernoulli1_cuda",  # cpp fallback op naming issue
+        "test_conv_backward_cuda",
+        "test_custom_op_cuda",  # needs custom op support
+        "test_index_put_deterministic_fallback_cuda",
+        "test_profiler_mark_wrapper_call_cuda",
+        "test_scaled_dot_product_attention_cuda_dynamic_shapes",
+    ]
+    for test_name in xfail_list:
+        test_failures_cuda_wrapper[test_name] = test_torchinductor.TestFailure(
+            ("cuda_wrapper",), is_skip=False
+        )
+        test_failures_cuda_wrapper[
+            f"{test_name}_dynamic_shapes"
+        ] = test_torchinductor.TestFailure(("cuda_wrapper",), is_skip=False)
+    skip_list = [
+        "test_multi_device_cuda",
+        "test_linear1_cuda",  # segfault from double free
+    ]
+    for test_name in skip_list:
+        test_failures_cuda_wrapper[test_name] = test_torchinductor.TestFailure(
+            ("cuda_wrapper",), is_skip=True
+        )
+        test_failures_cuda_wrapper[
+            f"{test_name}_dynamic_shapes"
+        ] = test_torchinductor.TestFailure(("cuda_wrapper",), is_skip=True)
+

 def make_test_case(
    name,
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@ -784,7 +784,7 @@ class CommonTemplate:
            _, code = run_and_get_code(fn, x, y)
            self.assertEqual(
                " ".join(code).count(
-                    "::view_dtype" if config.cpp_wrapper else "aten.view"
+                    "view_dtype" if config.cpp_wrapper else "aten.view"
                ),
                3,
            )