[CI] Fix XPU CI failure (#138548)

# Motivation Fix https://github.com/pytorch/pytorch/issues/138577. # Solution 1. All UTs in `test/inductor/test_compiled_optimizers.py` are fixed by https://github.com/pytorch/pytorch/pull/134170 2. UT in `test/inductor/test_pattern_matcher.py` is introduced by https://github.com/pytorch/pytorch/pull/138089, we will skip this UT due to the unsupported feature `max_autotune_gemm_backends:Triton`. 3. We have a new impl related to `histc`, so we remove the expected failure from `test/inductor/test_torchinductor_opinfo.py` 4. We support `avg_pool3d` for `fp16` data type, so we remove the expected failure from `test/inductor/test_torchinductor_opinfo.py` 5. CUDA-bias code is introduced by https://github.com/pytorch/pytorch/issues/138472, we just generalize it to `GPU_TYPE`. # Additional Context > Why update torch-xpu-ops commit pin here? We have to update commit pin to avoid the build failure raised by the code change [C10_UNUSED](https://github.com/pytorch/pytorch/pull/138364). > What does the feature of torch-xpu-ops update? 1. Add some foreach ops, like `unary ops` and `foreach_clamp_max` etc; 2. Add some maxpool ops forward and backward, like `averge_pool3d` and `max_pool3d` 3. Add some other ops, like `log_normal_`, `index_copy`, and `mode` etc; 4. fix build failure related to `C10_UNUSED`; Pull Request resolved: https://github.com/pytorch/pytorch/pull/138548 Approved by: https://github.com/malfet, https://github.com/EikanWang
2025-10-20 21:14:14 +08:00 · 2024-10-24 11:39:01 +00:00
parent dbf0fa811a
commit 0efa590d43
5 changed files with 4 additions and 8 deletions
--- a/test/inductor/test_compiled_optimizers.py
+++ b/test/inductor/test_compiled_optimizers.py
@ -121,6 +121,7 @@ KERNEL_COUNT_OVERRIDES = {
    "test_adamw_amsgrad_capturable_cuda": 6,
    "test_adamw_amsgrad_capturable_xpu": 6,
    "test_adamw_tensor_lr_tensor_betas_amsgrad_capturable_cuda": 6,
+    "test_adamw_tensor_lr_tensor_betas_amsgrad_capturable_xpu": 6,
    "test_adamw_tensor_lr_amsgrad_capturable_cuda": 6,
    "test_adamw_tensor_lr_amsgrad_capturable_xpu": 6,
    "test_adam_tensor_lr_amsgrad_capturable_cuda": 6,
@ -153,7 +154,6 @@ KERNEL_COUNT_OVERRIDES = {
    "test_sgd_cuda": 4,
    "test_sgd_cpu": 4,
    "test_sgd_xpu": 4,
-    "test_rmsprop_tensor_lr_capturable_foreach_xpu": 4,
    "test_adagrad_initial_accumulator_value_weight_decay_foreach_xpu": 2,
    "test_adagrad_lr_decay_weight_decay_foreach_xpu": 2,
    "test_adagrad_weight_decay_foreach_xpu": 2,
@ -167,14 +167,11 @@ KERNEL_COUNT_OVERRIDES = {
    "test_asgd_tensor_lr_weight_decay_maximize_capturable_xpu": 8,
    "test_nadam_tensor_lr_weight_decay_momentum_decay_decoupled_weight_decay_capturable_cuda": 6,
    "test_nadam_tensor_lr_weight_decay_momentum_decay_decoupled_weight_decay_capturable_xpu": 9,
-    "test_nadam_tensor_lr_weight_decay_momentum_decay_decoupled_weight_decay_capturable_foreach_xpu": 3,
    "test_radam_tensor_lr_capturable_weight_decay_decoupled_weight_decay_cuda": 6,
    "test_radam_tensor_lr_capturable_weight_decay_decoupled_weight_decay_xpu": 6,
-    "test_radam_tensor_lr_capturable_weight_decay_decoupled_weight_decay_foreach_xpu": 3,
    "test_sgd_tensor_lr_cpu": 2,
    "test_sgd_tensor_lr_cuda": 2,
    "test_sgd_tensor_lr_xpu": 2,
-    "test_sgd_tensor_lr_foreach_xpu": 2,
 }

 # also tracks currently supported optimizers
--- a/test/inductor/test_pattern_matcher.py
+++ b/test/inductor/test_pattern_matcher.py
@ -1234,6 +1234,7 @@ class TestPatternMatcher(TestCase):
                # of search_fn).
                self.assertTrue(pattern.pattern_eq(search_fn_pattern))

+    @skipIfXpu
    @inductor_config.patch(
        {
            "triton.unique_kernel_names": "original_aten",
--- a/test/inductor/test_torchinductor_opinfo.py
+++ b/test/inductor/test_torchinductor_opinfo.py
@ -351,11 +351,9 @@ inductor_expected_failures_single_sample["xpu"] = {
    "nn.functional.conv_transpose3d": {f32, f64},
    # rrelu not supported on XPU now
    "nn.functional.rrelu": {f16, f32, f64},
-    "histc": {i32, i64},
    # not implemented for 'Half'
    "nn.functional.multilabel_margin_loss": {f16},
    "nn.functional.multi_margin_loss": {f16},
-    "nn.functional.avg_pool3d": {f16},
 }


--- a/test/inductor/test_triton_kernels.py
+++ b/test/inductor/test_triton_kernels.py
@ -2158,7 +2158,7 @@ def forward(self, arg0_1, arg1_1):
            n_elements = out.numel()
            sin_kernel[(n_elements,)](x, out, n_elements)

-        x = torch.randn(65, device="cuda")
+        x = torch.randn(65, device=GPU_TYPE)
        out = torch.empty_like(x)
        out_compiled = torch.empty_like(x)
        sin_triton_compiled = torch.compile(fullgraph=True)(sin_triton)
--- a/third_party/xpu.txt
+++ b/third_party/xpu.txt
@ -1 +1 @@
-1d217ae491669b550b136ca16e91b85c4597cd66
+b3d5d78c72eadc5140aef1f8e06844385e9a2d45