From d5b1d99f78b497ea615eb043d7df035ff8009acf Mon Sep 17 00:00:00 2001
From: Aleksei Nikiforov <aleksei.nikiforov@linux.ibm.com>
Date: Tue, 18 Mar 2025 16:09:35 +0000
Subject: [PATCH] Enable more nightly tests on s390x (#148452)

Also enable some tests which probably were accidentally disabled.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/148452
Approved by: https://github.com/seemethere, https://github.com/malfet
---
 .ci/docker/requirements-ci.txt                |   5 +-
 .github/scripts/s390x-ci/tests_list.py        |  97 ++++++
 test/dynamo/test_logging.py                   |   3 +
 test/export/test_converter.py                 |   6 +-
 test/functorch/test_aotdispatch.py            |   4 -
 test/functorch/test_ops.py                    |   7 -
 test/inductor/test_torchinductor.py           |   7 +-
 test/run_test.py                              | 308 +++++++++++++++---
 test/test_dataloader.py                       |   4 +
 test/test_ops_gradients.py                    |   2 -
 test/test_optim.py                            |   2 -
 torch/onnx/_internal/onnxruntime.py           |   2 +
 .../_internal/common_methods_invocations.py   |   3 +-
 13 files changed, 391 insertions(+), 59 deletions(-)
 create mode 100755 .github/scripts/s390x-ci/tests_list.py

diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
index d0680f687794..80d7f97b1d89 100644
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@@ -41,11 +41,14 @@ fbscribelogger==0.1.7
 #Pinned versions: 0.1.6
 #test that import:
 
-flatbuffers==2.0
+flatbuffers==2.0 ; platform_machine != "s390x"
 #Description: cross platform serialization library
 #Pinned versions: 2.0
 #test that import:
 
+flatbuffers ; platform_machine == "s390x"
+#Description: cross platform serialization library; Newer version is required on s390x for new python version
+
 hypothesis==5.35.1
 # Pin hypothesis to avoid flakiness: https://github.com/pytorch/pytorch/issues/31136
 #Description: advanced library for generating parametrized tests
diff --git a/.github/scripts/s390x-ci/tests_list.py b/.github/scripts/s390x-ci/tests_list.py
new file mode 100755
index 000000000000..18e78f40e5a3
--- /dev/null
+++ b/.github/scripts/s390x-ci/tests_list.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python3
+
+import os
+import re
+import sys
+
+
+sys.path.insert(1, os.path.join(sys.path[0], "..", "..", ".."))
+
+from tools.testing.discover_tests import TESTS
+
+
+skip_list = [
+    # these tests fail due to various reasons
+    "dynamo/test_misc",
+    "inductor/test_aot_inductor",
+    "inductor/test_cpu_repro",
+    "inductor/test_cpu_select_algorithm",
+    "inductor/test_aot_inductor_arrayref",
+    "inductor/test_torchinductor_codegen_dynamic_shapes",
+    "lazy/test_meta_kernel",
+    "onnx/test_utility_funs",
+    "profiler/test_profiler",
+    "test_ao_sparsity",
+    "test_cpp_extensions_open_device_registration",
+    "test_jit",
+    "test_metal",
+    "test_mps",
+    "dynamo/test_torchrec",
+    "inductor/test_aot_inductor_utils",
+    "inductor/test_coordinate_descent_tuner",
+    "test_jiterator",
+    # these tests run long and fail in addition to that
+    "dynamo/test_dynamic_shapes",
+    "test_quantization",
+    "inductor/test_torchinductor",
+    "inductor/test_torchinductor_dynamic_shapes",
+    "inductor/test_torchinductor_opinfo",
+    "test_binary_ufuncs",
+    "test_unary_ufuncs",
+    # these tests fail when cuda is not available
+    "inductor/test_cudacodecache",
+    "inductor/test_inductor_utils",
+    "inductor/test_inplacing_pass",
+    "inductor/test_kernel_benchmark",
+    "inductor/test_max_autotune",
+    "inductor/test_move_constructors_to_cuda",
+    "inductor/test_multi_kernel",
+    "inductor/test_pattern_matcher",
+    "inductor/test_perf",
+    "inductor/test_select_algorithm",
+    "inductor/test_snode_runtime",
+    "inductor/test_triton_wrapper",
+    # these tests fail when mkldnn is not available
+    "inductor/test_custom_post_grad_passes",
+    "inductor/test_mkldnn_pattern_matcher",
+    # lacks quantization support
+    "onnx/test_models_quantized_onnxruntime",
+    "onnx/test_pytorch_onnx_onnxruntime",
+    # https://github.com/pytorch/pytorch/issues/102078
+    "test_decomp",
+    # https://github.com/pytorch/pytorch/issues/146698
+    "test_model_exports_to_core_aten",
+    # runs very long, skip for now
+    "inductor/test_layout_optim",
+    "test_fx",
+    # some false errors
+    "doctests",
+]
+
+skip_list_regex = [
+    # distributed tests fail randomly
+    "distributed/.*",
+]
+
+all_testfiles = sorted(TESTS)
+
+filtered_testfiles = []
+
+for filename in all_testfiles:
+    if filename in skip_list:
+        continue
+
+    regex_filtered = False
+
+    for regex_string in skip_list_regex:
+        if re.fullmatch(regex_string, filename):
+            regex_filtered = True
+            break
+
+    if regex_filtered:
+        continue
+
+    filtered_testfiles.append(filename)
+
+for filename in filtered_testfiles:
+    print('    "' + filename + '",')
diff --git a/test/dynamo/test_logging.py b/test/dynamo/test_logging.py
index f97ef5afaed5..43e5f3308a31 100644
--- a/test/dynamo/test_logging.py
+++ b/test/dynamo/test_logging.py
@@ -23,6 +23,7 @@ from torch.testing._internal.common_utils import (
     find_free_port,
     munge_exc,
     skipIfTorchDynamo,
+    xfailIfS390X,
 )
 from torch.testing._internal.inductor_utils import HAS_CUDA
 from torch.testing._internal.logging_utils import (
@@ -817,6 +818,8 @@ TRACE FX call mul from test_logging.py:N in fn (LoggingTests.test_trace_call_pre
             len([r for r in records if "return a + 1" in r.getMessage()]), 0
         )
 
+    # there are some additional deprecation warnings in stderr, probably due to newer dependencies used on s390x
+    @xfailIfS390X
     def test_logs_out(self):
         import tempfile
 
diff --git a/test/export/test_converter.py b/test/export/test_converter.py
index 83d8aa4e081e..953246be7a7b 100644
--- a/test/export/test_converter.py
+++ b/test/export/test_converter.py
@@ -10,7 +10,7 @@ from torch._dynamo.test_case import TestCase
 from torch._export.converter import TS2EPConverter
 from torch.export import ExportedProgram
 from torch.testing._internal.common_quantized import override_quantized_engine
-from torch.testing._internal.common_utils import IS_WINDOWS, run_tests
+from torch.testing._internal.common_utils import IS_WINDOWS, run_tests, xfailIfS390X
 from torch.testing._internal.torchbind_impls import (
     _empty_tensor_queue,
     init_torchbind_implementations,
@@ -1403,6 +1403,8 @@ class TestConverter(TestCase):
         IS_WINDOWS,
         "Windows does not support qnnpack",
     )
+    # qnnpack not supported on s390x
+    @xfailIfS390X
     def test_ts2ep_convert_quantized_model(self):
         class Standalone(torch.nn.Module):
             def __init__(self):
@@ -1446,6 +1448,8 @@ class TestConverter(TestCase):
             ep_out, _ = pytree.tree_flatten(ep.module()(*inp))
             self._check_tensor_list_equal(orig_out, ep_out)
 
+    # qnnpack not supported on s390x
+    @xfailIfS390X
     def test_ts2ep_convert_quantized_model_with_opcontext(self):
         class M(torch.nn.Module):
             def __init__(self, linear_op):
diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index bcf99f6a6635..0b9347b53919 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -74,7 +74,6 @@ from torch.testing._internal.common_utils import (
     skipIfRocm,
     TestCase,
     xfail_inherited_tests,
-    xfailIfS390X,
     xfailIfTorchDynamo,
 )
 from torch.testing._internal.custom_tensor import ConstantExtraMetadataTensor
@@ -6681,7 +6680,6 @@ class TestEagerFusionOpInfo(AOTTestCase):
     def test_aot_autograd_exhaustive(self, device, dtype, op):
         _test_aot_autograd_helper(self, device, dtype, op)
 
-    @xfailIfS390X
     @ops(op_db + hop_db, allowed_dtypes=(torch.float,))
     @patch("functorch.compile.config.debug_assert", True)
     @skipOps(
@@ -6728,13 +6726,11 @@ symbolic_aot_autograd_module_failures = {
 
 
 class TestEagerFusionModuleInfo(AOTTestCase):
-    @xfailIfS390X
     @modules(module_db, allowed_dtypes=(torch.float,))
     @decorateForModules(unittest.expectedFailure, aot_autograd_module_failures)
     def test_aot_autograd_module_exhaustive(self, device, dtype, training, module_info):
         _test_aot_autograd_module_helper(self, device, dtype, training, module_info)
 
-    @xfailIfS390X
     @modules(module_db, allowed_dtypes=(torch.float,))
     @decorateForModules(
         unittest.expectedFailure,
diff --git a/test/functorch/test_ops.py b/test/functorch/test_ops.py
index 145905f08f22..8a0bf6ad40f5 100644
--- a/test/functorch/test_ops.py
+++ b/test/functorch/test_ops.py
@@ -55,7 +55,6 @@ from torch.testing._internal.common_utils import (
     TEST_WITH_ROCM,
     TestCase,
     unMarkDynamoStrictTest,
-    xfailIfS390X,
 )
 from torch.testing._internal.opinfo.core import SampleInput
 from torch.utils import _pytree as pytree
@@ -1031,12 +1030,6 @@ class TestOperators(TestCase):
                 xfail(
                     "unbind_copy"
                 ),  # Batching rule not implemented for aten::unbind_copy.int.
-                decorate("linalg.tensorsolve", decorator=xfailIfS390X),
-                decorate("nn.functional.max_pool1d", decorator=xfailIfS390X),
-                decorate("nn.functional.max_unpool2d", decorator=xfailIfS390X),
-                decorate(
-                    "nn.functional.multilabel_margin_loss", decorator=xfailIfS390X
-                ),
             }
         ),
     )
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 6eb8da343357..fde903c9fd42 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -1686,6 +1686,7 @@ class CommonTemplate:
 
         self.common(fn, (torch.randn(1024),))
 
+    @xfailIfS390X
     @config.patch(debug_index_asserts=False)
     @config.patch("cpp.enable_tiling_heuristics", False)
     def test_neg_index(self):
@@ -1995,7 +1996,6 @@ class CommonTemplate:
 
     @skip_if_gpu_halide
     @skipCPUIf(IS_MACOS, "fails on macos")
-    @xfailIfS390X
     def test_multilayer_var(self):
         def fn(a):
             return torch.var(a)
@@ -2015,7 +2015,7 @@ class CommonTemplate:
 
     @skipCPUIf(IS_MACOS, "fails on macos")
     @skip_if_halide  # accuracy 4.7% off
-    @xfailIfS390X
+    @xfailIfS390X  # accuracy failure
     def test_multilayer_var_lowp(self):
         def fn(a):
             return torch.var(a)
@@ -9695,7 +9695,6 @@ class CommonTemplate:
         "TODO: debug this with asan",
     )
     @skip_if_gpu_halide
-    @xfailIfS390X
     def test_tmp_not_defined_issue2(self):
         def forward(arg38_1, arg81_1, getitem_17, new_zeros_default_4):
             div_tensor_7 = torch.ops.aten.div.Tensor(getitem_17, arg81_1)
@@ -10904,8 +10903,8 @@ class CommonTemplate:
     # Calling div only torch.SymInt arguments is not yet supported.
     # To support this behavior, we need to allow const-propping tensors that store symint data.
     # For now, dynamo will explicitly graph break when it encounters user code with this behavior.
-    @xfailIfS390X
     @expectedFailureCodegenDynamic
+    @xfailIfS390X
     @skip_if_gpu_halide  # accuracy error
     def test_AllenaiLongformerBase_repro(self):
         def fn(query, scores, window_overlap):
diff --git a/test/run_test.py b/test/run_test.py
index 974e54004519..b342ffa6f531 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -189,30 +189,130 @@ ROCM_BLOCKLIST = [
 
 # whitelist of tests for s390x
 S390X_TESTLIST = [
-    "backends/xeon/test_launch.py",
-    "benchmark_utils/test_benchmark_utils.py",
-    "cpp/apply_utils_test",
-    "cpp/atest",
-    "cpp/basic",
-    "cpp/broadcast_test",
-    "cpp/cpu_generator_test",
+    "backends/xeon/test_launch",
+    "benchmark_utils/test_benchmark_utils",
+    "cpp/BackoffTest",
+    "cpp/CppSignature_test",
     "cpp/Dict_test",
     "cpp/Dimname_test",
+    "cpp/FileStoreTest",
+    "cpp/HashStoreTest",
+    "cpp/IListRef_test",
+    "cpp/KernelFunction_test",
+    "cpp/List_test",
+    "cpp/MaybeOwned_test",
+    "cpp/NamedTensor_test",
+    "cpp/ProcessGroupGlooTest",
+    "cpp/StorageUtils_test",
+    "cpp/TCPStoreTest",
+    "cpp/apply_utils_test",
+    "cpp/atest",
+    "cpp/backend_fallback_test",
+    "cpp/basic",
+    "cpp/broadcast_test",
+    "cpp/c10_ArrayRef_test",
+    "cpp/c10_Bitset_test",
+    "cpp/c10_CompileTimeFunctionPointer_test",
+    "cpp/c10_ConstexprCrc_test",
+    "cpp/c10_DeadlockDetection_test",
+    "cpp/c10_DeviceGuard_test",
+    "cpp/c10_Device_test",
+    "cpp/c10_DispatchKeySet_test",
+    "cpp/c10_Half_test",
+    "cpp/c10_InlineDeviceGuard_test",
+    "cpp/c10_InlineStreamGuard_test",
+    "cpp/c10_LeftRight_test",
+    "cpp/c10_Metaprogramming_test",
+    "cpp/c10_NetworkFlow_test",
+    "cpp/c10_Scalar_test",
+    "cpp/c10_SizesAndStrides_test",
+    "cpp/c10_StreamGuard_test",
+    "cpp/c10_SymInt_test",
+    "cpp/c10_Synchronized_test",
+    "cpp/c10_ThreadLocal_test",
+    "cpp/c10_TypeIndex_test",
+    "cpp/c10_TypeList_test",
+    "cpp/c10_TypeTraits_test",
+    "cpp/c10_accumulate_test",
+    "cpp/c10_bfloat16_test",
+    "cpp/c10_bit_cast_test",
+    "cpp/c10_complex_math_test",
+    "cpp/c10_complex_test",
+    "cpp/c10_cow_test",
+    "cpp/c10_error_test",
+    "cpp/c10_exception_test",
+    "cpp/c10_flags_test",
+    "cpp/c10_generic_math_test",
+    "cpp/c10_intrusive_ptr_test",
+    "cpp/c10_irange_test",
+    "cpp/c10_lazy_test",
+    "cpp/c10_logging_test",
+    "cpp/c10_optional_test",
+    "cpp/c10_ordered_preserving_dict_test",
+    "cpp/c10_registry_test",
+    "cpp/c10_small_vector_test",
+    "cpp/c10_ssize_test",
+    "cpp/c10_string_util_test",
+    "cpp/c10_string_view_test",
+    "cpp/c10_tempfile_test",
+    "cpp/c10_typeid_test",
+    "cpp/cpu_allocator_test",
+    "cpp/cpu_generator_test",
+    "cpp/cpu_profiling_allocator_test",
+    "cpp/cpu_rng_test",
+    "cpp/dispatch_key_set_test",
     "cpp/dlconvertor_test",
     "cpp/extension_backend_test",
+    "cpp/half_test",
+    "cpp/inline_container_test",
+    "cpp/ivalue_test",
+    "cpp/kernel_function_legacy_test",
+    "cpp/kernel_function_test",
+    "cpp/kernel_lambda_legacy_test",
+    "cpp/kernel_lambda_test",
+    "cpp/kernel_stackbased_test",
     "cpp/lazy_tensor_test",
     "cpp/legacy_vmap_test",
-    "cpp/NamedTensor_test",
+    "cpp/make_boxed_from_unboxed_functor_test",
+    "cpp/math_kernel_test",
+    "cpp/memory_format_test",
+    "cpp/memory_overlapping_test",
+    "cpp/mobile_memory_cleanup",
     "cpp/native_test",
+    "cpp/op_allowlist_test",
+    "cpp/op_registration_test",
+    "cpp/operator_name_test",
     "cpp/operators_test",
+    "cpp/packedtensoraccessor_test",
+    "cpp/pow_test",
+    "cpp/protoc-3",
+    "cpp/quantized_test",
+    "cpp/reduce_ops_test",
+    "cpp/reportMemoryUsage_test",
     "cpp/scalar_tensor_test",
     "cpp/scalar_test",
+    "cpp/stride_properties_test",
     "cpp/tensor_iterator_test",
     "cpp/test_api",
+    "cpp/test_cpp_rpc",
+    "cpp/test_dist_autograd",
+    "cpp/test_edge_op_registration",
+    "cpp/test_jit",
+    "cpp/test_lazy",
+    "cpp/test_parallel",
+    "cpp/test_tensorexpr",
+    "cpp/thread_init_test",
+    "cpp/type_ptr_test",
+    "cpp/type_test",
     "cpp/undefined_tensor_test",
+    "cpp/vec_test_all_types_DEFAULT",
+    "cpp/verify_api_visibility",
+    "cpp/weakref_test",
     "cpp/wrapdim_test",
+    "cpp/xla_tensor_test",
+    "cpp_extensions/open_registration_extension/test/test_openreg",
     "distributions/test_constraints",
-    "doctests",
+    "distributions/test_distributions",
     "dynamo/test_activation_checkpointing",
     "dynamo/test_after_aot",
     "dynamo/test_aot_autograd",
@@ -220,10 +320,12 @@ S390X_TESTLIST = [
     "dynamo/test_autograd_function",
     "dynamo/test_backends",
     "dynamo/test_backward_higher_order_ops",
+    "dynamo/test_base_hop",
     "dynamo/test_base_output",
     "dynamo/test_bytecode_utils",
     "dynamo/test_callback",
     "dynamo/test_compile",
+    "dynamo/test_compiler_bisector",
     "dynamo/test_comptime",
     "dynamo/test_config",
     "dynamo/test_ctx_manager",
@@ -232,53 +334,71 @@ S390X_TESTLIST = [
     "dynamo/test_debug_utils",
     "dynamo/test_decorators",
     "dynamo/test_deviceguard",
+    "dynamo/test_dicts",
+    "dynamo/test_error_messages",
+    "dynamo/test_exc",
+    "dynamo/test_exceptions",
     "dynamo/test_export",
     "dynamo/test_export_mutations",
+    "dynamo/test_flat_apply",
     "dynamo/test_frame_init",
+    "dynamo/test_functions",
     "dynamo/test_fx_passes_pre_grad",
+    "dynamo/test_generator",
     "dynamo/test_global",
+    "dynamo/test_graph_deduplication",
+    "dynamo/test_graph_region_tracker",
     "dynamo/test_guard_manager",
     "dynamo/test_higher_order_ops",
     "dynamo/test_hooks",
     "dynamo/test_input_attr_tracking",
     "dynamo/test_interop",
     "dynamo/test_logging",
+    "dynamo/test_metrics_context",
     "dynamo/test_minifier",
     "dynamo/test_model_output",
     "dynamo/test_modes",
     "dynamo/test_modules",
     "dynamo/test_nops",
     "dynamo/test_optimizers",
+    "dynamo/test_pgo",
     "dynamo/test_pre_dispatch",
     "dynamo/test_profiler",
     "dynamo/test_python_autograd",
-    "dynamo/test_recompiles",
+    "dynamo/test_python_dispatcher",
     "dynamo/test_recompile_ux",
+    "dynamo/test_recompiles",
     "dynamo/test_reconstruct",
     "dynamo/test_reorder_logs",
     "dynamo/test_repros",
     "dynamo/test_resume",
     "dynamo/test_sdpa",
+    "dynamo/test_skip_guard_eval_unsafe",
     "dynamo/test_skip_non_tensor",
     "dynamo/test_sources",
     "dynamo/test_structured_trace",
     "dynamo/test_subclasses",
     "dynamo/test_subgraphs",
-    "dynamo/test_torchrec",
+    "dynamo/test_sys",
+    "dynamo/test_trace_rules",
     "dynamo/test_unspec",
     "dynamo/test_utils",
     "dynamo/test_verify_correctness",
     "dynamo/test_view",
+    "export/test_converter",
+    "export/test_cpp_serdes",
     "export/test_db",
+    "export/test_draft_export",
     "export/test_experimental",
     "export/test_export",
+    "export/test_export_legacy",
     "export/test_export_nonstrict",
     "export/test_export_training_ir_to_run_decomp",
     "export/test_functionalized_assertions",
     "export/test_hop",
     "export/test_lift_unlift",
-    "export/test_passes",
     "export/test_pass_infra",
+    "export/test_passes",
     "export/test_retraceability",
     "export/test_schema",
     "export/test_serdes",
@@ -292,30 +412,49 @@ S390X_TESTLIST = [
     "export/test_unflatten_training_ir",
     "export/test_verifier",
     "functorch/test_ac",
+    "functorch/test_ac_knapsack",
+    "functorch/test_ac_logging",
+    "functorch/test_aotdispatch",
     "functorch/test_control_flow",
+    "functorch/test_dims",
     "functorch/test_eager_transforms",
     "functorch/test_logging",
+    "functorch/test_memory_efficient_fusion",
     "functorch/test_minifier",
-    "higher_order_ops/test_with_effects.py",
+    "functorch/test_ops",
+    "functorch/test_parsing",
+    "functorch/test_rearrange",
+    "functorch/test_vmap",
+    "functorch/test_vmap_registrations",
+    "higher_order_ops/test_invoke_quant",
+    "higher_order_ops/test_invoke_subgraph",
+    "higher_order_ops/test_with_effects",
+    "inductor/test_aot_inductor_custom_ops",
+    "inductor/test_aot_inductor_package",
+    "inductor/test_async_compile",
     "inductor/test_auto_functionalize",
     "inductor/test_autoheuristic",
     "inductor/test_b2b_gemm",
+    "inductor/test_benchmark_fusion",
     "inductor/test_benchmarking",
+    "inductor/test_binary_folding",
+    "inductor/test_block_analysis",
     "inductor/test_ck_backend",
     "inductor/test_codecache",
     "inductor/test_codegen_triton",
     "inductor/test_combo_kernels",
+    "inductor/test_compile_subprocess",
+    "inductor/test_compile_worker",
     "inductor/test_compiled_autograd",
     "inductor/test_compiled_optimizers",
-    "inductor/test_compile_worker",
     "inductor/test_config",
     "inductor/test_control_flow",
-    "inductor/test_coordinate_descent_tuner",
+    "inductor/test_cooperative_reductions",
     "inductor/test_cpp_wrapper_hipify",
     "inductor/test_cpu_cpp_wrapper",
+    "inductor/test_cuda_repro",
     "inductor/test_cudagraph_trees",
     "inductor/test_cudagraph_trees_expandable_segments",
-    "inductor/test_cuda_repro",
     "inductor/test_custom_lowering",
     "inductor/test_cutlass_backend",
     "inductor/test_debug_trace",
@@ -329,31 +468,48 @@ S390X_TESTLIST = [
     "inductor/test_flex_decoding",
     "inductor/test_foreach",
     "inductor/test_fp8",
+    "inductor/test_fused_attention",
+    "inductor/test_fuzzer",
     "inductor/test_fx_fusion",
+    "inductor/test_gpu_cpp_wrapper",
     "inductor/test_graph_transform_observer",
     "inductor/test_group_batch_fusion",
-    "inductor/test_gpu_cpp_wrapper",
     "inductor/test_halide",
     "inductor/test_indexing",
+    "inductor/test_inductor_annotations",
     "inductor/test_inductor_freezing",
+    "inductor/test_inplace_padding",
     "inductor/test_loop_ordering",
     "inductor/test_memory",
     "inductor/test_memory_planning",
     "inductor/test_metrics",
     "inductor/test_minifier",
     "inductor/test_minifier_isolate",
+    "inductor/test_minifier_utils",
     "inductor/test_mmdecomp",
-    "inductor/test_padding",
+    "inductor/test_mps_basic",
+    "inductor/test_online_softmax",
+    "inductor/test_op_completeness",
+    "inductor/test_op_dtype_prop",
+    "inductor/test_ordered_set",
     "inductor/test_pad_mm",
+    "inductor/test_padding",
     "inductor/test_profiler",
+    "inductor/test_provenance_tracing",
     "inductor/test_scatter_optimization",
     "inductor/test_smoke",
+    "inductor/test_split_cat_fx_aten_passes",
+    "inductor/test_split_cat_fx_passes",
     "inductor/test_standalone_compile",
     "inductor/test_torchbind",
+    "inductor/test_torchinductor_codegen_config_overrides",
+    "inductor/test_torchinductor_strided_blocks",
     "inductor/test_triton_cpu_backend",
     "inductor/test_triton_extension_backend",
     "inductor/test_triton_heuristics",
     "inductor/test_triton_kernels",
+    "inductor/test_triton_syntax",
+    "inductor/test_unbacked_symints",
     "inductor/test_utils",
     "inductor/test_xpu_basic",
     "lazy/test_bindings",
@@ -364,25 +520,59 @@ S390X_TESTLIST = [
     "lazy/test_reuse_ir",
     "lazy/test_step_closures",
     "lazy/test_ts_opinfo",
-    "nn/test_convolution.py",
-    "nn/test_dropout.py",
-    "nn/test_embedding.py",
-    "nn/test_init.py",
-    "nn/test_lazy_modules.py",
-    "nn/test_load_state_dict.py",
-    "nn/test_module_hooks.py",
-    "nn/test_multihead_attention.py",
-    "nn/test_packed_sequence.py",
-    "nn/test_parametrization.py",
-    "nn/test_pooling.py",
-    "nn/test_pruning.py",
+    "nn/test_convolution",
+    "nn/test_dropout",
+    "nn/test_embedding",
+    "nn/test_init",
+    "nn/test_lazy_modules",
+    "nn/test_load_state_dict",
+    "nn/test_module_hooks",
+    "nn/test_multihead_attention",
+    "nn/test_packed_sequence",
+    "nn/test_parametrization",
+    "nn/test_pooling",
+    "nn/test_pruning",
+    "onnx/dynamo/test_dynamo_with_onnxruntime_backend",
+    "onnx/exporter/test_api",
+    "onnx/exporter/test_building",
+    "onnx/exporter/test_capture_strategies",
+    "onnx/exporter/test_core",
+    "onnx/exporter/test_dynamic_shapes",
+    "onnx/exporter/test_hf_models_e2e",
+    "onnx/exporter/test_ir_passes",
+    "onnx/exporter/test_small_models_e2e",
+    "onnx/exporter/test_tensors",
+    "onnx/exporter/test_verification",
+    "onnx/internal/test_diagnostics",
+    "onnx/internal/test_registraion",
+    "onnx/test_autograd_funs",
+    "onnx/test_custom_ops",
+    "onnx/test_fx_passes",
+    "onnx/test_fx_type_promotion",
+    "onnx/test_lazy_import",
+    "onnx/test_models_onnxruntime",
+    "onnx/test_onnx_opset",
+    "onnx/test_onnxscript_no_runtime",
+    "onnx/test_onnxscript_runtime",
+    "onnx/test_op_consistency",
+    "onnx/test_pytorch_jit_onnx",
+    "onnx/test_pytorch_onnx_no_runtime",
+    "onnx/test_pytorch_onnx_shape_inference",
+    "onnx/test_symbolic_helper",
+    "onnx/test_verification",
+    "onnx/torchlib/test_ops",
     "optim/test_lrscheduler",
+    "optim/test_optim",
     "optim/test_swa_utils",
     "profiler/test_cpp_thread",
     "profiler/test_execution_trace",
+    "profiler/test_kineto",
     "profiler/test_memory_profiler",
+    "profiler/test_profiler_tree",
     "profiler/test_record_function",
     "profiler/test_torch_tidy",
+    "test_accelerator",
+    "test_appending_byte_serializer",
     "test_autocast",
     "test_autograd",
     "test_autograd_fallback",
@@ -390,6 +580,7 @@ S390X_TESTLIST = [
     "test_autoload_disable",
     "test_autoload_enable",
     "test_bundled_inputs",
+    "test_ci_sanity_check_fail",
     "test_comparison_utils",
     "test_compile_benchmark_util",
     "test_complex",
@@ -408,63 +599,102 @@ S390X_TESTLIST = [
     "test_cuda_sanitizer",
     "test_cuda_trace",
     "test_custom_ops",
+    "test_dataloader",
     "test_datapipe",
     "test_deploy",
     "test_dispatch",
     "test_dlpack",
     "test_dynamic_shapes",
     "test_expanded_weights",
+    "test_extension_utils",
     "test_fake_tensor",
     "test_file_check",
     "test_flop_counter",
+    "test_foreach",
+    "test_function_schema",
+    "test_functional_autograd_benchmark",
+    "test_functional_optim",
     "test_functionalization",
     "test_functionalization_of_rng_ops",
-    "test_functional_optim",
-    "test_function_schema",
     "test_futures",
+    "test_fx_experimental",
+    "test_fx_passes",
+    "test_fx_reinplace_pass",
+    "test_hop_infra",
     "test_hub",
     "test_import_stats",
     "test_indexing",
     "test_itt",
+    "test_jit_autocast",
+    "test_jit_disabled",
+    "test_jit_fuser_legacy",
+    "test_jit_fuser_te",
+    "test_jit_legacy",
+    "test_jit_llga_fuser",
+    "test_jit_profiling",
     "test_legacy_vmap",
+    "test_license",
+    "test_linalg",
     "test_logging",
     "test_masked",
     "test_maskedtensor",
     "test_matmul_cuda",
+    "test_meta",
+    "test_mkl_verbose",
     "test_mkldnn",
     "test_mkldnn_fusion",
     "test_mkldnn_verbose",
-    "test_mkl_verbose",
     "test_mobile_optimizer",
     "test_module_tracker",
+    "test_modules",
     "test_monitor",
+    "test_multiprocessing",
+    "test_multiprocessing_spawn",
+    "test_namedtensor",
     "test_namedtuple_return_api",
+    "test_native_functions",
     "test_native_mha",
     "test_nestedtensor",
+    "test_nn",
     "test_numba_integration",
     "test_numpy_interop",
     "test_openmp",
+    "test_ops",
+    "test_ops_fwd_gradients",
+    "test_ops_gradients",
+    "test_ops_jit",
+    "test_optim",
     "test_out_dtype_op",
     "test_overrides",
     "test_package",
     "test_per_overload_api",
     "test_prims",
+    "test_proxy_tensor",
     "test_pruning_op",
+    "test_public_bindings",
     "test_python_dispatch",
+    "test_pytree",
+    "test_reductions",
     "test_scatter_gather_ops",
+    "test_schema_check",
     "test_segment_reductions",
     "test_serialization",
     "test_set_default_mobile_cpu_allocator",
     "test_shape_ops",
     "test_show_pickle",
     "test_sort_and_select",
+    "test_sparse",
+    "test_sparse_csr",
+    "test_sparse_semi_structured",
     "test_spectral_ops",
     "test_stateless",
     "test_subclass",
-    "test_tensorboard",
+    "test_sympy_utils",
     "test_tensor_creation_ops",
+    "test_tensorboard",
     "test_tensorexpr",
     "test_tensorexpr_pybind",
+    "test_testing",
     "test_torch",
     "test_transformers",
     "test_transformers_privateuse1",
@@ -473,21 +703,25 @@ S390X_TESTLIST = [
     "test_type_promotion",
     "test_typing",
     "test_utils",
+    "test_utils_config_module",
+    "test_utils_filelock",
     "test_view_ops",
     "test_vulkan",
     "test_weak",
     "test_xnnpack_integration",
+    "test_xpu",
     "torch_np/numpy_tests/core/test_dlpack",
     "torch_np/numpy_tests/core/test_dtype",
     "torch_np/numpy_tests/core/test_einsum",
     "torch_np/numpy_tests/core/test_getlimits",
     "torch_np/numpy_tests/core/test_indexing",
+    "torch_np/numpy_tests/core/test_multiarray",
     "torch_np/numpy_tests/core/test_numeric",
     "torch_np/numpy_tests/core/test_numerictypes",
     "torch_np/numpy_tests/core/test_scalar_ctors",
+    "torch_np/numpy_tests/core/test_scalar_methods",
     "torch_np/numpy_tests/core/test_scalarinherit",
     "torch_np/numpy_tests/core/test_scalarmath",
-    "torch_np/numpy_tests/core/test_scalar_methods",
     "torch_np/numpy_tests/core/test_shape_base",
     "torch_np/numpy_tests/fft/test_helper",
     "torch_np/numpy_tests/fft/test_pocketfft",
@@ -511,8 +745,8 @@ S390X_TESTLIST = [
     "torch_np/test_scalars_0D_arrays",
     "torch_np/test_ufuncs_basic",
     "torch_np/test_unary_ufuncs",
-    "xpu/test_conv.py",
-    "xpu/test_gemm.py",
+    "xpu/test_conv",
+    "xpu/test_gemm",
 ]
 
 XPU_BLOCKLIST = [
diff --git a/test/test_dataloader.py b/test/test_dataloader.py
index 5c0708893579..ef92b4f1b82d 100644
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@@ -25,6 +25,7 @@ from torch.testing._internal.common_device_type import instantiate_device_type_t
 from torch.testing._internal.common_utils import (
     IS_CI,
     IS_JETSON,
+    IS_S390X,
     IS_SANDCASTLE,
     IS_WINDOWS,
     load_tests,
@@ -1384,6 +1385,9 @@ except RuntimeError as e:
     # This case pass on Intel GPU, but currently expected failure on other device,
     # please don't forget to remove this skip when remove the xfailIfLinux.
     @skipIfXpu
+    # This case passes on s390x too.
+    # please don't forget to remove this skip when remove the xfailIfLinux.
+    @unittest.skipIf(IS_S390X, "Unexpectedly succeeds on s390x")
     # https://github.com/pytorch/pytorch/issues/128551
     @xfailIfLinux
     def test_segfault(self):
diff --git a/test/test_ops_gradients.py b/test/test_ops_gradients.py
index a0adc4ce3972..4dfedc458520 100644
--- a/test/test_ops_gradients.py
+++ b/test/test_ops_gradients.py
@@ -14,7 +14,6 @@ from torch.testing._internal.common_utils import (
     TestCase,
     TestGradients,
     unMarkDynamoStrictTest,
-    xfailIfS390X,
 )
 from torch.testing._internal.custom_op_db import custom_op_db
 from torch.testing._internal.hop_db import hop_db
@@ -29,7 +28,6 @@ _gradcheck_ops = partial(
 @unMarkDynamoStrictTest
 class TestBwdGradients(TestGradients):
     # Tests that gradients are computed correctly
-    @xfailIfS390X
     @_gradcheck_ops(op_db + hop_db + custom_op_db)
     def test_fn_grad(self, device, dtype, op):
         # This is verified by test_dtypes in test_ops.py
diff --git a/test/test_optim.py b/test/test_optim.py
index 88b0ab78e4ac..9e2556683562 100644
--- a/test/test_optim.py
+++ b/test/test_optim.py
@@ -45,7 +45,6 @@ from torch.testing._internal.common_utils import (
     run_tests,
     TEST_WITH_TORCHDYNAMO,
     TestCase,
-    xfailIfS390X,
 )
 
 
@@ -591,7 +590,6 @@ class TestOptimRenewed(TestCase):
             self.assertEqual(complex_steps, real_steps)
 
     @skipMPS
-    @xfailIfS390X
     @optims([o for o in optim_db if o.supports_complex], dtypes=[torch.complex64])
     def test_complex_2d(self, device, dtype, optim_info):
         optim_cls = optim_info.optim_cls
diff --git a/torch/onnx/_internal/onnxruntime.py b/torch/onnx/_internal/onnxruntime.py
index 85f0cfe0d31f..1eb37f361203 100644
--- a/torch/onnx/_internal/onnxruntime.py
+++ b/torch/onnx/_internal/onnxruntime.py
@@ -496,6 +496,8 @@ def _run_onnx_session_with_ortvaluevector(
         _nvtx_range_pop()
         return pth_outputs
     else:
+        import onnxruntime.training
+
         # Profile the two ORT-to-PyTorch type casts below
         _nvtx_range_push("after run_with_ortvaluevector")
         # Map ORTValue to torch.Tensor.
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index ec0775cad3f0..95d717b3be36 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -36,7 +36,7 @@ from torch.testing._internal.common_cuda import (
 )
 from torch.testing._internal.common_utils import (
     make_fullrank_matrices_with_distinct_singular_values,
-    TEST_WITH_ROCM, IS_FBCODE, IS_WINDOWS, IS_MACOS, TEST_SCIPY,
+    TEST_WITH_ROCM, IS_FBCODE, IS_WINDOWS, IS_MACOS, IS_S390X, TEST_SCIPY,
     torch_to_numpy_dtype_dict, numpy_to_torch_dtype, TEST_WITH_ASAN,
     GRADCHECK_NONDET_TOL, slowTest, TEST_WITH_SLOW,
     TEST_WITH_TORCHINDUCTOR
@@ -23172,6 +23172,7 @@ python_ref_db = [
                 "test_python_ref",
                 dtypes=(torch.bfloat16,),
                 device_type="cpu",
+                active_if=not IS_S390X,
             ),
         ),
     ),