From ce109b3f79d47618c37d11fa7066d05e9158f803 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Thu, 16 Oct 2025 13:14:17 -0700
Subject: [PATCH 001/123] Add `torch.backends.mkldnn.is_acl_available()` method
 (#165678)

That tells whether or not PyTorch was compiled with Arm Compute Library
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165678
Approved by: https://github.com/Skylion007, https://github.com/atalman, https://github.com/albanD
ghstack dependencies: #165583, #165584, #165676
---
 torch/_C/__init__.pyi.in          | 1 +
 torch/backends/mkldnn/__init__.py | 5 +++++
 torch/csrc/Module.cpp             | 2 ++
 3 files changed, 8 insertions(+)

diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index c7e2c608ab53..244200216ec9 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -1442,6 +1442,7 @@ _has_cuda: _bool
 _has_magma: _bool
 _has_xpu: _bool
 _has_mkldnn: _bool
+_has_mkldnn_acl: _bool
 _has_cudnn: _bool
 _has_cusparselt: _bool
 has_spectral: _bool
diff --git a/torch/backends/mkldnn/__init__.py b/torch/backends/mkldnn/__init__.py
index ae76a9f20c46..a98c2cb64dfc 100644
--- a/torch/backends/mkldnn/__init__.py
+++ b/torch/backends/mkldnn/__init__.py
@@ -19,6 +19,11 @@ def is_available():
     return torch._C._has_mkldnn
 
 
+def is_acl_available():
+    r"""Return whether PyTorch is built with MKL-DNN + ACL support."""
+    return torch._C._has_mkldnn_acl
+
+
 VERBOSE_OFF = 0
 VERBOSE_ON = 1
 VERBOSE_ON_CREATION = 2
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index 4f99fa40bc6c..4a864daa8c12 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -2701,6 +2701,8 @@ Call this whenever a new thread is created in order to propagate values from
   ASSERT_TRUE(set_module_attr("_has_xpu", has_xpu));
   ASSERT_TRUE(
       set_module_attr("_has_mkldnn", at::hasMKLDNN() ? Py_True : Py_False));
+  ASSERT_TRUE(set_module_attr(
+      "_has_mkldnn_acl", AT_MKLDNN_ACL_ENABLED() ? Py_True : Py_False));
 
   ASSERT_TRUE(set_module_attr("_GLIBCXX_USE_CXX11_ABI", Py_True));
 

From 556fc09a9f67f24ca5591ec049c5d0c347c5f62a Mon Sep 17 00:00:00 2001
From: Pian Pawakapan <pianpwk@meta.com>
Date: Mon, 13 Oct 2025 16:20:49 -0700
Subject: [PATCH 002/123] [DebugMode][1/N] refactor logs into _DebugCalls
 (#165376)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165376
Approved by: https://github.com/SherlockNoMad
---
 torch/utils/_debug_mode.py | 113 +++++++++++++++++++++++--------------
 1 file changed, 71 insertions(+), 42 deletions(-)

diff --git a/torch/utils/_debug_mode.py b/torch/utils/_debug_mode.py
index 7f7de2b7334f..29b74aab5ee3 100644
--- a/torch/utils/_debug_mode.py
+++ b/torch/utils/_debug_mode.py
@@ -77,33 +77,66 @@ def _arg_to_str(arg, attributes) -> str:
     return str(arg)
 
 
-def _op_to_str(op, attributes, *args, **kwargs) -> str:
-    if op == REDISTRIBUTE_FUNC:
-        if len(args) == 2:
-            args_str = f"{_arg_to_str(args[0], attributes)}, trace: {args[1]}"
-        elif len(args) == 3:
-            _args = [_arg_to_str(arg, attributes) for arg in args]
-            args_str = f"{_args[0]}, {_args[1]} -> {_args[2]}"
+class _DebugCall:
+    """Base class for tracking operator calls in DebugMode"""
+
+    def __init__(self, call_depth: int):
+        self.call_depth = call_depth
+
+    def render(self, attributes: list[str]) -> str:
+        raise NotImplementedError("Subclasses must implement string render()")
+
+
+class _OpCall(_DebugCall):
+    """Normal operator call"""
+
+    def __init__(self, op, args: tuple, kwargs: dict, call_depth: int):
+        super().__init__(call_depth)
+        self.op = op
+        self.args = args
+        self.kwargs = kwargs
+
+    def render(self, attributes: list[str]) -> str:
+        args_str = ", ".join(_arg_to_str(arg, attributes) for arg in self.args)
+
+        if self.kwargs:
+            kwargs_str = ", " + ", ".join(
+                f"{k}={_arg_to_str(v, attributes)}" for k, v in self.kwargs.items()
+            )
         else:
-            raise RuntimeError(f"Unsupported args for {REDISTRIBUTE_FUNC}: {args}")
-    else:
-        args_str = ", ".join(_arg_to_str(arg, attributes) for arg in args)
+            kwargs_str = ""
 
-    if kwargs:
-        kwargs_str = ", " + ", ".join(
-            f"{k}={_arg_to_str(v, attributes)}" for k, v in kwargs.items()
-        )
-    else:
-        kwargs_str = ""
+        if isinstance(self.op, torch._ops.OpOverload):
+            op_name = self.op.__qualname__
+        elif hasattr(self.op, "__module__") and hasattr(self.op, "__name__"):
+            op_name = f"{self.op.__module__}.{self.op.__name__}"
+        else:
+            op_name = str(self.op)
 
-    if isinstance(op, torch._ops.OpOverload):
-        op_name = op.__qualname__
-    elif hasattr(op, "__module__") and hasattr(op, "__name__"):
-        op_name = f"{op.__module__}.{op.__name__}"
-    else:
-        op_name = str(op)
+        return f"{op_name}({args_str}{kwargs_str})"
 
-    return f"{op_name}({args_str}{kwargs_str})"
+
+class _RedistributeCall(_DebugCall):
+    """Redistribute call from DTensor dispatch"""
+
+    def __init__(
+        self, arg, src_placement, dst_placement, transform_info_str, call_depth
+    ):
+        super().__init__(call_depth)
+        self.arg = arg
+        self.src_placement = src_placement
+        self.dst_placement = dst_placement
+        self.transform_info_str = transform_info_str
+
+    def render(self, attributes: list[str]) -> str:
+        arg_str = f"{_arg_to_str(self.arg, attributes)}"
+        if self.transform_info_str is not None:  # prioritize over src/dst placements
+            placement_str = f"trace: {self.transform_info_str}"
+        else:
+            src_placement_str = _arg_to_str(self.src_placement, attributes)
+            dst_placement_str = _arg_to_str(self.dst_placement, attributes)
+            placement_str = f"{src_placement_str} -> {dst_placement_str}"
+        return f"{REDISTRIBUTE_FUNC}({arg_str}, {placement_str})"
 
 
 class DebugMode(TorchDispatchMode):
@@ -138,7 +171,7 @@ class DebugMode(TorchDispatchMode):
         if kwargs is None:
             kwargs = {}
 
-        self.operators.append((func, args, kwargs, self.call_depth))
+        self.operators.append(_OpCall(func, args, kwargs, self.call_depth))
 
         try:
             self.call_depth += 1
@@ -152,17 +185,19 @@ class DebugMode(TorchDispatchMode):
 
         # Record the operation with its call depth
         if torch.distributed.tensor.DTensor in types:
-            self.operators.append((func, args, kwargs, self.call_depth))
+            self.operators.append(_OpCall(func, args, kwargs, self.call_depth))
             return NotImplemented
         elif FakeTensor in types or isinstance(
             _get_current_dispatch_mode(), FakeTensorMode
         ):
             if self.record_faketensor:
                 if func != torch.ops.prim.device.default:
-                    self.operators.append((func, args, kwargs, self.call_depth + 1))
+                    self.operators.append(
+                        _OpCall(func, args, kwargs, self.call_depth + 1)
+                    )
         elif len(types) == 0:
             if self.record_realtensor:
-                self.operators.append((func, args, kwargs, self.call_depth + 1))
+                self.operators.append(_OpCall(func, args, kwargs, self.call_depth + 1))
 
         result = func(*args, **kwargs)
 
@@ -187,23 +222,19 @@ class DebugMode(TorchDispatchMode):
     @contextlib.contextmanager
     def record_redistribute_calls(
         self,
-        arg_idx,
+        arg,
         src_placement,
         dst_placement,
         transform_info_str: Optional[str] = None,
     ):
         try:
-            arg_list = (
-                [arg_idx, transform_info_str]
-                if transform_info_str
-                else [arg_idx, src_placement, dst_placement]
-            )
             self.operators.append(
-                (
-                    REDISTRIBUTE_FUNC,
-                    arg_list,
-                    {},
-                    self.call_depth + 1,
+                _RedistributeCall(
+                    arg,
+                    src_placement=src_placement,
+                    dst_placement=dst_placement,
+                    transform_info_str=transform_info_str,
+                    call_depth=self.call_depth + 1,
                 )
             )
             self.call_depth += 1
@@ -215,10 +246,8 @@ class DebugMode(TorchDispatchMode):
         with torch._C.DisableTorchFunction():
             result = ""
             result += "\n".join(
-                "  "
-                + "  " * depth
-                + _op_to_str(op, self.record_tensor_attributes, *args, **kwargs)
-                for op, args, kwargs, depth in self.operators
+                "  " + "  " * op.call_depth + op.render(self.record_tensor_attributes)
+                for op in self.operators
             )
         return result
 

From 5b3ea758951558e7d9f681ae784acb57eaa07910 Mon Sep 17 00:00:00 2001
From: Shivam Raikundalia <sraikund@meta.com>
Date: Thu, 16 Oct 2025 22:54:27 +0000
Subject: [PATCH 003/123] [Mem Snapshot] Add Metadata Field (#165490)

Summary:
The implementation adds the ability to:

Set custom metadata strings that will be attached to all subsequent allocations
Clear or change the metadata at any point
View the metadata in memory snapshots via _dump_snapshot()

Test Plan: Added test in test_cuda.py and check manually in snapshot to see that metadata was added.

Differential Revision: D84654933

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165490
Approved by: https://github.com/yushangdi
---
 c10/cuda/CUDACachingAllocator.cpp   | 27 +++++++++++++++++++++++++-
 c10/cuda/CUDACachingAllocator.h     | 19 ++++++++++++++++--
 test/test_cuda.py                   | 22 +++++++++++++++++++++
 torch/_C/__init__.pyi.in            |  2 ++
 torch/csrc/cuda/Module.cpp          | 10 ++++++++++
 torch/csrc/cuda/memory_snapshot.cpp |  2 ++
 torch/cuda/memory.py                | 30 +++++++++++++++++++++++++++++
 7 files changed, 109 insertions(+), 3 deletions(-)

diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
index 48413e7a6f34..25058f87264f 100644
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@@ -1260,6 +1260,9 @@ class DeviceCachingAllocator {
   // thread local compile context for each device
   static thread_local std::stack<std::string> compile_context;
 
+  // thread local user metadata for annotating allocations
+  static thread_local std::string user_metadata;
+
  public:
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
   explicit DeviceCachingAllocator(c10::DeviceIndex id)
@@ -1302,6 +1305,14 @@ class DeviceCachingAllocator {
     }
   }
 
+  void setUserMetadata(const std::string& metadata) {
+    user_metadata = metadata;
+  }
+
+  std::string getUserMetadata() {
+    return user_metadata;
+  }
+
   bool checkPoolLiveAllocations(
       MempoolId_t mempool_id,
       const std::unordered_set<void*>& expected_live_allocations) const {
@@ -3682,7 +3693,8 @@ class DeviceCachingAllocator {
         mempool_id,
         getApproximateTime(),
         record_context_ >= RecordContext::ALLOC ? std::move(context) : nullptr,
-        compile_string);
+        compile_string,
+        user_metadata);
 
     // Callbacks should not include any Pytorch call
     for (const auto& cb : trace_trackers_) {
@@ -3737,6 +3749,7 @@ static void uncached_delete(void* ptr) {
 
 static void local_raw_delete(void* ptr);
 thread_local std::stack<std::string> DeviceCachingAllocator::compile_context;
+thread_local std::string DeviceCachingAllocator::user_metadata;
 #ifdef __cpp_lib_hardware_interference_size
 using std::hardware_destructive_interference_size;
 #else
@@ -3934,6 +3947,18 @@ class NativeCachingAllocator : public CUDAAllocator {
     device_allocator[device]->popCompileContext();
   }
 
+  void setUserMetadata(const std::string& metadata) override {
+    c10::DeviceIndex device = 0;
+    C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
+    device_allocator[device]->setUserMetadata(metadata);
+  }
+
+  std::string getUserMetadata() override {
+    c10::DeviceIndex device = 0;
+    C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
+    return device_allocator[device]->getUserMetadata();
+  }
+
   bool isHistoryEnabled() override {
     c10::DeviceIndex device = 0;
     C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
diff --git a/c10/cuda/CUDACachingAllocator.h b/c10/cuda/CUDACachingAllocator.h
index 89274c9f9946..fbe5dab18e0a 100644
--- a/c10/cuda/CUDACachingAllocator.h
+++ b/c10/cuda/CUDACachingAllocator.h
@@ -118,7 +118,8 @@ struct TraceEntry {
       MempoolId_t mempool,
       approx_time_t time,
       std::shared_ptr<GatheredContext> context = nullptr,
-      std::string compile_context = "")
+      std::string compile_context = "",
+      std::string user_metadata = "")
       : action_(action),
         device_(device),
         addr_(addr),
@@ -126,7 +127,8 @@ struct TraceEntry {
         stream_(stream),
         size_(size),
         mempool_(std::move(mempool)),
-        compile_context_(std::move(compile_context)) {
+        compile_context_(std::move(compile_context)),
+        user_metadata_(std::move(user_metadata)) {
     time_.approx_t_ = time;
   }
   Action action_;
@@ -138,6 +140,7 @@ struct TraceEntry {
   MempoolId_t mempool_;
   trace_time_ time_{};
   std::string compile_context_;
+  std::string user_metadata_;
 };
 
 // Calls made by record_function will save annotations
@@ -297,6 +300,10 @@ class CUDAAllocator : public DeviceAllocator {
       const std::vector<std::pair<std::string, std::string>>& /*md*/) {}
   virtual void pushCompileContext(std::string& md) {}
   virtual void popCompileContext() {}
+  virtual void setUserMetadata(const std::string& metadata) {}
+  virtual std::string getUserMetadata() {
+    return "";
+  }
   virtual void attachOutOfMemoryObserver(OutOfMemoryObserver observer) = 0;
 
   // Attached AllocatorTraceTracker callbacks will be called while the
@@ -536,6 +543,14 @@ inline void enablePeerAccess(
   get()->enablePeerAccess(dev, dev_to_access);
 }
 
+inline void setUserMetadata(const std::string& metadata) {
+  get()->setUserMetadata(metadata);
+}
+
+inline std::string getUserMetadata() {
+  return get()->getUserMetadata();
+}
+
 } // namespace c10::cuda::CUDACachingAllocator
 
 namespace c10::cuda {
diff --git a/test/test_cuda.py b/test/test_cuda.py
index 667bccd82c24..05302ad97661 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -4378,6 +4378,28 @@ class TestCudaMallocAsync(TestCase):
             finally:
                 torch.cuda.memory._record_memory_history(None)
 
+    @unittest.skipIf(
+        TEST_CUDAMALLOCASYNC, "setContextRecorder not supported by CUDAMallocAsync"
+    )
+    @requiresCppContext
+    def test_memory_plots_metadata(self):
+        for context in ["alloc", "all", "state"]:
+            try:
+                torch._C._cuda_clearCublasWorkspaces()
+                torch.cuda.memory.empty_cache()
+                torch.cuda.memory._set_memory_metadata("metadata test")
+                torch.cuda.memory._record_memory_history(context="all")
+                x = torch.rand(3, 4, device="cuda")
+                del x
+                torch.cuda.memory.empty_cache()
+                torch.cuda.memory._set_memory_metadata("")
+
+                ss = torch.cuda.memory._snapshot()
+                for event in ss["device_traces"][0]:
+                    self.assertTrue(event["user_metadata"] == "metadata test")
+            finally:
+                torch.cuda.memory._record_memory_history(None)
+
     @unittest.skipIf(
         TEST_CUDAMALLOCASYNC, "setContextRecorder not supported by CUDAMallocAsync"
     )
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 244200216ec9..b99fd3f2b80a 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -2081,6 +2081,8 @@ def _cuda_hostMemoryStats() -> dict[str, Any]: ...
 def _cuda_resetAccumulatedHostMemoryStats() -> None: ...
 def _cuda_resetPeakHostMemoryStats() -> None: ...
 def _cuda_memorySnapshot(mempool_id: tuple[_int, _int] | None) -> dict[str, Any]: ...
+def _cuda_setMemoryMetadata(metadata: str) -> None: ...
+def _cuda_getMemoryMetadata() -> str: ...
 def _cuda_record_memory_history_legacy(
     enabled: _bool,
     record_context: _bool,
diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp
index 0950192457d6..32ade3680980 100644
--- a/torch/csrc/cuda/Module.cpp
+++ b/torch/csrc/cuda/Module.cpp
@@ -765,6 +765,7 @@ PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* arg) {
   py::str frames_s = "frames";
   py::str time_us_s = "time_us";
   py::str compile_context_s = "compile_context";
+  py::str user_metadata_s = "user_metadata";
 
   py::list empty_frames;
   std::vector<CapturedTraceback*> to_gather_frames;
@@ -882,6 +883,7 @@ PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* arg) {
       trace_entry[stream_s] = int64_t(te.stream_);
       trace_entry[time_us_s] = te.time_.t_;
       trace_entry[compile_context_s] = te.compile_context_;
+      trace_entry[user_metadata_s] = te.user_metadata_;
       trace.append(trace_entry);
     }
     traces.append(trace);
@@ -1137,6 +1139,14 @@ static void registerCudaDeviceProperties(PyObject* module) {
     return c10::cuda::CUDACachingAllocator::isHistoryEnabled();
   });
 
+  m.def("_cuda_setMemoryMetadata", [](const std::string& metadata) {
+    c10::cuda::CUDACachingAllocator::setUserMetadata(metadata);
+  });
+
+  m.def("_cuda_getMemoryMetadata", []() {
+    return c10::cuda::CUDACachingAllocator::getUserMetadata();
+  });
+
   m.def("_cuda_get_conv_benchmark_empty_cache", []() {
     return at::native::_cudnn_get_conv_benchmark_empty_cache();
   });
diff --git a/torch/csrc/cuda/memory_snapshot.cpp b/torch/csrc/cuda/memory_snapshot.cpp
index d4382aa8cb32..830159d0a919 100644
--- a/torch/csrc/cuda/memory_snapshot.cpp
+++ b/torch/csrc/cuda/memory_snapshot.cpp
@@ -311,6 +311,7 @@ std::string _memory_snapshot_pickled() {
   IValue is_expandable_s = "is_expandable";
   IValue time_us_s = "time_us";
   IValue compile_contexts_s = "compile_context";
+  IValue user_metadata_s = "user_metadata";
 
   auto empty_frames = new_list();
 
@@ -428,6 +429,7 @@ std::string _memory_snapshot_pickled() {
       trace_entry.insert(size_s, (int64_t)te.size_);
       trace_entry.insert(stream_s, int64_t(te.stream_));
       trace_entry.insert(compile_contexts_s, te.compile_context_);
+      trace_entry.insert(user_metadata_s, te.user_metadata_);
       if (te.context_) {
         auto sc = getFromContext(te.context_);
         frame_tracebacks.push_back(sc);
diff --git a/torch/cuda/memory.py b/torch/cuda/memory.py
index 5eeaf3a8253f..e4b125eb4258 100644
--- a/torch/cuda/memory.py
+++ b/torch/cuda/memory.py
@@ -1063,6 +1063,36 @@ def _dump_snapshot(filename="dump_snapshot.pickle"):
         pickle.dump(s, f)
 
 
+def _set_memory_metadata(metadata: str):
+    """
+    Set custom metadata that will be attached to all subsequent CUDA memory allocations.
+
+    This metadata will be recorded in the memory snapshot for all allocations made
+    after this call until the metadata is cleared or changed.
+
+    Args:
+        metadata (str): Custom metadata string to attach to allocations.
+                       Pass an empty string to clear the metadata.
+
+    Example:
+        >>> torch.cuda.memory._set_memory_metadata("training_phase")
+        >>> # All allocations here will have "training_phase" metadata
+        >>> x = torch.randn(100, 100, device="cuda")
+        >>> torch.cuda.memory._set_memory_metadata("")  # Clear metadata
+    """
+    torch._C._cuda_setMemoryMetadata(metadata)
+
+
+def _get_memory_metadata() -> str:
+    """
+    Get the current custom metadata that is being attached to CUDA memory allocations.
+
+    Returns:
+        str: The current metadata string, or empty string if no metadata is set.
+    """
+    return torch._C._cuda_getMemoryMetadata()
+
+
 def _save_segment_usage(filename="output.svg", snapshot=None):
     if snapshot is None:
         snapshot = _snapshot()

From 98a488c9aaadd4b137b7a63dad31543aee75c454 Mon Sep 17 00:00:00 2001
From: Colin L Reliability Rice <clr@meta.com>
Date: Thu, 16 Oct 2025 23:05:31 +0000
Subject: [PATCH 004/123] Start recording inductor provenance (#162669)

Summary:
This stores information on where fx graphs come from, which makes it
significantly easier to debug.

One outstanding question

1) I only stored the kernel stack traces, do we also want the node mappings?

Test Plan:
I wrote a explicit logging test which makes a module, fx traces it, compiles it, and makes sure the logging infomration shows up.

```
clr@devvm17763 ~/fbsource/fbcode/caffe2/test/dynamo
 % buck2 test @//mode/opt fbcode//caffe2/test/dynamo:test_dynamo -- test_utils

File changed: fbsource//xplat/caffe2/test/dynamo/test_utils.py
File changed: fbcode//caffe2/test/dynamo/test_utils.py
Buck UI: https://www.internalfb.com/buck2/528dea32-2416-4a62-a1ec-39f3c0efdd2e
Test UI: https://www.internalfb.com/intern/testinfra/testrun/13229324015574003
Network: Up: 0B  Down: 0B
Executing actions. Remaining     0/2
Command: test.
Time elapsed: 17.3s
Tests finished: Pass 16. Fail 0. Fatal 0. Skip 0. Build failure 0
```

Rollback Plan:

Differential Revision: D82037582

Pull Request resolved: https://github.com/pytorch/pytorch/pull/162669
Approved by: https://github.com/yushangdi
---
 test/dynamo/test_utils.py     | 23 +++++++++++++++++++++++
 torch/_dynamo/utils.py        |  1 +
 torch/_inductor/codecache.py  | 11 ++++++++++-
 torch/_inductor/compile_fx.py |  3 +++
 4 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/test/dynamo/test_utils.py b/test/dynamo/test_utils.py
index 1708da900056..8dec23534eff 100644
--- a/test/dynamo/test_utils.py
+++ b/test/dynamo/test_utils.py
@@ -510,6 +510,7 @@ class TestDynamoTimed(TestCase):
         raw = dataclasses.asdict(compilation_events[0])
         del raw["feature_usage"]
         del raw["ir_count"]
+        del raw["inductor_provenance"]
         del raw["param_numel"]
         del raw["param_bytes"]
         del raw["param_count"]
@@ -694,6 +695,7 @@ class TestDynamoTimed(TestCase):
         raw = dataclasses.asdict(compilation_events[1])
         del raw["feature_usage"]
         del raw["ir_count"]
+        del raw["inductor_provenance"]
         del raw["guard_latency_us"]
         del raw["param_numel"]
         del raw["param_bytes"]
@@ -911,6 +913,27 @@ class TestDynamoTimed(TestCase):
             compilation_events = [arg[0][0] for arg in log_event.call_args_list]
         self.assertEqual(compilation_events[0].ir_count, second)
 
+    @dynamo_config.patch(
+        {
+            "log_compilation_metrics": True,
+        }
+    )
+    @inductor_config.patch(
+        {"trace.enabled": True, "trace.provenance_tracking_level": 1},
+    )
+    def test_inductor_provenance(self):
+        module = torch.nn.Linear(6, 66)
+        graph_module = torch.fx.symbolic_trace(module)
+
+        compilation_events = []
+        with mock.patch("torch._dynamo.utils.log_compilation_event") as log_event:
+            torch.compile(graph_module)(torch.randn(6, 6))
+            compilation_events = [arg[0][0] for arg in log_event.call_args_list]
+        self.assertEqual(
+            compilation_events[0].inductor_provenance,
+            {'{"extern_kernels.addmm:1": []}'},
+        )
+
     @dynamo_config.patch({"log_compilation_metrics": True})
     @inductor_config.patch({"force_disable_caches": True})
     def test_dynamic_shape_feature_use(self):
diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
index 5e426d53e267..08bfe58aacba 100644
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@@ -1376,6 +1376,7 @@ class CompilationMetrics:
     recompile_user_contexts: Optional[set[str]] = None
     inline_inbuilt_nn_modules_candidate: Optional[bool] = False
     pytorch_version: Optional[str] = None
+    inductor_provenance: Optional[str] = None
 
     @classmethod
     def create(cls, metrics: dict[str, Any]) -> CompilationMetrics:
diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
index 08b6b263272c..5cc178db2fc3 100644
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@@ -42,7 +42,12 @@ import torch.distributed as dist
 from torch import SymInt, Tensor
 from torch._dynamo.device_interface import get_interface_for_device
 from torch._dynamo.exc import SkipFrame
-from torch._dynamo.utils import CompileEventLogger, counters, dynamo_timed
+from torch._dynamo.utils import (
+    CompileEventLogger,
+    counters,
+    dynamo_timed,
+    get_metrics_context,
+)
 from torch._inductor import config, exc, metrics
 from torch._inductor.codegen.common import (
     custom_backend_codegen_configs,
@@ -1281,6 +1286,10 @@ class FxGraphCache(GuardedCache[CompiledFxGraph]):
             },
             payload_fn=lambda: graph.inductor_provenance_stack_traces_str,
         )
+        if get_metrics_context().in_progress():
+            get_metrics_context().add_to_set(
+                "inductor_provenance", graph.inductor_provenance_stack_traces_str
+            )
         return graph, cache_info
 
     @staticmethod
diff --git a/torch/_inductor/compile_fx.py b/torch/_inductor/compile_fx.py
index 7947e9cb8445..6153daac47c8 100644
--- a/torch/_inductor/compile_fx.py
+++ b/torch/_inductor/compile_fx.py
@@ -1544,6 +1544,9 @@ class _InProcessFxCompile(FxCompile):
                             },
                             payload_fn=lambda: inductor_kernel_stack_trace_str,
                         )
+                        get_metrics_context().add_to_set(
+                            "inductor_provenance", inductor_kernel_stack_trace_str
+                        )
 
                     node_runtimes = None
                     if inductor_metrics_log.isEnabledFor(logging.INFO):

From d2c82bafb7086a1dd109a0a6407ca7fed27337f4 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Thu, 16 Oct 2025 23:08:26 +0000
Subject: [PATCH 005/123] Revert "158232  Fix autocast cache incorrectly
 retaining no_grad state (#165068)"

This reverts commit 5daef30b26b794d237fbbc399c1d47ec0380200a.

Reverted https://github.com/pytorch/pytorch/pull/165068 on behalf of https://github.com/jeffdaily due to This broke ROCm CI. test/test_transformers.py::TestTransformersCUDA::test_transformerencoder_fastpath_use_torchscript_False_enable_nested_tensor_True_use_autocast_True_d_model_256_cuda [GH job link](https://github.com/pytorch/pytorch/actions/runs/18572589089/job/52952074008) [HUD commit link](https://hud.pytorch.org/pytorch/pytorch/commit/5daef30b26b794d237fbbc399c1d47ec0380200a) ([comment](https://github.com/pytorch/pytorch/pull/165068#issuecomment-3413184445))
---
 aten/src/ATen/autocast_mode.cpp |  35 +-------
 test/test_autocast.py           | 137 --------------------------------
 2 files changed, 4 insertions(+), 168 deletions(-)

diff --git a/aten/src/ATen/autocast_mode.cpp b/aten/src/ATen/autocast_mode.cpp
index b15fb9910afc..e3424cc4cb8e 100644
--- a/aten/src/ATen/autocast_mode.cpp
+++ b/aten/src/ATen/autocast_mode.cpp
@@ -2,7 +2,6 @@
 
 #include <mutex>
 #include <ATen/CachedTensorUtils.h>
-#include <c10/core/GradMode.h>
 #include <c10/util/flat_hash_map.h>
 
 namespace at::autocast {
@@ -37,29 +36,10 @@ namespace {
 using weakref_type = c10::weak_intrusive_ptr<TensorImpl, UndefinedTensorImpl>;
 using val_type = std::tuple<weakref_type, Tensor>;
 
-// We maintain separate caches for gradient-enabled and gradient-disabled modes.
-// This ensures that tensors cached in torch.no_grad() (with requires_grad=False)
-// are not incorrectly reused in gradient-enabled contexts.
-// This fixes issue #158232 while maintaining optimal performance for both modes.
-static ska::flat_hash_map<TensorImpl*, val_type>& get_cached_casts_grad_enabled() {
-  static ska::flat_hash_map<TensorImpl*, val_type> cached_casts_grad_enabled;
-  return cached_casts_grad_enabled;
+ska::flat_hash_map<TensorImpl*, val_type>& get_cached_casts() {
+  static ska::flat_hash_map<TensorImpl*, val_type> cached_casts;
+  return cached_casts;
 }
-
-static ska::flat_hash_map<TensorImpl*, val_type>& get_cached_casts_grad_disabled() {
-  static ska::flat_hash_map<TensorImpl*, val_type> cached_casts_grad_disabled;
-  return cached_casts_grad_disabled;
-}
-
-// Helper function to get the appropriate cache based on current gradient mode.
-// This allows us to cache tensors separately for grad-enabled and grad-disabled contexts,
-// preventing incorrect cache hits when gradient mode changes.
-static ska::flat_hash_map<TensorImpl*, val_type>& get_cached_casts() {
-  return at::GradMode::is_enabled() ?
-    get_cached_casts_grad_enabled() :
-    get_cached_casts_grad_disabled();
-}
-
 std::mutex cached_casts_mutex;
 
 
@@ -106,9 +86,7 @@ thread_local bool cache_enabled = true;
 
 void clear_cache() {
   const std::lock_guard<std::mutex> lock(cached_casts_mutex);
-  // Clear both caches to ensure consistent behavior regardless of current gradient mode
-  get_cached_casts_grad_enabled().clear();
-  get_cached_casts_grad_disabled().clear();
+  get_cached_casts().clear();
 }
 
 int increment_nesting() {
@@ -143,11 +121,6 @@ Tensor cached_cast(at::ScalarType to_type, const Tensor& arg, DeviceType device_
   if (is_eligible(arg, device_type) && (arg.scalar_type() != to_type)) {
     // Heuristic:  Do what Apex does, and cache lower_precision_fp casts of fp32 model weights (leaves).
     // See cached_casts declaration above for detailed strategy.
-    //
-    // We maintain separate caches for gradient-enabled and gradient-disabled modes
-    // (see get_cached_casts() above). This ensures correctness when mixing torch.no_grad()
-    // with torch.autocast(), while maintaining optimal performance for both training and inference.
-    // This fixes issue #158232 without any performance regression.
     bool can_try_cache = (to_type == get_lower_precision_fp_from_device_type(device_type) &&
                          arg.scalar_type() == at::kFloat && arg.requires_grad() &&
                          arg.is_leaf() && !arg.is_view() && cache_enabled &&
diff --git a/test/test_autocast.py b/test/test_autocast.py
index d1c5f525b8d8..19e05dd0a9d1 100644
--- a/test/test_autocast.py
+++ b/test/test_autocast.py
@@ -384,143 +384,6 @@ class TestTorchAutocast(TestCase):
         with self.assertRaisesRegex(expected_exception=ValueError, expected_regex=msg):
             torch.autocast(device_type=dev)
 
-    @skipIfTorchDynamo()
-    def test_autocast_nograd_caching_issue_158232(self):
-        """
-        Regression test for issue #158232: autocast + no_grad incompatibility
-
-        When torch.no_grad() is nested inside torch.autocast(), the autocast cache
-        must not cache tensors created in the no_grad context, because they lack
-        gradient tracking. If cached, subsequent operations in gradient-enabled mode
-        would incorrectly use the no-gradient cached version.
-
-        Before fix: RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
-        After fix: Should work correctly
-        """
-        model = torch.nn.Linear(2, 2)
-        inp = torch.randn(8, 2)
-
-        with torch.autocast("cpu", dtype=torch.bfloat16, enabled=True):
-            # First forward pass in no_grad context (e.g., shape inference)
-            with torch.no_grad():
-                out1 = model(inp)
-                self.assertFalse(
-                    out1.requires_grad, "Output in no_grad should not require grad"
-                )
-
-            # Second forward pass with gradients enabled (e.g., training)
-            out2 = model(inp)
-            self.assertTrue(
-                out2.requires_grad,
-                "Output should require gradients after exiting no_grad",
-            )
-            self.assertIsNotNone(
-                out2.grad_fn, "Output should have grad_fn after exiting no_grad"
-            )
-
-            # Backward pass should work
-            loss = out2.mean()
-            loss.backward()
-
-        # Verify gradients were computed
-        self.assertIsNotNone(model.weight.grad)
-        self.assertIsNotNone(model.bias.grad)
-
-    @skipIfTorchDynamo()
-    def test_autocast_inference_mode_interaction(self):
-        """
-        Test that autocast works correctly with torch.inference_mode()
-
-        InferenceMode is a stricter version of no_grad that provides additional
-        performance optimizations. Verify it doesn't break with autocast.
-        """
-        model = torch.nn.Linear(2, 2)
-        inp = torch.randn(8, 2)
-
-        # Test 1: inference_mode inside autocast
-        with torch.autocast("cpu", dtype=torch.bfloat16, enabled=True):
-            with torch.inference_mode():
-                out1 = model(inp)
-                self.assertFalse(out1.requires_grad)
-                self.assertEqual(out1.dtype, torch.bfloat16)
-
-            # After exiting inference_mode, gradients should work
-            out2 = model(inp)
-            self.assertTrue(out2.requires_grad)
-            out2.mean().backward()
-
-        # Test 2: autocast inside inference_mode
-        with torch.inference_mode():
-            with torch.autocast("cpu", dtype=torch.bfloat16, enabled=True):
-                out = model(inp)
-                self.assertFalse(out.requires_grad)
-                self.assertEqual(out.dtype, torch.bfloat16)
-
-    def test_autocast_caching_still_works_with_gradients(self):
-        """
-        Verify that autocast caching still functions correctly when gradients ARE enabled.
-
-        This test ensures the fix for #158232 didn't break normal caching behavior.
-        We can't directly observe cache hits, but we verify that repeated operations
-        with gradients enabled work correctly.
-        """
-        model = torch.nn.Linear(2, 2)
-        inp = torch.randn(8, 2)
-
-        with torch.autocast("cpu", dtype=torch.bfloat16, enabled=True):
-            # Multiple forward passes with gradients enabled
-            out1 = model(inp)
-            out2 = model(inp)
-            out3 = model(inp)
-
-            # All should have gradients
-            self.assertTrue(out1.requires_grad)
-            self.assertTrue(out2.requires_grad)
-            self.assertTrue(out3.requires_grad)
-
-            # All should have grad_fn
-            self.assertIsNotNone(out1.grad_fn)
-            self.assertIsNotNone(out2.grad_fn)
-            self.assertIsNotNone(out3.grad_fn)
-
-            # Backward should work on all
-            out1.mean().backward(retain_graph=True)
-            out2.mean().backward(retain_graph=True)
-            out3.mean().backward()
-
-    @skipIfTorchDynamo()
-    def test_autocast_mixed_grad_contexts(self):
-        """
-        Test complex nesting of gradient contexts within autocast.
-
-        This ensures the gradient mode check works correctly across
-        multiple transitions between gradient-enabled and disabled states.
-        """
-        model = torch.nn.Linear(2, 2)
-        inp = torch.randn(8, 2)
-
-        with torch.autocast("cpu", dtype=torch.bfloat16, enabled=True):
-            # Pass 1: no_grad
-            with torch.no_grad():
-                out1 = model(inp)
-                self.assertFalse(out1.requires_grad)
-
-            # Pass 2: gradients enabled
-            out2 = model(inp)
-            self.assertTrue(out2.requires_grad)
-
-            # Pass 3: no_grad again
-            with torch.no_grad():
-                out3 = model(inp)
-                self.assertFalse(out3.requires_grad)
-
-            # Pass 4: gradients enabled again
-            out4 = model(inp)
-            self.assertTrue(out4.requires_grad)
-
-            # Backward on gradient-enabled outputs
-            (out2.mean() + out4.mean()).backward()
-
 
 if __name__ == "__main__":
     run_tests()

From e0fe37fa687a39e42ddeeb5c03986ffd5c40e662 Mon Sep 17 00:00:00 2001
From: Kurt Mohler <kmohler@quansight.com>
Date: Mon, 13 Oct 2025 19:12:26 -0500
Subject: [PATCH 006/123] [MPS] Move `torch.cat` impl to Metal (#165373)

After this change, all of the cases tested in [this performance measurement script](https://github.com/kurtamohler/pytorch-perf-test-scripts/blob/10de64c5ac8008e9f2015a1277451da81e5b6dff/cat/perf0.py) take either roughly the same runtime or less.

Before:

```
idx: cpu time, mps time, speedup, op, args, kwargs
-----------------------------------------
0: 0.000857 ms, 0.016098 ms, 0.05, cat, [[tensor(shape[5, 5]), tensor(shape[5, 5])]], {'dim': -1}
1: 0.000858 ms, 0.014861 ms, 0.06, cat, [[tensor(shape[5, 5]), tensor(shape[5, 5])]], {'dim': 1}
2: 0.000806 ms, 0.015145 ms, 0.05, cat, [[tensor(shape[10, 5]), tensor(shape[5, 5])]], {'dim': 0}
3: 0.000829 ms, 0.015355 ms, 0.05, cat, [[tensor(shape[1, 2, 3]), tensor(shape[1, 2, 3])]], {'dim': -2}
4: 0.000591 ms, 0.000582 ms, 1.02, cat, [[tensor(shape[0]), tensor(shape[0])]], {'dim': 0}
5: 0.001076 ms, 0.022387 ms, 0.05, cat, [[tensor(shape[0]), tensor(shape[5, 5])]], {'dim': 1}
6: 0.000708 ms, 0.022300 ms, 0.03, cat, [[tensor(shape[0, 5]), tensor(shape[5, 5])]], {'dim': 0}
7: 0.000640 ms, 0.014367 ms, 0.04, cat, [[tensor(shape[1]), tensor(shape[1])]], {}
8: 0.000777 ms, 0.027506 ms, 0.03, cat, [[tensor(shape[2, 2, 2, 2])], 1], {}
9: 0.003383 ms, 0.269277 ms, 0.01, cat, "[[tensor(shape[3, 1, 2]), tensor(shape[3, 2, 2]), tensor(shape[3, 3, 2]), tensor(shape[3, 1, 2]), te...", {'dim': 1}
10: 0.526138 ms, 0.650852 ms, 0.81, cat, "[[tensor(shape[3, 1, 2]), tensor(shape[3, 2, 2]), tensor(shape[3, 3, 2]), tensor(shape[3, 1, 2]), te...", {'dim': 1}
11: 0.444091 ms, 0.628630 ms, 0.71, cat, "[[tensor(shape[1, 3, 2]), tensor(shape[2, 3, 2]), tensor(shape[3, 3, 2]), tensor(shape[1, 3, 2]), te...", {'dim': 0}
12: 2.011870 ms, 0.989525 ms, 2.03, cat, [[tensor(shape[1000000, 3, 2]), tensor(shape[1000000, 3, 2])]], {'dim': 0}
13: 3.100653 ms, 0.948178 ms, 3.27, cat, [[tensor(shape[3, 1000000, 2]), tensor(shape[3, 1000000, 2])]], {'dim': 1}
14: 3.112174 ms, 0.954174 ms, 3.26, cat, [[tensor(shape[3, 2, 1000000]), tensor(shape[3, 2, 1000000])]], {'dim': 2}
```

After:

```
idx: cpu time, mps time, speedup, op, args, kwargs
-----------------------------------------
0: 0.000790 ms, 0.013111 ms, 0.06, cat, [[tensor(shape[5, 5]), tensor(shape[5, 5])]], {'dim': -1}
1: 0.000800 ms, 0.014419 ms, 0.06, cat, [[tensor(shape[5, 5]), tensor(shape[5, 5])]], {'dim': 1}
2: 0.000748 ms, 0.010019 ms, 0.07, cat, [[tensor(shape[10, 5]), tensor(shape[5, 5])]], {'dim': 0}
3: 0.000767 ms, 0.010063 ms, 0.08, cat, [[tensor(shape[1, 2, 3]), tensor(shape[1, 2, 3])]], {'dim': -2}
4: 0.000591 ms, 0.000591 ms, 1.00, cat, [[tensor(shape[0]), tensor(shape[0])]], {'dim': 0}
5: 0.001220 ms, 0.009763 ms, 0.12, cat, [[tensor(shape[0]), tensor(shape[5, 5])]], {'dim': 1}
6: 0.000739 ms, 0.006203 ms, 0.12, cat, [[tensor(shape[0, 5]), tensor(shape[5, 5])]], {'dim': 0}
7: 0.000647 ms, 0.009905 ms, 0.07, cat, [[tensor(shape[1]), tensor(shape[1])]], {}
8: 0.000753 ms, 0.007818 ms, 0.10, cat, [[tensor(shape[2, 2, 2, 2])], 1], {}
9: 0.003823 ms, 0.192723 ms, 0.02, cat, "[[tensor(shape[3, 1, 2]), tensor(shape[3, 2, 2]), tensor(shape[3, 3, 2]), tensor(shape[3, 1, 2]), te...", {'dim': 1}
10: 0.576564 ms, 0.733920 ms, 0.79, cat, "[[tensor(shape[3, 1, 2]), tensor(shape[3, 2, 2]), tensor(shape[3, 3, 2]), tensor(shape[3, 1, 2]), te...", {'dim': 1}
11: 0.462957 ms, 0.692799 ms, 0.67, cat, "[[tensor(shape[1, 3, 2]), tensor(shape[2, 3, 2]), tensor(shape[3, 3, 2]), tensor(shape[1, 3, 2]), te...", {'dim': 0}
12: 2.017181 ms, 0.968345 ms, 2.08, cat, [[tensor(shape[1000000, 3, 2]), tensor(shape[1000000, 3, 2])]], {'dim': 0}
13: 3.203508 ms, 0.986382 ms, 3.25, cat, [[tensor(shape[3, 1000000, 2]), tensor(shape[3, 1000000, 2])]], {'dim': 1}
14: 3.181249 ms, 1.007773 ms, 3.16, cat, [[tensor(shape[3, 2, 1000000]), tensor(shape[3, 2, 1000000])]], {'dim': 2}
```

Fixes #165350
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165373
Approved by: https://github.com/kulinseth, https://github.com/malfet
---
 aten/src/ATen/native/mps/kernels/Shape.h     |   8 +-
 aten/src/ATen/native/mps/kernels/Shape.metal |  86 ++++++------
 aten/src/ATen/native/mps/operations/Shape.mm | 138 +++++--------------
 3 files changed, 85 insertions(+), 147 deletions(-)

diff --git a/aten/src/ATen/native/mps/kernels/Shape.h b/aten/src/ATen/native/mps/kernels/Shape.h
index bfa76e24a659..dcbc3226e923 100644
--- a/aten/src/ATen/native/mps/kernels/Shape.h
+++ b/aten/src/ATen/native/mps/kernels/Shape.h
@@ -1,16 +1,16 @@
 #pragma once
 #include <c10/metal/common.h>
 
-template <unsigned N = c10::metal::max_ndim, typename idx_type_t = int64_t>
-struct CatLargeSharedParams {
+template <typename idx_type_t = int64_t, unsigned N = c10::metal::max_ndim>
+struct CatSharedParams {
   int32_t ndim;
   int32_t cat_dim;
   ::c10::metal::array<idx_type_t, N> output_strides;
   ::c10::metal::array<idx_type_t, N> output_sizes;
 };
 
-template <unsigned N = c10::metal::max_ndim, typename idx_type_t = int64_t>
-struct CatLargeInputParams {
+template <typename idx_type_t = int64_t, unsigned N = c10::metal::max_ndim>
+struct CatInputParams {
   idx_type_t cat_dim_offset;
   idx_type_t input_element_offset;
   ::c10::metal::array<idx_type_t, N> input_strides;
diff --git a/aten/src/ATen/native/mps/kernels/Shape.metal b/aten/src/ATen/native/mps/kernels/Shape.metal
index d45077e89298..44cf6f1e8d56 100644
--- a/aten/src/ATen/native/mps/kernels/Shape.metal
+++ b/aten/src/ATen/native/mps/kernels/Shape.metal
@@ -6,12 +6,12 @@
 using namespace metal;
 using namespace c10::metal;
 
-template <typename T_in, typename T_out>
-kernel void cat_large(
+template <typename I, typename T_in, typename T_out>
+kernel void cat(
     constant T_in* input [[buffer(0)]],
     device T_out* output [[buffer(1)]],
-    constant CatLargeSharedParams<>& shared_params [[buffer(2)]],
-    constant CatLargeInputParams<>& input_params [[buffer(3)]],
+    constant CatSharedParams<I>& shared_params [[buffer(2)]],
+    constant CatInputParams<I>& input_params [[buffer(3)]],
     uint tid [[thread_position_in_grid]]) {
   auto ndim = shared_params.ndim;
   auto cat_dim = shared_params.cat_dim;
@@ -23,9 +23,9 @@ kernel void cat_large(
   constant auto& input_strides = input_params.input_strides;
   constant auto& input_sizes = input_params.input_sizes;
 
-  auto input_element_idx = static_cast<int64_t>(tid) + input_element_offset;
-  int64_t input_offset = 0;
-  int64_t output_offset = 0;
+  auto input_element_idx = static_cast<I>(tid) + input_element_offset;
+  I input_offset = 0;
+  I output_offset = 0;
 
   for (auto dim = ndim - 1; dim >= 0; dim--) {
     auto dim_size = input_sizes[dim];
@@ -42,41 +42,45 @@ kernel void cat_large(
   output[output_offset] = static_cast<T_out>(input[input_offset]);
 }
 
-#define REGISTER_CAT_LARGE_OP(T_in, T_out)                           \
-  template [[host_name("cat_large_" #T_in "_" #T_out)]]              \
-  kernel void cat_large<T_in, T_out>(                                \
-      constant T_in * input [[buffer(0)]],                           \
-      device T_out * output [[buffer(1)]],                           \
-      constant CatLargeSharedParams<> & shared_params [[buffer(2)]], \
-      constant CatLargeInputParams<> & input_params [[buffer(3)]],   \
+#define REGISTER_CAT_OP(I, T_in, T_out)                          \
+  template [[host_name("cat_" #I "_" #T_in "_" #T_out)]]         \
+  kernel void cat<I, T_in, T_out>(                               \
+      constant T_in * input [[buffer(0)]],                       \
+      device T_out * output [[buffer(1)]],                       \
+      constant CatSharedParams<I> & shared_params [[buffer(2)]], \
+      constant CatInputParams<I> & input_params [[buffer(3)]],   \
       uint tid [[thread_position_in_grid]]);
 
-#define REGISTER_CAT_LARGE_OP_ALL_INPUT_TYPES(T_out) \
-  REGISTER_CAT_LARGE_OP(float, T_out);               \
-  REGISTER_CAT_LARGE_OP(half, T_out);                \
-  REGISTER_CAT_LARGE_OP(bfloat, T_out);              \
-  REGISTER_CAT_LARGE_OP(int, T_out);                 \
-  REGISTER_CAT_LARGE_OP(uint, T_out);                \
-  REGISTER_CAT_LARGE_OP(long, T_out);                \
-  REGISTER_CAT_LARGE_OP(ulong, T_out);               \
-  REGISTER_CAT_LARGE_OP(short, T_out);               \
-  REGISTER_CAT_LARGE_OP(ushort, T_out);              \
-  REGISTER_CAT_LARGE_OP(char, T_out);                \
-  REGISTER_CAT_LARGE_OP(uchar, T_out);               \
-  REGISTER_CAT_LARGE_OP(bool, T_out);
+#define REGISTER_CAT_OP_ALL_INPUT_TYPES(I, T_out) \
+  REGISTER_CAT_OP(I, float, T_out);               \
+  REGISTER_CAT_OP(I, half, T_out);                \
+  REGISTER_CAT_OP(I, bfloat, T_out);              \
+  REGISTER_CAT_OP(I, int, T_out);                 \
+  REGISTER_CAT_OP(I, uint, T_out);                \
+  REGISTER_CAT_OP(I, long, T_out);                \
+  REGISTER_CAT_OP(I, ulong, T_out);               \
+  REGISTER_CAT_OP(I, short, T_out);               \
+  REGISTER_CAT_OP(I, ushort, T_out);              \
+  REGISTER_CAT_OP(I, char, T_out);                \
+  REGISTER_CAT_OP(I, uchar, T_out);               \
+  REGISTER_CAT_OP(I, bool, T_out);
 
-REGISTER_CAT_LARGE_OP_ALL_INPUT_TYPES(float);
-REGISTER_CAT_LARGE_OP_ALL_INPUT_TYPES(half);
-REGISTER_CAT_LARGE_OP_ALL_INPUT_TYPES(bfloat);
-REGISTER_CAT_LARGE_OP_ALL_INPUT_TYPES(int);
-REGISTER_CAT_LARGE_OP_ALL_INPUT_TYPES(uint);
-REGISTER_CAT_LARGE_OP_ALL_INPUT_TYPES(long);
-REGISTER_CAT_LARGE_OP_ALL_INPUT_TYPES(ulong);
-REGISTER_CAT_LARGE_OP_ALL_INPUT_TYPES(short);
-REGISTER_CAT_LARGE_OP_ALL_INPUT_TYPES(ushort);
-REGISTER_CAT_LARGE_OP_ALL_INPUT_TYPES(char);
-REGISTER_CAT_LARGE_OP_ALL_INPUT_TYPES(uchar);
-REGISTER_CAT_LARGE_OP_ALL_INPUT_TYPES(bool);
+#define REGISTER_CAT_FOR_INDEX_TYPE(I)        \
+  REGISTER_CAT_OP_ALL_INPUT_TYPES(I, float);  \
+  REGISTER_CAT_OP_ALL_INPUT_TYPES(I, half);   \
+  REGISTER_CAT_OP_ALL_INPUT_TYPES(I, bfloat); \
+  REGISTER_CAT_OP_ALL_INPUT_TYPES(I, int);    \
+  REGISTER_CAT_OP_ALL_INPUT_TYPES(I, uint);   \
+  REGISTER_CAT_OP_ALL_INPUT_TYPES(I, long);   \
+  REGISTER_CAT_OP_ALL_INPUT_TYPES(I, ulong);  \
+  REGISTER_CAT_OP_ALL_INPUT_TYPES(I, short);  \
+  REGISTER_CAT_OP_ALL_INPUT_TYPES(I, ushort); \
+  REGISTER_CAT_OP_ALL_INPUT_TYPES(I, char);   \
+  REGISTER_CAT_OP_ALL_INPUT_TYPES(I, uchar);  \
+  REGISTER_CAT_OP_ALL_INPUT_TYPES(I, bool);   \
+                                              \
+  REGISTER_CAT_OP(I, float2, float2);         \
+  REGISTER_CAT_OP(I, half2, half2);
 
-REGISTER_CAT_LARGE_OP(float2, float2);
-REGISTER_CAT_LARGE_OP(half2, half2);
+REGISTER_CAT_FOR_INDEX_TYPE(int64_t);
+REGISTER_CAT_FOR_INDEX_TYPE(int32_t);
diff --git a/aten/src/ATen/native/mps/operations/Shape.mm b/aten/src/ATen/native/mps/operations/Shape.mm
index 3947419c117d..973bef036d56 100644
--- a/aten/src/ATen/native/mps/operations/Shape.mm
+++ b/aten/src/ATen/native/mps/operations/Shape.mm
@@ -3,6 +3,7 @@
 #include <ATen/MemoryOverlap.h>
 #include <ATen/WrapDimUtils.h>
 #include <ATen/mps/MPSProfiler.h>
+#include <ATen/native/Pool.h>
 #include <ATen/native/TensorShape.h>
 #include <ATen/native/TypeProperties.h>
 #include <ATen/native/mps/OperationUtils.h>
@@ -69,29 +70,40 @@ static void check_shape_except_dim(const Tensor& first, const Tensor& second, in
   }
 }
 
-// This implementation of cat is used only if one of the inputs or the output is
-// too large to use MPSGraph.
+template <typename T>
+std::string get_type_str();
+
+template <>
+std::string get_type_str<int64_t>() {
+  return "int64_t";
+}
+
+template <>
+std::string get_type_str<int32_t>() {
+  return "int32_t";
+}
+
 // NOTE: `output` is expected to already have the correct size.
-static void cat_out_large_tensor_mps(const ITensorListRef& inputs, int64_t dimension, const Tensor& output) {
-  CatLargeSharedParams shared_params;
+template <typename idx_type_t>
+static void cat_out_mps_impl(const ITensorListRef& inputs, int64_t dimension, const Tensor& output) {
+  CatSharedParams<idx_type_t> shared_params;
 
   shared_params.ndim = output.dim();
   shared_params.cat_dim = dimension;
 
   for (const auto dim : c10::irange(output.dim())) {
-    shared_params.output_strides[dim] = output.stride(dim);
-    shared_params.output_sizes[dim] = output.size(dim);
+    shared_params.output_strides[dim] = safe_downcast<idx_type_t, int64_t>(output.stride(dim));
+    shared_params.output_sizes[dim] = safe_downcast<idx_type_t, int64_t>(output.size(dim));
   }
 
-  int64_t cat_dim_offset = 0;
+  idx_type_t cat_dim_offset = 0;
   size_t input_idx = 0;
   MPSStream* stream = getCurrentMPSStream();
 
-  // Launch a separate kernels for each input. This will produce some overhead,
-  // but that should be relatively minimal since at least one of the inputs is
-  // very large. In order to launch only one kernel to process all inputs, we
-  // would have to copy all the input tensor data into a packed buffer, which
-  // would not be ideal.
+  // Launch a separate kernels for each input. This will produce some overhead.
+  // In order to launch only one kernel to process all inputs, we would have to
+  // copy all the input tensor data into a packed buffer, which would not be
+  // ideal.
   for (const Tensor& input : inputs) {
     if (input.numel() == 0) {
       continue;
@@ -104,21 +116,23 @@ static void cat_out_large_tensor_mps(const ITensorListRef& inputs, int64_t dimen
 
     for (int64_t numel_remaining = input.numel(); numel_remaining > 0; numel_remaining -= max_num_threads) {
       auto num_threads = std::min(max_num_threads, numel_remaining);
-      CatLargeInputParams input_params;
+      CatInputParams<idx_type_t> input_params;
 
-      input_params.cat_dim_offset = cat_dim_offset;
-      input_params.input_element_offset = input.numel() - numel_remaining;
+      input_params.cat_dim_offset = safe_downcast<idx_type_t, int64_t>(cat_dim_offset);
+      input_params.input_element_offset = safe_downcast<idx_type_t, int64_t>(input.numel() - numel_remaining);
 
       for (const auto dim : c10::irange(input.dim())) {
-        input_params.input_strides[dim] = input.stride(dim);
-        input_params.input_sizes[dim] = input.size(dim);
+        input_params.input_strides[dim] = safe_downcast<idx_type_t, int64_t>(input.stride(dim));
+        input_params.input_sizes[dim] = safe_downcast<idx_type_t, int64_t>(input.size(dim));
       }
 
       dispatch_sync_with_rethrow(stream->queue(), ^() {
         @autoreleasepool {
           id<MTLComputeCommandEncoder> computeEncoder = stream->commandEncoder();
-          auto pipeline_state = lib.getPipelineStateForFunc(
-              fmt::format("cat_large_{}_{}", scalarToMetalTypeString(input), scalarToMetalTypeString(output)));
+          auto pipeline_state = lib.getPipelineStateForFunc(fmt::format("cat_{}_{}_{}",
+                                                                        get_type_str<idx_type_t>(),
+                                                                        scalarToMetalTypeString(input),
+                                                                        scalarToMetalTypeString(output)));
           getMPSProfiler().beginProfileKernel(pipeline_state, "cat", {input});
           [computeEncoder setComputePipelineState:pipeline_state];
           mtl_setArgs(computeEncoder, input, output, shared_params, input_params);
@@ -294,13 +308,6 @@ TORCH_IMPL_FUNC(cat_out_mps)
               " and out is on ",
               out.device());
 
-  // TODO: For better performance by eliminating input tensor gathering and post transpose,
-  // TODO: it is better to keep the out tensor's memory format.
-  // TODO: dimension needs to be recomputed as:
-  // TODO: dim = 0 --> dim = 0; dim = 1 or 2 --> dim = out.dim()- dim; otherwise dim = dim-1
-  if (needsGather(out)) {
-    out.unsafeGetTensorImpl()->empty_tensor_restride(MemoryFormat::Contiguous);
-  }
   std::vector<int64_t> size(notSkippedTensor.sizes().vec());
 
   // Compute size of the result in the cat dimension
@@ -331,82 +338,9 @@ TORCH_IMPL_FUNC(cat_out_mps)
   has_large_tensor |= isTooLargeForMPSGraph(out);
 
   if (has_large_tensor) {
-    return mps::cat_out_large_tensor_mps(materialized_inputs, dimension, out);
-  }
-
-  struct CachedGraph : public MPSCachedGraph {
-    CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {}
-    std::vector<MPSGraphTensor*> inputTensors_;
-    MPSGraphTensor* outputTensor_ = nil;
-  };
-
-  @autoreleasepool {
-    std::string key = "cat_out_mps:" + std::to_string(dimension) + ":" +
-        (memory_format == MemoryFormat::ChannelsLast ? "NHWC" : "NCHW");
-    if (!all_same_dtype) {
-      key += getTensorsStringKey(input_tensors, true, all_same_sizes_and_stride);
-    } else {
-      key += ":" + getMPSTypeString(input_tensors[0].scalar_type(), true) + ":" + std::to_string(inputs.size());
-    }
-    for (auto idx : skipped_tensor_indices) {
-      key += "," + std::to_string(idx);
-    }
-
-    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
-      auto len_tensor_array = inputs.size() - skipped_tensor_indices.size();
-      std::vector<MPSGraphTensor*> castInputTensors(len_tensor_array);
-      newCachedGraph->inputTensors_.reserve(len_tensor_array);
-
-      for (const auto idx : c10::irange(len_tensor_array)) {
-        const Tensor& tensor = input_tensors[idx];
-        auto scalar_type = getMPSScalarType(tensor.scalar_type());
-        if (tensor.scalar_type() == kBool) {
-          scalar_type = MPSDataTypeInt8;
-        }
-        newCachedGraph->inputTensors_[idx] = mpsGraphUnrankedPlaceHolder(mpsGraph, scalar_type);
-        if (tensor.scalar_type() != out_dtype) {
-          castInputTensors[idx] = [mpsGraph castTensor:newCachedGraph->inputTensors_[idx]
-                                                toType:getMPSDataType(out_dtype)
-                                                  name:@"castInput"];
-        } else {
-          castInputTensors[idx] = newCachedGraph->inputTensors_[idx];
-        }
-      }
-
-      auto inputTensorsArray = [NSArray arrayWithObjects:castInputTensors.data() count:len_tensor_array];
-      MPSGraphTensor* outputTensor = [mpsGraph concatTensors:inputTensorsArray
-                                                   dimension:dimension // Maybe convert this from int64_t -> int32
-                                                        name:nil];
-      if (getMPSDataType(out_dtype) == MPSDataTypeBool) {
-        outputTensor = [mpsGraph castTensor:outputTensor toType:MPSDataTypeBool name:@"outputTensor"];
-      }
-      newCachedGraph->outputTensor_ = outputTensor;
-    });
-
-    std::vector<Placeholder> inputPlaceholders;
-    int i = 0;
-    int t_idx = 0;
-    for (const Tensor& tensor : materialized_inputs) {
-      if (std::find(skipped_tensor_indices.begin(), skipped_tensor_indices.end(), i) == skipped_tensor_indices.end()) {
-        auto scalar_type = getMPSScalarType(tensor.scalar_type());
-        if (tensor.scalar_type() == kBool) {
-          scalar_type = MPSDataTypeInt8;
-        }
-        inputPlaceholders.emplace_back(cachedGraph->inputTensors_[t_idx], tensor, nullptr, true, scalar_type);
-        t_idx++;
-      }
-      i++;
-    }
-
-    auto outputDataType = getMPSScalarType(out.scalar_type());
-    Placeholder outputPlaceholder =
-        Placeholder(cachedGraph->outputTensor_, out, /*mpsShape=*/nil, /*gatherTensorData=*/false, outputDataType);
-
-    NSMutableDictionary* feeds = [[NSMutableDictionary new] autorelease];
-    for (auto& inputPlaceholder : inputPlaceholders) {
-      feeds[inputPlaceholder.getMPSGraphTensor()] = inputPlaceholder.getMPSGraphTensorData();
-    }
-    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder);
+    return mps::cat_out_mps_impl<int64_t>(materialized_inputs, dimension, out);
+  } else {
+    return mps::cat_out_mps_impl<int32_t>(materialized_inputs, dimension, out);
   }
 }
 

From 470e2f61c3b2083e8d895b6aae5ede198bba5696 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Fri, 17 Oct 2025 00:06:40 +0000
Subject: [PATCH 007/123] Revert "[Fix] Use sys.executable instead of hardcoded
 python (#165633)"

This reverts commit 37f3ba274a8ccebc6b3409f52cf068a8b23617d4.

Reverted https://github.com/pytorch/pytorch/pull/165633 on behalf of https://github.com/malfet due to Looks like it broke test_collect_callgrind in slow workflows, see https://hud.pytorch.org/hud/pytorch/pytorch/e0fe37fa687a39e42ddeeb5c03986ffd5c40e662/1?per_page=50&name_filter=slow&mergeEphemeralLF=true ([comment](https://github.com/pytorch/pytorch/pull/165633#issuecomment-3413290813))
---
 torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py b/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
index 3788a44e062c..e80416482271 100644
--- a/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
+++ b/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
@@ -640,7 +640,7 @@ class _ValgrindWrapper:
                         stat_log=stat_log,
                         bindings=self._bindings_module))
 
-                run_loop_cmd = [sys.executable, script_file]
+                run_loop_cmd = ["python", script_file]
             else:
                 if collect_baseline:
                     raise AssertionError("collect_baseline must be False for non-Python timers")

From b2953f5643c6627d2bd0ceb9d2ccb32e2545c549 Mon Sep 17 00:00:00 2001
From: Yuanyuan Chen <cyyever@outlook.com>
Date: Fri, 17 Oct 2025 00:09:49 +0000
Subject: [PATCH 008/123] [9/N] Apply ruff UP035 rule (#165515)

This is follow-up of #165214 to continue applying ruff UP035 rule to the code base.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165515
Approved by: https://github.com/Lucaskabela
---
 benchmarks/dynamo/cachebench.py                             | 2 +-
 benchmarks/dynamo/genai_layers/utils.py                     | 3 ++-
 benchmarks/dynamo/torchao_backend.py                        | 3 ++-
 .../functional_autograd_benchmark.py                        | 3 ++-
 benchmarks/functional_autograd_benchmark/utils.py           | 3 ++-
 benchmarks/gpt_fast/common.py                               | 3 ++-
 benchmarks/inductor_backends/cutlass.py                     | 3 ++-
 benchmarks/transformer/attention_bias_benchmarks.py         | 3 ++-
 benchmarks/transformer/score_mod.py                         | 3 ++-
 benchmarks/transformer/sdpa.py                              | 2 +-
 functorch/dim/__init__.py                                   | 4 ++--
 functorch/dim/_wrap.py                                      | 6 +++++-
 functorch/dim/wrap_type.py                                  | 3 ++-
 functorch/einops/rearrange.py                               | 4 ++--
 tools/autograd/context.py                                   | 2 +-
 tools/autograd/gen_python_functions.py                      | 4 ++--
 tools/autograd/gen_variable_type.py                         | 4 ++--
 tools/flight_recorder/components/fr_logger.py               | 3 ++-
 tools/github/github_utils.py                                | 6 +++++-
 tools/linter/adapters/docstring_linter.py                   | 4 ++--
 tools/linter/adapters/no_workflows_on_fork.py               | 6 +++++-
 tools/nightly.py                                            | 3 ++-
 tools/stats/import_test_stats.py                            | 6 +++++-
 tools/stats/upload_external_contrib_stats.py                | 6 +++++-
 tools/stats/upload_stats_lib.py                             | 6 +++++-
 tools/testing/target_determination/heuristics/filepath.py   | 6 +++++-
 tools/testing/test_selections.py                            | 4 ++--
 27 files changed, 72 insertions(+), 33 deletions(-)

diff --git a/benchmarks/dynamo/cachebench.py b/benchmarks/dynamo/cachebench.py
index 9244612b5aeb..c4d79a1b12ce 100644
--- a/benchmarks/dynamo/cachebench.py
+++ b/benchmarks/dynamo/cachebench.py
@@ -6,7 +6,7 @@ import os
 import subprocess
 import sys
 import tempfile
-from typing import Callable
+from collections.abc import Callable
 
 from torch._inductor.utils import fresh_cache
 
diff --git a/benchmarks/dynamo/genai_layers/utils.py b/benchmarks/dynamo/genai_layers/utils.py
index 749b9cea2032..2db2d7300df5 100644
--- a/benchmarks/dynamo/genai_layers/utils.py
+++ b/benchmarks/dynamo/genai_layers/utils.py
@@ -1,7 +1,8 @@
 import os
 from collections import defaultdict
+from collections.abc import Callable
 from dataclasses import dataclass
-from typing import Any, Callable, Optional
+from typing import Any, Optional
 
 import matplotlib.pyplot as plt
 
diff --git a/benchmarks/dynamo/torchao_backend.py b/benchmarks/dynamo/torchao_backend.py
index 96e1c4569274..6b4204db7b36 100644
--- a/benchmarks/dynamo/torchao_backend.py
+++ b/benchmarks/dynamo/torchao_backend.py
@@ -1,4 +1,5 @@
-from typing import Any, Callable
+from collections.abc import Callable
+from typing import Any
 
 import torch
 
diff --git a/benchmarks/functional_autograd_benchmark/functional_autograd_benchmark.py b/benchmarks/functional_autograd_benchmark/functional_autograd_benchmark.py
index a974eb8ae5ca..9d5772c4f124 100644
--- a/benchmarks/functional_autograd_benchmark/functional_autograd_benchmark.py
+++ b/benchmarks/functional_autograd_benchmark/functional_autograd_benchmark.py
@@ -1,7 +1,8 @@
 import time
 from argparse import ArgumentParser
 from collections import defaultdict
-from typing import Any, Callable, NamedTuple
+from collections.abc import Callable
+from typing import Any, NamedTuple
 
 import torch
 from torch.autograd import functional
diff --git a/benchmarks/functional_autograd_benchmark/utils.py b/benchmarks/functional_autograd_benchmark/utils.py
index 46f0061cd3fe..8efc0bdcddd1 100644
--- a/benchmarks/functional_autograd_benchmark/utils.py
+++ b/benchmarks/functional_autograd_benchmark/utils.py
@@ -1,5 +1,6 @@
 from collections import defaultdict
-from typing import Callable, Optional, Union
+from collections.abc import Callable
+from typing import Optional, Union
 
 import torch
 from torch import nn, Tensor
diff --git a/benchmarks/gpt_fast/common.py b/benchmarks/gpt_fast/common.py
index 5d9fc7c4aa6b..4cbd0bd0f2dc 100644
--- a/benchmarks/gpt_fast/common.py
+++ b/benchmarks/gpt_fast/common.py
@@ -1,5 +1,6 @@
 import dataclasses
-from typing import Callable, Optional
+from collections.abc import Callable
+from typing import Optional
 
 
 all_experiments: dict[str, Callable] = {}
diff --git a/benchmarks/inductor_backends/cutlass.py b/benchmarks/inductor_backends/cutlass.py
index 7141872ec3c4..b2ed506302ae 100644
--- a/benchmarks/inductor_backends/cutlass.py
+++ b/benchmarks/inductor_backends/cutlass.py
@@ -9,8 +9,9 @@ import logging
 import time
 from abc import abstractmethod
 from collections import defaultdict
+from collections.abc import Callable
 from dataclasses import asdict, dataclass, field
-from typing import Any, Callable, Optional
+from typing import Any, Optional
 
 from tabulate import tabulate
 from tqdm import tqdm
diff --git a/benchmarks/transformer/attention_bias_benchmarks.py b/benchmarks/transformer/attention_bias_benchmarks.py
index 2154e11237e9..f6bf45063309 100644
--- a/benchmarks/transformer/attention_bias_benchmarks.py
+++ b/benchmarks/transformer/attention_bias_benchmarks.py
@@ -1,7 +1,8 @@
 import itertools
+from collections.abc import Callable
 from dataclasses import asdict, dataclass
 from functools import partial
-from typing import Callable, Union
+from typing import Union
 
 import numpy as np
 from tabulate import tabulate
diff --git a/benchmarks/transformer/score_mod.py b/benchmarks/transformer/score_mod.py
index 4be4a1e7c46c..f812ede7f635 100644
--- a/benchmarks/transformer/score_mod.py
+++ b/benchmarks/transformer/score_mod.py
@@ -3,10 +3,11 @@ import csv
 import itertools
 import random
 from collections import defaultdict
+from collections.abc import Callable
 from contextlib import nullcontext
 from dataclasses import asdict, dataclass
 from functools import partial
-from typing import Callable, Optional, Union
+from typing import Optional, Union
 
 import numpy as np
 from tabulate import tabulate
diff --git a/benchmarks/transformer/sdpa.py b/benchmarks/transformer/sdpa.py
index 2eca4bf06b44..b4bc77bafdd6 100644
--- a/benchmarks/transformer/sdpa.py
+++ b/benchmarks/transformer/sdpa.py
@@ -1,8 +1,8 @@
 import itertools
 from collections import defaultdict
+from collections.abc import Callable
 from contextlib import nullcontext
 from dataclasses import asdict, dataclass
-from typing import Callable
 
 from tabulate import tabulate
 from tqdm import tqdm
diff --git a/functorch/dim/__init__.py b/functorch/dim/__init__.py
index 1d7a4307c310..df9ca766e28f 100644
--- a/functorch/dim/__init__.py
+++ b/functorch/dim/__init__.py
@@ -3,11 +3,11 @@ from __future__ import annotations
 import dis
 import inspect
 import sys
-from typing import Any, Callable, Optional, TYPE_CHECKING, Union
+from typing import Any, Optional, TYPE_CHECKING, Union
 
 
 if TYPE_CHECKING:
-    from collections.abc import Sequence
+    from collections.abc import Callable, Sequence
 
 import torch
 from torch.utils._pytree import tree_flatten, tree_map, tree_unflatten
diff --git a/functorch/dim/_wrap.py b/functorch/dim/_wrap.py
index 4b359f6a1d58..3c3a12b54ceb 100644
--- a/functorch/dim/_wrap.py
+++ b/functorch/dim/_wrap.py
@@ -5,7 +5,7 @@ Python implementation of function wrapping functionality for functorch.dim.
 from __future__ import annotations
 
 import functools
-from typing import Any, Callable, Optional
+from typing import Any, Optional, TYPE_CHECKING
 
 import torch
 from torch.utils._pytree import tree_map
@@ -15,6 +15,10 @@ from ._enable_all_layers import EnableAllLayers
 from ._tensor_info import TensorInfo
 
 
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+
 def handle_from_tensor(tensor: torch.Tensor) -> torch.Tensor:
     """Handle tensor conversion for torch function integration."""
     return tensor
diff --git a/functorch/dim/wrap_type.py b/functorch/dim/wrap_type.py
index cf4a195f3c74..5020e756ce6c 100644
--- a/functorch/dim/wrap_type.py
+++ b/functorch/dim/wrap_type.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import functools
+from collections.abc import Callable
 from types import (
     BuiltinMethodType,
     FunctionType,
@@ -12,7 +13,7 @@ from types import (
     MethodDescriptorType,
     WrapperDescriptorType,
 )
-from typing import Any, Callable
+from typing import Any
 
 
 FUNC_TYPES = (
diff --git a/functorch/einops/rearrange.py b/functorch/einops/rearrange.py
index 473a43816668..21e3bfaad4d8 100644
--- a/functorch/einops/rearrange.py
+++ b/functorch/einops/rearrange.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 import functools
-from typing import Callable, TYPE_CHECKING, Union
+from typing import TYPE_CHECKING, Union
 
 import torch
 from functorch.dim import dims  # noqa: F401
@@ -16,7 +16,7 @@ from ._parsing import (
 
 
 if TYPE_CHECKING:
-    from collections.abc import Sequence
+    from collections.abc import Callable, Sequence
 
 __all__ = ["rearrange"]
 
diff --git a/tools/autograd/context.py b/tools/autograd/context.py
index 146cf571d304..0ed4b2ee4d01 100644
--- a/tools/autograd/context.py
+++ b/tools/autograd/context.py
@@ -1,5 +1,5 @@
 import functools
-from typing import Callable
+from collections.abc import Callable
 
 from torchgen.api.autograd import NativeFunctionWithDifferentiabilityInfo as NFWDI
 from torchgen.context import native_function_manager
diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py
index 5a003cadf6b3..af25d55ef38d 100644
--- a/tools/autograd/gen_python_functions.py
+++ b/tools/autograd/gen_python_functions.py
@@ -36,7 +36,7 @@ from __future__ import annotations
 import itertools
 import re
 from collections import defaultdict
-from typing import Callable, TYPE_CHECKING
+from typing import TYPE_CHECKING
 
 import yaml
 
@@ -77,7 +77,7 @@ from .gen_trace_type import should_trace
 
 
 if TYPE_CHECKING:
-    from collections.abc import Iterable, Sequence
+    from collections.abc import Callable, Iterable, Sequence
 
 
 #
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index ed5a6e6cf398..5ce3b06af145 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -29,7 +29,7 @@
 from __future__ import annotations
 
 import re
-from typing import Callable, TYPE_CHECKING
+from typing import TYPE_CHECKING
 
 from torchgen.api import cpp
 from torchgen.api.autograd import (
@@ -106,7 +106,7 @@ from .gen_trace_type import (
 
 
 if TYPE_CHECKING:
-    from collections.abc import Sequence
+    from collections.abc import Callable, Sequence
 
 
 # We don't set or modify grad_fn on these methods. Generally, they return
diff --git a/tools/flight_recorder/components/fr_logger.py b/tools/flight_recorder/components/fr_logger.py
index 9574df97437b..49d878bf4559 100644
--- a/tools/flight_recorder/components/fr_logger.py
+++ b/tools/flight_recorder/components/fr_logger.py
@@ -5,7 +5,8 @@
 # LICENSE file in the root directory of this source tree.
 
 import logging
-from typing import Any, Callable, Optional
+from collections.abc import Callable
+from typing import Any, Optional
 
 
 class FlightRecorderLogger:
diff --git a/tools/github/github_utils.py b/tools/github/github_utils.py
index 6442a0644282..dc078fe29fad 100644
--- a/tools/github/github_utils.py
+++ b/tools/github/github_utils.py
@@ -4,12 +4,16 @@ from __future__ import annotations
 
 import json
 import os
-from typing import Any, Callable, cast
+from typing import Any, cast, TYPE_CHECKING
 from urllib.error import HTTPError
 from urllib.parse import quote
 from urllib.request import Request, urlopen
 
 
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+
 def gh_fetch_url_and_headers(
     url: str,
     *,
diff --git a/tools/linter/adapters/docstring_linter.py b/tools/linter/adapters/docstring_linter.py
index 477bfe7d9a80..ce891bedcf99 100644
--- a/tools/linter/adapters/docstring_linter.py
+++ b/tools/linter/adapters/docstring_linter.py
@@ -5,7 +5,7 @@ import json
 import sys
 from functools import cached_property
 from pathlib import Path
-from typing import Any, Callable, TYPE_CHECKING
+from typing import Any, TYPE_CHECKING
 
 
 _FILE = Path(__file__).absolute()
@@ -18,7 +18,7 @@ else:
     import _linter
 
 if TYPE_CHECKING:
-    from collections.abc import Iterator, Sequence
+    from collections.abc import Callable, Iterator, Sequence
 
 
 GRANDFATHER_LIST = _FILE.parent / "docstring_linter-grandfather.json"
diff --git a/tools/linter/adapters/no_workflows_on_fork.py b/tools/linter/adapters/no_workflows_on_fork.py
index 81e11a47f67b..02efd5f6f62a 100644
--- a/tools/linter/adapters/no_workflows_on_fork.py
+++ b/tools/linter/adapters/no_workflows_on_fork.py
@@ -22,11 +22,15 @@ import os
 import re
 from enum import Enum
 from pathlib import Path
-from typing import Any, Callable, NamedTuple, Optional
+from typing import Any, NamedTuple, Optional, TYPE_CHECKING
 
 from yaml import load
 
 
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+
 # Safely load fast C Yaml loader/dumper if they are available
 try:
     from yaml import CSafeLoader as Loader
diff --git a/tools/nightly.py b/tools/nightly.py
index 6361d7da67ce..ab60c71ae9b7 100755
--- a/tools/nightly.py
+++ b/tools/nightly.py
@@ -65,10 +65,11 @@ import textwrap
 import time
 import uuid
 from ast import literal_eval
+from collections.abc import Callable
 from datetime import datetime
 from pathlib import Path
 from platform import system as platform_system
-from typing import Any, Callable, cast, NamedTuple, TYPE_CHECKING, TypeVar
+from typing import Any, cast, NamedTuple, TYPE_CHECKING, TypeVar
 
 
 if TYPE_CHECKING:
diff --git a/tools/stats/import_test_stats.py b/tools/stats/import_test_stats.py
index 8fb6be57e97d..a7c661340d13 100644
--- a/tools/stats/import_test_stats.py
+++ b/tools/stats/import_test_stats.py
@@ -7,10 +7,14 @@ import json
 import os
 import shutil
 from pathlib import Path
-from typing import Any, Callable, cast
+from typing import Any, cast, TYPE_CHECKING
 from urllib.request import urlopen
 
 
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+
 REPO_ROOT = Path(__file__).resolve().parents[2]
 
 
diff --git a/tools/stats/upload_external_contrib_stats.py b/tools/stats/upload_external_contrib_stats.py
index 6de0e4952143..ab31cf645cd5 100644
--- a/tools/stats/upload_external_contrib_stats.py
+++ b/tools/stats/upload_external_contrib_stats.py
@@ -6,13 +6,17 @@ import json
 import os
 import time
 import urllib.parse
-from typing import Any, Callable, cast
+from typing import Any, cast, TYPE_CHECKING
 from urllib.error import HTTPError
 from urllib.request import Request, urlopen
 
 from tools.stats.upload_stats_lib import upload_to_s3
 
 
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+
 FILTER_OUT_USERS = {
     "pytorchmergebot",
     "facebook-github-bot",
diff --git a/tools/stats/upload_stats_lib.py b/tools/stats/upload_stats_lib.py
index 3ef60171acf6..34548b80d76b 100644
--- a/tools/stats/upload_stats_lib.py
+++ b/tools/stats/upload_stats_lib.py
@@ -9,12 +9,16 @@ import time
 import zipfile
 from functools import lru_cache
 from pathlib import Path
-from typing import Any, Callable, cast, Optional
+from typing import Any, cast, Optional, TYPE_CHECKING
 
 import boto3  # type: ignore[import]
 import requests
 
 
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+
 PYTORCH_REPO = "https://api.github.com/repos/pytorch/pytorch"
 
 
diff --git a/tools/testing/target_determination/heuristics/filepath.py b/tools/testing/target_determination/heuristics/filepath.py
index e9bdd920b4ce..9cd4ccd862a4 100644
--- a/tools/testing/target_determination/heuristics/filepath.py
+++ b/tools/testing/target_determination/heuristics/filepath.py
@@ -3,7 +3,7 @@ from __future__ import annotations
 from collections import defaultdict
 from functools import lru_cache
 from pathlib import Path
-from typing import Any, Callable
+from typing import Any, TYPE_CHECKING
 from warnings import warn
 
 from tools.testing.target_determination.heuristics.interface import (
@@ -17,6 +17,10 @@ from tools.testing.target_determination.heuristics.utils import (
 from tools.testing.test_run import TestRun
 
 
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+
 REPO_ROOT = Path(__file__).parents[3]
 
 keyword_synonyms: dict[str, list[str]] = {
diff --git a/tools/testing/test_selections.py b/tools/testing/test_selections.py
index 9493e35f97d7..4a5fbb6a836b 100644
--- a/tools/testing/test_selections.py
+++ b/tools/testing/test_selections.py
@@ -4,7 +4,7 @@ import math
 import os
 import subprocess
 from pathlib import Path
-from typing import Callable, TYPE_CHECKING
+from typing import TYPE_CHECKING
 
 from tools.stats.import_test_stats import get_disabled_tests
 from tools.testing.test_run import ShardedTest, TestRun
@@ -19,7 +19,7 @@ except ImportError:
 
 
 if TYPE_CHECKING:
-    from collections.abc import Sequence
+    from collections.abc import Callable, Sequence
 
 
 REPO_ROOT = Path(__file__).resolve().parents[2]

From 5b2afe4c5dc87786ca65bf22ca9a78f7c21a33a4 Mon Sep 17 00:00:00 2001
From: Yuanyuan Chen <cyyever@outlook.com>
Date: Fri, 17 Oct 2025 00:40:07 +0000
Subject: [PATCH 009/123] Turn some const variables into constexpr in C++ code
 (#165401)

This PR checks the C++ code and turns some const variables into constexpr.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165401
Approved by: https://github.com/Skylion007
---
 aten/src/ATen/core/PhiloxRNGEngine.h          |  8 ++--
 aten/src/ATen/cuda/CUDAGeneratorImpl.cpp      | 12 ++---
 aten/src/ATen/native/Activation.cpp           |  4 +-
 aten/src/ATen/native/BlasKernel.cpp           |  4 +-
 aten/src/ATen/native/Distributions.h          |  4 +-
 aten/src/ATen/native/Math.h                   |  6 +--
 aten/src/ATen/native/Normalization.cpp        |  2 +-
 aten/src/ATen/native/cpu/UpSampleKernel.cpp   |  6 +--
 aten/src/ATen/native/cuda/DilatedMaxPool2d.cu |  2 +-
 aten/src/ATen/native/cuda/Embedding.cu        |  4 +-
 aten/src/ATen/native/cuda/IGammaKernel.cu     | 46 +++++++++----------
 aten/src/ATen/native/cuda/Math.cuh            |  8 ++--
 aten/src/ATen/native/cuda/UpSample.cuh        |  4 +-
 aten/src/ATen/native/mkldnn/Matmul.cpp        |  2 +-
 .../cpu/kernels/QuantizedOpKernels.cpp        |  2 +-
 .../src/ATen/native/quantized/cpu/qlinear.cpp |  2 +-
 .../ATen/native/quantized/cpu/qsoftmax.cpp    |  4 +-
 .../epilogue_thread_apply_logsumexp.h         |  6 +--
 aten/src/ATen/test/pow_test.cpp               | 20 ++++----
 aten/src/ATen/xpu/XPUGeneratorImpl.cpp        | 12 ++---
 20 files changed, 79 insertions(+), 79 deletions(-)

diff --git a/aten/src/ATen/core/PhiloxRNGEngine.h b/aten/src/ATen/core/PhiloxRNGEngine.h
index 413055d3fad6..e8bac545933c 100644
--- a/aten/src/ATen/core/PhiloxRNGEngine.h
+++ b/aten/src/ATen/core/PhiloxRNGEngine.h
@@ -229,10 +229,10 @@ private:
   }
 
 
-  static const uint32_t kPhilox10A = 0x9E3779B9;
-  static const uint32_t kPhilox10B = 0xBB67AE85;
-  static const uint32_t kPhiloxSA = 0xD2511F53;
-  static const uint32_t kPhiloxSB = 0xCD9E8D57;
+  static constexpr uint32_t kPhilox10A = 0x9E3779B9;
+  static constexpr uint32_t kPhilox10B = 0xBB67AE85;
+  static constexpr uint32_t kPhiloxSA = 0xD2511F53;
+  static constexpr uint32_t kPhiloxSB = 0xCD9E8D57;
 };
 
 typedef philox_engine Philox4_32;
diff --git a/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp b/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
index 9f7c9ba881e9..2e387fbc264d 100644
--- a/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
+++ b/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
@@ -325,9 +325,9 @@ uint64_t CUDAGeneratorImpl::seed() {
  */
 c10::intrusive_ptr<c10::TensorImpl> CUDAGeneratorImpl::get_state() const {
   // The RNG state comprises the seed, and an offset used for Philox.
-  static const size_t seed_size = sizeof(uint64_t);
-  static const size_t offset_size = sizeof(int64_t);
-  static const size_t total_size = seed_size + offset_size;
+  constexpr size_t seed_size = sizeof(uint64_t);
+  constexpr size_t offset_size = sizeof(int64_t);
+  constexpr size_t total_size = seed_size + offset_size;
 
   auto state_tensor = at::detail::empty_cpu({(int64_t)total_size}, ScalarType::Byte, std::nullopt, std::nullopt, std::nullopt, std::nullopt);
   auto rng_state = state_tensor.data_ptr<uint8_t>();
@@ -346,9 +346,9 @@ c10::intrusive_ptr<c10::TensorImpl> CUDAGeneratorImpl::get_state() const {
  * and size of the internal state.
  */
 void CUDAGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
-  static const size_t seed_size = sizeof(uint64_t);
-  static const size_t offset_size = sizeof(int64_t);
-  static const size_t total_size = seed_size + offset_size;
+  constexpr size_t seed_size = sizeof(uint64_t);
+  constexpr size_t offset_size = sizeof(int64_t);
+  constexpr size_t total_size = seed_size + offset_size;
 
   detail::check_rng_state(new_state);
 
diff --git a/aten/src/ATen/native/Activation.cpp b/aten/src/ATen/native/Activation.cpp
index 861c51f16097..c164120a1f3c 100644
--- a/aten/src/ATen/native/Activation.cpp
+++ b/aten/src/ATen/native/Activation.cpp
@@ -240,8 +240,8 @@ TORCH_META_FUNC(gelu_backward) (
 
 namespace at::native {
 
-static const double SELU_ALPHA = 1.6732632423543772848170429916717;
-static const double SELU_SCALE = 1.0507009873554804934193349852946;
+static constexpr double SELU_ALPHA = 1.6732632423543772848170429916717;
+static constexpr double SELU_SCALE = 1.0507009873554804934193349852946;
 
 DEFINE_DISPATCH(elu_stub);
 DEFINE_DISPATCH(elu_backward_stub);
diff --git a/aten/src/ATen/native/BlasKernel.cpp b/aten/src/ATen/native/BlasKernel.cpp
index a77604c535c1..b476ca3cff8f 100644
--- a/aten/src/ATen/native/BlasKernel.cpp
+++ b/aten/src/ATen/native/BlasKernel.cpp
@@ -286,7 +286,7 @@ template void scal_fast_path<scalar_t>(int *n, scalar_t *a, scalar_t *x, int *in
 #if AT_BUILD_WITH_BLAS()
 template <>
 bool scal_use_fast_path<double>(int64_t n, int64_t incx) {
-  auto intmax = std::numeric_limits<int>::max();
+  auto constexpr intmax = std::numeric_limits<int>::max();
   return n <= intmax && incx <= intmax;
 }
 
@@ -315,7 +315,7 @@ bool gemv_use_fast_path<float>(
     int64_t incx,
     [[maybe_unused]] float beta,
     int64_t incy) {
-  auto intmax = std::numeric_limits<int>::max();
+  auto constexpr intmax = std::numeric_limits<int>::max();
   return (m <= intmax) && (n <= intmax) && (lda <= intmax) &&
          (incx > 0) && (incx <= intmax) && (incy > 0) && (incy <= intmax);
 }
diff --git a/aten/src/ATen/native/Distributions.h b/aten/src/ATen/native/Distributions.h
index 1c9db44aebb0..ab7d82dbeab4 100644
--- a/aten/src/ATen/native/Distributions.h
+++ b/aten/src/ATen/native/Distributions.h
@@ -127,7 +127,7 @@ C10_DEVICE scalar_t sample_gamma(scalar_t alpha, BaseSampler<accscalar_t, unifor
 
 template<typename scalar_t>
 C10_DEVICE scalar_t stirling_approx_tail(scalar_t k) {
-  const static scalar_t kTailValues[] = {
+  constexpr static scalar_t kTailValues[] = {
     0.0810614667953272,
     0.0413406959554092,
     0.0276779256849983,
@@ -139,7 +139,7 @@ C10_DEVICE scalar_t stirling_approx_tail(scalar_t k) {
     0.00925546218271273,
     0.00833056343336287
   };
-  if (k <= 9) {
+  if (k <= sizeof(kTailValues)/sizeof(scalar_t)) {
     return kTailValues[static_cast<size_t>(k)];
   }
   scalar_t kp1sq = (k + 1) * (k + 1);
diff --git a/aten/src/ATen/native/Math.h b/aten/src/ATen/native/Math.h
index b261da5fe54e..4677542706f6 100644
--- a/aten/src/ATen/native/Math.h
+++ b/aten/src/ATen/native/Math.h
@@ -581,7 +581,7 @@ scalar_t ratevl(scalar_t x, const scalar_t num[], int64_t M,
 template <typename scalar_t>
 static scalar_t lanczos_sum_expg_scaled(scalar_t x) {
   // lanczos approximation
-  static const scalar_t lanczos_sum_expg_scaled_num[13] = {
+  static constexpr scalar_t lanczos_sum_expg_scaled_num[13] = {
     0.006061842346248906525783753964555936883222,
     0.5098416655656676188125178644804694509993,
     19.51992788247617482847860966235652136208,
@@ -596,7 +596,7 @@ static scalar_t lanczos_sum_expg_scaled(scalar_t x) {
     103794043.1163445451906271053616070238554,
     56906521.91347156388090791033559122686859
   };
-  static const scalar_t lanczos_sum_expg_scaled_denom[13] = {
+  static constexpr scalar_t lanczos_sum_expg_scaled_denom[13] = {
     1.,
     66.,
     1925.,
@@ -712,7 +712,7 @@ static scalar_t _igamc_helper_series(scalar_t a, scalar_t x) {
 template <typename scalar_t>
 static scalar_t _igam_helper_asymptotic_series(scalar_t a, scalar_t x, bool igam) {
   // Compute igam/igamc using DLMF 8.12.3/8.12.4 [igam1]
-  static const scalar_t d[25][25] =
+  static constexpr scalar_t d[25][25] =
     {{-3.3333333333333333e-1, 8.3333333333333333e-2, -1.4814814814814815e-2,
       1.1574074074074074e-3, 3.527336860670194e-4, -1.7875514403292181e-4,
       3.9192631785224378e-5, -2.1854485106799922e-6, -1.85406221071516e-6,
diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp
index 86941806d307..72526162d133 100644
--- a/aten/src/ATen/native/Normalization.cpp
+++ b/aten/src/ATen/native/Normalization.cpp
@@ -62,7 +62,7 @@
 #include <utility>
 #include <vector>
 
-static const int MIOPEN_DIM_MAX = 5;
+static constexpr int MIOPEN_DIM_MAX = 5;
 
 namespace at::meta {
 
diff --git a/aten/src/ATen/native/cpu/UpSampleKernel.cpp b/aten/src/ATen/native/cpu/UpSampleKernel.cpp
index bd421aad111d..e59e5985bf7f 100644
--- a/aten/src/ATen/native/cpu/UpSampleKernel.cpp
+++ b/aten/src/ATen/native/cpu/UpSampleKernel.cpp
@@ -1038,7 +1038,7 @@ struct HelperInterpNearest : public HelperInterpBase {
   // We keep this structure for BC and consider as deprecated.
   // See HelperInterpNearestExact as replacement
 
-  static const int interp_size = 1;
+  static constexpr int interp_size = 1;
 
   static inline void init_indices_weights(
     at::ScalarType output_type,
@@ -1155,7 +1155,7 @@ struct HelperInterpNearestExact : public HelperInterpNearest {
 
 struct HelperInterpLinear : public HelperInterpBase {
 
-  static const int interp_size = 2;
+  static constexpr int interp_size = 2;
 
   // Compute indices and weights for each interpolated dimension
   // indices_weights = {
@@ -1275,7 +1275,7 @@ struct HelperInterpLinear : public HelperInterpBase {
 
 struct HelperInterpCubic : public HelperInterpBase {
 
-  static const int interp_size = 4;
+  static constexpr int interp_size = 4;
 
   // Compute indices and weights for each interpolated dimension
   // indices_weights = {
diff --git a/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu
index edb502688860..344906a2a4df 100644
--- a/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu
+++ b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu
@@ -249,7 +249,7 @@ __global__ void max_pool_forward_nhwc(
 }
 
 
-static const int BLOCK_THREADS = 256;
+static constexpr int BLOCK_THREADS = 256;
 
 template <typename scalar_t, typename accscalar_t>
 #if defined (USE_ROCM)
diff --git a/aten/src/ATen/native/cuda/Embedding.cu b/aten/src/ATen/native/cuda/Embedding.cu
index 602dfd6e5288..adc300a5a9ef 100644
--- a/aten/src/ATen/native/cuda/Embedding.cu
+++ b/aten/src/ATen/native/cuda/Embedding.cu
@@ -36,9 +36,9 @@ namespace at::native {
 namespace {
 
 #if defined(USE_ROCM)
-static const int BLOCKDIMY = 16;
+static constexpr int BLOCKDIMY = 16;
 #else
-static const int BLOCKDIMY = 32;
+static constexpr int BLOCKDIMY = 32;
 #endif
 
 template
diff --git a/aten/src/ATen/native/cuda/IGammaKernel.cu b/aten/src/ATen/native/cuda/IGammaKernel.cu
index 624f080d9f6e..73db6272be9e 100644
--- a/aten/src/ATen/native/cuda/IGammaKernel.cu
+++ b/aten/src/ATen/native/cuda/IGammaKernel.cu
@@ -82,7 +82,7 @@ __host__ __device__ scalar_t lanczos_sum_expg_scaled(scalar_t x) {
   // lanczos approximation
   using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
 
-  static const accscalar_t lanczos_sum_expg_scaled_num[13] = {
+  constexpr accscalar_t lanczos_sum_expg_scaled_num[13] = {
     0.006061842346248906525783753964555936883222,
     0.5098416655656676188125178644804694509993,
     19.51992788247617482847860966235652136208,
@@ -97,7 +97,7 @@ __host__ __device__ scalar_t lanczos_sum_expg_scaled(scalar_t x) {
     103794043.1163445451906271053616070238554,
     56906521.91347156388090791033559122686859
   };
-  static const accscalar_t lanczos_sum_expg_scaled_denom[13] = {
+  constexpr accscalar_t lanczos_sum_expg_scaled_denom[13] = {
     1.,
     66.,
     1925.,
@@ -126,10 +126,10 @@ __host__ __device__ scalar_t _igam_helper_fac(scalar_t a, scalar_t x) {
 
   using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
   accscalar_t ax, fac, res, num, numfac;
-  static const accscalar_t MAXLOG = std::is_same_v<accscalar_t,double> ?
+  constexpr accscalar_t MAXLOG = std::is_same_v<accscalar_t,double> ?
     7.09782712893383996843E2 : 88.72283905206835;
-  static const accscalar_t EXP1 = 2.718281828459045;
-  static const accscalar_t lanczos_g = 6.024680040776729583740234375;
+  constexpr accscalar_t EXP1 = 2.718281828459045;
+  constexpr accscalar_t lanczos_g = 6.024680040776729583740234375;
 
   if (::fabs(a - x) > 0.4 * ::fabs(a)) {
     ax = a * ::log(x) - x - ::lgamma(a);
@@ -158,9 +158,9 @@ __host__ __device__ scalar_t _igam_helper_series(scalar_t a, scalar_t x) {
   // Compute igam using DLMF 8.11.4. [igam1]
 
   using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
-  static const accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
+  constexpr accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
     1.11022302462515654042E-16 : 5.9604644775390625E-8;
-  static const int MAXITER = 2000;
+  constexpr int MAXITER = 2000;
 
   int i;
   accscalar_t ans, ax, c, r;
@@ -196,8 +196,8 @@ __host__ __device__ scalar_t _igamc_helper_series(scalar_t a, scalar_t x) {
   accscalar_t fac = 1;
   accscalar_t sum = 0;
   accscalar_t term, logx;
-  static const int MAXITER = 2000;
-  static const accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
+  constexpr int MAXITER = 2000;
+  constexpr accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
     1.11022302462515654042E-16 : 5.9604644775390625E-8;
 
   for (n = 1; n < MAXITER; n++) {
@@ -219,7 +219,7 @@ __host__ __device__ scalar_t _igam_helper_asymptotic_series(scalar_t a, scalar_t
   // Compute igam/igamc using DLMF 8.12.3/8.12.4 [igam1]
 
   using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
-  static const accscalar_t d[25][25] =
+  constexpr accscalar_t d[25][25] =
     {{-3.3333333333333333e-1, 8.3333333333333333e-2, -1.4814814814814815e-2, 1.1574074074074074e-3, 3.527336860670194e-4, -1.7875514403292181e-4, 3.9192631785224378e-5, -2.1854485106799922e-6, -1.85406221071516e-6, 8.296711340953086e-7, -1.7665952736826079e-7, 6.7078535434014986e-9, 1.0261809784240308e-8, -4.3820360184533532e-9, 9.1476995822367902e-10, -2.551419399494625e-11, -5.8307721325504251e-11, 2.4361948020667416e-11, -5.0276692801141756e-12, 1.1004392031956135e-13, 3.3717632624009854e-13, -1.3923887224181621e-13, 2.8534893807047443e-14, -5.1391118342425726e-16, -1.9752288294349443e-15},
     {-1.8518518518518519e-3, -3.4722222222222222e-3, 2.6455026455026455e-3, -9.9022633744855967e-4, 2.0576131687242798e-4, -4.0187757201646091e-7, -1.8098550334489978e-5, 7.6491609160811101e-6, -1.6120900894563446e-6, 4.6471278028074343e-9, 1.378633446915721e-7, -5.752545603517705e-8, 1.1951628599778147e-8, -1.7543241719747648e-11, -1.0091543710600413e-9, 4.1627929918425826e-10, -8.5639070264929806e-11, 6.0672151016047586e-14, 7.1624989648114854e-12, -2.9331866437714371e-12, 5.9966963656836887e-13, -2.1671786527323314e-16, -4.9783399723692616e-14, 2.0291628823713425e-14, -4.13125571381061e-15},
     {4.1335978835978836e-3, -2.6813271604938272e-3, 7.7160493827160494e-4, 2.0093878600823045e-6, -1.0736653226365161e-4, 5.2923448829120125e-5, -1.2760635188618728e-5, 3.4235787340961381e-8, 1.3721957309062933e-6, -6.298992138380055e-7, 1.4280614206064242e-7, -2.0477098421990866e-10, -1.4092529910867521e-8, 6.228974084922022e-9, -1.3670488396617113e-9, 9.4283561590146782e-13, 1.2872252400089318e-10, -5.5645956134363321e-11, 1.1975935546366981e-11, -4.1689782251838635e-15, -1.0940640427884594e-12, 4.6622399463901357e-13, -9.905105763906906e-14, 1.8931876768373515e-17, 8.8592218725911273e-15},
@@ -248,7 +248,7 @@ __host__ __device__ scalar_t _igam_helper_asymptotic_series(scalar_t a, scalar_t
 
   int k, n, sgn;
   int maxpow = 0;
-  static const accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
+  constexpr accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
     1.11022302462515654042E-16 : 5.9604644775390625E-8;
   accscalar_t lambda = x / a;
   accscalar_t sigma = (x - a) / a;
@@ -314,12 +314,12 @@ __host__ __device__ scalar_t _igamc_helper_continued_fraction(scalar_t a, scalar
   int i;
   accscalar_t ans, ax, c, yc, r, t, y, z;
   accscalar_t pk, pkm1, pkm2, qk, qkm1, qkm2;
-  static const int MAXITER = 2000;
-  static const accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
+  constexpr int MAXITER = 2000;
+  constexpr accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
     1.11022302462515654042E-16 : 5.9604644775390625E-8;
-  static const accscalar_t BIG = std::is_same_v<accscalar_t,double> ?
+  constexpr accscalar_t BIG = std::is_same_v<accscalar_t,double> ?
     4.503599627370496e15 : 16777216.;
-  static const accscalar_t BIGINV = std::is_same_v<accscalar_t,double> ?
+  constexpr accscalar_t BIGINV = std::is_same_v<accscalar_t,double> ?
     2.22044604925031308085e-16 : 5.9604644775390625E-8;
 
   ax = _igam_helper_fac(a, x);
@@ -385,10 +385,10 @@ __noinline__ __host__ __device__ scalar_t calc_igammac(scalar_t a, scalar_t x) {
   using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
   accscalar_t absxma_a;
 
-  static const accscalar_t SMALL = 20.0;
-  static const accscalar_t LARGE = 200.0;
-  static const accscalar_t SMALLRATIO = 0.3;
-  static const accscalar_t LARGERATIO = 4.5;
+  constexpr accscalar_t SMALL = 20.0;
+  constexpr accscalar_t LARGE = 200.0;
+  constexpr accscalar_t SMALLRATIO = 0.3;
+  constexpr accscalar_t LARGERATIO = 4.5;
 
   if ((x < 0) || (a < 0)) {
     // out of defined-region of the function
@@ -467,10 +467,10 @@ __noinline__ __host__ __device__ scalar_t calc_igamma(scalar_t a, scalar_t x) {
 
   using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
   accscalar_t absxma_a;
-  static const accscalar_t SMALL = 20.0;
-  static const accscalar_t LARGE = 200.0;
-  static const accscalar_t SMALLRATIO = 0.3;
-  static const accscalar_t LARGERATIO = 4.5;
+  constexpr accscalar_t SMALL = 20.0;
+  constexpr accscalar_t LARGE = 200.0;
+  constexpr accscalar_t SMALLRATIO = 0.3;
+  constexpr accscalar_t LARGERATIO = 4.5;
 
   // boundary values following SciPy
   if ((x < 0) || (a < 0)) {
diff --git a/aten/src/ATen/native/cuda/Math.cuh b/aten/src/ATen/native/cuda/Math.cuh
index 1d603132e689..1fa245af1a4d 100644
--- a/aten/src/ATen/native/cuda/Math.cuh
+++ b/aten/src/ATen/native/cuda/Math.cuh
@@ -231,7 +231,7 @@ const auto lcm_string = jiterator_stringify(
 const auto digamma_string = jiterator_stringify(
   template <typename T>
   T digamma(T x) {
-    static const double PI_f64 = 3.14159265358979323846;
+    static constexpr double PI_f64 = 3.14159265358979323846;
 
     // Short-circuits if x is +/- 0 and returns -/+ ∞ per the C++ standard
     if (x == 0) {
@@ -3072,9 +3072,9 @@ template <typename scalar_t>
 static inline C10_HOST_DEVICE scalar_t calc_digamma(scalar_t in) {
   // [C++ Standard Reference: Gamma Function] https://en.cppreference.com/w/cpp/numeric/math/tgamma
   using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
-  static const double PI_f64 = 3.14159265358979323846;
-  const accscalar_t PSI_10 = 2.25175258906672110764;
-  const accscalar_t A[] = {
+  static constexpr double PI_f64 = 3.14159265358979323846;
+  constexpr accscalar_t PSI_10 = 2.25175258906672110764;
+  constexpr accscalar_t A[] = {
       8.33333333333333333333E-2,
       -2.10927960927960927961E-2,
       7.57575757575757575758E-3,
diff --git a/aten/src/ATen/native/cuda/UpSample.cuh b/aten/src/ATen/native/cuda/UpSample.cuh
index 50428b377da8..09e094ea2bf0 100644
--- a/aten/src/ATen/native/cuda/UpSample.cuh
+++ b/aten/src/ATen/native/cuda/UpSample.cuh
@@ -277,7 +277,7 @@ struct BilinearFilterFunctor {
     return 0;
   }
 
-  static const int size = 2;
+  static constexpr int size = 2;
 };
 
 // taken from
@@ -301,7 +301,7 @@ struct BicubicFilterFunctor {
     return 0;
   }
 
-  static const int size = 4;
+  static constexpr int size = 4;
 };
 
 template <typename accscalar_t>
diff --git a/aten/src/ATen/native/mkldnn/Matmul.cpp b/aten/src/ATen/native/mkldnn/Matmul.cpp
index 740c056a7f23..fbc8294f45cf 100644
--- a/aten/src/ATen/native/mkldnn/Matmul.cpp
+++ b/aten/src/ATen/native/mkldnn/Matmul.cpp
@@ -416,7 +416,7 @@ static inline bool checksize(const Tensor& mat1, const Tensor& mat2){
   // else if dim = 3, mat1's size = (b * m * n), mat2's size = (b * n * k)
   // else called from aten::mv, mat1.size = (m * n), mat2.size = (n)
   // only m * n * b * k(if exist) are large enough we can get benefit from mkldnn optimized gemm kernel
-  static const int64_t mkldnn_gemm_min_size = 16 * 16 * 16;
+  constexpr int64_t mkldnn_gemm_min_size = 16 * 16 * 16;
   if (mat1.dim() == 1 && mat2.dim() == 1) {
     // aten::dot
     return mat1.size(0) > mkldnn_gemm_min_size;
diff --git a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
index 028047e4d6ac..293dfb20b9bf 100644
--- a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
+++ b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
@@ -3551,7 +3551,7 @@ void dequantize_tensor_per_tensor_affine_cpu(
 
 #if defined(__ARM_NEON__) || defined(__aarch64__)
 
-const static int PARALLEL_THRESHOLD = 1 << 20;
+constexpr static int PARALLEL_THRESHOLD = 1 << 20;
 
 // Generic template defaults to naive quantize implementation
 template <typename T>
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear.cpp b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
index 897eefd91d21..7a80b166f8cb 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
@@ -1388,7 +1388,7 @@ namespace at::native {
     TORCH_CHECK(act_scale.numel() == 1 && act_zero_point.numel() <= 1,
         "onednn int8 linear: act scale/zp size should be 1/<=1");
     static std::optional<at::Tensor> other = std::nullopt;
-    static const std::string_view binary_post_op = "none";
+    constexpr std::string_view binary_post_op = "none";
     int64_t act_zp = act_zero_point.numel() == 1 ? act_zero_point.item().toLong() : 0;
     return linear_int8_with_onednn_weight(
         act, act_scale.item().toDouble(), act_zp,
diff --git a/aten/src/ATen/native/quantized/cpu/qsoftmax.cpp b/aten/src/ATen/native/quantized/cpu/qsoftmax.cpp
index cd00a351b0e3..31221cd9bf26 100644
--- a/aten/src/ATen/native/quantized/cpu/qsoftmax.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qsoftmax.cpp
@@ -16,8 +16,8 @@ namespace {
 
 #ifdef USE_PYTORCH_QNNPACK
 
-const static float qnnpack_softmax_output_scale = 0x1.0p-8f;
-const static int qnnpack_softmax_output_zero_point = 0;
+constexpr static float qnnpack_softmax_output_scale = 0x1.0p-8f;
+constexpr static int qnnpack_softmax_output_zero_point = 0;
 
 bool is_qnnpack_compatible(
     const Tensor& qx,
diff --git a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/epilogue/epilogue_thread_apply_logsumexp.h b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/epilogue/epilogue_thread_apply_logsumexp.h
index e3dc0778e46b..156034954d9e 100644
--- a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/epilogue/epilogue_thread_apply_logsumexp.h
+++ b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/epilogue/epilogue_thread_apply_logsumexp.h
@@ -110,9 +110,9 @@ class ApplyLogSumExp {
   using ElementCompute = ElementCompute_;
   using ElementLSE = ElementLSE_;
 
-  static int const kElementsPerAccess = ElementsPerAccess;
-  static int const kCount = kElementsPerAccess;
-  static const ScaleType::Kind kScale =
+  static int constexpr kElementsPerAccess = ElementsPerAccess;
+  static int constexpr kCount = kElementsPerAccess;
+  static constexpr ScaleType::Kind kScale =
       cutlass::epilogue::thread::ScaleType::NoBetaScaling;
 
   using FragmentOutput = Array<ElementOutput, kCount>;
diff --git a/aten/src/ATen/test/pow_test.cpp b/aten/src/ATen/test/pow_test.cpp
index 95bb48b341f5..6391c3c8228c 100644
--- a/aten/src/ATen/test/pow_test.cpp
+++ b/aten/src/ATen/test/pow_test.cpp
@@ -14,16 +14,16 @@ using namespace at;
 
 namespace {
 
-const auto int_min = std::numeric_limits<int>::min();
-const auto int_max = std::numeric_limits<int>::max();
-const auto long_min = std::numeric_limits<int64_t>::min();
-const auto long_max = std::numeric_limits<int64_t>::max();
-const auto float_lowest = std::numeric_limits<float>::lowest();
-const auto float_min = std::numeric_limits<float>::min();
-const auto float_max = std::numeric_limits<float>::max();
-const auto double_lowest = std::numeric_limits<double>::lowest();
-const auto double_min = std::numeric_limits<double>::min();
-const auto double_max = std::numeric_limits<double>::max();
+constexpr auto int_min = std::numeric_limits<int>::min();
+constexpr auto int_max = std::numeric_limits<int>::max();
+constexpr auto long_min = std::numeric_limits<int64_t>::min();
+constexpr auto long_max = std::numeric_limits<int64_t>::max();
+constexpr auto float_lowest = std::numeric_limits<float>::lowest();
+constexpr auto float_min = std::numeric_limits<float>::min();
+constexpr auto float_max = std::numeric_limits<float>::max();
+constexpr auto double_lowest = std::numeric_limits<double>::lowest();
+constexpr auto double_min = std::numeric_limits<double>::min();
+constexpr auto double_max = std::numeric_limits<double>::max();
 
 const std::vector<int> ints {
   int_min,
diff --git a/aten/src/ATen/xpu/XPUGeneratorImpl.cpp b/aten/src/ATen/xpu/XPUGeneratorImpl.cpp
index 14f3059cc2b3..7a0859671ba7 100644
--- a/aten/src/ATen/xpu/XPUGeneratorImpl.cpp
+++ b/aten/src/ATen/xpu/XPUGeneratorImpl.cpp
@@ -146,9 +146,9 @@ uint64_t XPUGeneratorImpl::seed() {
 
 c10::intrusive_ptr<c10::TensorImpl> XPUGeneratorImpl::get_state() const {
   // The RNG state comprises the seed, and an offset used for Philox.
-  static const size_t seed_size = sizeof(uint64_t);
-  static const size_t offset_size = sizeof(uint64_t);
-  static const size_t total_size = seed_size + offset_size;
+  constexpr size_t seed_size = sizeof(uint64_t);
+  constexpr size_t offset_size = sizeof(uint64_t);
+  constexpr size_t total_size = seed_size + offset_size;
 
   // The internal state is returned as a CPU byte tensor.
   auto state_tensor = at::detail::empty_cpu(
@@ -170,9 +170,9 @@ c10::intrusive_ptr<c10::TensorImpl> XPUGeneratorImpl::get_state() const {
 void XPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
   at::xpu::assertNotCapturing(
       "Please ensure to utilize the XPUGeneratorImpl::set_state_index method during capturing.");
-  static const size_t seed_size = sizeof(uint64_t);
-  static const size_t offset_size = sizeof(uint64_t);
-  static const size_t total_size = seed_size + offset_size;
+  constexpr size_t seed_size = sizeof(uint64_t);
+  constexpr size_t offset_size = sizeof(uint64_t);
+  constexpr size_t total_size = seed_size + offset_size;
 
   at::detail::check_rng_state(new_state);
 

From 5d9b0242762e7a416a789365e987b63dfe6b030a Mon Sep 17 00:00:00 2001
From: Shangdi Yu <shangdiy@meta.com>
Date: Thu, 16 Oct 2025 13:25:16 -0700
Subject: [PATCH 010/123] Add mingw to docker (#165560)

Add mingw to `pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11` docker image to support AOTI cross-compilation

This PR will make docker container rebuild, and upgrade python version from 3.13.7 to 3.13.8. and it relies on https://github.com/pytorch/pytorch/pull/165667
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165560
Approved by: https://github.com/malfet
---
 .ci/docker/build.sh                |  2 ++
 .ci/docker/common/install_mingw.sh | 10 ++++++++++
 .ci/docker/ubuntu/Dockerfile       |  5 +++++
 3 files changed, 17 insertions(+)
 create mode 100644 .ci/docker/common/install_mingw.sh

diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
index ff0df5a1983a..a23c85bc60a5 100755
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@@ -113,6 +113,7 @@ case "$tag" in
     UCX_COMMIT=${_UCX_COMMIT}
     UCC_COMMIT=${_UCC_COMMIT}
     TRITON=yes
+    INSTALL_MINGW=yes
     ;;
   pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11)
     CUDA_VERSION=13.0.0
@@ -361,6 +362,7 @@ docker build \
        --build-arg "OPENBLAS=${OPENBLAS:-}" \
        --build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \
        --build-arg "SKIP_LLVM_SRC_BUILD_INSTALL=${SKIP_LLVM_SRC_BUILD_INSTALL:-}" \
+       --build-arg "INSTALL_MINGW=${INSTALL_MINGW:-}" \
        -f $(dirname ${DOCKERFILE})/Dockerfile \
        -t "$tmp_tag" \
        "$@" \
diff --git a/.ci/docker/common/install_mingw.sh b/.ci/docker/common/install_mingw.sh
new file mode 100644
index 000000000000..6232a0d0245c
--- /dev/null
+++ b/.ci/docker/common/install_mingw.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+set -ex
+
+# Install MinGW-w64 for Windows cross-compilation
+apt-get update
+apt-get install -y g++-mingw-w64-x86-64-posix
+
+echo "MinGW-w64 installed successfully"
+x86_64-w64-mingw32-g++ --version
diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile
index 1edc8c60c2f0..3f22a1276921 100644
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@@ -103,6 +103,11 @@ COPY ci_commit_pins/torchbench.txt torchbench.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
 RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt
 
+ARG INSTALL_MINGW
+COPY ./common/install_mingw.sh install_mingw.sh
+RUN if [ -n "${INSTALL_MINGW}" ]; then bash ./install_mingw.sh; fi
+RUN rm install_mingw.sh
+
 ARG TRITON
 ARG TRITON_CPU
 

From d82527b32ad0e09309ff874458139ecf6994e030 Mon Sep 17 00:00:00 2001
From: Shangdi Yu <shangdiy@meta.com>
Date: Thu, 16 Oct 2025 13:25:17 -0700
Subject: [PATCH 011/123] [Windows] Add AOTI cross-compilation CI (#165573)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165573
Approved by: https://github.com/malfet
ghstack dependencies: #165560
---
 .ci/pytorch/test.sh                           |  18 +
 .github/workflows/_linux-test.yml             |  40 ++
 .github/workflows/_win-build.yml              |  25 ++
 .github/workflows/trunk.yml                   |  17 +
 .../test_aoti_cross_compile_windows.py        | 371 ++++++++++++++++++
 tools/testing/discover_tests.py               |   1 +
 6 files changed, 472 insertions(+)
 create mode 100644 test/inductor/test_aoti_cross_compile_windows.py

diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh
index 9ca0decd087e..3e2dc09ef495 100755
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@@ -485,6 +485,22 @@ test_inductor_aoti() {
   /usr/bin/env "${TEST_ENVS[@]}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference cpp/test_vec_half_AVX2 -dist=loadfile
 }
 
+test_inductor_aoti_cross_compile_for_windows() {
+
+  TEST_REPORTS_DIR=$(pwd)/test/test-reports
+  mkdir -p "$TEST_REPORTS_DIR"
+
+  # Set WINDOWS_CUDA_HOME environment variable
+  WINDOWS_CUDA_HOME="$(pwd)/win-torch-wheel-extracted"
+  export WINDOWS_CUDA_HOME
+
+  echo "WINDOWS_CUDA_HOME is set to: $WINDOWS_CUDA_HOME"
+  echo "Contents:"
+  ls -lah "$(pwd)/win-torch-wheel-extracted/lib/x64/" || true
+
+  python test/inductor/test_aoti_cross_compile_windows.py -k compile --package-dir "$TEST_REPORTS_DIR" --win-torch-lib-dir "$(pwd)/win-torch-wheel-extracted/torch/lib"
+}
+
 test_inductor_cpp_wrapper_shard() {
   if [[ -z "$NUM_TEST_SHARDS" ]]; then
     echo "NUM_TEST_SHARDS must be defined to run a Python test shard"
@@ -1718,6 +1734,8 @@ elif [[ "${TEST_CONFIG}" == *inductor-triton-cpu* ]]; then
   test_inductor_triton_cpu
 elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then
   test_inductor_micro_benchmark
+elif [[ "${TEST_CONFIG}" == *aoti_cross_compile_for_windows* ]]; then
+  test_inductor_aoti_cross_compile_for_windows
 elif [[ "${TEST_CONFIG}" == *huggingface* ]]; then
   install_torchvision
   id=$((SHARD_NUMBER-1))
diff --git a/.github/workflows/_linux-test.yml b/.github/workflows/_linux-test.yml
index 89f13d3fea8f..29c2fc8e0847 100644
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@@ -224,6 +224,46 @@ jobs:
         continue-on-error: true
         uses: ./.github/actions/download-td-artifacts
 
+      - name: Download Windows torch wheel for cross-compilation
+        if: matrix.win_torch_wheel_artifact != ''
+        uses: seemethere/download-artifact-s3@1da556a7aa0a088e3153970611f6c432d58e80e6 # v4.2.0
+        with:
+          name: ${{ matrix.win_torch_wheel_artifact }}
+          path: win-torch-wheel
+
+      - name: Extract Windows wheel and setup CUDA libraries
+        if: matrix.win_torch_wheel_artifact != ''
+        shell: bash
+        run: |
+          set -x
+
+          # Find the wheel file
+          WHEEL_FILE=$(find win-torch-wheel -name "*.whl" -type f | head -n 1)
+          if [ -z "$WHEEL_FILE" ]; then
+            echo "Error: No wheel file found in win-torch-wheel directory"
+            exit 1
+          fi
+          echo "Found wheel file: $WHEEL_FILE"
+
+          # Unzip the wheel file
+          unzip -q "$WHEEL_FILE" -d win-torch-wheel-extracted
+          echo "Extracted wheel contents"
+
+          # Setup CUDA libraries (cuda.lib and cudart.lib) directory
+          mkdir -p win-torch-wheel-extracted/lib/x64
+          if [ -f "win-torch-wheel/cuda.lib" ]; then
+            mv win-torch-wheel/cuda.lib win-torch-wheel-extracted/lib/x64/
+            echo "Moved cuda.lib to win-torch-wheel-extracted/lib/x64/"
+          fi
+          if [ -f "win-torch-wheel/cudart.lib" ]; then
+            mv win-torch-wheel/cudart.lib win-torch-wheel-extracted/lib/x64/
+            echo "Moved cudart.lib to win-torch-wheel-extracted/lib/x64/"
+          fi
+
+          # Verify CUDA libraries are present
+          echo "CUDA libraries:"
+          ls -la win-torch-wheel-extracted/lib/x64/ || echo "No CUDA libraries found"
+
       - name: Parse ref
         id: parse-ref
         run: .github/scripts/parse_ref.py
diff --git a/.github/workflows/_win-build.yml b/.github/workflows/_win-build.yml
index 153f6007b3f0..0fd3cf7f3972 100644
--- a/.github/workflows/_win-build.yml
+++ b/.github/workflows/_win-build.yml
@@ -168,6 +168,31 @@ jobs:
         run: |
           .ci/pytorch/win-build.sh
 
+      # Collect Windows torch libs and CUDA libs for cross-compilation
+      - name: Collect Windows CUDA libs for cross-compilation
+        if: steps.build.outcome != 'skipped' && inputs.cuda-version != 'cpu'
+        shell: bash
+        run: |
+          set -ex
+
+          # Create directory structure if does not exist
+          mkdir -p /c/${{ github.run_id }}/build-results
+
+          # Copy CUDA libs
+          CUDA_PATH="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${{ inputs.cuda-version }}"
+
+          if [ -f "${CUDA_PATH}/lib/x64/cuda.lib" ]; then
+            cp "${CUDA_PATH}/lib/x64/cuda.lib" /c/${{ github.run_id }}/build-results/
+          fi
+
+          if [ -f "${CUDA_PATH}/lib/x64/cudart.lib" ]; then
+            cp "${CUDA_PATH}/lib/x64/cudart.lib" /c/${{ github.run_id }}/build-results/
+          fi
+
+          # List collected files
+          echo "Collected CUDA libs:"
+          ls -lah /c/${{ github.run_id }}/build-results/*.lib
+
       # Upload to github so that people can click and download artifacts
       - name: Upload artifacts to s3
         if: steps.build.outcome != 'skipped'
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index c8aab0aee10e..710b6cfa9eaf 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -200,6 +200,23 @@ jobs:
       cuda-arch-list: '8.0'
     secrets: inherit
 
+  # Test cross-compiled models with Windows libs extracted from wheel
+  cross-compile-linux-test:
+    name: cross-compile-linux-test
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-cuda12_8-py3_10-gcc11-build
+      - get-label-type
+      - win-vs2022-cuda12_8-py3-build
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.docker-image }}
+      test-matrix: |
+        { include: [
+          { config: "aoti_cross_compile_for_windows", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", win_torch_wheel_artifact: "win-vs2022-cuda12.8-py3" },
+        ]}
+    secrets: inherit
+
   verify-cachebench-cpu-build:
     name: verify-cachebench-cpu-build
     uses: ./.github/workflows/_linux-build.yml
diff --git a/test/inductor/test_aoti_cross_compile_windows.py b/test/inductor/test_aoti_cross_compile_windows.py
new file mode 100644
index 000000000000..04065add9081
--- /dev/null
+++ b/test/inductor/test_aoti_cross_compile_windows.py
@@ -0,0 +1,371 @@
+# Owner(s): ["module: inductor"]
+import os
+import platform
+import tempfile
+import unittest
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Optional
+
+import torch
+import torch._inductor.config
+from torch._inductor.test_case import TestCase
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU, requires_gpu
+
+
+@dataclass
+class ModelTestConfig:
+    """Configuration for a model test case."""
+
+    name: str
+    model_class: type
+    example_inputs: tuple[torch.Tensor, ...]
+    dynamic_shapes: Optional[dict[str, Any]] = None
+    inductor_configs: Optional[dict[str, Any]] = None
+    rtol: float = 1e-4
+    atol: float = 1e-4
+
+
+class WindowsCrossCompilationTestFramework:
+    """
+    Framework for testing cross-compilation from Linux to Windows.
+
+    Provides reusable logic for creating compile and load test methods.
+    """
+
+    _base_path: Optional[Path] = None
+    _win_torch_libs_path: Optional[str] = None
+
+    @classmethod
+    def base_path(cls) -> Path:
+        """Get or create the base path for package files."""
+        if cls._base_path is None:
+            cls._base_path = Path(tempfile.mkdtemp(prefix="aoti_cross_compile_"))
+        return cls._base_path
+
+    @classmethod
+    def set_base_path(cls, path: Optional[Path | str] = None) -> None:
+        """Set the base path for package files."""
+        cls._base_path = Path(path) if path else None
+
+    @classmethod
+    def set_win_torch_libs_path(cls, path: Optional[str] = None) -> None:
+        """Set the path for Windows torch libs."""
+        cls._win_torch_libs_path = path
+
+    @classmethod
+    def get_package_path(cls, model_name: str) -> str:
+        """Get the path for a model's .pt2 package file."""
+        package_dir = cls.base_path()
+        package_dir.mkdir(parents=True, exist_ok=True)
+        return str(package_dir / f"{model_name}_windows.pt2")
+
+    @classmethod
+    def get_win_torch_libs_path(cls) -> str:
+        """Get the path for Windows torch libs."""
+        if cls._win_torch_libs_path is None:
+            raise RuntimeError("Windows torch libs path not set")
+        return str(cls._win_torch_libs_path)
+
+    @classmethod
+    def create_compile_test(cls, config: ModelTestConfig):
+        """Create a compile test method for a model configuration."""
+
+        def compile_test(self):
+            if platform.system() == "Windows":
+                raise unittest.SkipTest(
+                    "This test should run on Linux for cross-compilation"
+                )
+
+            self.assertTrue("WINDOWS_CUDA_HOME" in os.environ)
+
+            with torch.no_grad():
+                # Windows cross-compilation is only used for GPU.
+                # AOTI for CPU should be able to work as native compilation on Windows.
+                device = GPU_TYPE
+                model = config.model_class().to(device=device)
+                example_inputs = config.example_inputs
+
+                # Inputs should already be on GPU_TYPE but ensure they are
+                example_inputs = tuple(inp.to(device) for inp in example_inputs)
+
+                # Export the model
+                exported = torch.export.export(
+                    model, example_inputs, dynamic_shapes=config.dynamic_shapes
+                )
+
+                # Prepare inductor configs
+                inductor_configs = {
+                    "aot_inductor.cross_target_platform": "windows",
+                    "aot_inductor.precompile_headers": False,
+                    "aot_inductor.package_constants_on_disk_format": "binary_blob",
+                    "aot_inductor.package_constants_in_so": False,
+                    "aot_inductor.aoti_shim_library_path": cls.get_win_torch_libs_path(),
+                }
+                if config.inductor_configs:
+                    inductor_configs.update(config.inductor_configs)
+
+                # Compile and package directly to the expected location
+                package_path = cls.get_package_path(config.name)
+                torch._inductor.aoti_compile_and_package(
+                    exported,
+                    package_path=package_path,
+                    inductor_configs=inductor_configs,
+                )
+
+                self.assertTrue(
+                    os.path.exists(package_path),
+                    f"Package file should exist at {package_path}",
+                )
+
+        return compile_test
+
+    @classmethod
+    def create_load_test(cls, config: ModelTestConfig):
+        """Create a load test method for a model configuration."""
+
+        def load_test(self):
+            if platform.system() != "Windows":
+                raise unittest.SkipTest("This test should run on Windows")
+
+            if not HAS_GPU:
+                raise unittest.SkipTest("Test requires GPU")
+
+            package_path = cls.get_package_path(config.name)
+            if not os.path.exists(package_path):
+                raise unittest.SkipTest(
+                    f"Package file not found at {package_path}. "
+                    f"Run test_{config.name}_compile first."
+                )
+
+            with torch.no_grad():
+                # Windows cross-compilation is only used for GPU.
+                # AOTI for CPU should be able to work as native compilation on Windows.
+                device = GPU_TYPE
+
+                # Create original model for comparison
+                original_model = config.model_class().to(device=device)
+                example_inputs = config.example_inputs
+
+                # Inputs should already be on GPU_TYPE but ensure they are
+                example_inputs = tuple(inp.to(device) for inp in example_inputs)
+
+                # Load the compiled package
+                loaded_model = torch._inductor.aoti_load_package(package_path)
+
+                # Test with the same inputs
+                original_output = original_model(*example_inputs)
+                loaded_output = loaded_model(*example_inputs)
+
+                # Compare outputs
+                torch.testing.assert_close(
+                    original_output, loaded_output, rtol=config.rtol, atol=config.atol
+                )
+
+        return load_test
+
+
+def auto_generate_tests(test_class):
+    """
+    Class decorator to automatically generate compile/load test methods
+    from _define_* methods that return ModelTestConfig.
+    """
+    # Find all _define_* methods that return ModelTestConfig
+    define_methods = {}
+    for name in dir(test_class):
+        if name.startswith("_define_") and callable(getattr(test_class, name)):
+            method = getattr(test_class, name)
+            # Try to call the method to see if it returns ModelTestConfig
+            try:
+                # Create a temporary instance to call the method
+                temp_instance = test_class.__new__(test_class)
+                result = method(temp_instance)
+                if isinstance(result, ModelTestConfig):
+                    define_methods[name] = result
+            except Exception:
+                # If method fails, skip it
+                pass
+
+    # Generate compile/load methods for each discovered definition
+    for define_name, config in define_methods.items():
+        model_name = define_name[8:]  # Remove '_define_' prefix
+
+        # Create compile test method
+        compile_method_name = f"test_{model_name}_compile"
+        compile_method = WindowsCrossCompilationTestFramework.create_compile_test(
+            config
+        )
+        compile_method.__name__ = compile_method_name
+        compile_method.__doc__ = f"Step 1: Cross-compile {model_name} model on Linux"
+        compile_method = requires_gpu()(compile_method)
+        setattr(test_class, compile_method_name, compile_method)
+
+        # Create load test method
+        load_method_name = f"test_{model_name}_load"
+        load_method = WindowsCrossCompilationTestFramework.create_load_test(config)
+        load_method.__name__ = load_method_name
+        load_method.__doc__ = f"Step 2: Load and test {model_name} model on Windows"
+        load_method = requires_gpu()(load_method)
+        setattr(test_class, load_method_name, load_method)
+
+    return test_class
+
+
+@auto_generate_tests
+class TestAOTInductorWindowsCrossCompilation(TestCase):
+    """
+    Test class for AOT Inductor Windows cross-compilation.
+
+    Define test methods that return ModelTestConfig, and the decorator
+    will auto-generate compile/load test methods.
+    """
+
+    def _define_simple(self):
+        """Define the Simple model and its test configuration."""
+
+        class Simple(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.fc1 = torch.nn.Linear(10, 16)
+                self.relu = torch.nn.ReLU()
+                self.fc2 = torch.nn.Linear(16, 1)
+                self.sigmoid = torch.nn.Sigmoid()
+
+            def forward(self, x):
+                x = self.fc1(x)
+                x = self.relu(x)
+                x = self.fc2(x)
+                x = self.sigmoid(x)
+                return x
+
+        return ModelTestConfig(
+            name="simple",
+            model_class=Simple,
+            example_inputs=(torch.randn(8, 10, device=GPU_TYPE),),
+            dynamic_shapes={"x": {0: torch.export.Dim("batch", min=1, max=1024)}},
+        )
+
+    def _define_simple_cnn(self):
+        """Define the SimpleCNN model and its test configuration."""
+
+        class SimpleCNN(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv1 = torch.nn.Conv2d(3, 16, 3)
+                self.relu = torch.nn.ReLU()
+                self.pool = torch.nn.AdaptiveAvgPool2d((1, 1))
+                self.fc = torch.nn.Linear(16, 10)
+
+            def forward(self, x):
+                x = self.conv1(x)
+                x = self.relu(x)
+                x = self.pool(x)
+                x = x.flatten(1)
+                x = self.fc(x)
+                return x
+
+        return ModelTestConfig(
+            name="simple_cnn",
+            model_class=SimpleCNN,
+            example_inputs=(torch.randn(2, 3, 32, 32, device=GPU_TYPE),),
+            dynamic_shapes={"x": {0: torch.export.Dim("batch", min=1, max=16)}},
+            rtol=1e-3,
+            atol=1e-3,
+        )
+
+    def _define_transformer(self):
+        """Define the SimpleTransformer model and its test configuration."""
+
+        class SimpleTransformer(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.embedding = torch.nn.Linear(128, 256)
+                self.attention = torch.nn.MultiheadAttention(256, 8, batch_first=True)
+                self.norm1 = torch.nn.LayerNorm(256)
+                self.ffn = torch.nn.Sequential(
+                    torch.nn.Linear(256, 1024),
+                    torch.nn.ReLU(),
+                    torch.nn.Linear(1024, 256),
+                )
+                self.norm2 = torch.nn.LayerNorm(256)
+                self.output = torch.nn.Linear(256, 10)
+
+            def forward(self, x):
+                # x shape: (batch, seq_len, input_dim)
+                x = self.embedding(x)
+                attn_out, _ = self.attention(x, x, x)
+                x = self.norm1(x + attn_out)
+                ffn_out = self.ffn(x)
+                x = self.norm2(x + ffn_out)
+                x = x.mean(dim=1)  # Global average pooling
+                x = self.output(x)
+                return x
+
+        return ModelTestConfig(
+            name="transformer",
+            model_class=SimpleTransformer,
+            example_inputs=(torch.randn(4, 16, 128, device=GPU_TYPE),),
+            dynamic_shapes={"x": {0: torch.export.Dim("batch", min=1, max=32)}},
+            rtol=1e-3,
+            atol=1e-3,
+        )
+
+
+if __name__ == "__main__":
+    import sys
+
+    from torch._inductor.test_case import run_tests
+
+    # Check for --package-dir argument and remove it before unittest sees it
+    package_dir = None
+    win_torch_lib_dir = None
+    filtered_argv = []
+    i = 0
+    while i < len(sys.argv):
+        if sys.argv[i] == "--package-dir":
+            if i + 1 < len(sys.argv):
+                package_dir = sys.argv[i + 1]
+                i += 2  # Skip both --package-dir and its value
+            else:
+                print("Error: --package-dir requires a valid directory path")
+                sys.exit(1)
+        elif sys.argv[i].startswith("--package-dir="):
+            package_dir = sys.argv[i].split("=", 1)[1]
+            i += 1
+        elif sys.argv[i] == "--win-torch-lib-dir":
+            if i + 1 < len(sys.argv):
+                win_torch_lib_dir = sys.argv[i + 1]
+                i += 2  # Skip both --win-torch-lib-dir and its value
+            else:
+                print("Error: --win-torch-lib-dir requires a valid directory path")
+                sys.exit(1)
+        elif sys.argv[i].startswith("--win-torch-lib-dir="):
+            win_torch_lib_dir = sys.argv[i].split("=", 1)[1]
+            i += 1
+        else:
+            filtered_argv.append(sys.argv[i])
+            i += 1
+
+    # Validate and set the base path for package storage
+    if package_dir:
+        try:
+            package_path = Path(package_dir)
+            package_path.mkdir(parents=True, exist_ok=True)
+            # Test write access
+            test_file = package_path / ".test_write"
+            test_file.touch()
+            test_file.unlink()
+            WindowsCrossCompilationTestFramework.set_base_path(package_path)
+        except Exception:
+            print("Error: --package-dir requires a valid directory path")
+            sys.exit(1)
+
+    # Set Windows torch libs path if provided (only needed for compile tests)
+    if win_torch_lib_dir:
+        WindowsCrossCompilationTestFramework.set_win_torch_libs_path(win_torch_lib_dir)
+
+    # Update sys.argv to remove our custom arguments
+    sys.argv = filtered_argv
+
+    if HAS_GPU:
+        run_tests(needs="filelock")
diff --git a/tools/testing/discover_tests.py b/tools/testing/discover_tests.py
index 13511b1ec129..1210326a02db 100644
--- a/tools/testing/discover_tests.py
+++ b/tools/testing/discover_tests.py
@@ -107,6 +107,7 @@ TESTS = discover_tests(
         "lazy/test_meta_kernel",
         "lazy/test_extract_compiled_graph",
         "test/inductor/test_aot_inductor_utils",
+        "inductor/test_aoti_cross_compile_windows",
         "onnx/test_onnxscript_no_runtime",
         "onnx/test_pytorch_onnx_onnxruntime_cuda",
         "onnx/test_models",

From 9726553653ee1c53fc9a1f436a92b29f456082ca Mon Sep 17 00:00:00 2001
From: Aaron Gokaslan <aaronGokaslan@gmail.com>
Date: Fri, 17 Oct 2025 01:07:36 +0000
Subject: [PATCH 012/123] [BE][Ez]: Use sys.executable instead of hardcoded
 Python (#165679)

Handles edgecase to ensure proper interpreter is called. Inspired by #165633
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165679
Approved by: https://github.com/FindHao
---
 torch/utils/_get_clean_triton.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/torch/utils/_get_clean_triton.py b/torch/utils/_get_clean_triton.py
index 98ee54a1c23d..fbbabc3f50e6 100644
--- a/torch/utils/_get_clean_triton.py
+++ b/torch/utils/_get_clean_triton.py
@@ -3,6 +3,7 @@ import argparse
 import os
 import re
 import subprocess
+import sys
 from pathlib import Path
 
 
@@ -107,7 +108,7 @@ def process_file(
             env["TORCHINDUCTOR_DUMP_LAUNCH_PARAMS"] = "1"
 
             result = subprocess.run(
-                ["python", input_filename],
+                [sys.executable, input_filename],
                 env=env,
                 capture_output=True,
                 text=True,

From 11e20843086cf58b3976ed3ac75ac1bbbebd715d Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Fri, 17 Oct 2025 02:01:53 +0000
Subject: [PATCH 013/123] Revert "[Mem Snapshot] Add Metadata Field (#165490)"

This reverts commit 5b3ea758951558e7d9f681ae784acb57eaa07910.

Reverted https://github.com/pytorch/pytorch/pull/165490 on behalf of https://github.com/pytorch-auto-revert due to Reverted automatically by pytorch's autorevert, to avoid this behaviour add the tag autorevert: disable ([comment](https://github.com/pytorch/pytorch/pull/165490#issuecomment-3413491091))
---
 c10/cuda/CUDACachingAllocator.cpp   | 27 +-------------------------
 c10/cuda/CUDACachingAllocator.h     | 19 ++----------------
 test/test_cuda.py                   | 22 ---------------------
 torch/_C/__init__.pyi.in            |  2 --
 torch/csrc/cuda/Module.cpp          | 10 ----------
 torch/csrc/cuda/memory_snapshot.cpp |  2 --
 torch/cuda/memory.py                | 30 -----------------------------
 7 files changed, 3 insertions(+), 109 deletions(-)

diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
index 25058f87264f..48413e7a6f34 100644
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@@ -1260,9 +1260,6 @@ class DeviceCachingAllocator {
   // thread local compile context for each device
   static thread_local std::stack<std::string> compile_context;
 
-  // thread local user metadata for annotating allocations
-  static thread_local std::string user_metadata;
-
  public:
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
   explicit DeviceCachingAllocator(c10::DeviceIndex id)
@@ -1305,14 +1302,6 @@ class DeviceCachingAllocator {
     }
   }
 
-  void setUserMetadata(const std::string& metadata) {
-    user_metadata = metadata;
-  }
-
-  std::string getUserMetadata() {
-    return user_metadata;
-  }
-
   bool checkPoolLiveAllocations(
       MempoolId_t mempool_id,
       const std::unordered_set<void*>& expected_live_allocations) const {
@@ -3693,8 +3682,7 @@ class DeviceCachingAllocator {
         mempool_id,
         getApproximateTime(),
         record_context_ >= RecordContext::ALLOC ? std::move(context) : nullptr,
-        compile_string,
-        user_metadata);
+        compile_string);
 
     // Callbacks should not include any Pytorch call
     for (const auto& cb : trace_trackers_) {
@@ -3749,7 +3737,6 @@ static void uncached_delete(void* ptr) {
 
 static void local_raw_delete(void* ptr);
 thread_local std::stack<std::string> DeviceCachingAllocator::compile_context;
-thread_local std::string DeviceCachingAllocator::user_metadata;
 #ifdef __cpp_lib_hardware_interference_size
 using std::hardware_destructive_interference_size;
 #else
@@ -3947,18 +3934,6 @@ class NativeCachingAllocator : public CUDAAllocator {
     device_allocator[device]->popCompileContext();
   }
 
-  void setUserMetadata(const std::string& metadata) override {
-    c10::DeviceIndex device = 0;
-    C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
-    device_allocator[device]->setUserMetadata(metadata);
-  }
-
-  std::string getUserMetadata() override {
-    c10::DeviceIndex device = 0;
-    C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
-    return device_allocator[device]->getUserMetadata();
-  }
-
   bool isHistoryEnabled() override {
     c10::DeviceIndex device = 0;
     C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
diff --git a/c10/cuda/CUDACachingAllocator.h b/c10/cuda/CUDACachingAllocator.h
index fbe5dab18e0a..89274c9f9946 100644
--- a/c10/cuda/CUDACachingAllocator.h
+++ b/c10/cuda/CUDACachingAllocator.h
@@ -118,8 +118,7 @@ struct TraceEntry {
       MempoolId_t mempool,
       approx_time_t time,
       std::shared_ptr<GatheredContext> context = nullptr,
-      std::string compile_context = "",
-      std::string user_metadata = "")
+      std::string compile_context = "")
       : action_(action),
         device_(device),
         addr_(addr),
@@ -127,8 +126,7 @@ struct TraceEntry {
         stream_(stream),
         size_(size),
         mempool_(std::move(mempool)),
-        compile_context_(std::move(compile_context)),
-        user_metadata_(std::move(user_metadata)) {
+        compile_context_(std::move(compile_context)) {
     time_.approx_t_ = time;
   }
   Action action_;
@@ -140,7 +138,6 @@ struct TraceEntry {
   MempoolId_t mempool_;
   trace_time_ time_{};
   std::string compile_context_;
-  std::string user_metadata_;
 };
 
 // Calls made by record_function will save annotations
@@ -300,10 +297,6 @@ class CUDAAllocator : public DeviceAllocator {
       const std::vector<std::pair<std::string, std::string>>& /*md*/) {}
   virtual void pushCompileContext(std::string& md) {}
   virtual void popCompileContext() {}
-  virtual void setUserMetadata(const std::string& metadata) {}
-  virtual std::string getUserMetadata() {
-    return "";
-  }
   virtual void attachOutOfMemoryObserver(OutOfMemoryObserver observer) = 0;
 
   // Attached AllocatorTraceTracker callbacks will be called while the
@@ -543,14 +536,6 @@ inline void enablePeerAccess(
   get()->enablePeerAccess(dev, dev_to_access);
 }
 
-inline void setUserMetadata(const std::string& metadata) {
-  get()->setUserMetadata(metadata);
-}
-
-inline std::string getUserMetadata() {
-  return get()->getUserMetadata();
-}
-
 } // namespace c10::cuda::CUDACachingAllocator
 
 namespace c10::cuda {
diff --git a/test/test_cuda.py b/test/test_cuda.py
index 05302ad97661..667bccd82c24 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -4378,28 +4378,6 @@ class TestCudaMallocAsync(TestCase):
             finally:
                 torch.cuda.memory._record_memory_history(None)
 
-    @unittest.skipIf(
-        TEST_CUDAMALLOCASYNC, "setContextRecorder not supported by CUDAMallocAsync"
-    )
-    @requiresCppContext
-    def test_memory_plots_metadata(self):
-        for context in ["alloc", "all", "state"]:
-            try:
-                torch._C._cuda_clearCublasWorkspaces()
-                torch.cuda.memory.empty_cache()
-                torch.cuda.memory._set_memory_metadata("metadata test")
-                torch.cuda.memory._record_memory_history(context="all")
-                x = torch.rand(3, 4, device="cuda")
-                del x
-                torch.cuda.memory.empty_cache()
-                torch.cuda.memory._set_memory_metadata("")
-
-                ss = torch.cuda.memory._snapshot()
-                for event in ss["device_traces"][0]:
-                    self.assertTrue(event["user_metadata"] == "metadata test")
-            finally:
-                torch.cuda.memory._record_memory_history(None)
-
     @unittest.skipIf(
         TEST_CUDAMALLOCASYNC, "setContextRecorder not supported by CUDAMallocAsync"
     )
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index b99fd3f2b80a..244200216ec9 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -2081,8 +2081,6 @@ def _cuda_hostMemoryStats() -> dict[str, Any]: ...
 def _cuda_resetAccumulatedHostMemoryStats() -> None: ...
 def _cuda_resetPeakHostMemoryStats() -> None: ...
 def _cuda_memorySnapshot(mempool_id: tuple[_int, _int] | None) -> dict[str, Any]: ...
-def _cuda_setMemoryMetadata(metadata: str) -> None: ...
-def _cuda_getMemoryMetadata() -> str: ...
 def _cuda_record_memory_history_legacy(
     enabled: _bool,
     record_context: _bool,
diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp
index 32ade3680980..0950192457d6 100644
--- a/torch/csrc/cuda/Module.cpp
+++ b/torch/csrc/cuda/Module.cpp
@@ -765,7 +765,6 @@ PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* arg) {
   py::str frames_s = "frames";
   py::str time_us_s = "time_us";
   py::str compile_context_s = "compile_context";
-  py::str user_metadata_s = "user_metadata";
 
   py::list empty_frames;
   std::vector<CapturedTraceback*> to_gather_frames;
@@ -883,7 +882,6 @@ PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* arg) {
       trace_entry[stream_s] = int64_t(te.stream_);
       trace_entry[time_us_s] = te.time_.t_;
       trace_entry[compile_context_s] = te.compile_context_;
-      trace_entry[user_metadata_s] = te.user_metadata_;
       trace.append(trace_entry);
     }
     traces.append(trace);
@@ -1139,14 +1137,6 @@ static void registerCudaDeviceProperties(PyObject* module) {
     return c10::cuda::CUDACachingAllocator::isHistoryEnabled();
   });
 
-  m.def("_cuda_setMemoryMetadata", [](const std::string& metadata) {
-    c10::cuda::CUDACachingAllocator::setUserMetadata(metadata);
-  });
-
-  m.def("_cuda_getMemoryMetadata", []() {
-    return c10::cuda::CUDACachingAllocator::getUserMetadata();
-  });
-
   m.def("_cuda_get_conv_benchmark_empty_cache", []() {
     return at::native::_cudnn_get_conv_benchmark_empty_cache();
   });
diff --git a/torch/csrc/cuda/memory_snapshot.cpp b/torch/csrc/cuda/memory_snapshot.cpp
index 830159d0a919..d4382aa8cb32 100644
--- a/torch/csrc/cuda/memory_snapshot.cpp
+++ b/torch/csrc/cuda/memory_snapshot.cpp
@@ -311,7 +311,6 @@ std::string _memory_snapshot_pickled() {
   IValue is_expandable_s = "is_expandable";
   IValue time_us_s = "time_us";
   IValue compile_contexts_s = "compile_context";
-  IValue user_metadata_s = "user_metadata";
 
   auto empty_frames = new_list();
 
@@ -429,7 +428,6 @@ std::string _memory_snapshot_pickled() {
       trace_entry.insert(size_s, (int64_t)te.size_);
       trace_entry.insert(stream_s, int64_t(te.stream_));
       trace_entry.insert(compile_contexts_s, te.compile_context_);
-      trace_entry.insert(user_metadata_s, te.user_metadata_);
       if (te.context_) {
         auto sc = getFromContext(te.context_);
         frame_tracebacks.push_back(sc);
diff --git a/torch/cuda/memory.py b/torch/cuda/memory.py
index e4b125eb4258..5eeaf3a8253f 100644
--- a/torch/cuda/memory.py
+++ b/torch/cuda/memory.py
@@ -1063,36 +1063,6 @@ def _dump_snapshot(filename="dump_snapshot.pickle"):
         pickle.dump(s, f)
 
 
-def _set_memory_metadata(metadata: str):
-    """
-    Set custom metadata that will be attached to all subsequent CUDA memory allocations.
-
-    This metadata will be recorded in the memory snapshot for all allocations made
-    after this call until the metadata is cleared or changed.
-
-    Args:
-        metadata (str): Custom metadata string to attach to allocations.
-                       Pass an empty string to clear the metadata.
-
-    Example:
-        >>> torch.cuda.memory._set_memory_metadata("training_phase")
-        >>> # All allocations here will have "training_phase" metadata
-        >>> x = torch.randn(100, 100, device="cuda")
-        >>> torch.cuda.memory._set_memory_metadata("")  # Clear metadata
-    """
-    torch._C._cuda_setMemoryMetadata(metadata)
-
-
-def _get_memory_metadata() -> str:
-    """
-    Get the current custom metadata that is being attached to CUDA memory allocations.
-
-    Returns:
-        str: The current metadata string, or empty string if no metadata is set.
-    """
-    return torch._C._cuda_getMemoryMetadata()
-
-
 def _save_segment_usage(filename="output.svg", snapshot=None):
     if snapshot is None:
         snapshot = _snapshot()

From d0add0be436582ab7d7e46828458704de66854ab Mon Sep 17 00:00:00 2001
From: bobrenjc93 <bobren@meta.com>
Date: Thu, 16 Oct 2025 14:11:00 -0700
Subject: [PATCH 014/123] [torchfuzz] check in some more ignore regexes
 (#164749)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164749
Approved by: https://github.com/pianpwk
---
 tools/experimental/torchfuzz/multi_process_fuzzer.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tools/experimental/torchfuzz/multi_process_fuzzer.py b/tools/experimental/torchfuzz/multi_process_fuzzer.py
index 1a602dbf4842..18cc3b62ee25 100644
--- a/tools/experimental/torchfuzz/multi_process_fuzzer.py
+++ b/tools/experimental/torchfuzz/multi_process_fuzzer.py
@@ -66,6 +66,12 @@ IGNORE_PATTERNS: list[re.Pattern] = [
     re.compile(
         r"torch\._inductor\.exc\.InductorError: CppCompileError: C\+\+ compile error"
     ),  # https://github.com/pytorch/pytorch/issues/164686
+    re.compile(
+        r"\.item\(\) # dtype="
+    ),  # https://github.com/pytorch/pytorch/issues/164725
+    re.compile(
+        r"dimensionality of sizes \(0\) must match dimensionality of strides \(1\)"
+    ),  # https://github.com/pytorch/pytorch/issues/164814
     # Add more patterns here as needed, e.g.:
     # re.compile(r"Some other error message"),
 ]

From 7dabfb07cb896e9c31734c17d215e59418e071e0 Mon Sep 17 00:00:00 2001
From: bobrenjc93 <bobren@meta.com>
Date: Thu, 16 Oct 2025 14:11:01 -0700
Subject: [PATCH 015/123] [torchfuzz] add support for --stop-at-first-failure
 flag (#165529)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165529
Approved by: https://github.com/pianpwk
ghstack dependencies: #164749
---
 tools/experimental/torchfuzz/codegen.py       |   4 +-
 tools/experimental/torchfuzz/fuzzer.py        |  33 ++++-
 .../torchfuzz/multi_process_fuzzer.py         | 140 ++++++++++++++++++
 3 files changed, 173 insertions(+), 4 deletions(-)

diff --git a/tools/experimental/torchfuzz/codegen.py b/tools/experimental/torchfuzz/codegen.py
index 8b0f2c8860fb..592d9322bcd6 100644
--- a/tools/experimental/torchfuzz/codegen.py
+++ b/tools/experimental/torchfuzz/codegen.py
@@ -196,7 +196,7 @@ class FuzzTemplate:
 
 class DefaultFuzzTemplate(FuzzTemplate):
     def __init__(self):
-        from torchfuzz.checks import EagerVsFullGraphDynamicCompileWithNumericsCheck
+        from torchfuzz.checks import EagerVsFullGraphDynamicCompileCheck
 
         super().__init__(
             supported_ops=[
@@ -236,7 +236,7 @@ class DefaultFuzzTemplate(FuzzTemplate):
                 # Regularization
                 "torch.nn.functional.dropout",
             ],
-            check=EagerVsFullGraphDynamicCompileWithNumericsCheck(),
+            check=EagerVsFullGraphDynamicCompileCheck(),
         )
 
     def spec_distribution(self):
diff --git a/tools/experimental/torchfuzz/fuzzer.py b/tools/experimental/torchfuzz/fuzzer.py
index e683b71519fb..5c54fded9f8a 100644
--- a/tools/experimental/torchfuzz/fuzzer.py
+++ b/tools/experimental/torchfuzz/fuzzer.py
@@ -241,7 +241,7 @@ if __name__ == "__main__":
     import argparse
 
     try:
-        from multi_process_fuzzer import run_multi_process_fuzzer
+        from multi_process_fuzzer import run_multi_process_fuzzer, run_until_failure
     except ImportError:
         # If importing as a module fails, import from the same directory
         import os
@@ -249,7 +249,7 @@ if __name__ == "__main__":
 
         current_dir = os.path.dirname(os.path.abspath(__file__))
         sys.path.insert(0, current_dir)
-        from multi_process_fuzzer import run_multi_process_fuzzer
+        from multi_process_fuzzer import run_multi_process_fuzzer, run_until_failure
 
     # Set up command-line argument parsing
     parser = argparse.ArgumentParser(
@@ -296,6 +296,11 @@ if __name__ == "__main__":
         action="store_true",
         help="Print detailed output for all runs (not just failures)",
     )
+    parser.add_argument(
+        "--stop-at-first-failure",
+        action="store_true",
+        help="Pick a random seed and keep iterating until finding a failure (exits with non-zero code)",
+    )
 
     # Legacy arguments
     parser.add_argument(
@@ -337,6 +342,30 @@ if __name__ == "__main__":
             supported_ops=parsed_supported_ops,
             op_weights=(parsed_weights if parsed_weights else None),
         )
+    elif args.stop_at_first_failure:
+        # Stop-at-first-failure mode
+        # Default number of processes
+        if args.processes is None:
+            cpu_count = mp.cpu_count()
+            args.processes = max(1, min(16, int(cpu_count * 0.75)))
+
+        if args.processes < 1:
+            print("❌ Error: Number of processes must be at least 1")
+            sys.exit(1)
+
+        try:
+            run_until_failure(
+                num_processes=args.processes,
+                verbose=args.verbose,
+                template=args.template,
+                supported_ops=args.supported_ops,
+            )
+        except Exception as e:
+            print(f"❌ Unexpected error: {str(e)}")
+            import traceback
+
+            traceback.print_exc()
+            sys.exit(1)
     elif args.start is not None or args.count is not None:
         # Multi-process fuzzing mode
         if args.start is None:
diff --git a/tools/experimental/torchfuzz/multi_process_fuzzer.py b/tools/experimental/torchfuzz/multi_process_fuzzer.py
index 18cc3b62ee25..520c03271fe7 100644
--- a/tools/experimental/torchfuzz/multi_process_fuzzer.py
+++ b/tools/experimental/torchfuzz/multi_process_fuzzer.py
@@ -522,3 +522,143 @@ def _print_operation_distribution(results: list[FuzzerResult]) -> None:
         persist_print(
             "\n📊 No operation statistics collected (no successful runs with stats)"
         )
+
+
+def run_until_failure(
+    num_processes: Optional[int] = None,
+    verbose: bool = False,
+    template: str = "default",
+    supported_ops: Optional[str] = None,
+) -> None:
+    """
+    Run the multi-process fuzzer with a random starting seed, iterating until a failure is found.
+
+    Args:
+        num_processes: Number of worker processes to use
+        verbose: Whether to print detailed output
+        template: The template to use for code generation
+        supported_ops: Comma-separated ops string with optional weights
+
+    Returns:
+        Exits with non-zero code when a failure is found
+    """
+    import random
+
+    # Pick a random seed to start from
+    initial_seed = random.randint(0, 2**31 - 1)
+
+    persist_print(
+        f"🎲 Starting continuous fuzzing with random initial seed: {initial_seed}"
+    )
+    persist_print(f"🚀 Using {num_processes} processes")
+    persist_print(
+        f"🔧 Command template: python fuzzer.py --seed {{seed}} --template {template}"
+    )
+    persist_print("🎯 Running until first failure is found...")
+    persist_print("=" * 60)
+
+    start_time = time.time()
+    current_seed = initial_seed
+    total_successful = 0
+    total_ignored = 0
+    batch_size = 100  # Process seeds in batches of 100
+
+    try:
+        while True:
+            # Process a batch of seeds
+            seeds = list(range(current_seed, current_seed + batch_size))
+
+            with mp.Pool(processes=num_processes) as pool:
+                future_results = []
+                for seed in seeds:
+                    future = pool.apply_async(
+                        run_fuzzer_with_seed, (seed, template, supported_ops)
+                    )
+                    future_results.append((seed, future))
+
+                # Set up progress bar for this batch
+                if HAS_TQDM:
+                    from tqdm import tqdm
+
+                    pbar = tqdm(
+                        total=len(seeds),
+                        desc=f"Batch starting at seed {current_seed}",
+                        file=sys.stdout,
+                        bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}] ✅/🚫={postfix}",
+                        dynamic_ncols=True,
+                    )
+                    pbar.set_postfix_str(f"{total_successful}/{total_ignored}")
+
+                    def write_func(msg):
+                        pbar.write(msg)
+                else:
+                    pbar = None
+
+                # Collect results as they complete
+                for seed, future in future_results:
+                    result: FuzzerResult = future.get()
+
+                    if result.ignored_pattern_idx != -1:
+                        total_ignored += 1
+
+                    if result.success:
+                        total_successful += 1
+                    elif result.ignored_pattern_idx == -1:
+                        # Found a failure that is not ignored!
+                        if HAS_TQDM and pbar:
+                            pbar.close()
+
+                        elapsed = time.time() - start_time
+                        persist_print("\n" + "=" * 60)
+                        persist_print("🎯 FAILURE FOUND!")
+                        persist_print("=" * 60)
+                        persist_print(f"❌ Failing seed: {result.seed}")
+                        persist_print(
+                            f"⏱️  Duration for this seed: {result.duration:.2f}s"
+                        )
+                        persist_print(f"⏱️  Total time elapsed: {elapsed:.2f}s")
+                        persist_print(f"✅ Successful seeds tested: {total_successful}")
+                        persist_print(f"🚫 Ignored seeds: {total_ignored}")
+                        persist_print(
+                            f"📊 Total seeds tested: {total_successful + total_ignored + 1}"
+                        )
+                        persist_print("\n💥 Failure output:")
+                        persist_print("-" * 60)
+                        print_output_lines(result.output, persist_print)
+                        persist_print("-" * 60)
+                        persist_print(
+                            f"\n🔄 Reproduce with: python fuzzer.py --seed {result.seed} --template {template}"
+                        )
+
+                        # Exit with non-zero code
+                        sys.exit(1)
+
+                    # Update progress bar
+                    if HAS_TQDM and pbar:
+                        pbar.set_postfix_str(f"{total_successful}/{total_ignored}")
+                        pbar.update(1)
+                    elif verbose:
+                        status_emoji = "✅" if result.success else "🚫"
+                        persist_print(f"Seed {result.seed}: {status_emoji}")
+
+                # Close progress bar for this batch
+                if HAS_TQDM and pbar:
+                    pbar.close()
+
+            # Move to next batch
+            current_seed += batch_size
+
+    except KeyboardInterrupt:
+        persist_print("\n🛑 Interrupted by user (Ctrl+C)")
+        elapsed = time.time() - start_time
+        persist_print("=" * 60)
+        persist_print("📈 SUMMARY (interrupted)")
+        persist_print("=" * 60)
+        persist_print(f"⏱️  Total time: {elapsed:.2f}s")
+        persist_print(f"✅ Successful seeds: {total_successful}")
+        persist_print(f"🚫 Ignored seeds: {total_ignored}")
+        persist_print(f"📊 Total seeds tested: {total_successful + total_ignored}")
+        persist_print(
+            f"⚡ Throughput: {((total_successful + total_ignored) / (elapsed / 3600)):.2f} seeds/hr"
+        )
+        sys.exit(130)

From 9fccbdd4f05820fed8ccf66422b056c932649d62 Mon Sep 17 00:00:00 2001
From: Mu-Chu Lee <mlee8@meta.com>
Date: Wed, 15 Oct 2025 10:41:51 -0700
Subject: [PATCH 016/123] Fix incorrect function signature in template
 (#165567)

Summary:
In https://github.com/pytorch/pytorch/pull/148305 we refactored the grid
argument out, but it's not reflected in our template.

Test Plan:
Included in commit.
python test/inductor/test_aot_inductor.py
AOTInductorTestABICompatibleGpu.test_cond_symint_input_disable_one_pass_cuda

Reviewers:

Subscribers:

Tasks:

Tags:

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165567
Approved by: https://github.com/desertfire
---
 test/inductor/test_aot_inductor.py | 33 ++++++++++++++++++++++++++++++
 torch/_inductor/codegen/wrapper.py |  1 -
 2 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/test/inductor/test_aot_inductor.py b/test/inductor/test_aot_inductor.py
index 335bf7e1e5ea..0e9ff43cc87e 100644
--- a/test/inductor/test_aot_inductor.py
+++ b/test/inductor/test_aot_inductor.py
@@ -2340,6 +2340,39 @@ class AOTInductorTestsTemplate:
             dynamic_shapes=dynamic_shapes,
         )
 
+    def test_cond_symint_input_disable_one_pass(self):
+        class M(torch.nn.Module):
+            def forward(self, x, y, z):
+                a = y.shape[0]
+                b = z.shape[0]
+
+                def true_fn(x):
+                    return x + a
+
+                def false_fn(x):
+                    return x + b * z
+
+                return torch.cond(x.shape[0] > 5, true_fn, false_fn, (x,))
+
+        input1 = (
+            torch.ones(3, 3, device=self.device),
+            torch.ones(5, device=self.device),
+            torch.ones(3, 3, device=self.device),
+        )
+        input2 = (
+            torch.ones(10, 3, device=self.device),
+            torch.ones(6, device=self.device),
+            torch.ones(10, 3, device=self.device),
+        )
+        inputs = (input1, input2)
+        dynamic_shapes = {"x": {0: Dim("d")}, "y": {0: Dim("d1")}, "z": {0: Dim("d")}}
+        with torch._inductor.config.patch({"triton.autotune_at_compile_time": False}):
+            self.check_model_with_multiple_inputs(
+                M(),
+                inputs,
+                dynamic_shapes=dynamic_shapes,
+            )
+
     def test_while_loop_simple(self):
         inputs = (
             torch.randn((10, 20), device=self.device),
diff --git a/torch/_inductor/codegen/wrapper.py b/torch/_inductor/codegen/wrapper.py
index dc613c467587..efef044fa1e7 100644
--- a/torch/_inductor/codegen/wrapper.py
+++ b/torch/_inductor/codegen/wrapper.py
@@ -2631,7 +2631,6 @@ class PythonWrapperCodegen(CodeGen):
                         if len(kernel.launchers) == 0:
                             kernel.precompile()
                         kernel.save_gpu_kernel(
-                            grid=(0, 0, 0),   # use dummy grid
                             stream="stream",  # use dummy stream
                             launcher=kernel.launchers[0],
                         )

From 3154482072cefc49b69bd377a0774707b021fea7 Mon Sep 17 00:00:00 2001
From: Eddie Yan <eddiey@nvidia.com>
Date: Fri, 17 Oct 2025 02:45:04 +0000
Subject: [PATCH 017/123] [CUDA][cuBLAS] Only `xFail` `addmm` with reduced
 precision reductions on non-RTX skus (#165379)

RTX Blackwells don't behave quite like their datacenter counterparts here

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165379
Approved by: https://github.com/Skylion007
---
 test/test_matmul_cuda.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/test/test_matmul_cuda.py b/test/test_matmul_cuda.py
index 08a724671d6e..61f5642830dd 100644
--- a/test/test_matmul_cuda.py
+++ b/test/test_matmul_cuda.py
@@ -56,14 +56,15 @@ if TEST_CUDA:
 # Protects against includes accidentally setting the default dtype
 assert torch.get_default_dtype() is torch.float32
 
-def xfailIfSM100OrLaterAndCondition(condition_fn):
+def xfailIfSM100OrLaterNonRTXAndCondition(condition_fn):
     """
-    Conditionally xfail tests on SM100+ based on a condition function.
+    Conditionally xfail tests on SM100+ datacenter SKUs based on a condition function.
     The condition function receives the test parameters dict and returns True to xfail.
     """
+    computeCapabilityCheck = SM100OrLater and torch.cuda.get_device_capability()[0] != 12
     return decorateIf(
         unittest.expectedFailure,
-        lambda params: SM100OrLater and condition_fn(params)
+        lambda params: computeCapabilityCheck and condition_fn(params)
     )
 
 
@@ -163,7 +164,7 @@ class TestMatmulCuda(InductorTestCase):
             self.cublas_addmm(size, dtype, False)
 
     @onlyCUDA
-    @xfailIfSM100OrLaterAndCondition(lambda params: params.get('dtype') == torch.bfloat16 and params.get('size') == 10000)
+    @xfailIfSM100OrLaterNonRTXAndCondition(lambda params: params.get('dtype') == torch.bfloat16 and params.get('size') == 10000)
     # imported 'tol' as 'xtol' to avoid aliasing in code above
     @toleranceOverride({torch.float16: xtol(atol=7e-1, rtol=2e-1),
                         torch.bfloat16: xtol(atol=1e1, rtol=2e-1)})

From 861cdb887b73818a7e96dc07c5aa6a308216daa4 Mon Sep 17 00:00:00 2001
From: eellison <elias.ellison@gmail.com>
Date: Thu, 16 Oct 2025 10:30:35 -0700
Subject: [PATCH 018/123] use statically_known_leq & *=2 instead of bound_sympy
 in persistent rblock (#165657)

While these should be equivalent, we've found instances where they are not, and an error was caused. update until we figure out underlying issue.

Differential Revision: [D84835898](https://our.internmc.facebook.com/intern/diff/D84835898)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165657
Approved by: https://github.com/bobrenjc93
---
 torch/_inductor/codegen/triton.py | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
index c24cde56358b..62aa8e7c88cf 100644
--- a/torch/_inductor/codegen/triton.py
+++ b/torch/_inductor/codegen/triton.py
@@ -26,7 +26,6 @@ from torch._dynamo.utils import identity, preserve_rng_state
 from torch._prims_common import is_integer_dtype
 from torch.utils._ordered_set import OrderedSet
 from torch.utils._sympy.functions import CeilDiv, FloorDiv, ModularIndexing
-from torch.utils._sympy.value_ranges import bound_sympy
 from torch.utils._triton import has_triton_package, has_triton_stable_tma_api
 
 from ...utils._sympy.symbol import free_symbol_is_type, prefix_str, symbol_is_type, SymT
@@ -5111,16 +5110,13 @@ class TritonKernel(SIMDKernel[TritonCSEVariable]):
             val = int(rnumel)
             val = next_power_of_2(val)
         else:
-            val = bound_sympy(rnumel).upper
-            assert isinstance(val, int) or val.is_constant()
+            val = 2
+            while not V.graph.sizevars.statically_known_leq(rnumel, val):
+                if val > 16 * 1024:
+                    raise ValueError(f"Failed to find static RBLOCK for {rnumel}")
+                val *= 2
 
-            if val == torch.utils._sympy.numbers.IntInfinity():
-                raise ValueError(f"Failed to find static RBLOCK for {rnumel}")
-
-            val = next_power_of_2(int(val))
-
-            if val > 16 * 1024:
-                raise ValueError(f"Failed to find static RBLOCK for {rnumel}")
+            return val
 
         return val
 

From fcbde24c1cb54f3e0417e123bdb9ae09da134c8d Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchuby@users.noreply.github.com>
Date: Fri, 17 Oct 2025 03:25:31 +0000
Subject: [PATCH 019/123] [ONNX] Remove common imports from torchlib (#165156)

The Rank and IsScalar functions are no longer used in the torchlib. Requires onnxscript v0.5.4

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165156
Approved by: https://github.com/Skylion007, https://github.com/cyyever
---
 .ci/docker/common/install_onnx.sh           |  2 +-
 test/onnx/torchlib/ops_test_common.py       |  1 -
 torch/onnx/_internal/exporter/_building.py  | 39 ---------------------
 torch/onnx/_internal/exporter/_core.py      |  3 --
 torch/onnx/_internal/exporter/_ir_passes.py | 22 ------------
 5 files changed, 1 insertion(+), 66 deletions(-)

diff --git a/.ci/docker/common/install_onnx.sh b/.ci/docker/common/install_onnx.sh
index 183b5b65c90a..b0615b8a84c1 100755
--- a/.ci/docker/common/install_onnx.sh
+++ b/.ci/docker/common/install_onnx.sh
@@ -20,7 +20,7 @@ pip_install \
 
 pip_install coloredlogs packaging
 pip_install onnxruntime==1.23.0
-pip_install onnxscript==0.5.3
+pip_install onnxscript==0.5.4
 
 # Cache the transformers model to be used later by ONNX tests. We need to run the transformers
 # package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/
diff --git a/test/onnx/torchlib/ops_test_common.py b/test/onnx/torchlib/ops_test_common.py
index 72243faf3b50..d1206da0e07d 100644
--- a/test/onnx/torchlib/ops_test_common.py
+++ b/test/onnx/torchlib/ops_test_common.py
@@ -592,7 +592,6 @@ def graph_executor(
                 proto = onnxscript_function.to_function_proto()
                 ir_function = ir.serde.deserialize_function(proto)
             onnx_model.functions[identifier] = ir_function
-        _ir_passes.add_torchlib_common_imports(onnx_model, opset_version=opset_version)
         _ir_passes.add_opset_imports(onnx_model)
         # Make sure the model is valid
         model_proto = ir.to_proto(onnx_model)
diff --git a/torch/onnx/_internal/exporter/_building.py b/torch/onnx/_internal/exporter/_building.py
index dbe38f81680c..608591ca04c2 100644
--- a/torch/onnx/_internal/exporter/_building.py
+++ b/torch/onnx/_internal/exporter/_building.py
@@ -646,45 +646,6 @@ class OpRecorder(evaluator.Evaluator):
         kwargs: Mapping[str, AllowedArgType],
     ) -> _tensors.SymbolicTensor | Sequence[_tensors.SymbolicTensor] | bool | int:
         try:
-            # TODO(justinchuby): Remove this once IsScalar and Rank are removed
-            # Special cases for handling IsScalar and Rank
-            if function.name == "IsScalar":
-                if len(args) != 1:
-                    raise TypeError(
-                        f"Expected 1 positional argument for function '{function}', got {len(args)}."
-                    )
-                if isinstance(args[0], _tensors.SymbolicTensor):
-                    if args[0].rank is not None:
-                        return args[0].rank == 0
-                    else:
-                        # Fall to call add_function_call
-                        pass
-                elif isinstance(args[0], Sequence):
-                    return False
-                else:
-                    # Python constants are scalars
-                    return True
-            if function.name == "Rank":
-                if len(args) != 1:
-                    raise TypeError(
-                        f"Expected 1 positional argument for function '{function}', got {len(args)}."
-                    )
-                if isinstance(args[0], _tensors.SymbolicTensor):
-                    if args[0].rank is not None:
-                        return args[0].rank
-                    else:
-                        # Fall to call add_function_call
-                        pass
-                elif isinstance(args[0], Sequence):
-                    if all(isinstance(arg, (int, float)) for arg in args[0]):
-                        return 1
-                    else:
-                        # Fall to call add_function_call
-                        pass
-                else:
-                    # Python constants are scalars
-                    return 0
-
             # NOTE: signature should be written to function in the registration process
             if hasattr(function, "_pt_onnx_signature"):
                 op_signature = function._pt_onnx_signature  # type: ignore[attr-defined]
diff --git a/torch/onnx/_internal/exporter/_core.py b/torch/onnx/_internal/exporter/_core.py
index 06b12d8b1931..5696273f7b66 100644
--- a/torch/onnx/_internal/exporter/_core.py
+++ b/torch/onnx/_internal/exporter/_core.py
@@ -1249,9 +1249,6 @@ def _exported_program_to_onnx_program(
 
     # TODO: Decide if we should keep mutated buffers as inputs/outputs
 
-    # TODO(justinchuby): Remove the hack
-    _ir_passes.add_torchlib_common_imports(model)
-
     # Collect and add opset imports to the model
     _ir_passes.add_opset_imports(model)
 
diff --git a/torch/onnx/_internal/exporter/_ir_passes.py b/torch/onnx/_internal/exporter/_ir_passes.py
index 8a715e245597..9391b642b009 100644
--- a/torch/onnx/_internal/exporter/_ir_passes.py
+++ b/torch/onnx/_internal/exporter/_ir_passes.py
@@ -90,28 +90,6 @@ def rename_axis(model: ir.Model, rename_mapping: dict[str, str]) -> None:
             value.shape = ir.Shape(new_shape)
 
 
-def add_torchlib_common_imports(
-    model: ir.Model, opset_version: int = _constants.TORCHLIB_OPSET
-) -> None:
-    """Hack to add torchlib common imports to the model."""
-
-    try:
-        # TODO(justinchuby): Remove this hack and improved onnxscript
-        from onnxscript.function_libs.torch_lib.ops import common as common_ops
-
-        model.opset_imports["pkg.onnxscript.torch_lib.common"] = 1
-        rank_func = ir.serde.deserialize_function(common_ops.Rank.to_function_proto())
-        rank_func.opset_imports[""] = opset_version
-        is_scalar_func = ir.serde.deserialize_function(
-            common_ops.IsScalar.to_function_proto()
-        )
-        is_scalar_func.opset_imports[""] = opset_version
-        model.functions[rank_func.identifier()] = rank_func
-        model.functions[is_scalar_func.identifier()] = is_scalar_func
-    except Exception:
-        logger.exception("Failed to add torchlib common imports to the model.")
-
-
 def _maybe_set_opset_version(
     opset_imports: dict[str, int], domain: str, version: int | None
 ) -> None:

From 43d78423ac224cce432bf34ed9627035169d5433 Mon Sep 17 00:00:00 2001
From: Maggie Moss <maggiebmoss@gmail.com>
Date: Fri, 17 Oct 2025 04:15:22 +0000
Subject: [PATCH 020/123] Pyrefly suppressions 2 (#165692)

This is the last directory to opt in for the regular mypy.ini file. Will put up a diff to remove unused ignores before making sure we're also type checking all the files in the mypy strict configurations

Test plan:
dmypy restart && python3 scripts/lintrunner.py -a
pyrefly check

step 1: delete lines in the pyrefly.toml file from the project-excludes field
step 2: run pyrefly check
step 3: add suppressions, clean up unused suppressions
before: https://gist.github.com/maggiemoss/4b3bf2037014e116bc00706a16aef199

after:
INFO 0 errors (6,884 ignored)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165692
Approved by: https://github.com/oulgen
---
 pyrefly.toml                                    |  4 +++-
 torch/_inductor/codegen/common.py               |  1 +
 torch/_inductor/codegen/cpp_gemm_template.py    |  2 ++
 torch/_inductor/codegen/cpp_wrapper_gpu.py      |  1 +
 torch/_inductor/codegen/mps.py                  |  2 ++
 torch/_inductor/codegen/simd.py                 |  1 +
 torch/_inductor/codegen/wrapper_fxir.py         |  1 +
 torch/_inductor/runtime/autotune_cache.py       |  8 ++++++++
 torch/_inductor/runtime/benchmarking.py         |  2 ++
 .../runtime/caching/implementations.py          |  1 +
 .../runtime/coordinate_descent_tuner.py         | 11 +++++++----
 torch/_inductor/runtime/hints.py                |  2 ++
 torch/_inductor/runtime/runtime_utils.py        |  5 +++++
 torch/_inductor/runtime/static_cuda_launcher.py | 17 +++++++++++++++++
 torch/fx/experimental/proxy_tensor.py           |  1 +
 15 files changed, 54 insertions(+), 5 deletions(-)

diff --git a/pyrefly.toml b/pyrefly.toml
index ad74e4df084c..88054d605258 100644
--- a/pyrefly.toml
+++ b/pyrefly.toml
@@ -22,8 +22,10 @@ project-includes = [
 project-excludes = [
   # ==== below will be enabled directory by directory ====
   # ==== to test Pyrefly on a specific directory, simply comment it out ====
-  "torch/_inductor/runtime",
   "torch/_inductor/codegen/triton.py",
+  "torch/_inductor/runtime/triton_helpers.py",
+  "torch/_inductor/runtime/triton_heuristics.py",
+  "torch/_inductor/runtime/halide_helpers.py",
   # formatting issues, will turn on after adjusting where suppressions can be
   # in import statements
   "torch/linalg/__init__.py",
diff --git a/torch/_inductor/codegen/common.py b/torch/_inductor/codegen/common.py
index 36ded3aea2fe..743baec01dfa 100644
--- a/torch/_inductor/codegen/common.py
+++ b/torch/_inductor/codegen/common.py
@@ -1739,6 +1739,7 @@ class KernelArgs:
         for outer, inner in chain(
             # pyrefly: ignore  # bad-argument-type
             self.input_buffers.items(),
+            # pyrefly: ignore  # bad-argument-type
             self.output_buffers.items(),
         ):
             if outer in self.inplace_buffers or isinstance(inner, RemovedArg):
diff --git a/torch/_inductor/codegen/cpp_gemm_template.py b/torch/_inductor/codegen/cpp_gemm_template.py
index 9b26105bab10..cb17b5a7deb0 100644
--- a/torch/_inductor/codegen/cpp_gemm_template.py
+++ b/torch/_inductor/codegen/cpp_gemm_template.py
@@ -1480,6 +1480,7 @@ class CppGemmTemplate(CppTemplate):
             gemm_output_buffer = ir.Buffer(
                 # pyrefly: ignore  # missing-attribute
                 name=gemm_output_name,
+                # pyrefly: ignore  # missing-attribute
                 layout=template_buffer.layout,
             )
             current_input_buffer = gemm_output_buffer
@@ -1503,6 +1504,7 @@ class CppGemmTemplate(CppTemplate):
                     current_input_buffer = ir.Buffer(
                         # pyrefly: ignore  # missing-attribute
                         name=buffer_name,
+                        # pyrefly: ignore  # missing-attribute
                         layout=template_buffer.layout,
                     )
 
diff --git a/torch/_inductor/codegen/cpp_wrapper_gpu.py b/torch/_inductor/codegen/cpp_wrapper_gpu.py
index d1ddc7e1cd40..dd4a3a984d34 100644
--- a/torch/_inductor/codegen/cpp_wrapper_gpu.py
+++ b/torch/_inductor/codegen/cpp_wrapper_gpu.py
@@ -824,6 +824,7 @@ class CppWrapperGpu(CppWrapperCpu):
             call_args, arg_types = self.prepare_triton_wrapper_args(
                 # pyrefly: ignore  # bad-argument-type
                 call_args,
+                # pyrefly: ignore  # bad-argument-type
                 arg_types,
             )
             wrapper_name = f"call_{kernel_name}"
diff --git a/torch/_inductor/codegen/mps.py b/torch/_inductor/codegen/mps.py
index a74506d7247a..fb3939531b71 100644
--- a/torch/_inductor/codegen/mps.py
+++ b/torch/_inductor/codegen/mps.py
@@ -683,6 +683,7 @@ class MetalKernel(SIMDKernel):
                     # pyrefly: ignore  # missing-argument
                     t
                     for t in self.range_tree_nodes.values()
+                    # pyrefly: ignore  # missing-argument
                     if t.is_reduction
                 )
                 cmp_op = ">" if reduction_type == "argmax" else "<"
@@ -865,6 +866,7 @@ class MetalKernel(SIMDKernel):
                     # pyrefly: ignore  # missing-argument
                     t.numel
                     for t in self.range_trees
+                    # pyrefly: ignore  # missing-argument
                     if t.is_reduction
                 )
                 # If using dynamic shapes, set the threadgroup size to be the
diff --git a/torch/_inductor/codegen/simd.py b/torch/_inductor/codegen/simd.py
index e2294f05ddca..79d0b603220a 100644
--- a/torch/_inductor/codegen/simd.py
+++ b/torch/_inductor/codegen/simd.py
@@ -968,6 +968,7 @@ class SIMDKernel(Kernel[CSEVariableType], Generic[CSEVariableType]):
             # pyrefly: ignore  # missing-argument
             t
             for t in self.range_trees
+            # pyrefly: ignore  # missing-argument
             if not t.is_reduction or self.inside_reduction
         ]
 
diff --git a/torch/_inductor/codegen/wrapper_fxir.py b/torch/_inductor/codegen/wrapper_fxir.py
index 72c8e0335508..e123f9592770 100644
--- a/torch/_inductor/codegen/wrapper_fxir.py
+++ b/torch/_inductor/codegen/wrapper_fxir.py
@@ -1004,6 +1004,7 @@ class FxConverter:
                 # pyrefly: ignore  # missing-attribute
                 call_kwargs[key]
                 for key in signature
+                # pyrefly: ignore  # missing-attribute
                 if key not in cfg.kwargs
             ]
 
diff --git a/torch/_inductor/runtime/autotune_cache.py b/torch/_inductor/runtime/autotune_cache.py
index 3c55a9cd1b08..63d7a52ff7d7 100644
--- a/torch/_inductor/runtime/autotune_cache.py
+++ b/torch/_inductor/runtime/autotune_cache.py
@@ -275,8 +275,11 @@ class AutotuneCache:
         triton_cache_hash: str | None = None,
     ) -> None:
         data = {
+            # pyrefly: ignore  # missing-attribute
             **config.kwargs,
+            # pyrefly: ignore  # missing-attribute
             "num_warps": config.num_warps,
+            # pyrefly: ignore  # missing-attribute
             "num_stages": config.num_stages,
             "configs_hash": self.configs_hash,
             "found_by_coordesc": found_by_coordesc,
@@ -570,15 +573,20 @@ def _load_cached_autotuning(
             )
 
         # Create the triton_config with the appropriate arguments
+        # pyrefly: ignore  # bad-argument-count
         triton_config = Config(best_config, **config_args)
+        # pyrefly: ignore  # missing-attribute
         triton_config.found_by_coordesc = True
         return triton_config
 
     matching_configs = [
         cfg
         for cfg in configs
+        # pyrefly: ignore  # missing-attribute
         if all(val == best_config.get(key) for key, val in cfg.kwargs.items())
+        # pyrefly: ignore  # missing-attribute
         and cfg.num_warps == best_config.get("num_warps")
+        # pyrefly: ignore  # missing-attribute
         and cfg.num_stages == best_config.get("num_stages")
     ]
     if len(matching_configs) != 1:
diff --git a/torch/_inductor/runtime/benchmarking.py b/torch/_inductor/runtime/benchmarking.py
index 698484658ddd..ee504b1a0575 100644
--- a/torch/_inductor/runtime/benchmarking.py
+++ b/torch/_inductor/runtime/benchmarking.py
@@ -123,6 +123,7 @@ class Benchmarker:
         - The runtime of `fn(*fn_args, **fn_kwargs)`, in milliseconds.
         """
         inferred_device = None
+        # pyrefly: ignore  # bad-assignment
         for arg_or_kwarg in chain(fn_args, fn_kwargs.values()):
             if not isinstance(arg_or_kwarg, torch.Tensor):
                 continue
@@ -196,6 +197,7 @@ class TritonBenchmarker(Benchmarker):
 
     @may_distort_benchmarking_result
     @time_and_count
+    # pyrefly: ignore  # bad-override
     def benchmark_gpu(
         self: Self,
         _callable: Callable[[], Any],
diff --git a/torch/_inductor/runtime/caching/implementations.py b/torch/_inductor/runtime/caching/implementations.py
index abc113caae93..8292b957f562 100644
--- a/torch/_inductor/runtime/caching/implementations.py
+++ b/torch/_inductor/runtime/caching/implementations.py
@@ -190,6 +190,7 @@ class _OnDiskCacheImpl(_CacheImpl):
                     Defaults to empty string if not specified.
         """
         self._cache_dir: Path = self._base_dir / (sub_dir or "")
+        # pyrefly: ignore  # bad-assignment
         self._flock: FileLock = FileLock(str(self._cache_dir / "dir.lock"))
 
     @property
diff --git a/torch/_inductor/runtime/coordinate_descent_tuner.py b/torch/_inductor/runtime/coordinate_descent_tuner.py
index faa2b06bcaf1..30e0acfca4fe 100644
--- a/torch/_inductor/runtime/coordinate_descent_tuner.py
+++ b/torch/_inductor/runtime/coordinate_descent_tuner.py
@@ -186,6 +186,7 @@ class CoordescTuner:
 
     def check_all_tuning_directions(
         self,
+        # pyrefly: ignore  # missing-attribute
         func: Callable[["triton.Config"], float],
         best_config,
         best_timing,
@@ -255,10 +256,12 @@ class CoordescTuner:
 
     def autotune(
         self,
-        func: Callable[["triton.Config"], float],
-        baseline_config: "triton.Config",
-        baseline_timing: float | None = None,
-    ) -> "triton.Config":
+        func: Callable[
+            ["triton.Config"], float  # pyrefly: ignore  # missing-attribute
+        ],
+        baseline_config: "triton.Config",  # pyrefly: ignore  # missing-attribute
+        baseline_timing: float | None = None,  # pyrefly: ignore  # missing-attribute
+    ) -> "triton.Config":  # pyrefly: ignore  # missing-attribute
         if baseline_timing is None:
             baseline_timing = self.call_func(func, baseline_config)
 
diff --git a/torch/_inductor/runtime/hints.py b/torch/_inductor/runtime/hints.py
index 1cff04d04079..71ba05011e41 100644
--- a/torch/_inductor/runtime/hints.py
+++ b/torch/_inductor/runtime/hints.py
@@ -88,11 +88,13 @@ if has_triton_package():
             divisible_by_16=None,
             equal_to_1=None,
         ):
+            # pyrefly: ignore  # not-iterable
             return {(x,): [["tt.divisibility", 16]] for x in divisible_by_16}
 
 else:
     # Define a namedtuple as a fallback when AttrsDescriptor is not available
     AttrsDescriptorWrapper = collections.namedtuple(  # type: ignore[no-redef, name-match]
+        # pyrefly: ignore  # invalid-argument
         "AttrsDescriptor",
         ["divisible_by_16", "equal_to_1"],
         defaults=[(), ()],
diff --git a/torch/_inductor/runtime/runtime_utils.py b/torch/_inductor/runtime/runtime_utils.py
index 21cd5987f8f4..30087d95663a 100644
--- a/torch/_inductor/runtime/runtime_utils.py
+++ b/torch/_inductor/runtime/runtime_utils.py
@@ -68,8 +68,11 @@ def triton_config_to_hashable(cfg: Config) -> Hashable:
     Convert triton config to a tuple that can uniquely identify it. We can use
     the return value as a dictionary key.
     """
+    # pyrefly: ignore  # missing-attribute
     items = sorted(cfg.kwargs.items())
+    # pyrefly: ignore  # missing-attribute
     items.append(("num_warps", cfg.num_warps))
+    # pyrefly: ignore  # missing-attribute
     items.append(("num_stages", cfg.num_stages))
     return tuple(items)
 
@@ -103,6 +106,7 @@ def get_max_y_grid() -> int:
 
 
 try:
+    # pyrefly: ignore  # import-error
     import colorama
 
     HAS_COLORAMA = True
@@ -114,6 +118,7 @@ except ModuleNotFoundError:
 if HAS_COLORAMA:
 
     def _color_text(msg: str, color: str) -> str:
+        # pyrefly: ignore  # missing-attribute
         return getattr(colorama.Fore, color.upper()) + msg + colorama.Fore.RESET
 
 else:
diff --git a/torch/_inductor/runtime/static_cuda_launcher.py b/torch/_inductor/runtime/static_cuda_launcher.py
index a5e511052b28..e7d4705740e5 100644
--- a/torch/_inductor/runtime/static_cuda_launcher.py
+++ b/torch/_inductor/runtime/static_cuda_launcher.py
@@ -34,21 +34,29 @@ class StaticallyLaunchedCudaKernel:
     """
 
     def __init__(self, kernel: CompiledKernel) -> None:
+        # pyrefly: ignore  # missing-attribute
         self.name = kernel.src.fn.__name__
+        # pyrefly: ignore  # missing-attribute
         self.cubin_raw = kernel.asm.get("cubin", None)
+        # pyrefly: ignore  # missing-attribute
         self.cubin_path = kernel._cubin_path
 
         # Used by torch.compile to filter constants in older triton versions
+        # pyrefly: ignore  # missing-attribute
         self.arg_names = kernel.src.fn.arg_names
 
         # Const exprs that are declared by the triton kernel directly
         # Used to generate the kernel launcher's def args
+        # pyrefly: ignore  # missing-attribute
         self.declared_constexprs = kernel.src.fn.constexprs
 
+        # pyrefly: ignore  # missing-attribute
         self.hash = kernel.hash
 
         if triton_knobs is None:
+            # pyrefly: ignore  # missing-attribute
             launch_enter = kernel.__class__.launch_enter_hook
+            # pyrefly: ignore  # missing-attribute
             launch_exit = kernel.__class__.launch_exit_hook
         else:
             launch_enter = triton_knobs.runtime.launch_enter_hook
@@ -70,12 +78,15 @@ class StaticallyLaunchedCudaKernel:
             raise NotImplementedError(
                 "We don't support launch enter or launch exit hooks"
             )
+        # pyrefly: ignore  # missing-attribute
         self.num_warps = kernel.metadata.num_warps
         self.shared = (
+            # pyrefly: ignore  # missing-attribute
             kernel.shared if hasattr(kernel, "shared") else kernel.metadata.shared
         )
 
         def needs_scratch_arg(scratch_name: str, param_name: str) -> bool:
+            # pyrefly: ignore  # missing-attribute
             if hasattr(kernel.metadata, param_name):
                 if getattr(kernel.metadata, param_name) > 0:
                     raise NotImplementedError(
@@ -91,6 +102,7 @@ class StaticallyLaunchedCudaKernel:
         # same situation for profile scratch - triton-lang/triton#7258
         self.has_profile_scratch = needs_scratch_arg("Profile", "profile_scratch_size")
 
+        # pyrefly: ignore  # missing-attribute
         self.arg_tys = self.arg_ty_from_signature(kernel.src)
         self.function: int | None = None  # Loaded by load_kernel(on the parent process)
         num_ctas = 1
@@ -170,6 +182,7 @@ class StaticallyLaunchedCudaKernel:
     def arg_ty_from_signature(self, src: ASTSource) -> str:
         def index_key(i: Any) -> int:
             if isinstance(i, str):
+                # pyrefly: ignore  # missing-attribute
                 return src.fn.arg_names.index(i)
             elif isinstance(i, tuple):
                 # In triton 3.3, src.fn.constants has tuples as a key
@@ -177,6 +190,7 @@ class StaticallyLaunchedCudaKernel:
             else:
                 return i
 
+        # pyrefly: ignore  # missing-attribute
         signature = {index_key(key): value for key, value in src.signature.items()}
         # Triton uses these as the main way to filter out constants passed to their cubin
         constants = [index_key(key) for key in getattr(src, "constants", dict())]
@@ -198,6 +212,7 @@ class StaticallyLaunchedCudaKernel:
             if ty == "constexpr" or i in constants:
                 pass
             else:
+                # pyrefly: ignore  # bad-argument-type
                 params.append(self.extract_type(ty))
         return "".join(params)
 
@@ -235,6 +250,7 @@ class StaticallyLaunchedCudaKernel:
             if has_scratch:
                 arg_tys = arg_tys + "O"
                 args = (*args, None)
+        # pyrefly: ignore  # bad-argument-type
         assert len(args) == len(arg_tys)
 
         # TODO: can handle grid functions here or in C++, so
@@ -247,6 +263,7 @@ class StaticallyLaunchedCudaKernel:
             self.num_warps,
             self.shared,
             arg_tys,
+            # pyrefly: ignore  # bad-argument-type
             args,
             stream,
         )
diff --git a/torch/fx/experimental/proxy_tensor.py b/torch/fx/experimental/proxy_tensor.py
index 805d59008e02..28a60bafcac8 100644
--- a/torch/fx/experimental/proxy_tensor.py
+++ b/torch/fx/experimental/proxy_tensor.py
@@ -421,6 +421,7 @@ def get_proxy_slot(
             else:
                 # Attempt to build it from first principles.
                 _build_proxy_for_sym_expr(tracer, obj.node.expr, obj)
+                # pyrefly: ignore  # no-matching-overload
                 value = tracker.get(obj)
 
     if value is None:

From 7e150467f753360277c00585e4e689f91f3aef63 Mon Sep 17 00:00:00 2001
From: Tushar Jain <tushar00jain@users.noreply.github.com>
Date: Fri, 17 Oct 2025 04:43:41 +0000
Subject: [PATCH 021/123] allow providing full fr trace path (#165639)

Summary:
- allow users to specify the full path instead of fr suffixing the rank id
- this will be used by torchft to provide the global rank id accross all replicas
- we can't just prefix the replica id because analysis tool expects the file name to provide a unique integer

---
[//]: # (BEGIN SAPLING FOOTER)
Stack created with [Sapling](https://sapling-scm.com). Best reviewed with [ReviewStack](https://reviewstack.dev/pytorch/pytorch/pull/165639).
* #165638
* #165640
* #165677
* #165642
* __->__ #165639

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165639
Approved by: https://github.com/fduwjj
---
 torch/csrc/distributed/c10d/FlightRecorder.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/torch/csrc/distributed/c10d/FlightRecorder.cpp b/torch/csrc/distributed/c10d/FlightRecorder.cpp
index e817c2dd2f63..a404b627752a 100644
--- a/torch/csrc/distributed/c10d/FlightRecorder.cpp
+++ b/torch/csrc/distributed/c10d/FlightRecorder.cpp
@@ -7,7 +7,10 @@ namespace c10d {
 void DebugInfoWriter::write(const std::string& trace) {
   std::string filename = filename_;
   if (enable_dynamic_filename_) {
-    filename = c10::str(getCvarString({"TORCH_FR_DUMP_TEMP_FILE"}, ""), rank_);
+    LOG(INFO) << "Writing Flight Recorder debug info to a dynamic file name";
+    filename = c10::str(getCvarString({"TORCH_FR_DUMP_TEMP_FILE"}, ""));
+  } else {
+    LOG(INFO) << "Writing Flight Recorder debug info to a static file name";
   }
   // Open a file for writing. The ios::binary flag is used to write data as
   // binary.

From 364624e2091749d34aecbad843262643ad9a366f Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@meta.com>
Date: Fri, 17 Oct 2025 05:30:03 +0000
Subject: [PATCH 022/123] [codemod][lowrisk] Remove unused exception parameter
 from some files (#165700)

Summary:
`-Wunused-exception-parameter` has identified an unused exception parameter. This diff removes it.

This:
```
try {
    ...
} catch (exception& e) {
    // no use of e
}
```
should instead be written as
```
} catch (exception&) {
```

If the code compiles, this is safe to land.

Test Plan: Sandcastle

Differential Revision: D84868162

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165700
Approved by: https://github.com/Skylion007
---
 aten/src/ATen/native/TensorAdvancedIndexingUtils.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/TensorAdvancedIndexingUtils.h b/aten/src/ATen/native/TensorAdvancedIndexingUtils.h
index bc6c2533eac5..6f127b711d3e 100644
--- a/aten/src/ATen/native/TensorAdvancedIndexingUtils.h
+++ b/aten/src/ATen/native/TensorAdvancedIndexingUtils.h
@@ -77,7 +77,7 @@ inline AdvancedIndex make_info(Tensor self, IOptTensorListRef orig) {
   // next broadcast all index tensors together
   try {
     indices = expand_outplace(indices);
-  } catch (std::exception& e) {
+  } catch (std::exception&) {
     TORCH_CHECK_INDEX(
         false,
         "shape mismatch: indexing tensors could not be broadcast together"

From 9e94ec76b8b29812a1c9dcbb46f00b44e8c3719d Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Fri, 17 Oct 2025 06:14:09 +0000
Subject: [PATCH 023/123] Revert "Turn some const variables into constexpr in
 C++ code (#165401)"

This reverts commit 5b2afe4c5dc87786ca65bf22ca9a78f7c21a33a4.

Reverted https://github.com/pytorch/pytorch/pull/165401 on behalf of https://github.com/seemethere due to This is breaking test/distributions/test_distributions.py::TestDistributions::test_binomial_sample on HUD, see https://hud.pytorch.org/pytorch/pytorch/commit/5b2afe4c5dc87786ca65bf22ca9a78f7c21a33a4 ([comment](https://github.com/pytorch/pytorch/pull/165401#issuecomment-3414023134))
---
 aten/src/ATen/core/PhiloxRNGEngine.h          |  8 ++--
 aten/src/ATen/cuda/CUDAGeneratorImpl.cpp      | 12 ++---
 aten/src/ATen/native/Activation.cpp           |  4 +-
 aten/src/ATen/native/BlasKernel.cpp           |  4 +-
 aten/src/ATen/native/Distributions.h          |  4 +-
 aten/src/ATen/native/Math.h                   |  6 +--
 aten/src/ATen/native/Normalization.cpp        |  2 +-
 aten/src/ATen/native/cpu/UpSampleKernel.cpp   |  6 +--
 aten/src/ATen/native/cuda/DilatedMaxPool2d.cu |  2 +-
 aten/src/ATen/native/cuda/Embedding.cu        |  4 +-
 aten/src/ATen/native/cuda/IGammaKernel.cu     | 46 +++++++++----------
 aten/src/ATen/native/cuda/Math.cuh            |  8 ++--
 aten/src/ATen/native/cuda/UpSample.cuh        |  4 +-
 aten/src/ATen/native/mkldnn/Matmul.cpp        |  2 +-
 .../cpu/kernels/QuantizedOpKernels.cpp        |  2 +-
 .../src/ATen/native/quantized/cpu/qlinear.cpp |  2 +-
 .../ATen/native/quantized/cpu/qsoftmax.cpp    |  4 +-
 .../epilogue_thread_apply_logsumexp.h         |  6 +--
 aten/src/ATen/test/pow_test.cpp               | 20 ++++----
 aten/src/ATen/xpu/XPUGeneratorImpl.cpp        | 12 ++---
 20 files changed, 79 insertions(+), 79 deletions(-)

diff --git a/aten/src/ATen/core/PhiloxRNGEngine.h b/aten/src/ATen/core/PhiloxRNGEngine.h
index e8bac545933c..413055d3fad6 100644
--- a/aten/src/ATen/core/PhiloxRNGEngine.h
+++ b/aten/src/ATen/core/PhiloxRNGEngine.h
@@ -229,10 +229,10 @@ private:
   }
 
 
-  static constexpr uint32_t kPhilox10A = 0x9E3779B9;
-  static constexpr uint32_t kPhilox10B = 0xBB67AE85;
-  static constexpr uint32_t kPhiloxSA = 0xD2511F53;
-  static constexpr uint32_t kPhiloxSB = 0xCD9E8D57;
+  static const uint32_t kPhilox10A = 0x9E3779B9;
+  static const uint32_t kPhilox10B = 0xBB67AE85;
+  static const uint32_t kPhiloxSA = 0xD2511F53;
+  static const uint32_t kPhiloxSB = 0xCD9E8D57;
 };
 
 typedef philox_engine Philox4_32;
diff --git a/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp b/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
index 2e387fbc264d..9f7c9ba881e9 100644
--- a/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
+++ b/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
@@ -325,9 +325,9 @@ uint64_t CUDAGeneratorImpl::seed() {
  */
 c10::intrusive_ptr<c10::TensorImpl> CUDAGeneratorImpl::get_state() const {
   // The RNG state comprises the seed, and an offset used for Philox.
-  constexpr size_t seed_size = sizeof(uint64_t);
-  constexpr size_t offset_size = sizeof(int64_t);
-  constexpr size_t total_size = seed_size + offset_size;
+  static const size_t seed_size = sizeof(uint64_t);
+  static const size_t offset_size = sizeof(int64_t);
+  static const size_t total_size = seed_size + offset_size;
 
   auto state_tensor = at::detail::empty_cpu({(int64_t)total_size}, ScalarType::Byte, std::nullopt, std::nullopt, std::nullopt, std::nullopt);
   auto rng_state = state_tensor.data_ptr<uint8_t>();
@@ -346,9 +346,9 @@ c10::intrusive_ptr<c10::TensorImpl> CUDAGeneratorImpl::get_state() const {
  * and size of the internal state.
  */
 void CUDAGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
-  constexpr size_t seed_size = sizeof(uint64_t);
-  constexpr size_t offset_size = sizeof(int64_t);
-  constexpr size_t total_size = seed_size + offset_size;
+  static const size_t seed_size = sizeof(uint64_t);
+  static const size_t offset_size = sizeof(int64_t);
+  static const size_t total_size = seed_size + offset_size;
 
   detail::check_rng_state(new_state);
 
diff --git a/aten/src/ATen/native/Activation.cpp b/aten/src/ATen/native/Activation.cpp
index c164120a1f3c..861c51f16097 100644
--- a/aten/src/ATen/native/Activation.cpp
+++ b/aten/src/ATen/native/Activation.cpp
@@ -240,8 +240,8 @@ TORCH_META_FUNC(gelu_backward) (
 
 namespace at::native {
 
-static constexpr double SELU_ALPHA = 1.6732632423543772848170429916717;
-static constexpr double SELU_SCALE = 1.0507009873554804934193349852946;
+static const double SELU_ALPHA = 1.6732632423543772848170429916717;
+static const double SELU_SCALE = 1.0507009873554804934193349852946;
 
 DEFINE_DISPATCH(elu_stub);
 DEFINE_DISPATCH(elu_backward_stub);
diff --git a/aten/src/ATen/native/BlasKernel.cpp b/aten/src/ATen/native/BlasKernel.cpp
index b476ca3cff8f..a77604c535c1 100644
--- a/aten/src/ATen/native/BlasKernel.cpp
+++ b/aten/src/ATen/native/BlasKernel.cpp
@@ -286,7 +286,7 @@ template void scal_fast_path<scalar_t>(int *n, scalar_t *a, scalar_t *x, int *in
 #if AT_BUILD_WITH_BLAS()
 template <>
 bool scal_use_fast_path<double>(int64_t n, int64_t incx) {
-  auto constexpr intmax = std::numeric_limits<int>::max();
+  auto intmax = std::numeric_limits<int>::max();
   return n <= intmax && incx <= intmax;
 }
 
@@ -315,7 +315,7 @@ bool gemv_use_fast_path<float>(
     int64_t incx,
     [[maybe_unused]] float beta,
     int64_t incy) {
-  auto constexpr intmax = std::numeric_limits<int>::max();
+  auto intmax = std::numeric_limits<int>::max();
   return (m <= intmax) && (n <= intmax) && (lda <= intmax) &&
          (incx > 0) && (incx <= intmax) && (incy > 0) && (incy <= intmax);
 }
diff --git a/aten/src/ATen/native/Distributions.h b/aten/src/ATen/native/Distributions.h
index ab7d82dbeab4..1c9db44aebb0 100644
--- a/aten/src/ATen/native/Distributions.h
+++ b/aten/src/ATen/native/Distributions.h
@@ -127,7 +127,7 @@ C10_DEVICE scalar_t sample_gamma(scalar_t alpha, BaseSampler<accscalar_t, unifor
 
 template<typename scalar_t>
 C10_DEVICE scalar_t stirling_approx_tail(scalar_t k) {
-  constexpr static scalar_t kTailValues[] = {
+  const static scalar_t kTailValues[] = {
     0.0810614667953272,
     0.0413406959554092,
     0.0276779256849983,
@@ -139,7 +139,7 @@ C10_DEVICE scalar_t stirling_approx_tail(scalar_t k) {
     0.00925546218271273,
     0.00833056343336287
   };
-  if (k <= sizeof(kTailValues)/sizeof(scalar_t)) {
+  if (k <= 9) {
     return kTailValues[static_cast<size_t>(k)];
   }
   scalar_t kp1sq = (k + 1) * (k + 1);
diff --git a/aten/src/ATen/native/Math.h b/aten/src/ATen/native/Math.h
index 4677542706f6..b261da5fe54e 100644
--- a/aten/src/ATen/native/Math.h
+++ b/aten/src/ATen/native/Math.h
@@ -581,7 +581,7 @@ scalar_t ratevl(scalar_t x, const scalar_t num[], int64_t M,
 template <typename scalar_t>
 static scalar_t lanczos_sum_expg_scaled(scalar_t x) {
   // lanczos approximation
-  static constexpr scalar_t lanczos_sum_expg_scaled_num[13] = {
+  static const scalar_t lanczos_sum_expg_scaled_num[13] = {
     0.006061842346248906525783753964555936883222,
     0.5098416655656676188125178644804694509993,
     19.51992788247617482847860966235652136208,
@@ -596,7 +596,7 @@ static scalar_t lanczos_sum_expg_scaled(scalar_t x) {
     103794043.1163445451906271053616070238554,
     56906521.91347156388090791033559122686859
   };
-  static constexpr scalar_t lanczos_sum_expg_scaled_denom[13] = {
+  static const scalar_t lanczos_sum_expg_scaled_denom[13] = {
     1.,
     66.,
     1925.,
@@ -712,7 +712,7 @@ static scalar_t _igamc_helper_series(scalar_t a, scalar_t x) {
 template <typename scalar_t>
 static scalar_t _igam_helper_asymptotic_series(scalar_t a, scalar_t x, bool igam) {
   // Compute igam/igamc using DLMF 8.12.3/8.12.4 [igam1]
-  static constexpr scalar_t d[25][25] =
+  static const scalar_t d[25][25] =
     {{-3.3333333333333333e-1, 8.3333333333333333e-2, -1.4814814814814815e-2,
       1.1574074074074074e-3, 3.527336860670194e-4, -1.7875514403292181e-4,
       3.9192631785224378e-5, -2.1854485106799922e-6, -1.85406221071516e-6,
diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp
index 72526162d133..86941806d307 100644
--- a/aten/src/ATen/native/Normalization.cpp
+++ b/aten/src/ATen/native/Normalization.cpp
@@ -62,7 +62,7 @@
 #include <utility>
 #include <vector>
 
-static constexpr int MIOPEN_DIM_MAX = 5;
+static const int MIOPEN_DIM_MAX = 5;
 
 namespace at::meta {
 
diff --git a/aten/src/ATen/native/cpu/UpSampleKernel.cpp b/aten/src/ATen/native/cpu/UpSampleKernel.cpp
index e59e5985bf7f..bd421aad111d 100644
--- a/aten/src/ATen/native/cpu/UpSampleKernel.cpp
+++ b/aten/src/ATen/native/cpu/UpSampleKernel.cpp
@@ -1038,7 +1038,7 @@ struct HelperInterpNearest : public HelperInterpBase {
   // We keep this structure for BC and consider as deprecated.
   // See HelperInterpNearestExact as replacement
 
-  static constexpr int interp_size = 1;
+  static const int interp_size = 1;
 
   static inline void init_indices_weights(
     at::ScalarType output_type,
@@ -1155,7 +1155,7 @@ struct HelperInterpNearestExact : public HelperInterpNearest {
 
 struct HelperInterpLinear : public HelperInterpBase {
 
-  static constexpr int interp_size = 2;
+  static const int interp_size = 2;
 
   // Compute indices and weights for each interpolated dimension
   // indices_weights = {
@@ -1275,7 +1275,7 @@ struct HelperInterpLinear : public HelperInterpBase {
 
 struct HelperInterpCubic : public HelperInterpBase {
 
-  static constexpr int interp_size = 4;
+  static const int interp_size = 4;
 
   // Compute indices and weights for each interpolated dimension
   // indices_weights = {
diff --git a/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu
index 344906a2a4df..edb502688860 100644
--- a/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu
+++ b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu
@@ -249,7 +249,7 @@ __global__ void max_pool_forward_nhwc(
 }
 
 
-static constexpr int BLOCK_THREADS = 256;
+static const int BLOCK_THREADS = 256;
 
 template <typename scalar_t, typename accscalar_t>
 #if defined (USE_ROCM)
diff --git a/aten/src/ATen/native/cuda/Embedding.cu b/aten/src/ATen/native/cuda/Embedding.cu
index adc300a5a9ef..602dfd6e5288 100644
--- a/aten/src/ATen/native/cuda/Embedding.cu
+++ b/aten/src/ATen/native/cuda/Embedding.cu
@@ -36,9 +36,9 @@ namespace at::native {
 namespace {
 
 #if defined(USE_ROCM)
-static constexpr int BLOCKDIMY = 16;
+static const int BLOCKDIMY = 16;
 #else
-static constexpr int BLOCKDIMY = 32;
+static const int BLOCKDIMY = 32;
 #endif
 
 template
diff --git a/aten/src/ATen/native/cuda/IGammaKernel.cu b/aten/src/ATen/native/cuda/IGammaKernel.cu
index 73db6272be9e..624f080d9f6e 100644
--- a/aten/src/ATen/native/cuda/IGammaKernel.cu
+++ b/aten/src/ATen/native/cuda/IGammaKernel.cu
@@ -82,7 +82,7 @@ __host__ __device__ scalar_t lanczos_sum_expg_scaled(scalar_t x) {
   // lanczos approximation
   using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
 
-  constexpr accscalar_t lanczos_sum_expg_scaled_num[13] = {
+  static const accscalar_t lanczos_sum_expg_scaled_num[13] = {
     0.006061842346248906525783753964555936883222,
     0.5098416655656676188125178644804694509993,
     19.51992788247617482847860966235652136208,
@@ -97,7 +97,7 @@ __host__ __device__ scalar_t lanczos_sum_expg_scaled(scalar_t x) {
     103794043.1163445451906271053616070238554,
     56906521.91347156388090791033559122686859
   };
-  constexpr accscalar_t lanczos_sum_expg_scaled_denom[13] = {
+  static const accscalar_t lanczos_sum_expg_scaled_denom[13] = {
     1.,
     66.,
     1925.,
@@ -126,10 +126,10 @@ __host__ __device__ scalar_t _igam_helper_fac(scalar_t a, scalar_t x) {
 
   using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
   accscalar_t ax, fac, res, num, numfac;
-  constexpr accscalar_t MAXLOG = std::is_same_v<accscalar_t,double> ?
+  static const accscalar_t MAXLOG = std::is_same_v<accscalar_t,double> ?
     7.09782712893383996843E2 : 88.72283905206835;
-  constexpr accscalar_t EXP1 = 2.718281828459045;
-  constexpr accscalar_t lanczos_g = 6.024680040776729583740234375;
+  static const accscalar_t EXP1 = 2.718281828459045;
+  static const accscalar_t lanczos_g = 6.024680040776729583740234375;
 
   if (::fabs(a - x) > 0.4 * ::fabs(a)) {
     ax = a * ::log(x) - x - ::lgamma(a);
@@ -158,9 +158,9 @@ __host__ __device__ scalar_t _igam_helper_series(scalar_t a, scalar_t x) {
   // Compute igam using DLMF 8.11.4. [igam1]
 
   using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
-  constexpr accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
+  static const accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
     1.11022302462515654042E-16 : 5.9604644775390625E-8;
-  constexpr int MAXITER = 2000;
+  static const int MAXITER = 2000;
 
   int i;
   accscalar_t ans, ax, c, r;
@@ -196,8 +196,8 @@ __host__ __device__ scalar_t _igamc_helper_series(scalar_t a, scalar_t x) {
   accscalar_t fac = 1;
   accscalar_t sum = 0;
   accscalar_t term, logx;
-  constexpr int MAXITER = 2000;
-  constexpr accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
+  static const int MAXITER = 2000;
+  static const accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
     1.11022302462515654042E-16 : 5.9604644775390625E-8;
 
   for (n = 1; n < MAXITER; n++) {
@@ -219,7 +219,7 @@ __host__ __device__ scalar_t _igam_helper_asymptotic_series(scalar_t a, scalar_t
   // Compute igam/igamc using DLMF 8.12.3/8.12.4 [igam1]
 
   using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
-  constexpr accscalar_t d[25][25] =
+  static const accscalar_t d[25][25] =
     {{-3.3333333333333333e-1, 8.3333333333333333e-2, -1.4814814814814815e-2, 1.1574074074074074e-3, 3.527336860670194e-4, -1.7875514403292181e-4, 3.9192631785224378e-5, -2.1854485106799922e-6, -1.85406221071516e-6, 8.296711340953086e-7, -1.7665952736826079e-7, 6.7078535434014986e-9, 1.0261809784240308e-8, -4.3820360184533532e-9, 9.1476995822367902e-10, -2.551419399494625e-11, -5.8307721325504251e-11, 2.4361948020667416e-11, -5.0276692801141756e-12, 1.1004392031956135e-13, 3.3717632624009854e-13, -1.3923887224181621e-13, 2.8534893807047443e-14, -5.1391118342425726e-16, -1.9752288294349443e-15},
     {-1.8518518518518519e-3, -3.4722222222222222e-3, 2.6455026455026455e-3, -9.9022633744855967e-4, 2.0576131687242798e-4, -4.0187757201646091e-7, -1.8098550334489978e-5, 7.6491609160811101e-6, -1.6120900894563446e-6, 4.6471278028074343e-9, 1.378633446915721e-7, -5.752545603517705e-8, 1.1951628599778147e-8, -1.7543241719747648e-11, -1.0091543710600413e-9, 4.1627929918425826e-10, -8.5639070264929806e-11, 6.0672151016047586e-14, 7.1624989648114854e-12, -2.9331866437714371e-12, 5.9966963656836887e-13, -2.1671786527323314e-16, -4.9783399723692616e-14, 2.0291628823713425e-14, -4.13125571381061e-15},
     {4.1335978835978836e-3, -2.6813271604938272e-3, 7.7160493827160494e-4, 2.0093878600823045e-6, -1.0736653226365161e-4, 5.2923448829120125e-5, -1.2760635188618728e-5, 3.4235787340961381e-8, 1.3721957309062933e-6, -6.298992138380055e-7, 1.4280614206064242e-7, -2.0477098421990866e-10, -1.4092529910867521e-8, 6.228974084922022e-9, -1.3670488396617113e-9, 9.4283561590146782e-13, 1.2872252400089318e-10, -5.5645956134363321e-11, 1.1975935546366981e-11, -4.1689782251838635e-15, -1.0940640427884594e-12, 4.6622399463901357e-13, -9.905105763906906e-14, 1.8931876768373515e-17, 8.8592218725911273e-15},
@@ -248,7 +248,7 @@ __host__ __device__ scalar_t _igam_helper_asymptotic_series(scalar_t a, scalar_t
 
   int k, n, sgn;
   int maxpow = 0;
-  constexpr accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
+  static const accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
     1.11022302462515654042E-16 : 5.9604644775390625E-8;
   accscalar_t lambda = x / a;
   accscalar_t sigma = (x - a) / a;
@@ -314,12 +314,12 @@ __host__ __device__ scalar_t _igamc_helper_continued_fraction(scalar_t a, scalar
   int i;
   accscalar_t ans, ax, c, yc, r, t, y, z;
   accscalar_t pk, pkm1, pkm2, qk, qkm1, qkm2;
-  constexpr int MAXITER = 2000;
-  constexpr accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
+  static const int MAXITER = 2000;
+  static const accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
     1.11022302462515654042E-16 : 5.9604644775390625E-8;
-  constexpr accscalar_t BIG = std::is_same_v<accscalar_t,double> ?
+  static const accscalar_t BIG = std::is_same_v<accscalar_t,double> ?
     4.503599627370496e15 : 16777216.;
-  constexpr accscalar_t BIGINV = std::is_same_v<accscalar_t,double> ?
+  static const accscalar_t BIGINV = std::is_same_v<accscalar_t,double> ?
     2.22044604925031308085e-16 : 5.9604644775390625E-8;
 
   ax = _igam_helper_fac(a, x);
@@ -385,10 +385,10 @@ __noinline__ __host__ __device__ scalar_t calc_igammac(scalar_t a, scalar_t x) {
   using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
   accscalar_t absxma_a;
 
-  constexpr accscalar_t SMALL = 20.0;
-  constexpr accscalar_t LARGE = 200.0;
-  constexpr accscalar_t SMALLRATIO = 0.3;
-  constexpr accscalar_t LARGERATIO = 4.5;
+  static const accscalar_t SMALL = 20.0;
+  static const accscalar_t LARGE = 200.0;
+  static const accscalar_t SMALLRATIO = 0.3;
+  static const accscalar_t LARGERATIO = 4.5;
 
   if ((x < 0) || (a < 0)) {
     // out of defined-region of the function
@@ -467,10 +467,10 @@ __noinline__ __host__ __device__ scalar_t calc_igamma(scalar_t a, scalar_t x) {
 
   using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
   accscalar_t absxma_a;
-  constexpr accscalar_t SMALL = 20.0;
-  constexpr accscalar_t LARGE = 200.0;
-  constexpr accscalar_t SMALLRATIO = 0.3;
-  constexpr accscalar_t LARGERATIO = 4.5;
+  static const accscalar_t SMALL = 20.0;
+  static const accscalar_t LARGE = 200.0;
+  static const accscalar_t SMALLRATIO = 0.3;
+  static const accscalar_t LARGERATIO = 4.5;
 
   // boundary values following SciPy
   if ((x < 0) || (a < 0)) {
diff --git a/aten/src/ATen/native/cuda/Math.cuh b/aten/src/ATen/native/cuda/Math.cuh
index 1fa245af1a4d..1d603132e689 100644
--- a/aten/src/ATen/native/cuda/Math.cuh
+++ b/aten/src/ATen/native/cuda/Math.cuh
@@ -231,7 +231,7 @@ const auto lcm_string = jiterator_stringify(
 const auto digamma_string = jiterator_stringify(
   template <typename T>
   T digamma(T x) {
-    static constexpr double PI_f64 = 3.14159265358979323846;
+    static const double PI_f64 = 3.14159265358979323846;
 
     // Short-circuits if x is +/- 0 and returns -/+ ∞ per the C++ standard
     if (x == 0) {
@@ -3072,9 +3072,9 @@ template <typename scalar_t>
 static inline C10_HOST_DEVICE scalar_t calc_digamma(scalar_t in) {
   // [C++ Standard Reference: Gamma Function] https://en.cppreference.com/w/cpp/numeric/math/tgamma
   using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
-  static constexpr double PI_f64 = 3.14159265358979323846;
-  constexpr accscalar_t PSI_10 = 2.25175258906672110764;
-  constexpr accscalar_t A[] = {
+  static const double PI_f64 = 3.14159265358979323846;
+  const accscalar_t PSI_10 = 2.25175258906672110764;
+  const accscalar_t A[] = {
       8.33333333333333333333E-2,
       -2.10927960927960927961E-2,
       7.57575757575757575758E-3,
diff --git a/aten/src/ATen/native/cuda/UpSample.cuh b/aten/src/ATen/native/cuda/UpSample.cuh
index 09e094ea2bf0..50428b377da8 100644
--- a/aten/src/ATen/native/cuda/UpSample.cuh
+++ b/aten/src/ATen/native/cuda/UpSample.cuh
@@ -277,7 +277,7 @@ struct BilinearFilterFunctor {
     return 0;
   }
 
-  static constexpr int size = 2;
+  static const int size = 2;
 };
 
 // taken from
@@ -301,7 +301,7 @@ struct BicubicFilterFunctor {
     return 0;
   }
 
-  static constexpr int size = 4;
+  static const int size = 4;
 };
 
 template <typename accscalar_t>
diff --git a/aten/src/ATen/native/mkldnn/Matmul.cpp b/aten/src/ATen/native/mkldnn/Matmul.cpp
index fbc8294f45cf..740c056a7f23 100644
--- a/aten/src/ATen/native/mkldnn/Matmul.cpp
+++ b/aten/src/ATen/native/mkldnn/Matmul.cpp
@@ -416,7 +416,7 @@ static inline bool checksize(const Tensor& mat1, const Tensor& mat2){
   // else if dim = 3, mat1's size = (b * m * n), mat2's size = (b * n * k)
   // else called from aten::mv, mat1.size = (m * n), mat2.size = (n)
   // only m * n * b * k(if exist) are large enough we can get benefit from mkldnn optimized gemm kernel
-  constexpr int64_t mkldnn_gemm_min_size = 16 * 16 * 16;
+  static const int64_t mkldnn_gemm_min_size = 16 * 16 * 16;
   if (mat1.dim() == 1 && mat2.dim() == 1) {
     // aten::dot
     return mat1.size(0) > mkldnn_gemm_min_size;
diff --git a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
index 293dfb20b9bf..028047e4d6ac 100644
--- a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
+++ b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
@@ -3551,7 +3551,7 @@ void dequantize_tensor_per_tensor_affine_cpu(
 
 #if defined(__ARM_NEON__) || defined(__aarch64__)
 
-constexpr static int PARALLEL_THRESHOLD = 1 << 20;
+const static int PARALLEL_THRESHOLD = 1 << 20;
 
 // Generic template defaults to naive quantize implementation
 template <typename T>
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear.cpp b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
index 7a80b166f8cb..897eefd91d21 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
@@ -1388,7 +1388,7 @@ namespace at::native {
     TORCH_CHECK(act_scale.numel() == 1 && act_zero_point.numel() <= 1,
         "onednn int8 linear: act scale/zp size should be 1/<=1");
     static std::optional<at::Tensor> other = std::nullopt;
-    constexpr std::string_view binary_post_op = "none";
+    static const std::string_view binary_post_op = "none";
     int64_t act_zp = act_zero_point.numel() == 1 ? act_zero_point.item().toLong() : 0;
     return linear_int8_with_onednn_weight(
         act, act_scale.item().toDouble(), act_zp,
diff --git a/aten/src/ATen/native/quantized/cpu/qsoftmax.cpp b/aten/src/ATen/native/quantized/cpu/qsoftmax.cpp
index 31221cd9bf26..cd00a351b0e3 100644
--- a/aten/src/ATen/native/quantized/cpu/qsoftmax.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qsoftmax.cpp
@@ -16,8 +16,8 @@ namespace {
 
 #ifdef USE_PYTORCH_QNNPACK
 
-constexpr static float qnnpack_softmax_output_scale = 0x1.0p-8f;
-constexpr static int qnnpack_softmax_output_zero_point = 0;
+const static float qnnpack_softmax_output_scale = 0x1.0p-8f;
+const static int qnnpack_softmax_output_zero_point = 0;
 
 bool is_qnnpack_compatible(
     const Tensor& qx,
diff --git a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/epilogue/epilogue_thread_apply_logsumexp.h b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/epilogue/epilogue_thread_apply_logsumexp.h
index 156034954d9e..e3dc0778e46b 100644
--- a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/epilogue/epilogue_thread_apply_logsumexp.h
+++ b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/epilogue/epilogue_thread_apply_logsumexp.h
@@ -110,9 +110,9 @@ class ApplyLogSumExp {
   using ElementCompute = ElementCompute_;
   using ElementLSE = ElementLSE_;
 
-  static int constexpr kElementsPerAccess = ElementsPerAccess;
-  static int constexpr kCount = kElementsPerAccess;
-  static constexpr ScaleType::Kind kScale =
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kCount = kElementsPerAccess;
+  static const ScaleType::Kind kScale =
       cutlass::epilogue::thread::ScaleType::NoBetaScaling;
 
   using FragmentOutput = Array<ElementOutput, kCount>;
diff --git a/aten/src/ATen/test/pow_test.cpp b/aten/src/ATen/test/pow_test.cpp
index 6391c3c8228c..95bb48b341f5 100644
--- a/aten/src/ATen/test/pow_test.cpp
+++ b/aten/src/ATen/test/pow_test.cpp
@@ -14,16 +14,16 @@ using namespace at;
 
 namespace {
 
-constexpr auto int_min = std::numeric_limits<int>::min();
-constexpr auto int_max = std::numeric_limits<int>::max();
-constexpr auto long_min = std::numeric_limits<int64_t>::min();
-constexpr auto long_max = std::numeric_limits<int64_t>::max();
-constexpr auto float_lowest = std::numeric_limits<float>::lowest();
-constexpr auto float_min = std::numeric_limits<float>::min();
-constexpr auto float_max = std::numeric_limits<float>::max();
-constexpr auto double_lowest = std::numeric_limits<double>::lowest();
-constexpr auto double_min = std::numeric_limits<double>::min();
-constexpr auto double_max = std::numeric_limits<double>::max();
+const auto int_min = std::numeric_limits<int>::min();
+const auto int_max = std::numeric_limits<int>::max();
+const auto long_min = std::numeric_limits<int64_t>::min();
+const auto long_max = std::numeric_limits<int64_t>::max();
+const auto float_lowest = std::numeric_limits<float>::lowest();
+const auto float_min = std::numeric_limits<float>::min();
+const auto float_max = std::numeric_limits<float>::max();
+const auto double_lowest = std::numeric_limits<double>::lowest();
+const auto double_min = std::numeric_limits<double>::min();
+const auto double_max = std::numeric_limits<double>::max();
 
 const std::vector<int> ints {
   int_min,
diff --git a/aten/src/ATen/xpu/XPUGeneratorImpl.cpp b/aten/src/ATen/xpu/XPUGeneratorImpl.cpp
index 7a0859671ba7..14f3059cc2b3 100644
--- a/aten/src/ATen/xpu/XPUGeneratorImpl.cpp
+++ b/aten/src/ATen/xpu/XPUGeneratorImpl.cpp
@@ -146,9 +146,9 @@ uint64_t XPUGeneratorImpl::seed() {
 
 c10::intrusive_ptr<c10::TensorImpl> XPUGeneratorImpl::get_state() const {
   // The RNG state comprises the seed, and an offset used for Philox.
-  constexpr size_t seed_size = sizeof(uint64_t);
-  constexpr size_t offset_size = sizeof(uint64_t);
-  constexpr size_t total_size = seed_size + offset_size;
+  static const size_t seed_size = sizeof(uint64_t);
+  static const size_t offset_size = sizeof(uint64_t);
+  static const size_t total_size = seed_size + offset_size;
 
   // The internal state is returned as a CPU byte tensor.
   auto state_tensor = at::detail::empty_cpu(
@@ -170,9 +170,9 @@ c10::intrusive_ptr<c10::TensorImpl> XPUGeneratorImpl::get_state() const {
 void XPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
   at::xpu::assertNotCapturing(
       "Please ensure to utilize the XPUGeneratorImpl::set_state_index method during capturing.");
-  constexpr size_t seed_size = sizeof(uint64_t);
-  constexpr size_t offset_size = sizeof(uint64_t);
-  constexpr size_t total_size = seed_size + offset_size;
+  static const size_t seed_size = sizeof(uint64_t);
+  static const size_t offset_size = sizeof(uint64_t);
+  static const size_t total_size = seed_size + offset_size;
 
   at::detail::check_rng_state(new_state);
 

From 24879f0de97e0caaafa083ddc5ee28d6079fb1c0 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Thu, 16 Oct 2025 16:50:46 -0700
Subject: [PATCH 024/123] [dynamo] Use Variable Builder to build the property
 fget object (#165683)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165683
Approved by: https://github.com/ezyang, https://github.com/williamwen42
---
 test/dynamo/test_functions.py           | 28 +++++++++++++++++++++++--
 torch/_dynamo/variables/user_defined.py |  8 ++-----
 2 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/test/dynamo/test_functions.py b/test/dynamo/test_functions.py
index 3e155f5e590b..d16676cda8ee 100644
--- a/test/dynamo/test_functions.py
+++ b/test/dynamo/test_functions.py
@@ -5173,10 +5173,9 @@ class DefaultsTests(torch._dynamo.test_case.TestCase):
         res = opt_fn(x)
         self.assertEqual(ref, res)
 
-    @unittest.expectedFailure
     def test_property_class_transmute(self):
         class PropertyGetter:
-            def __call__(self):
+            def __call__(self, obj):
                 return True
 
         p = property(PropertyGetter())
@@ -5195,6 +5194,31 @@ class DefaultsTests(torch._dynamo.test_case.TestCase):
         x = torch.randn(1)
         self.assertEqual(opt_mod(x), x + 1)
 
+    def test_property_functools_partial(self):
+        def p_getter(obj, *, delta: int):
+            # Use instance state + a bound constant
+            return (getattr(obj, "flag", 0) + delta) > 0
+
+        class Mod(torch.nn.Module):
+            def __init__(self, flag: int):
+                super().__init__()
+                self.flag = flag
+
+            # fget is a functools.partial object
+            p = property(functools.partial(p_getter, delta=1))
+
+            def forward(self, x):
+                if self.p:  # calls p_getter(self, delta=1)
+                    return x + 1
+                else:
+                    raise RuntimeError("whoops")
+
+        mod = Mod(flag=1)
+
+        opt_mod = torch.compile(mod, backend="eager", fullgraph=True)
+        x = torch.randn(1)
+        self.assertEqual(opt_mod(x), x + 1)
+
 
 instantiate_parametrized_tests(FunctionTests)
 instantiate_parametrized_tests(DefaultsTests)
diff --git a/torch/_dynamo/variables/user_defined.py b/torch/_dynamo/variables/user_defined.py
index e214bb0e2b9d..c17a1b9392d2 100644
--- a/torch/_dynamo/variables/user_defined.py
+++ b/torch/_dynamo/variables/user_defined.py
@@ -1458,12 +1458,8 @@ class UserDefinedObjectVariable(UserDefinedVariable):
                 # Get the getter function
                 source = AttrSource(source, "fget")
 
-            # Avoid using UserMethodVariable here because there is no way to
-            # access the method object here. Direct inline by creating the
-            # UserFunctionVariable.
-            return variables.UserFunctionVariable(
-                subobj.fget, source=source
-            ).call_function(tx, [self], {})
+            fget_vt = VariableTracker.build(tx, subobj.fget, source=source)
+            return fget_vt.call_function(tx, [self], {})
         elif isinstance(subobj, _collections._tuplegetter):
             # namedtuple fields are represented by _tuplegetter, and here we
             # emulate its `__get__`, which is implemented in C.

From f1d882212afc3a73ce1e319d80b6406f9dc4a0c8 Mon Sep 17 00:00:00 2001
From: Shangdi Yu <shangdiy@meta.com>
Date: Fri, 17 Oct 2025 07:18:43 +0000
Subject: [PATCH 025/123] [annotate] add annotate_fn function decorator
 (#165703)

Example usage:

```
        @fx_traceback.annotate_fn({"pp_stage": 1})
        def example_function(x):
            return x * x

        class SimpleLinear(nn.Module):
            def __init__(self):
                super().__init__()
                self.linear = nn.Linear(3, 2)

            def forward(self, x):
                with fx_traceback.annotate({"pp_stage": 0}):
                    y = self.linear(x)
                y = example_function(y)
                return y - 1
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165703
Approved by: https://github.com/SherlockNoMad
---
 .../test_aot_joint_with_descriptors.py        | 40 +++++++++++++++++++
 torch/fx/traceback.py                         | 37 +++++++++++++++++
 2 files changed, 77 insertions(+)

diff --git a/test/functorch/test_aot_joint_with_descriptors.py b/test/functorch/test_aot_joint_with_descriptors.py
index 167215bb8be1..d797b36748d0 100644
--- a/test/functorch/test_aot_joint_with_descriptors.py
+++ b/test/functorch/test_aot_joint_with_descriptors.py
@@ -922,6 +922,46 @@ class inner_f(torch.nn.Module):
             in custom_metadata
         )
 
+    def test_preserve_annotate_function(self):
+        """Test basic annotate_fn usage"""
+
+        @fx_traceback.annotate_fn({"pp_stage": 1})
+        def example_function(x):
+            return x * x
+
+        class SimpleLinear(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = nn.Linear(3, 2)
+
+            def forward(self, x):
+                with fx_traceback.annotate({"pp_stage": 0}):
+                    y = self.linear(x)
+                y = example_function(y)
+                return y - 1
+
+        inputs = (torch.randn(4, 3),)
+        model = SimpleLinear()
+
+        for with_export in [True, False]:
+            graph_module = graph_capture(model, inputs, with_export)
+            custom_metadata = fx_traceback._get_custom_metadata(graph_module)
+            self.assertExpectedInline(
+                str(custom_metadata),
+                """\
+('call_function', 't', {'pp_stage': 0})
+('call_function', 'addmm', {'pp_stage': 0})
+('call_function', 'mul', {'pp_stage': 1})
+('call_function', 'mul_1', {'pp_stage': 1})
+('call_function', 'mul_2', {'pp_stage': 1})
+('call_function', 't_1', {'pp_stage': 0})
+('call_function', 'mm', {'pp_stage': 0})
+('call_function', 't_2', {'pp_stage': 0})
+('call_function', 'sum_1', {'pp_stage': 0})
+('call_function', 'view', {'pp_stage': 0})
+('call_function', 't_3', {'pp_stage': 0})""",
+            )
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/fx/traceback.py b/torch/fx/traceback.py
index 3d1e3b7c5d53..56b5f5041aa1 100644
--- a/torch/fx/traceback.py
+++ b/torch/fx/traceback.py
@@ -18,6 +18,7 @@ log = logging.getLogger(__name__)
 
 __all__ = [
     "annotate",
+    "annotate_fn",
     "preserve_node_meta",
     "has_preserved_node_meta",
     "set_stack_trace",
@@ -291,6 +292,42 @@ def annotate(annotation_dict: dict):
             del current_meta["custom"]
 
 
+@compatibility(is_backward_compatible=False)
+def annotate_fn(annotation_dict: dict):
+    """
+    A decorator that wraps a function with the annotate context manager.
+    Use this when you want to annotate an entire function instead of a specific code block.
+
+    Note:
+        This API is **not backward compatible** and may evolve in future releases.
+
+    Note:
+        This API is not compatible with fx.symbolic_trace or jit.trace. It's intended
+        to be used with PT2 family of tracers, e.g. torch.export and dynamo.
+
+    Args:
+        annotation_dict (dict): A dictionary of custom key-value pairs to inject
+            into the FX trace metadata for all operations in the function.
+
+    Example:
+        >>> @annotate_fn({"pp_stage": 1})
+        ... def my_function(x):
+        ...     return x + 1
+        # All operations in my_function will have {"pp_stage": 1} in their metadata.
+    """
+    from functools import wraps
+
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            with annotate(annotation_dict):
+                return func(*args, **kwargs)
+
+        return wrapper
+
+    return decorator
+
+
 @compatibility(is_backward_compatible=False)
 def set_grad_fn_seq_nr(seq_nr):
     global current_meta

From e925dfcc6b4fd76d744d04ecaa451fc2936155a8 Mon Sep 17 00:00:00 2001
From: Yuanyuan Chen <cyyever@outlook.com>
Date: Fri, 17 Oct 2025 07:27:06 +0000
Subject: [PATCH 026/123] Enable all SIM rules except disabled ones (#164645)

`SIM` rules are useful for simplifying boolean expressions and enhances code readability.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164645
Approved by: https://github.com/ezyang, https://github.com/mlazos
---
 .github/scripts/trymerge.py                       |  2 +-
 benchmarks/dynamo/common.py                       |  2 +-
 benchmarks/transformer/score_mod.py               |  2 +-
 pyproject.toml                                    |  4 ++--
 test/ao/sparsity/test_activation_sparsifier.py    |  2 +-
 .../_shard/sharded_tensor/test_sharded_tensor.py  |  4 ++--
 test/distributed/checkpoint/test_checkpoint.py    |  2 +-
 .../fsdp/test_fsdp_freezing_weights.py            |  2 +-
 test/distributed/pipelining/test_schedule.py      |  2 +-
 test/distributed/tensor/test_dtensor.py           |  8 +++-----
 test/distributed/test_c10d_nccl.py                |  6 +-----
 test/dynamo/test_python_autograd.py               |  2 +-
 test/dynamo/test_subclasses.py                    |  2 +-
 test/export/test_passes.py                        |  4 +---
 test/functorch/test_control_flow.py               | 15 +++++----------
 test/fx/test_fx_traceback.py                      |  4 +---
 test/inductor/test_b2b_gemm.py                    | 12 +++---------
 test/inductor/test_benchmark_fusion.py            |  2 +-
 test/inductor/test_compiled_optimizers.py         |  4 ++--
 test/inductor/test_cudagraph_trees.py             |  2 +-
 test/inductor/test_flex_attention.py              | 14 +++++---------
 test/inductor/test_flex_decoding.py               | 14 ++++++--------
 test/inductor/test_graph_transform_observer.py    |  2 +-
 test/inductor/test_mkldnn_pattern_matcher.py      | 10 +++-------
 test/inductor/test_torchinductor.py               | 10 +++++-----
 test/inductor/test_torchinductor_opinfo.py        |  2 +-
 test/mobile/model_test/update_production_ops.py   | 10 +++-------
 test/onnx/test_pytorch_onnx_onnxruntime.py        |  4 ++--
 test/quantization/core/test_quantized_op.py       |  8 ++++----
 test/test_autograd.py                             |  2 +-
 test/test_cuda.py                                 |  2 +-
 test/test_decomp.py                               |  2 +-
 test/test_fx.py                                   |  2 +-
 test/test_indexing.py                             |  2 +-
 test/test_jit.py                                  |  6 +++---
 test/test_jit_autocast.py                         |  2 +-
 test/test_nn.py                                   |  2 +-
 test/test_numpy_interop.py                        |  2 +-
 test/test_pruning_op.py                           |  2 +-
 test/test_reductions.py                           |  2 +-
 test/test_scaled_matmul_cuda.py                   |  2 +-
 test/test_segment_reductions.py                   |  6 +++---
 test/test_serialization.py                        |  2 +-
 test/test_sparse_csr.py                           |  2 +-
 test/test_tensor_creation_ops.py                  |  2 +-
 test/test_torchfuzz_repros.py                     |  6 ++++--
 test/torch_np/numpy_tests/core/test_dtype.py      |  2 +-
 torch/_inductor/analysis/profile_analysis.py      |  7 ++-----
 torch/_inductor/codegen/triton.py                 |  4 ++--
 torch/_inductor/ir.py                             |  2 +-
 torch/_inductor/sizevars.py                       |  6 +++---
 torch/distributed/_state_dict_utils.py            |  2 +-
 torch/nn/functional.py                            |  4 ++--
 torchgen/gen_vmap_plumbing.py                     |  2 +-
 54 files changed, 98 insertions(+), 134 deletions(-)

diff --git a/.github/scripts/trymerge.py b/.github/scripts/trymerge.py
index 07a07a5126c4..c258284a00d8 100755
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@@ -1092,7 +1092,7 @@ class GitHubPR:
         editor = node["editor"]
         return GitHubComment(
             body_text=node["bodyText"],
-            created_at=node["createdAt"] if "createdAt" in node else "",
+            created_at=node.get("createdAt", ""),
             author_login=node["author"]["login"],
             author_url=node["author"].get("url", None),
             author_association=node["authorAssociation"],
diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index a31ae2b335c2..b81f8a9dbd24 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -4060,7 +4060,7 @@ def run(runner, args, original_dir=None):
         else:
             optimize_ctx = torch._dynamo.optimize(args.backend, nopython=args.nopython)
         experiment = (
-            speedup_experiment if not args.backend == "torchao" else latency_experiment
+            speedup_experiment if args.backend != "torchao" else latency_experiment
         )
         if args.accuracy:
             output_filename = f"accuracy_{args.backend}.csv"
diff --git a/benchmarks/transformer/score_mod.py b/benchmarks/transformer/score_mod.py
index f812ede7f635..520fb26994e1 100644
--- a/benchmarks/transformer/score_mod.py
+++ b/benchmarks/transformer/score_mod.py
@@ -271,7 +271,7 @@ def run_single_backend_sdpa(
 
         if config.calculate_bwd_time:
             # TODO: debug backward pass for njt
-            if eager_sdpa and not config.attn_type == "document_mask":
+            if eager_sdpa and config.attn_type != "document_mask":
                 d_out = torch.randn_like(out_eager.transpose(1, 2)).transpose(1, 2)
                 backward_eager_time = benchmark_torch_function_in_microseconds(
                     out_eager.backward, d_out, retain_graph=True
diff --git a/pyproject.toml b/pyproject.toml
index f75261ba6ffb..8e29c1c81d56 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -180,6 +180,7 @@ ignore = [
     "SIM116", # Disable Use a dictionary instead of consecutive `if` statements
     "SIM117",
     "SIM118",
+    "SIM300", # Yoda condition detected
     "UP007", # keep-runtime-typing
     "UP045", # keep-runtime-typing
     "TC006",
@@ -195,8 +196,7 @@ select = [
     "E",
     "EXE",
     "F",
-    "SIM1",
-    "SIM911",
+    "SIM",
     "W",
     # Not included in flake8
     "FURB",
diff --git a/test/ao/sparsity/test_activation_sparsifier.py b/test/ao/sparsity/test_activation_sparsifier.py
index 122c368368e6..0f3f36ecda9f 100644
--- a/test/ao/sparsity/test_activation_sparsifier.py
+++ b/test/ao/sparsity/test_activation_sparsifier.py
@@ -55,7 +55,7 @@ class TestActivationSparsifier(TestCase):
 
         for key, config in sparsifier_defaults.items():
             # all the keys in combined_defaults should be present in sparsifier defaults
-            assert config == combined_defaults.get(key, None)
+            assert config == combined_defaults.get(key)
 
     def _check_register_layer(
         self, activation_sparsifier, defaults, sparse_config, layer_args_list
diff --git a/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py b/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py
index f62e4d29617d..b39b3075060f 100644
--- a/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py
+++ b/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py
@@ -3074,7 +3074,7 @@ class TestShardedTensorFromLocalShards(ShardedTensorTestBase):
                 wrong_dtype_shards, [10, 10], init_rrefs=True
             )
 
-        tensor_requires_grad = True if self.rank == 0 else False
+        tensor_requires_grad = self.rank == 0
         wrong_requires_grad_shards = [
             sharded_tensor.Shard(
                 torch.randn(
@@ -3121,7 +3121,7 @@ class TestShardedTensorFromLocalShards(ShardedTensorTestBase):
                 wrong_pin_memory_local_shards, [10, 10], init_rrefs=True
             )
 
-        tensor_pin_memory = True if self.rank == 0 else False
+        tensor_pin_memory = self.rank == 0
         wrong_pin_memory_shards_cross_ranks = [
             sharded_tensor.Shard(
                 torch.randn(5, 5, pin_memory=tensor_pin_memory), local_shard_metadata
diff --git a/test/distributed/checkpoint/test_checkpoint.py b/test/distributed/checkpoint/test_checkpoint.py
index 09c1924cf294..0bc5bf69f2a5 100644
--- a/test/distributed/checkpoint/test_checkpoint.py
+++ b/test/distributed/checkpoint/test_checkpoint.py
@@ -152,7 +152,7 @@ class TestStorageBase:
         self.rank = 0 if not dist.is_initialized() else dist.get_rank()
 
     def _get_ranks(self, name):
-        return self.fail_conf[name] if name in self.fail_conf else None
+        return self.fail_conf.get(name, None)
 
     def _fail_rank(self, name):
         ranks = self._get_ranks(name)
diff --git a/test/distributed/fsdp/test_fsdp_freezing_weights.py b/test/distributed/fsdp/test_fsdp_freezing_weights.py
index ad318a6bf752..730b8cd7308e 100644
--- a/test/distributed/fsdp/test_fsdp_freezing_weights.py
+++ b/test/distributed/fsdp/test_fsdp_freezing_weights.py
@@ -155,7 +155,7 @@ class TestFreezingWeights(FSDPTest):
 
         ddp_kwargs = {
             "device_ids": [self.rank],
-            "find_unused_parameters": True if disable_autograd else False,
+            "find_unused_parameters": bool(disable_autograd),
         }
 
         model = self._create_model(
diff --git a/test/distributed/pipelining/test_schedule.py b/test/distributed/pipelining/test_schedule.py
index 6305b5cecdbc..714ab8f65911 100644
--- a/test/distributed/pipelining/test_schedule.py
+++ b/test/distributed/pipelining/test_schedule.py
@@ -66,7 +66,7 @@ class MockPipelineStage(_PipelineStageBase):
         self.num_stages = kwargs.get("num_stages", 1)
         self.group_size = kwargs.get("group_size", 1)
         self.group_rank = kwargs.get("group_rank", 0)
-        self.group = kwargs.get("group", None)
+        self.group = kwargs.get("group")
 
     def _create_grad_recv_info(self, *args, **kwargs):
         return None
diff --git a/test/distributed/tensor/test_dtensor.py b/test/distributed/tensor/test_dtensor.py
index e2368a0ef220..0a607581a340 100644
--- a/test/distributed/tensor/test_dtensor.py
+++ b/test/distributed/tensor/test_dtensor.py
@@ -1066,7 +1066,7 @@ class TestDTensorPlacementTypes(DTensorTestBase):
                 assert_array_equal(expected_pad_sizes, pad_sizes)
 
                 is_tensor_empty = [
-                    False if splitted_tensor.numel() > 0 else True
+                    not splitted_tensor.numel() > 0
                     for splitted_tensor in splitted_tensor_list
                 ]
                 expected_is_tensor_empty = [True] * self.world_size
@@ -1089,12 +1089,10 @@ class TestDTensorPlacementTypes(DTensorTestBase):
                     for i, tensor in enumerate(splitted_tensor_list)
                 ]
                 expected_is_tensor_empty = [
-                    False if idx < size else True
-                    for idx, _ in enumerate(range(self.world_size))
+                    not idx < size for idx, _ in enumerate(range(self.world_size))
                 ]
                 is_tensor_empty = [
-                    False if unpadded_tensor.numel() > 0 else True
-                    for unpadded_tensor in unpadded_list
+                    not unpadded_tensor.numel() > 0 for unpadded_tensor in unpadded_list
                 ]
                 assert_array_equal(expected_is_tensor_empty, is_tensor_empty)
 
diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index 23287fa2d5c9..7410255d27a8 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -2770,11 +2770,7 @@ class WorkHookTest(MultiProcessTestCase):
         # from rank0 to other ranks. However, this is DDP's internal implementation,
         # which is subject to change in future versions.
         self.assertTrue(num_hook_fired[OpType.BROADCAST] > 0)
-        ctor_allreduce = (
-            num_hook_fired[OpType.ALLREDUCE]
-            if OpType.ALLREDUCE in num_hook_fired
-            else 0
-        )
+        ctor_allreduce = num_hook_fired.get(OpType.ALLREDUCE, 0)
 
         x = torch.zeros(2, 1000).cuda(self.rank)
         ddp(x).sum().backward()
diff --git a/test/dynamo/test_python_autograd.py b/test/dynamo/test_python_autograd.py
index 2acaf67add69..a615c653f56c 100644
--- a/test/dynamo/test_python_autograd.py
+++ b/test/dynamo/test_python_autograd.py
@@ -82,7 +82,7 @@ def grad(L, desired_results: list[Variable]) -> list[Variable]:
     # look up dL_dentries. If a variable is never used to compute the loss,
     # we consider its gradient None, see the note below about zeros for more information.
     def gather_grad(entries: list[str]):
-        return [dL_d[entry] if entry in dL_d else None for entry in entries]
+        return [dL_d.get(entry) for entry in entries]
 
     # propagate the gradient information backward
     for entry in reversed(gradient_tape):
diff --git a/test/dynamo/test_subclasses.py b/test/dynamo/test_subclasses.py
index 0242badeb99e..c590abe63788 100644
--- a/test/dynamo/test_subclasses.py
+++ b/test/dynamo/test_subclasses.py
@@ -286,7 +286,7 @@ class OptionalScaledTensor(torch.Tensor):
     def __tensor_unflatten__(inner_tensors, metadata, outer_size, outer_stride):
         return OptionalScaledTensor(
             inner_tensors["_data"],
-            inner_tensors["_scale"] if "_scale" in inner_tensors else None,
+            inner_tensors.get("_scale", None),
             constant=metadata["_constant"],
         )
 
diff --git a/test/export/test_passes.py b/test/export/test_passes.py
index e93a66ed572b..9cf442c27a2b 100644
--- a/test/export/test_passes.py
+++ b/test/export/test_passes.py
@@ -358,9 +358,7 @@ def _sequential_split_inline_tests():
 
         for i, node in enumerate(insert_locs):
             with gm.graph.inserting_before(node):
-                gm.graph.call_function(
-                    torch._C._set_grad_enabled, (True if i % 2 == 0 else False,), {}
-                )
+                gm.graph.call_function(torch._C._set_grad_enabled, (i % 2 == 0,), {})
         return gm
 
     x = torch.randn(2, 2)
diff --git a/test/functorch/test_control_flow.py b/test/functorch/test_control_flow.py
index 47e4481ef6af..e47aaa9e9e2b 100644
--- a/test/functorch/test_control_flow.py
+++ b/test/functorch/test_control_flow.py
@@ -2932,9 +2932,7 @@ class GraphModule(torch.nn.Module):
             if autograd:
                 result_flat = pytree.tree_leaves(result)
                 result_exp_flat = pytree.tree_leaves(result_exp)
-                exp_grad_mask = [
-                    True if r.requires_grad else False for r in result_exp_flat
-                ]
+                exp_grad_mask = [bool(r.requires_grad) for r in result_exp_flat]
                 self.check_autograd(
                     [r for r, m in zip(result_flat, exp_grad_mask) if m],
                     [r for r, m in zip(result_exp_flat, exp_grad_mask) if m],
@@ -3741,9 +3739,7 @@ class AssociativeScanTests(TestCase):
         ):
             result_flat = pytree.tree_leaves(result)
             result_exp_flat = pytree.tree_leaves(result_exp)
-            exp_grad_mask = [
-                True if r.requires_grad else False for r in result_exp_flat
-            ]
+            exp_grad_mask = [bool(r.requires_grad) for r in result_exp_flat]
 
             self._check_autograd(
                 [r for r, m in zip(result_flat, exp_grad_mask) if m],
@@ -5710,10 +5706,9 @@ def forward(self, arg0_1):
     )
     def test_while_loop_tracing(self, while_loop_test):
         fn, inp = WHILE_LOOP_TESTS[while_loop_test]
-        allow_non_fake_inputs = (
-            False
-            if while_loop_test not in ("simple_with_linear", "nested_with_linear")
-            else True
+        allow_non_fake_inputs = while_loop_test in (
+            "simple_with_linear",
+            "nested_with_linear",
         )
         self._check_tracing(fn, inp, allow_non_fake_inputs)
 
diff --git a/test/fx/test_fx_traceback.py b/test/fx/test_fx_traceback.py
index 05369d17078b..1db681ddfd71 100644
--- a/test/fx/test_fx_traceback.py
+++ b/test/fx/test_fx_traceback.py
@@ -177,9 +177,7 @@ class TestFXNodeSource(TestCase):
             for node_name_2 in node_name_to_from_node:
                 if node_name_2 in {
                     node_name_1,
-                    same_ancestor_nodes[node_name_1]
-                    if node_name_1 in same_ancestor_nodes
-                    else None,
+                    same_ancestor_nodes.get(node_name_1),
                 }:
                     self.assertEqual(
                         node_name_to_from_node[node_name_1],
diff --git a/test/inductor/test_b2b_gemm.py b/test/inductor/test_b2b_gemm.py
index 60bbfd6c4922..fa5194fc8340 100644
--- a/test/inductor/test_b2b_gemm.py
+++ b/test/inductor/test_b2b_gemm.py
@@ -164,9 +164,7 @@ class B2BGEMMTest(TestCase):
         self.assertTrue("B2B_GEMM_LEFT_TRITON_ENTRANCE" not in code)
         self.assertTrue("B2B_GEMM_RIGHT_TRITON_ENTRANCE" not in code)
 
-    @unittest.skipIf(
-        not (os.environ.get("DO_PERF_TEST") == "1"), "Perf test not enabled"
-    )
+    @unittest.skipIf(os.environ.get("DO_PERF_TEST") != "1", "Perf test not enabled")
     @torch._dynamo.config.patch(recompile_limit=32)
     def test_plain_b2b_gemm_performance(self):
         """compare torch.compile(f, b2b_gemm = off) with torch.compile(f, b2b_gemm = on)"""
@@ -219,9 +217,7 @@ class B2BGEMMTest(TestCase):
         # flaky test assertion: disabled
         # self.assertTrue(average_speedup > 1)
 
-    @unittest.skipIf(
-        not (os.environ.get("DO_PERF_TEST") == "1"), "Perf test not enabled"
-    )
+    @unittest.skipIf(os.environ.get("DO_PERF_TEST") != "1", "Perf test not enabled")
     @torch._dynamo.config.patch(recompile_limit=32)
     def test_gelu_b2b_gemm_performance(self):
         """compare torch.compile(f, b2b_gemm = off) with torch.compile(f, b2b_gemm = on)"""
@@ -276,9 +272,7 @@ class B2BGEMMTest(TestCase):
         # flaky test assertion: disabled
         # self.assertTrue(average_speedup > 1)
 
-    @unittest.skipIf(
-        not (os.environ.get("DO_PERF_TEST") == "1"), "Perf test not enabled"
-    )
+    @unittest.skipIf(os.environ.get("DO_PERF_TEST") != "1", "Perf test not enabled")
     @torch._dynamo.config.patch(recompile_limit=32)
     def test_gelu_mlp_b2b_gemm_performance(self):
         """compare torch.compile(f, b2b_gemm = off) with torch.compile(f, b2b_gemm = on)"""
diff --git a/test/inductor/test_benchmark_fusion.py b/test/inductor/test_benchmark_fusion.py
index 56310adc977d..335b22061be5 100644
--- a/test/inductor/test_benchmark_fusion.py
+++ b/test/inductor/test_benchmark_fusion.py
@@ -165,7 +165,7 @@ class BenchmarkFusionTestTemplate:
             _, out_code = run_and_get_code(foo_c, m, inp)
 
             # occasionally, CI will make this one kernel. just skip in this case
-            if not out_code[0].count("def triton_") == 2:
+            if out_code[0].count("def triton_") != 2:
                 return
 
             # should be multiple triton invocations
diff --git a/test/inductor/test_compiled_optimizers.py b/test/inductor/test_compiled_optimizers.py
index 4c3d394b3e9f..36a4424683a9 100644
--- a/test/inductor/test_compiled_optimizers.py
+++ b/test/inductor/test_compiled_optimizers.py
@@ -289,7 +289,7 @@ def build_opt_kwarg_db():
 
                 has_tensor_lr = False
                 for key, val in kwargs.items():
-                    if (not key == "lr" and not key == "betas") and (
+                    if (key != "lr" and key != "betas") and (
                         not isinstance(val, bool) or (isinstance(val, bool) and val)
                     ):
                         name += "_" + key
@@ -450,7 +450,7 @@ def make_test(
                 stack.enter_context(config.patch({"triton.cudagraphs": True}))
 
             kwargs_compiled = deepcopy(kwargs)
-            if isinstance(kwargs.get("lr", None), torch.Tensor):
+            if isinstance(kwargs.get("lr"), torch.Tensor):
                 kwargs["lr"] = kwargs["lr"].to(device)
                 kwargs_compiled["lr"] = kwargs_compiled["lr"].to(device)
 
diff --git a/test/inductor/test_cudagraph_trees.py b/test/inductor/test_cudagraph_trees.py
index 3e91e3ae2876..f9949ec710c8 100644
--- a/test/inductor/test_cudagraph_trees.py
+++ b/test/inductor/test_cudagraph_trees.py
@@ -177,7 +177,7 @@ if HAS_CUDA_AND_TRITON:
 
         def get_manager(self, device_index=None):
             return torch._inductor.cudagraph_trees.get_container(
-                self.device_idx if not device_index else device_index
+                device_index if device_index else self.device_idx
             ).tree_manager
 
         def get_roots(self):
diff --git a/test/inductor/test_flex_attention.py b/test/inductor/test_flex_attention.py
index 1081afc25520..d8d4b2a46f91 100644
--- a/test/inductor/test_flex_attention.py
+++ b/test/inductor/test_flex_attention.py
@@ -585,9 +585,7 @@ class TestFlexAttention(InductorTestCase):
             )
         q_ref, k_ref, v_ref = query_key_value_clones(q, k, v)
         q_gold, k_gold, v_gold = query_key_value_clones(q, k, v, torch.float64)
-        sdpa_partial = create_attention(
-            score_mod, block_mask, enable_gqa=(not Q_H == KV_H)
-        )
+        sdpa_partial = create_attention(score_mod, block_mask, enable_gqa=(Q_H != KV_H))
 
         compiled_sdpa = torch.compile(sdpa_partial)
         golden_out = sdpa_partial(q_gold, k_gold, v_gold)
@@ -761,7 +759,7 @@ class TestFlexAttention(InductorTestCase):
                 return_lse=return_lse,
                 block_mask=converted_block_mask,
                 score_mod=converted_score_mod,
-                enable_gqa=(not Q_H == KV_H),
+                enable_gqa=(Q_H != KV_H),
                 kernel_options=kernel_options,
             )
         else:
@@ -774,7 +772,7 @@ class TestFlexAttention(InductorTestCase):
                 return_lse=return_lse,
                 block_mask=converted_block_mask,
                 score_mod=converted_score_mod,
-                enable_gqa=(not Q_H == KV_H),
+                enable_gqa=(Q_H != KV_H),
                 kernel_options=kernel_options,
             )
         return compiled_out, compiled_lse
@@ -819,9 +817,7 @@ class TestFlexAttention(InductorTestCase):
         if block_mask is None:
             block_mask = create_block_mask(noop_mask, Q_B, 1, Q_S, KV_S, device=device)
 
-        sdpa_partial = create_attention(
-            score_mod, block_mask, enable_gqa=(not Q_H == KV_H)
-        )
+        sdpa_partial = create_attention(score_mod, block_mask, enable_gqa=(Q_H != KV_H))
         golden_out, golden_lse = sdpa_partial(q_gold, k_gold, v_gold, return_lse=True)
         ref_out, ref_lse = sdpa_partial(q_ref, k_ref, v_ref, return_lse=True)
 
@@ -1466,7 +1462,7 @@ class TestFlexAttention(InductorTestCase):
 
         block_mask = create_block_mask(mask_mod, Bq, 1, S, S, device=device)
         attention = functools.partial(
-            flex_attention, block_mask=block_mask, enable_gqa=(not Hq == Hkv)
+            flex_attention, block_mask=block_mask, enable_gqa=(Hq != Hkv)
         )
 
         self.run_test_with_call(attention, dtype, device, Bq, Hq, S, D, Bkv, Hkv, S, D)
diff --git a/test/inductor/test_flex_decoding.py b/test/inductor/test_flex_decoding.py
index ce0985c57269..a794f5e6e521 100644
--- a/test/inductor/test_flex_decoding.py
+++ b/test/inductor/test_flex_decoding.py
@@ -412,7 +412,7 @@ class TestFlexDecoding(InductorTestCase):
         sdpa_partial = create_attention(
             score_mod,
             block_mask,
-            enable_gqa=(not Q_H == KV_H),
+            enable_gqa=(Q_H != KV_H),
             kernel_options=kernel_options,
         )
         compiled_sdpa = torch.compile(sdpa_partial)
@@ -607,7 +607,7 @@ class TestFlexDecoding(InductorTestCase):
                 return_lse=True,
                 block_mask=converted_block_mask,
                 score_mod=converted_score_mod,
-                enable_gqa=(not Q_H == KV_H),
+                enable_gqa=(Q_H != KV_H),
             )
         else:
             compiled_lse = None
@@ -618,7 +618,7 @@ class TestFlexDecoding(InductorTestCase):
                 return_lse=False,
                 block_mask=converted_block_mask,
                 score_mod=converted_score_mod,
-                enable_gqa=(not Q_H == KV_H),
+                enable_gqa=(Q_H != KV_H),
             )
         return compiled_out, compiled_lse
 
@@ -664,9 +664,7 @@ class TestFlexDecoding(InductorTestCase):
         if block_mask is None:
             block_mask = create_block_mask(noop_mask, Q_B, 1, 1, KV_S, device=device)
 
-        sdpa_partial = create_attention(
-            score_mod, block_mask, enable_gqa=(not Q_H == KV_H)
-        )
+        sdpa_partial = create_attention(score_mod, block_mask, enable_gqa=(Q_H != KV_H))
         golden_out, gold_lse = sdpa_partial(q_gold, k_gold, v_gold, return_lse=True)
         ref_out, ref_lse = sdpa_partial(q_ref, k_ref, v_ref, return_lse=True)
 
@@ -906,7 +904,7 @@ class TestFlexDecoding(InductorTestCase):
         sdpa_partial = create_attention(
             score_mod=score_mod,
             block_mask=None,
-            enable_gqa=(not Hq == Hkv),
+            enable_gqa=(Hq != Hkv),
         )
         compiled_sdpa = torch.compile(sdpa_partial)
         ref_out = sdpa_partial(q, k, v)
@@ -1144,7 +1142,7 @@ class TestFlexDecoding(InductorTestCase):
 
         def head_attention_mod(kv_head_num):
             head_type = torch.tensor(
-                [False if i % kv_head_num == 0 else True for i in range(kv_head_num)],
+                [i % kv_head_num != 0 for i in range(kv_head_num)],
                 dtype=torch.bool,
                 device=device,
             )
diff --git a/test/inductor/test_graph_transform_observer.py b/test/inductor/test_graph_transform_observer.py
index 2bd0b6ef43f1..e30f2189cd42 100644
--- a/test/inductor/test_graph_transform_observer.py
+++ b/test/inductor/test_graph_transform_observer.py
@@ -22,7 +22,7 @@ except ImportError:
     HAS_PYDOT = False
 
 
-HAS_DOT = True if shutil.which("dot") is not None else False
+HAS_DOT = shutil.which("dot") is not None
 
 
 class TestGraphTransformObserver(TestCase):
diff --git a/test/inductor/test_mkldnn_pattern_matcher.py b/test/inductor/test_mkldnn_pattern_matcher.py
index 16f88b3c9419..02cf97432900 100644
--- a/test/inductor/test_mkldnn_pattern_matcher.py
+++ b/test/inductor/test_mkldnn_pattern_matcher.py
@@ -835,9 +835,7 @@ class TestPatternMatcher(TestPatternMatcherBase):
 
         for dtype in dtypes:
             torch._dynamo.reset()
-            autocast_enabled = (
-                True if dtype in [torch.bfloat16, torch.float16] else False
-            )
+            autocast_enabled = dtype in [torch.bfloat16, torch.float16]
             with (
                 torch.no_grad(),
                 torch.autocast(
@@ -4421,14 +4419,12 @@ class TestPatternMatcher(TestPatternMatcherBase):
         out_feature = 64
         q_min, q_max = -32, 31
         # we only test for qlinear_binary in this case
-        test_for_pointwise_binary = (
-            True
-            if M == 1
+        test_for_pointwise_binary = bool(
+            M == 1
             and inplace_add
             and not expand_a_scale
             and not dynamic
             and not has_bias
-            else False
         )
         if test_for_pointwise_binary and not IS_X86:
             self.skipTest("Some UTs are only supported on x86_64 CPUs")
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index ff04091fafa3..0b1f43c1b3d6 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -706,7 +706,7 @@ def check_model_gpu(
     if check_lowp:
 
         def downcast_fn(x):
-            if not isinstance(x, torch.Tensor) or not x.dtype == torch.float:
+            if not isinstance(x, torch.Tensor) or x.dtype != torch.float:
                 return x
             return torch.empty_strided(
                 x.size(), x.stride(), device=GPU_TYPE, dtype=torch.half
@@ -4694,7 +4694,7 @@ class CommonTemplate:
             # Make sure we compute also with fp16 in the reference. Otherwise,
             # the reference will compute with fp32 and cast back to fp16, which
             # causes numeric differences beyond tolerance.
-            reference_in_float=False if torch.version.hip else True,
+            reference_in_float=not torch.version.hip,
         )
 
     def test_convolution2(self):
@@ -4728,7 +4728,7 @@ class CommonTemplate:
             # Make sure we compute also with fp16 in the reference. Otherwise,
             # the reference will compute with fp32 and cast back to fp16, which
             # causes numeric differences beyond tolerance.
-            reference_in_float=False if torch.version.hip else True,
+            reference_in_float=not torch.version.hip,
         )
 
     @skip_if_gpu_halide
@@ -4779,7 +4779,7 @@ class CommonTemplate:
             # Make sure we compute also with fp16 in the reference. Otherwise,
             # the reference will compute with fp32 and cast back to fp16, which
             # causes numeric differences beyond tolerance.
-            reference_in_float=False if torch.version.hip else True,
+            reference_in_float=not torch.version.hip,
         )
 
     def test_conv2d_channels_last(self):
@@ -12970,7 +12970,7 @@ def forward(self, arg0_1: "Sym(s77)", arg1_1: "Sym(s27)", arg2_1: "Sym(s53)", ar
             )
 
         res = torch.compile(fn)(20)
-        self.assertTrue(torch.all((0 <= res) & (res < 10)).item())
+        self.assertTrue(torch.all((res >= 0) & (res < 10)).item())
 
     @torch._inductor.config.patch(force_shape_pad=True)
     @skip_if_gpu_halide  # correctness issue
diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
index 3c36d1405dd2..dd6e9cb47097 100644
--- a/test/inductor/test_torchinductor_opinfo.py
+++ b/test/inductor/test_torchinductor_opinfo.py
@@ -1220,7 +1220,7 @@ class TestInductorOpInfo(TestCase):
             # not exercised in test_ops_gradients atm.  The problem is not
             # complex32 per-se (which is supported by data movement only ops)
             # but that when we do backwards we expect other ops like add to work
-            and not dtype == torch.complex32
+            and dtype != torch.complex32
         )
         samples = op.sample_inputs(device, dtype, requires_grad=requires_grad)
 
diff --git a/test/mobile/model_test/update_production_ops.py b/test/mobile/model_test/update_production_ops.py
index ec616d24ec1f..b4549a585e15 100644
--- a/test/mobile/model_test/update_production_ops.py
+++ b/test/mobile/model_test/update_production_ops.py
@@ -17,17 +17,13 @@ with open(sys.argv[1]) as input_yaml_file:
     for info in model_infos:
         for op in info["root_operators"]:
             # aggregate occurance per op
-            root_operators[op] = 1 + (root_operators[op] if op in root_operators else 0)
+            root_operators[op] = 1 + (root_operators.get(op, 0))
         for op in info["traced_operators"]:
             # aggregate occurance per op
-            traced_operators[op] = 1 + (
-                traced_operators[op] if op in traced_operators else 0
-            )
+            traced_operators[op] = 1 + (traced_operators.get(op, 0))
         # merge dtypes for each kernel
         for kernal, dtypes in info["kernel_metadata"].items():
-            new_dtypes = dtypes + (
-                kernel_metadata[kernal] if kernal in kernel_metadata else []
-            )
+            new_dtypes = dtypes + (kernel_metadata.get(kernal, []))
             kernel_metadata[kernal] = list(set(new_dtypes))
 
 
diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index 6fa49ed61b71..5c11682deeda 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -4879,7 +4879,7 @@ class TestONNXRuntime(onnx_test_common._TestONNXRuntime):
     @skipScriptTest()
     def test_rnn_no_bias(self):
         def make_model(layers, packed_sequence):
-            batch_first = True if packed_sequence == 2 else False
+            batch_first = packed_sequence == 2
             model = torch.nn.RNN(
                 RNN_INPUT_SIZE,
                 RNN_HIDDEN_SIZE,
@@ -4900,7 +4900,7 @@ class TestONNXRuntime(onnx_test_common._TestONNXRuntime):
             return model
 
         def make_input(batch_size, layers, packed_sequence):
-            batch_first = True if packed_sequence == 2 else False
+            batch_first = packed_sequence == 2
             seq_lengths = np.random.randint(1, RNN_SEQUENCE_LENGTH + 1, size=batch_size)
             seq_lengths = sorted(map(int, seq_lengths), reverse=True)
             inputs = [torch.randn(l, RNN_INPUT_SIZE) for l in seq_lengths]
diff --git a/test/quantization/core/test_quantized_op.py b/test/quantization/core/test_quantized_op.py
index 0840eeb1be42..d8a35264f7de 100644
--- a/test/quantization/core/test_quantized_op.py
+++ b/test/quantization/core/test_quantized_op.py
@@ -7045,8 +7045,8 @@ class TestQuantizedConv(TestCase):
         # ONEDNN only supports symmetric quantization of weight
         if W_zero_point is not None:
             W_zero_point = len(W_zero_point) * [0]
-        fp32_output = True if qconv_output_dtype is torch.float32 else False
-        bfloat16_output = True if qconv_output_dtype is torch.bfloat16 else False
+        fp32_output = qconv_output_dtype is torch.float32
+        bfloat16_output = qconv_output_dtype is torch.bfloat16
         if fp32_output or bfloat16_output:
             Y_scale = 1.0
             Y_zero_point = 0
@@ -7905,8 +7905,8 @@ class TestQuantizedConv(TestCase):
         weight_in_channel_last_format=False,
     ):
         # We assume FP8 quantization is always symmetric
-        fp32_output = True if qconv_output_dtype is torch.float32 else False
-        bfloat16_output = True if qconv_output_dtype is torch.bfloat16 else False
+        fp32_output = qconv_output_dtype is torch.float32
+        bfloat16_output = qconv_output_dtype is torch.bfloat16
         if fp32_output or bfloat16_output:
             Y_scale = 1.0
             X2_scale = 1.0
diff --git a/test/test_autograd.py b/test/test_autograd.py
index 081349b23116..bebe89e09657 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -11861,7 +11861,7 @@ class TestAutogradDeviceType(TestCase):
             def test_nonzero(tensor, value, expected):
                 tensor[0] = value
                 self.assertEqual(expected, bool(tensor))
-                self.assertEqual(expected, True if tensor else False)
+                self.assertEqual(expected, bool(tensor))
 
             test_nonzero(l, 0, False)
             test_nonzero(l, -2, True)
diff --git a/test/test_cuda.py b/test/test_cuda.py
index 667bccd82c24..fc52c2b92067 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -577,7 +577,7 @@ print(t.is_pinned())
             src = torch.randn(
                 1000000,
                 device="cuda" if dst == "cpu" else "cpu",
-                pin_memory=True if dst == "cuda" else False,
+                pin_memory=dst == "cuda",
             )
             _test_to_non_blocking(src, try_non_blocking, dst)
 
diff --git a/test/test_decomp.py b/test/test_decomp.py
index e77f0a7467d9..f5c791c8cbe8 100644
--- a/test/test_decomp.py
+++ b/test/test_decomp.py
@@ -945,7 +945,7 @@ def forward(self, scores_1, mask_1, value_1):
             # not exercised in test_ops_gradients atm.  The problem is not
             # complex32 per-se (which is supported by data movement only ops)
             # but that when we do backwards we expect other ops like add to work
-            and not dtype == torch.complex32
+            and dtype != torch.complex32
         )
         samples = op.sample_inputs(device, dtype, requires_grad=requires_grad)
 
diff --git a/test/test_fx.py b/test/test_fx.py
index 1f6296a509fc..76dd7e15df93 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -3584,7 +3584,7 @@ class TestFX(JitTestCase):
 
         class LeafTracerNotB(Tracer):
             def is_leaf_module(self, module, name):
-                return False if "b" in name else True
+                return "b" not in name
 
         # Recompile calls added "for fun", since they
         # chain __call__ wrappers.
diff --git a/test/test_indexing.py b/test/test_indexing.py
index 28d320d90d0e..fa91b5903410 100644
--- a/test/test_indexing.py
+++ b/test/test_indexing.py
@@ -2036,7 +2036,7 @@ class TestIndexing(TestCase):
         index = torch.tensor([0], device=device)
         x.index_fill_(1, index, 0)
         self.assertEqual(x, torch.tensor([[0, 2], [0, 5]], dtype=dtype, device=device))
-        if not x.is_complex() and not device == "meta":
+        if not x.is_complex() and device != "meta":
             with self.assertRaisesRegex(RuntimeError, r"Scalar"):
                 x.index_fill_(1, index, 1 + 1j)
         # Make sure that the result stays 0-dim while applied to
diff --git a/test/test_jit.py b/test/test_jit.py
index fb7088a2875f..6a3c968f86dd 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -6723,7 +6723,7 @@ a")
         @torch.jit.script
         def testNoThrows(t):
             c1 = 1
-            if (False and bool(t[1])) or (True or bool(t[1])):
+            if (False and bool(t[1])) or (True or bool(t[1])):  # noqa: SIM222,SIM223
                 c1 = 0
             return c1
 
@@ -15758,7 +15758,7 @@ dedent """
         def fn(d):
             # type: (Dict[str, int]) -> List[int]
             out = [1]
-            for i in range(d["hi"] if "hi" in d else 6):
+            for i in range(d.get("hi", 6)):
                 out.append(i)  # noqa: PERF402
             return out
 
@@ -16104,7 +16104,7 @@ M = 10
 S = 5
 
 def add_nn_module_test(*args, **kwargs):
-    no_grad = False if 'no_grad' not in kwargs else kwargs['no_grad']
+    no_grad = kwargs.get('no_grad', False)
 
     if 'desc' in kwargs and 'eval' in kwargs['desc']:
         # eval() is not supported, so skip these tests
diff --git a/test/test_jit_autocast.py b/test/test_jit_autocast.py
index dcdf78ff4b89..0559a728aef9 100644
--- a/test/test_jit_autocast.py
+++ b/test/test_jit_autocast.py
@@ -111,7 +111,7 @@ class TestAutocast(JitTestCase):
     def test_runtime_autocast_state_expr(self):
         @torch.jit.script
         def fn(a, b):
-            with autocast(enabled=True if a[0][0] > 0.5 else False):
+            with autocast(enabled=bool((a[0][0] > 0.5).item())):
                 return torch.mm(a, b)
         # runtime values for autocast enable argument are not supported
         with self.assertRaises(RuntimeError):
diff --git a/test/test_nn.py b/test/test_nn.py
index 6a33d0d16ead..f0307e79fc20 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -3522,7 +3522,7 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""")
             nn.RNN(10, 20, batch_first=True)
         ]
         # ROCm RNN does not issue warning about single contig chunk of memory, so don't assert it
-        first_warn = False if torch.version.hip else True
+        first_warn = not torch.version.hip
         for rnn in rnns:
             rnn.cuda()
             input = torch.randn(5, 4, 10, requires_grad=True, device="cuda")
diff --git a/test/test_numpy_interop.py b/test/test_numpy_interop.py
index ca7e65fc6247..724cc974047b 100644
--- a/test/test_numpy_interop.py
+++ b/test/test_numpy_interop.py
@@ -205,7 +205,7 @@ class TestNumPyInterop(TestCase):
                             x = x.conj()
                             y = x.resolve_conj()
                         expect_error = (
-                            requires_grad or sparse or conj or not device == "cpu"
+                            requires_grad or sparse or conj or device != "cpu"
                         )
                         error_msg = r"Use (t|T)ensor\..*(\.numpy\(\))?"
                         if not force and expect_error:
diff --git a/test/test_pruning_op.py b/test/test_pruning_op.py
index 5d24a9a31cbe..d8e42d781390 100644
--- a/test/test_pruning_op.py
+++ b/test/test_pruning_op.py
@@ -18,7 +18,7 @@ class PruningOpTest(TestCase):
     def _generate_rowwise_mask(self, embedding_rows):
         indicator = torch.from_numpy((np.random.random_sample(embedding_rows)).astype(np.float32))
         threshold = float(np.random.random_sample())
-        mask = torch.BoolTensor([True if val >= threshold else False for val in indicator])
+        mask = torch.BoolTensor([val >= threshold for val in indicator])
         return mask
 
     def _test_rowwise_prune_op(self, embedding_rows, embedding_dims, indices_type, weights_dtype):
diff --git a/test/test_reductions.py b/test/test_reductions.py
index 7aabe08abef2..e4fa54491dd0 100644
--- a/test/test_reductions.py
+++ b/test/test_reductions.py
@@ -1899,7 +1899,7 @@ class TestReductions(TestCase):
         # Note [all, any uint8 compatibility]: However for compatibility reason,
         # for `uint8`, they return Tensor of same dtype `uint8`.
         # Reference: https://github.com/pytorch/pytorch/pull/47878#issuecomment-747108561
-        exact_dtype = True if dtype != torch.uint8 else False
+        exact_dtype = dtype != torch.uint8
 
         def _test_all_any(x):
             self.compare_with_numpy(torch.all, np.all, x)
diff --git a/test/test_scaled_matmul_cuda.py b/test/test_scaled_matmul_cuda.py
index 604a001c495f..c0b96595de6e 100644
--- a/test/test_scaled_matmul_cuda.py
+++ b/test/test_scaled_matmul_cuda.py
@@ -1204,7 +1204,7 @@ class TestFP8Matmul(TestCase):
                 events = sorted(events, key=lambda x: x['ts'])
                 # ROCm carveout is invisible except for kernels running slower on fewer CUs
                 no_carveout, carveout_0, carveout, no_carveout_again = [float(evt.get("dur", "0.0")) for evt in events]
-                if True or not (no_carveout < carveout and carveout_0 < carveout and no_carveout_again < carveout):
+                if True or not (no_carveout < carveout and carveout_0 < carveout and no_carveout_again < carveout):  # noqa: SIM222
                     # something went wrong, print more info to help debug flaky test
                     print("ROCm debug info for test_honor_sm_carveout")
                     print("cu_count", cu_count)
diff --git a/test/test_segment_reductions.py b/test/test_segment_reductions.py
index 815bbc7dbc3d..18159044407c 100644
--- a/test/test_segment_reductions.py
+++ b/test/test_segment_reductions.py
@@ -129,7 +129,7 @@ class TestSegmentReductions(TestCase):
 
         for reduction in reductions:
             for initial in [0, None]:
-                check_backward = True if initial is not None else False
+                check_backward = initial is not None
                 initial_value = initial
                 default_value = get_default_value(initial_value, reduction)
                 if reduction == "max":
@@ -186,7 +186,7 @@ class TestSegmentReductions(TestCase):
 
         for reduction in reductions:
             for initial in [0, None]:
-                check_backward = True if initial is not None else False
+                check_backward = initial is not None
                 initial_value = initial
                 default_value = get_default_value(initial_value, reduction)
                 if reduction == "max":
@@ -244,7 +244,7 @@ class TestSegmentReductions(TestCase):
 
         for reduction in reductions:
             for initial in [0, None]:
-                check_backward = True if initial is not None else False
+                check_backward = initial is not None
                 initial_value = initial
                 default_value = get_default_value(initial_value, reduction)
                 if reduction == "max":
diff --git a/test/test_serialization.py b/test/test_serialization.py
index 677dabfee96a..7c4208b6a0d6 100644
--- a/test/test_serialization.py
+++ b/test/test_serialization.py
@@ -4553,7 +4553,7 @@ class TestSerialization(TestCase, SerializationMixin):
         with TemporaryFileName() as f:
             torch.save(m, f)
             try:
-                old_value = os.environ[env_var] if env_var in os.environ else None
+                old_value = os.environ.get(env_var, None)
                 os.environ[env_var] = "1"
                 # if weights_only is explicitly set, TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD cannot override it
                 with self.assertRaisesRegex(pickle.UnpicklingError, "Weights only load failed"):
diff --git a/test/test_sparse_csr.py b/test/test_sparse_csr.py
index 3f4729d36ee9..65e800f6eba1 100644
--- a/test/test_sparse_csr.py
+++ b/test/test_sparse_csr.py
@@ -4099,7 +4099,7 @@ class TestSparseCompressedTritonKernels(TestCase):
             left_alpha = make_tensor(M, dtype=dtype, device=device, low=0.5, high=high) if has_left_alpha else None
             right_alpha = make_tensor(N, dtype=dtype, device=device, low=0.5, high=high) if has_right_alpha else None
 
-            if 0 and op == "bsr_dense_addmm":
+            if 0 and op == "bsr_dense_addmm":  # noqa: SIM223
                 # Find optimal kernel parameters, the speed-up is
                 # about 10x for running this test.
                 #
diff --git a/test/test_tensor_creation_ops.py b/test/test_tensor_creation_ops.py
index fce2d50c59ba..8a76397f0516 100644
--- a/test/test_tensor_creation_ops.py
+++ b/test/test_tensor_creation_ops.py
@@ -3498,7 +3498,7 @@ class TestRandomTensorCreation(TestCase):
                 else:
                     t.uniform_(from_, to_)
                     range_ = to_ - from_
-                    if not (dtype == torch.bfloat16) and not (
+                    if dtype != torch.bfloat16 and not (
                             dtype == torch.half and device == 'cpu') and not torch.isnan(t).all():
                         delta = alpha * range_
                         double_t = t.to(torch.double)
diff --git a/test/test_torchfuzz_repros.py b/test/test_torchfuzz_repros.py
index d4131d649372..adfdd755bc7b 100644
--- a/test/test_torchfuzz_repros.py
+++ b/test/test_torchfuzz_repros.py
@@ -359,7 +359,9 @@ class TestFuzzerCompileIssues(TestCase):
             t3 = arg1  # size=(1,), stride=(1,), dtype=int64, device=cuda
             t4 = arg2  # size=(1,), stride=(1,), dtype=int64, device=cuda
             t5 = t3 + t3 + t4  # size=(1,), stride=(1,), dtype=int64, device=cuda
-            t6 = torch.exp(t5)  # size=(1,), stride=(1,), dtype=int64, device=cuda
+            t6 = torch.exp(  # noqa: F841
+                t5
+            )  # size=(1,), stride=(1,), dtype=int64, device=cuda  # noqa: F841
             t7 = torch.nn.functional.layer_norm(
                 t2, (111,)
             )  # size=(49, 112, 111), stride=(12432, 111, 1), dtype=float32, device=cuda
@@ -436,7 +438,7 @@ class TestFuzzerCompileIssues(TestCase):
         torch.manual_seed(9)
 
         def foo(arg0):
-            var_node_1 = arg0  # size=(1, 2), stride=(2, 1), dtype=int64, device=cuda
+            var_node_1 = arg0  # size=(1, 2), stride=(2, 1), dtype=int64, device=cuda  # noqa: F841
             var_node_5 = torch.full(
                 (1, 2), -66, dtype=torch.int32
             )  # size=(1, 2), stride=(2, 1), dtype=int32, device=cuda
diff --git a/test/torch_np/numpy_tests/core/test_dtype.py b/test/torch_np/numpy_tests/core/test_dtype.py
index d548f49b4cc4..18622aa0d6ae 100644
--- a/test/torch_np/numpy_tests/core/test_dtype.py
+++ b/test/torch_np/numpy_tests/core/test_dtype.py
@@ -100,7 +100,7 @@ class TestBuiltin(TestCase):
         # dtypes results in False/True when compared to valid dtypes.
         # Here 7 cannot be converted to dtype. No exceptions should be raised
 
-        assert not np.dtype(np.int32) == 7, "dtype richcompare failed for =="
+        assert np.dtype(np.int32) != 7, "dtype richcompare failed for =="
         assert np.dtype(np.int32) != 7, "dtype richcompare failed for !="
 
     @parametrize("operation", [operator.le, operator.lt, operator.ge, operator.gt])
diff --git a/torch/_inductor/analysis/profile_analysis.py b/torch/_inductor/analysis/profile_analysis.py
index a9f89009c210..28e02a7a60e2 100644
--- a/torch/_inductor/analysis/profile_analysis.py
+++ b/torch/_inductor/analysis/profile_analysis.py
@@ -416,11 +416,8 @@ class JsonProfile:
             # pyrefly: ignore  # bad-assignment
             self.dtype = dtype
         else:
-            if dtype in _dtype_map:
-                # pyrefly: ignore  # bad-assignment
-                self.dtype = _dtype_map[dtype]
-            else:
-                self.dtype = None
+            # pyrefly: ignore  # bad-assignment
+            self.dtype = _dtype_map.get(dtype)
         self._create_devices()
 
     def convert_dtype(self, event: dict[str, Any]) -> Optional[torch.dtype]:
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
index 62aa8e7c88cf..adf4b6609347 100644
--- a/torch/_inductor/codegen/triton.py
+++ b/torch/_inductor/codegen/triton.py
@@ -1363,7 +1363,7 @@ class TritonOverrides(OpOverrides):
                 value = triton_reshape(value, initial_shape, shape_2d)
 
                 # broadcast if needed
-                broadcast_needed = not (shape_2d == [YBLOCK, RBLOCK])
+                broadcast_needed = shape_2d != [YBLOCK, RBLOCK]
                 if broadcast_needed:
                     value = f"tl.broadcast_to({value}, ({YBLOCK}, {RBLOCK}))"
 
@@ -1385,7 +1385,7 @@ class TritonOverrides(OpOverrides):
                 value = f"tl.trans({value})"
 
                 # broadcast if needed
-                broadcast_needed = not (shape_2d == [XBLOCK, RBLOCK])
+                broadcast_needed = shape_2d != [XBLOCK, RBLOCK]
                 if broadcast_needed:
                     value = f"tl.broadcast_to({value}, ({RBLOCK}, {XBLOCK}))"
             else:
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index 4952daee3095..4c28ee8faf59 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -1570,7 +1570,7 @@ class Reduction(Loops):
             and V.graph.sizevars.size_hint_or_throw(reduction_numel)
             < config.unroll_reductions_threshold
             and (sympy_product(ranges) != 1 or is_gpu(device.type))
-            and not (reduction_type == "dot")
+            and reduction_type != "dot"
         ):
             # When native matmul, don't unroll the dot reduction.
 
diff --git a/torch/_inductor/sizevars.py b/torch/_inductor/sizevars.py
index 6b9fa34700ba..322a8f0ea06c 100644
--- a/torch/_inductor/sizevars.py
+++ b/torch/_inductor/sizevars.py
@@ -834,7 +834,7 @@ class SizeVarAllocator:
                     any_unbacked_lhs = has_free_unbacked_symbols(lhs)
                     any_unbacked_rhs = has_free_unbacked_symbols(rhs)
                     if any_unbacked_lhs != any_unbacked_rhs:
-                        return True if any_unbacked_rhs else False
+                        return bool(any_unbacked_rhs)
 
                     # Handles cases where LHS contains the RHS. In other words,
                     # RHS is a sub-expression of LHS. For example:
@@ -848,12 +848,12 @@ class SizeVarAllocator:
                     degrees_lhs = len(self.eq_graph[lhs])
                     degrees_rhs = len(self.eq_graph[rhs])
                     if degrees_lhs != degrees_rhs:
-                        return True if degrees_lhs > degrees_rhs else False
+                        return degrees_lhs > degrees_rhs
 
                     # Try to apply union-by-rank optimization to flatten the
                     # leader trees.
                     if self.rank[x] != self.rank[y]:
-                        return True if self.rank[x] > self.rank[y] else False
+                        return self.rank[x] > self.rank[y]
 
                     # Fallback to sympy.Basic.compare for a deterministic ordering.
                     return lhs.compare(rhs) == -1
diff --git a/torch/distributed/_state_dict_utils.py b/torch/distributed/_state_dict_utils.py
index cea7903bd0e2..06aa9db81e9c 100644
--- a/torch/distributed/_state_dict_utils.py
+++ b/torch/distributed/_state_dict_utils.py
@@ -708,7 +708,7 @@ def _distribute_state_dict(
             local_state_dict[key] = value.cpu()
         else:
             assert isinstance(value, torch.Tensor)
-            local_state = local_state_dict.get(key, None)
+            local_state = local_state_dict.get(key)
             if local_state is None:
                 continue
             elif isinstance(local_state, DTensor):
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index ef4ed35008cc..9f1438d3780c 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -6686,7 +6686,7 @@ def scaled_mm(
     # So, we need to convert None arguments for lists in python
     # explicitly into empty lists.
     def list_or_empty(l: list[_Any] | None) -> list[_Any]:
-        return [] if not l else l
+        return l if l else []
 
     def enum_list_as_int_list(l: _Any | list[_Any]) -> list[_Any]:
         if not isinstance(l, list):
@@ -6772,7 +6772,7 @@ def scaled_grouped_mm(
     # So, we need to convert None arguments for lists in python
     # explicitly into empty lists.
     def list_or_empty(l: list[_Any] | None) -> list[_Any]:
-        return [] if not l else l
+        return l if l else []
 
     def enum_list_as_int_list(l: _Any | list[_Any]) -> list[_Any]:
         if not isinstance(l, list):
diff --git a/torchgen/gen_vmap_plumbing.py b/torchgen/gen_vmap_plumbing.py
index 0632e7c4b969..daf60589a0cc 100644
--- a/torchgen/gen_vmap_plumbing.py
+++ b/torchgen/gen_vmap_plumbing.py
@@ -150,7 +150,7 @@ def gen_vmap_inplace_plumbing(native_function: NativeFunction) -> str | None:
     assert schema.kind() == SchemaKind.inplace
     if not is_mutated_arg(schema.arguments.flat_all[0]):
         return None
-    if not len([arg for arg in schema.arguments.flat_all if is_mutated_arg(arg)]) == 1:
+    if len([arg for arg in schema.arguments.flat_all if is_mutated_arg(arg)]) != 1:
         return None
 
     # Only support cases where all returns are Tensors or vector<Tensor>

From fdd560afd1d413a9f814cbf7cc2a72e0d39b0117 Mon Sep 17 00:00:00 2001
From: Pian Pawakapan <pianpwk@meta.com>
Date: Fri, 17 Oct 2025 07:55:25 +0000
Subject: [PATCH 027/123] [export] preserve_node_meta by default (#165524)

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165524
Approved by: https://github.com/malaybag
---
 test/export/test_export.py | 14 ++++++++++++++
 torch/export/_trace.py     | 12 ++++++++----
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/test/export/test_export.py b/test/export/test_export.py
index 23a7ad9bff1e..e4a789316359 100755
--- a/test/export/test_export.py
+++ b/test/export/test_export.py
@@ -721,6 +721,20 @@ class TestExport(TestCase):
                 )
                 self.assertEqual(node.meta["from_node"][-1].graph_id, graph_id)
 
+    def test_fx_annotate(self):
+        class Foo(torch.nn.Module):
+            def forward(self, x):
+                x += 1
+                with torch.fx.traceback.annotate({"a": "b"}):
+                    x += 1
+                x += 1
+                return x
+
+        ep = export(Foo(), (torch.randn(2),))
+
+        add_1 = list(ep.graph.nodes)[2]
+        self.assertTrue("custom" in add_1.meta and add_1.meta["custom"].get("a") == "b")
+
     @requires_gpu
     def test_flex_attention_export(self):
         from torch.nn.attention.flex_attention import create_block_mask, flex_attention
diff --git a/torch/export/_trace.py b/torch/export/_trace.py
index 803c9fc2080d..b3ee2e18f0d8 100644
--- a/torch/export/_trace.py
+++ b/torch/export/_trace.py
@@ -812,7 +812,10 @@ def _export_to_torch_ir(
         prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
     )
 
-    with torch._dynamo.config.patch(dataclasses.asdict(dynamo_cfg)):
+    with (
+        torch._dynamo.config.patch(dataclasses.asdict(dynamo_cfg)),
+        torch.fx.traceback.preserve_node_meta(),
+    ):
         try:
             module_call_specs: dict[str, dict[str, pytree.TreeSpec]] = (
                 _ExportModuleSpecTrackerDict()
@@ -902,6 +905,7 @@ def _export_to_aten_ir(
         _ignore_backend_decomps(),
         _compiling_state_context(),
         custom_triton_ops_decomposition_ctx(),
+        torch.fx.traceback.preserve_node_meta(),
     ):
         gm, graph_signature = transform(aot_export_module)(
             mod,
@@ -1930,9 +1934,8 @@ def _non_strict_export(
                             in mod._forward_pre_hooks.values()
                         ):
                             _check_input_constraints_pre_hook(mod, args, kwargs)
-                        with torch.fx.traceback.preserve_node_meta():
-                            args = (*args, *kwargs.values())
-                            tree_out = torch.fx.Interpreter(mod).run(*args)
+                        args = (*args, *kwargs.values())
+                        tree_out = torch.fx.Interpreter(mod).run(*args)
                     else:
                         tree_out = mod(*args, **kwargs)
                     flat_outs, out_spec = pytree.tree_flatten(tree_out)
@@ -2029,6 +2032,7 @@ def _non_strict_export(
             ),
             _fakify_module_inputs(fake_args, fake_kwargs, fake_mode),
             _override_builtin_ops(),
+            torch.fx.traceback.preserve_node_meta(),
         ):
             aten_export_artifact = _to_aten_func(  # type: ignore[operator]
                 patched_mod,

From 51348c021935a0b8dee082a8a2c32bed2ecf636d Mon Sep 17 00:00:00 2001
From: "Yu, Guangye" <guangye.yu@intel.com>
Date: Fri, 17 Oct 2025 10:01:00 +0000
Subject: [PATCH 028/123] Give a friendly message for older Intel GPU (#165622)

# Motivation
Notify the user if the GPU is older than officially supported. This provides a friendly warning that the GPU may work, but the experience could be unstable.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165622
Approved by: https://github.com/EikanWang
---
 c10/xpu/XPUFunctions.cpp | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/c10/xpu/XPUFunctions.cpp b/c10/xpu/XPUFunctions.cpp
index 6947c078483e..f8e7305ab63c 100644
--- a/c10/xpu/XPUFunctions.cpp
+++ b/c10/xpu/XPUFunctions.cpp
@@ -120,6 +120,22 @@ inline void initGlobalDevicePoolState() {
   TORCH_CHECK(
       gDevicePool.devices.size() <= std::numeric_limits<DeviceIndex>::max(),
       "Too many XPU devices, DeviceIndex overflowed!");
+  // Check each device's architecture and issue a warning if it is older than
+  // the officially supported range (Intel GPUs starting from Arc (Alchemist)
+  // series).
+  namespace syclex = sycl::ext::oneapi::experimental;
+  for (const auto& device : gDevicePool.devices) {
+    auto architecture = device->get_info<syclex::info::device::architecture>();
+    if (architecture < syclex::architecture::intel_gpu_acm_g10) {
+      TORCH_WARN(
+          "The detected GPU (",
+          device->get_info<sycl::info::device::name>(),
+          ") is not officially supported by PyTorch XPU. Running workloads on this device may result in unexpected behavior.\n",
+          "For stable and fully supported execution, please use GPUs based on Intel Arc (Alchemist) series or newer.\n",
+          "Refer to the hardware prerequisites for more information: ",
+          "https://github.com/pytorch/pytorch/blob/main/docs/source/notes/get_start_xpu.rst#hardware-prerequisite");
+    }
+  }
 
 #if defined(_WIN32) && SYCL_COMPILER_VERSION < 20250000
   // The default context feature is disabled by default on Windows for SYCL

From b44fb149069b44bb043f4b3374d08676c3f40635 Mon Sep 17 00:00:00 2001
From: "Yu, Guangye" <guangye.yu@intel.com>
Date: Fri, 17 Oct 2025 10:01:01 +0000
Subject: [PATCH 029/123] Remove unused parameter when query extension
 attribute (#165623)

# Motivation
This code is no longer needed since SYCL compiler 2025.0. We are now using compiler 2025.2 (two tool uplifts later), so it can be safely removed.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165623
Approved by: https://github.com/EikanWang
ghstack dependencies: #165622
---
 c10/xpu/XPUFunctions.cpp | 19 +++----------------
 1 file changed, 3 insertions(+), 16 deletions(-)

diff --git a/c10/xpu/XPUFunctions.cpp b/c10/xpu/XPUFunctions.cpp
index f8e7305ab63c..26edf295d1fc 100644
--- a/c10/xpu/XPUFunctions.cpp
+++ b/c10/xpu/XPUFunctions.cpp
@@ -137,16 +137,6 @@ inline void initGlobalDevicePoolState() {
     }
   }
 
-#if defined(_WIN32) && SYCL_COMPILER_VERSION < 20250000
-  // The default context feature is disabled by default on Windows for SYCL
-  // compiler versions earlier than 2025.0.0.
-  std::vector<sycl::device> deviceList;
-  for (auto it = gDevicePool.devices.begin(); it != gDevicePool.devices.end();
-       ++it) {
-    deviceList.push_back(*(*it));
-  }
-  gDevicePool.context = std::make_unique<sycl::context>(deviceList);
-#else
   // The default context is utilized for each Intel GPU device, allowing the
   // retrieval of the context from any GPU device.
   const auto& platform = gDevicePool.devices[0]->get_platform();
@@ -156,7 +146,6 @@ inline void initGlobalDevicePoolState() {
 #else
       platform.ext_oneapi_get_default_context());
 #endif
-#endif
 }
 
 inline void initDevicePoolCallOnce() {
@@ -181,9 +170,9 @@ void initDeviceProperties(DeviceProp* device_prop, DeviceIndex device) {
 #define ASSIGN_DEVICE_ASPECT(member) \
   device_prop->has_##member = raw_device.has(sycl::aspect::member);
 
-#define ASSIGN_EXP_CL_ASPECT(member)                                       \
-  device_prop->has_##member = raw_device.ext_oneapi_supports_cl_extension( \
-      "cl_intel_" #member, &cl_version);
+#define ASSIGN_EXP_CL_ASPECT(member) \
+  device_prop->has_##member =        \
+      raw_device.ext_oneapi_supports_cl_extension("cl_intel_" #member);
 
 #define ASSIGN_EXP_DEVICE_PROP(property) \
   device_prop->property =                \
@@ -198,8 +187,6 @@ void initDeviceProperties(DeviceProp* device_prop, DeviceIndex device) {
 
   AT_FORALL_XPU_DEVICE_ASPECT(ASSIGN_DEVICE_ASPECT);
 
-  // TODO: Remove cl_version since it is unnecessary.
-  sycl::ext::oneapi::experimental::cl_version cl_version;
   AT_FORALL_XPU_EXP_CL_ASPECT(ASSIGN_EXP_CL_ASPECT);
 
 #if SYCL_COMPILER_VERSION >= 20250000

From d0c24b392cbb7b213d22e42c52c6c2d1ac2da1bd Mon Sep 17 00:00:00 2001
From: Xilun Wu <12968408+XilunWu@users.noreply.github.com>
Date: Thu, 16 Oct 2025 14:28:28 -0700
Subject: [PATCH 030/123] [APF Logging][Error Trait] To fill the errorTraits
 for ChildFailedError with signal abort (re-attempt of #165476) (#165688)

**Summary**
Land @guoding83128 's PR https://github.com/pytorch/pytorch/pull/165476 on his behalf due to EasyCLA blocking.
Refer his original PR for detail. But in short, elastic leaves 'errorTraits' as unknown when the error dump file is missing,
this PR adds a "system terminated error" to such case so the internal scuba table can correctly aggregate.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165688
Approved by: https://github.com/fduwjj
---
 .../elastic/multiprocessing/errors/__init__.py            | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/torch/distributed/elastic/multiprocessing/errors/__init__.py b/torch/distributed/elastic/multiprocessing/errors/__init__.py
index 174c89aa98a8..fa6abc8794b6 100644
--- a/torch/distributed/elastic/multiprocessing/errors/__init__.py
+++ b/torch/distributed/elastic/multiprocessing/errors/__init__.py
@@ -79,9 +79,9 @@ __all__ = [
 logger = get_logger(__name__)
 
 
-JSON = dict
+JSON = dict[str, Any]
 
-_EMPTY_ERROR_DATA = {"message": "<NONE>"}
+_EMPTY_ERROR_DATA: dict[str, Any] = {"message": "<NONE>"}
 _NOT_AVAILABLE = "<N/A>"
 
 _R = TypeVar("_R")
@@ -143,6 +143,10 @@ class ProcessFailure:
                     f" received by PID {self.pid}"
                 )
             else:
+                self.error_file_data["errorTraits"] = {
+                    "category": "system_terminated_error",
+                    "retryability": "False",
+                }
                 self.message = "To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html"
 
     def _get_error_data(self, error_file_data: dict[str, Any]) -> tuple[str, int]:

From 9fe3b2afbeff12080b483af1ee23e1c9d9fb0421 Mon Sep 17 00:00:00 2001
From: Joel Schlosser <jbschlosser@meta.com>
Date: Fri, 8 Aug 2025 17:38:47 -0400
Subject: [PATCH 031/123] Remove torch.serialization entries from the doc
 ignore list (#160224)

Follows the approach done in #158581
Pull Request resolved: https://github.com/pytorch/pytorch/pull/160224
Approved by: https://github.com/janeyx99
---
 docs/source/conf.py          | 14 +++-----------
 docs/source/torch.aliases.md | 19 +++++++++++++++++++
 docs/source/torch.rst        |  1 -
 3 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index d21e67c1caad..410f24a974c1 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -492,6 +492,9 @@ coverage_ignore_functions = [
     "amp_definitely_not_available",
     # torch.mtia.memory
     "reset_peak_memory_stats",
+    # torch.compiler
+    "load_cache_artifacts",
+    "save_cache_artifacts",
     # torch.cuda.nccl
     "all_gather",
     "all_reduce",
@@ -1727,17 +1730,6 @@ coverage_ignore_functions = [
     "tensorboard_trace_handler",
     # torch.return_types
     "pytree_register_structseq",
-    # torch.serialization
-    "check_module_version_greater_or_equal",
-    "default_restore_location",
-    "load",
-    "location_tag",
-    "mkdtemp",
-    "normalize_storage_type",
-    "save",
-    "storage_to_tensor_type",
-    "validate_cuda_device",
-    "validate_hpu_device",
     # torch.signal.windows.windows
     "bartlett",
     "blackman",
diff --git a/docs/source/torch.aliases.md b/docs/source/torch.aliases.md
index 882b642265d4..2639fdf0d929 100644
--- a/docs/source/torch.aliases.md
+++ b/docs/source/torch.aliases.md
@@ -32,3 +32,22 @@ in which they are defined. Feel free to use either the top-level version in ``to
     unique_consecutive
     unravel_index
 ```
+
+```{eval-rst}
+.. automodule:: torch.serialization
+.. currentmodule:: torch.serialization
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+
+    check_module_version_greater_or_equal
+    default_restore_location
+    load
+    location_tag
+    mkdtemp
+    normalize_storage_type
+    save
+    storage_to_tensor_type
+    validate_cuda_device
+    validate_hpu_device
+```
diff --git a/docs/source/torch.rst b/docs/source/torch.rst
index 068ffb52c0ad..47f8aa4a8951 100644
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@@ -808,7 +808,6 @@ Operator Tags
 .. py:module:: torch.utils.viz
 .. py:module:: torch.quasirandom
 .. py:module:: torch.return_types
-.. py:module:: torch.serialization
 .. py:module:: torch.signal.windows.windows
 .. py:module:: torch.sparse.semi_structured
 .. py:module:: torch.storage

From 202f83dc4ed9a2fcc7ea43fef61fbcad0c2ee987 Mon Sep 17 00:00:00 2001
From: Jerry Mannil <65309407+jerrymannil@users.noreply.github.com>
Date: Fri, 17 Oct 2025 09:12:27 +0000
Subject: [PATCH 032/123] [ROCm][layer_norm] Use __builtin_amdgcn_rcpf(x)
 instead of 1.f/x (#165589)

Replace (more) exact calculation with hardware approximation.

Benefits:
Reduced code size.
Improved performance for certain scenarios.

Experiments show low reduction in precision.
Experiments show no significant performance regressions. bfloat16 as well as float16 related calculations may benefit largely from this change.

Co-author: @mhalk @amd-hhashemi

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165589
Approved by: https://github.com/jeffdaily
---
 aten/src/ATen/native/cuda/layer_norm_kernel.cu |  8 ++++++++
 cmake/Dependencies.cmake                       | 11 +++++++++++
 cmake/Summary.cmake                            | 11 ++++++-----
 setup.py                                       |  4 ++++
 4 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/aten/src/ATen/native/cuda/layer_norm_kernel.cu b/aten/src/ATen/native/cuda/layer_norm_kernel.cu
index 940680eb3682..c457bd3dba75 100644
--- a/aten/src/ATen/native/cuda/layer_norm_kernel.cu
+++ b/aten/src/ATen/native/cuda/layer_norm_kernel.cu
@@ -141,7 +141,11 @@ WelfordDataLN cuWelfordOnlineSum(
   if constexpr (!rms_norm){
     U delta = val - curr_sum.mean;
     U new_count = curr_sum.count + 1.f;
+#if defined(USE_ROCM) && defined(USE_LAYERNORM_FAST_RECIPROCAL)
+    U new_mean = curr_sum.mean + delta * __builtin_amdgcn_rcpf(new_count);
+#else
     U new_mean = curr_sum.mean + delta * (1.f/new_count); //proper division is slow, this is less accurate but noticeably faster
+#endif
     return {new_mean, curr_sum.sigma2 + delta * (val - new_mean), new_count};
   } else{
     return {0.f, curr_sum.sigma2 + val * val, 0};
@@ -159,7 +163,11 @@ WelfordDataLN cuWelfordCombine(
     U count = dataA.count + dataB.count;
     U mean, sigma2;
     if (count > decltype(dataB.count){0}) {
+#if defined(USE_ROCM) && defined(USE_LAYERNORM_FAST_RECIPROCAL)
+      auto coef = __builtin_amdgcn_rcpf(count);
+#else
       auto coef = 1.f/count; //NB we don't use --use_fast_math, but this is emulation, 1./count goes to intrinsic, `* coef` is multiplication, instead of slow fp division
+#endif
       auto nA = dataA.count * coef;
       auto nB = dataB.count * coef;
       mean = nA*dataA.mean + nB*dataB.mean;
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 90fc3f284ac7..733183ef50bd 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1044,6 +1044,17 @@ if(USE_ROCM)
        list(APPEND HIP_HIPCC_FLAGS -fdebug-info-for-profiling)
     endif(CMAKE_BUILD_TYPE MATCHES Debug)
 
+    # Get EnVar 'USE_LAYERNORM_FAST_RECIPROCAL' (or default to on).
+    if(DEFINED ENV{USE_LAYERNORM_FAST_RECIPROCAL})
+      set(USE_LAYERNORM_FAST_RECIPROCAL $ENV{USE_LAYERNORM_FAST_RECIPROCAL})
+    else()
+      set(USE_LAYERNORM_FAST_RECIPROCAL ON)
+    endif()
+
+    if(USE_LAYERNORM_FAST_RECIPROCAL)
+      add_definitions(-DUSE_LAYERNORM_FAST_RECIPROCAL)
+    endif()
+
     # needed for compat with newer versions of hip-clang that introduced C++20 mangling rules
     list(APPEND HIP_HIPCC_FLAGS -fclang-abi-compat=17)
 
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index 1fa1398a8917..60951d6c6867 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -128,11 +128,12 @@ function(caffe2_print_configuration_summary)
   endif()
   message(STATUS "  USE_ROCM              : ${USE_ROCM}")
   if(${USE_ROCM})
-    message(STATUS "    ROCM_VERSION          : ${ROCM_VERSION}")
-    message(STATUS "    USE_FLASH_ATTENTION   : ${USE_FLASH_ATTENTION}")
-    message(STATUS "    USE_MEM_EFF_ATTENTION : ${USE_MEM_EFF_ATTENTION}")
-    message(STATUS "    USE_ROCM_CK_SDPA      : ${USE_ROCM_CK_SDPA}")
-    message(STATUS "    USE_ROCM_CK_GEMM      : ${USE_ROCM_CK_GEMM}")
+    message(STATUS "    ROCM_VERSION                  : ${ROCM_VERSION}")
+    message(STATUS "    USE_FLASH_ATTENTION           : ${USE_FLASH_ATTENTION}")
+    message(STATUS "    USE_MEM_EFF_ATTENTION         : ${USE_MEM_EFF_ATTENTION}")
+    message(STATUS "    USE_ROCM_CK_SDPA              : ${USE_ROCM_CK_SDPA}")
+    message(STATUS "    USE_ROCM_CK_GEMM              : ${USE_ROCM_CK_GEMM}")
+    message(STATUS "    USE_LAYERNORM_FAST_RECIPROCAL : ${USE_LAYERNORM_FAST_RECIPROCAL}")
   endif()
   message(STATUS "  BUILD_NVFUSER         : ${BUILD_NVFUSER}")
   message(STATUS "  USE_EIGEN_FOR_BLAS    : ${CAFFE2_USE_EIGEN_FOR_BLAS}")
diff --git a/setup.py b/setup.py
index bdfab24a0b32..a980a5f35216 100644
--- a/setup.py
+++ b/setup.py
@@ -156,6 +156,10 @@
 #   USE_ROCM_KERNEL_ASSERT=1
 #     Enable kernel assert in ROCm platform
 #
+#   USE_LAYERNORM_FAST_RECIPROCAL
+#     If set, enables the use of builtin functions for fast reciprocals (1/x) w.r.t.
+#     layer normalization. Default: enabled.
+#
 #   USE_ROCM_CK_GEMM=1
 #     Enable building CK GEMM backend in ROCm platform
 #

From cb6e4d7d825dfb23e4c4ff2547150cec6273048c Mon Sep 17 00:00:00 2001
From: Simon Layton <simonlayton@meta.com>
Date: Thu, 16 Oct 2025 13:45:19 -0700
Subject: [PATCH 033/123] User-passed alpha to scaled_gemm (#165563)

Summary:

Add optional user-passed `alpha` argument to
`at::cuda::blas::scaled_gemm`, necessary for two-level-scaled NVFP4 gemm
calls (where the global de-scales are folded into the `alpha` argument.

Global de-scales are naturally device tensors, but using cublas'
device-pointer mode for `alpha`/`beta` has an interesting lifetime
implication - the `alpha` tensor must be valid & correct until the end
of the matmul call, *not* just the launch (as for host values). To
enable this, I added device-constant memory for `one` and `zero`, along
with a statically-held single-fp32-value tensor, which is valid from the
first passed-`alpha` invocation of `scaled_gemm` to the end of the
program. User-passed values are copied into this perpetual buffer to
ensure lifetime requirements are met.

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
Signed-off-by: Simon Layton <simonlayton@meta.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165563
Approved by: https://github.com/drisspg, https://github.com/eqy
---
 aten/src/ATen/cuda/CUDABlas.cpp            | 51 +++++++++++++++-----
 aten/src/ATen/cuda/CUDABlas.h              |  3 +-
 aten/src/ATen/cuda/detail/BLASConstants.cu | 54 ++++++++++++++++++++++
 aten/src/ATen/cuda/detail/BLASConstants.h  | 11 +++++
 aten/src/ATen/cuda/tunable/TunableGemm.h   |  3 +-
 aten/src/ATen/native/cuda/Blas.cpp         |  6 ++-
 torch/utils/hipify/cuda_to_hip_mappings.py |  3 ++
 7 files changed, 116 insertions(+), 15 deletions(-)
 create mode 100644 aten/src/ATen/cuda/detail/BLASConstants.cu
 create mode 100644 aten/src/ATen/cuda/detail/BLASConstants.h

diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
index 13716736c577..6933099bb1f3 100644
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@@ -16,6 +16,8 @@
 #include <c10/util/irange.h>
 #include <c10/core/ScalarType.h>
 
+#include <ATen/cuda/detail/BLASConstants.h>
+
 #ifdef USE_ROCM
 #include <c10/cuda/CUDAStream.h>
 #include <hipblaslt/hipblaslt-ext.hpp>
@@ -1954,13 +1956,15 @@ void scaled_gemm(
     const void *result_scale_ptr,
     int64_t result_ld,
     ScalarType result_dtype,
-    bool use_fast_accum) {
+    bool use_fast_accum,
+    const std::optional<Tensor>& alpha) {
   // Note: see `cublasCommonArgs` for various non-intuitive manupulations
   // of input arguments to this function.
   const auto computeType = CUBLAS_COMPUTE_32F;
   const auto scaleType = CUDA_R_32F;
-  const float alpha_val = 1.0;
-  const float beta_val = 0.0;
+  // Note: alpha_val may change later depending on user-passed argument
+  float alpha_val = 1.0;
+  float beta_val = 0.0;
   CuBlasLtMatmulDescriptor computeDesc(computeType, scaleType);
   computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSA, _cublasOpFromChar(transa));
   computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSB, _cublasOpFromChar(transb));
@@ -2031,6 +2035,33 @@ void scaled_gemm(
     computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_EPILOGUE, CUBLASLT_EPILOGUE_BIAS);
     computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE, ScalarTypeToCudaDataType(bias_dtype));
   }
+
+  // Handle user-passed alpha
+  float *alpha_ptr = &alpha_val;
+  float *beta_ptr = &beta_val;
+
+  if (alpha.has_value()) {
+    auto& a = alpha.value();
+
+    // if device-tensor
+    if (a.is_cuda()) {
+      // NOTE: there are lifetime requirements on device-side pointers for alpha/beta -- the value must be
+      //       valid & correct until the cublas call finishes (not is scheduled like host-side values). Thus
+      //       we need to use allocations for alpha/beta that have some guarantees on lifetime - a statically
+      //       managed 4B buffer for alpha that we'll copy the passed alpha value into, and constant memory
+      //       for beta respectively.
+      float *user_alpha_ptr = at::cuda::detail::get_user_alpha_ptr();
+      at::Tensor user_alpha = at::from_blob(user_alpha_ptr, {1}, TensorOptions().device(kCUDA).dtype(kFloat));
+      user_alpha.copy_(a);
+      // Tell cublasLt we're using device-side pointers for alpha/beta
+      auto pointer_mode = CUBLASLT_POINTER_MODE_DEVICE;
+      computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_POINTER_MODE, pointer_mode);
+      alpha_ptr = user_alpha.data_ptr<float>();
+      beta_ptr = at::cuda::detail::get_cublas_device_zero();
+    } else {
+      alpha_val = a.item<float>();
+    }
+  }
     // For other data types, use the get_scale_mode function based on scaling type
     // The SCALE_MODE attrs only exist in cuBLAS 12.8+/ROCm 7.0 or in recent hipblaslt,
     // but we must invoke get_scale_mode anyways to trigger the version checks.
@@ -2048,6 +2079,7 @@ void scaled_gemm(
   cublasLtMatmulHeuristicResult_t heuristicResult = {};
   int returnedResult = 0;
   cublasLtHandle_t ltHandle = at::cuda::getCurrentCUDABlasLtHandle();
+
   TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic(
       ltHandle,
       computeDesc.descriptor(),
@@ -2088,10 +2120,10 @@ void scaled_gemm(
         auto is_valid_status = hipblaslt_ext::matmulIsAlgoSupported(
                 ltHandle,
                 computeDesc.descriptor(),
-                &alpha_val,
+                alpha_ptr,
                 Adesc.descriptor(),
                 Bdesc.descriptor(),
-                &beta_val,
+                beta_ptr,
                 Cdesc.descriptor(),
                 Ddesc.descriptor(),
                 all_algos[i].algo,
@@ -2110,17 +2142,14 @@ void scaled_gemm(
   cublasStatus_t cublasStatus = cublasLtMatmul(
       ltHandle,
       computeDesc.descriptor(),
-      &alpha_val,
+      alpha_ptr,
       mat1_ptr,
       Adesc.descriptor(),
       mat2_ptr,
       Bdesc.descriptor(),
-      &beta_val,
-#ifdef USE_ROCM
+      beta_ptr,
+      // NOTE: always use result_ptr here, because cuBLASLt w/device beta=0 can't handle nullptr either
       result_ptr, // unused, since beta_val is 0, but hipblaslt can't handle nullptr
-#else
-      nullptr,
-#endif // ifdef USE_ROCM
       Cdesc.descriptor(),
       result_ptr,
       Ddesc.descriptor(),
diff --git a/aten/src/ATen/cuda/CUDABlas.h b/aten/src/ATen/cuda/CUDABlas.h
index 6618658704a7..0295948311a5 100644
--- a/aten/src/ATen/cuda/CUDABlas.h
+++ b/aten/src/ATen/cuda/CUDABlas.h
@@ -161,7 +161,8 @@ void scaled_gemm(
     const void* result_scale_ptr,
     int64_t result_ld,
     ScalarType result_dtype,
-    bool use_fast_accum);
+    bool use_fast_accum,
+    const std::optional<Tensor>& alpha);
 
 #define CUDABLAS_BGEMM_ARGTYPES(Dtype)  CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, Dtype)
 
diff --git a/aten/src/ATen/cuda/detail/BLASConstants.cu b/aten/src/ATen/cuda/detail/BLASConstants.cu
new file mode 100644
index 000000000000..967388044705
--- /dev/null
+++ b/aten/src/ATen/cuda/detail/BLASConstants.cu
@@ -0,0 +1,54 @@
+#include <ATen/Functions.h>
+#include <ATen/Tensor.h>
+#include <ATen/cuda/Exceptions.h>
+
+#include <mutex>
+
+namespace at {
+namespace cuda {
+namespace detail {
+
+__device__ __constant__ float cublas_one_device;
+__device__ __constant__ float cublas_zero_device;
+
+float *get_cublas_device_one() {
+  static c10::once_flag init_flag;
+
+  c10::call_once(init_flag, []() {
+    const float one = 1.f;
+    AT_CUDA_CHECK(cudaMemcpyToSymbol(cublas_one_device, &one, sizeof(float)));
+  });
+
+  float *ptr;
+  AT_CUDA_CHECK(cudaGetSymbolAddress(reinterpret_cast<void**>(&ptr), cublas_one_device));
+  return ptr;
+}
+
+float *get_cublas_device_zero() {
+  static c10::once_flag init_flag;
+
+  c10::call_once(init_flag, []() {
+    const float zero = 0.f;
+    AT_CUDA_CHECK(cudaMemcpyToSymbol(cublas_zero_device, &zero, sizeof(float)));
+  });
+
+  float *ptr;
+  AT_CUDA_CHECK(cudaGetSymbolAddress(reinterpret_cast<void**>(&ptr), cublas_zero_device));
+  return ptr;
+}
+
+float *get_user_alpha_ptr() {
+  static float *alpha_ptr;
+
+  static c10::once_flag init_flag;
+
+  c10::call_once(init_flag, []() {
+    AT_CUDA_CHECK(cudaMalloc(&alpha_ptr, sizeof(float)));
+  });
+
+  return alpha_ptr;
+}
+
+} // namespace detail
+} // namespace cuda
+} // namespace at
diff --git a/aten/src/ATen/cuda/detail/BLASConstants.h b/aten/src/ATen/cuda/detail/BLASConstants.h
new file mode 100644
index 000000000000..d62aaf1330ee
--- /dev/null
+++ b/aten/src/ATen/cuda/detail/BLASConstants.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <ATen/core/TensorBase.h>
+
+namespace at::cuda::detail {
+
+float *get_cublas_device_one();
+float *get_cublas_device_zero();
+float *get_user_alpha_ptr();
+
+} // namespace at::cuda::detail
diff --git a/aten/src/ATen/cuda/tunable/TunableGemm.h b/aten/src/ATen/cuda/tunable/TunableGemm.h
index d941c230630c..c014d1ea569c 100644
--- a/aten/src/ATen/cuda/tunable/TunableGemm.h
+++ b/aten/src/ATen/cuda/tunable/TunableGemm.h
@@ -109,7 +109,8 @@ class DefaultScaledGemmOp : public Callable<ScaledGemmParams<T>> {
           params->c_scale_ptr,
           params->ldc,
           params->c_dtype,
-          params->use_fast_accum);
+          params->use_fast_accum,
+          std::nullopt /* alpha */);
       return OK;
     }
 };
diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp
index 1e7c4600efc5..4ee35013ab77 100644
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@@ -1359,7 +1359,8 @@ _scaled_gemm(
           const ScalingType scaling_choice_a, const ScalingType scaling_choice_b,
           const std::optional<Tensor>& bias,
           const bool use_fast_accum,
-          Tensor& out) {
+          Tensor& out,
+          const std::optional<Tensor>& alpha = std::nullopt) {
   cublasCommonArgs args(mat1, mat2, out, scale_a, scale_b, std::nullopt, scaling_choice_a, scaling_choice_b);
   const auto out_dtype_ = args.result->scalar_type();
   TORCH_CHECK(args.transa == 't' && args.transb == 'n', "Only multiplication of row-major and column-major matrices is supported by cuBLASLt");
@@ -1410,7 +1411,8 @@ _scaled_gemm(
           args.scale_result_ptr,
           args.result_ld,
           out_dtype_,
-          use_fast_accum);
+          use_fast_accum,
+          alpha);
       return out;
   }
 }
diff --git a/torch/utils/hipify/cuda_to_hip_mappings.py b/torch/utils/hipify/cuda_to_hip_mappings.py
index 54442fe403e9..d1d9a08c71c5 100644
--- a/torch/utils/hipify/cuda_to_hip_mappings.py
+++ b/torch/utils/hipify/cuda_to_hip_mappings.py
@@ -7702,8 +7702,11 @@ CUDA_IDENTIFIER_MAP = collections.OrderedDict(
         ("CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE", ("HIPBLASLT_MATMUL_DESC_BIAS_DATA_TYPE", CONV_MATH_FUNC, API_BLAS)),
         ("CUBLASLT_MATMUL_DESC_A_SCALE_MODE", ("HIPBLASLT_MATMUL_DESC_A_SCALE_MODE", CONV_MATH_FUNC, API_BLAS)),
         ("CUBLASLT_MATMUL_DESC_B_SCALE_MODE", ("HIPBLASLT_MATMUL_DESC_B_SCALE_MODE", CONV_MATH_FUNC, API_BLAS)),
+        ("CUBLASLT_MATMUL_DESC_POINTER_MODE", ("HIPBLASLT_MATMUL_DESC_POINTER_MODE", CONV_MATH_FUNC, API_BLAS)),
         ("CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0", ("HIPBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0", CONV_MATH_FUNC, API_BLAS)),
         ("CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3", ("HIPBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3", CONV_MATH_FUNC, API_BLAS)),
+        ("CUBLASLT_POINTER_MODE_DEVICE", ("HIPBLASLT_POINTER_MODE_DEVICE", CONV_NUMERIC_LITERAL, API_BLAS)),
+        ("CUBLASLT_POINTER_MODE_HOST", ("HIPBLASLT_POINTER_MODE_HOST", CONV_NUMERIC_LITERAL, API_BLAS)),
         ("cublasLtMatrixLayout_t", ("hipblasLtMatrixLayout_t", CONV_MATH_FUNC, API_BLAS)),
         ("cublasLtMatrixLayoutOpaque_t", ("hipblasLtMatrixLayoutOpaque_t", CONV_MATH_FUNC, API_BLAS)),
         ("cublasLtMatrixLayoutAttribute_t", ("hipblasLtMatrixLayoutAttribute_t", CONV_MATH_FUNC, API_BLAS)),

From 4a22139eeaa136c25461d87ee025714442d565ad Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nikita.shulga@gmail.com>
Date: Thu, 16 Oct 2025 21:12:36 -0700
Subject: [PATCH 034/123] [MPS][BE] Fix unused variable warning (#165726)

Namely this one
```
/Users/malfet/git/pytorch/pytorch/aten/src/ATen/native/mps/kernels/Shape.metal:19:18: warning: unused variable 'output_sizes' [-Wunused-variable]
  constant auto& output_sizes = shared_params.output_sizes;
                 ^
/Users/malfet/git/pytorch/pytorch/aten/src/ATen/native/mps/kernels/Shape.metal:85:1: note: in instantiation of function template specialization 'cat<long, float, float>' requested here
REGISTER_CAT_FOR_INDEX_TYPE(int64_t);
^
/Users/malfet/git/pytorch/pytorch/aten/src/ATen/native/mps/kernels/Shape.metal:69:3: note: expanded from macro 'REGISTER_CAT_FOR_INDEX_TYPE'
  REGISTER_CAT_OP_ALL_INPUT_TYPES(I, float);  \
  ^
/Users/malfet/git/pytorch/pytorch/aten/src/ATen/native/mps/kernels/Shape.metal:55:3: note: expanded from macro 'REGISTER_CAT_OP_ALL_INPUT_TYPES'
  REGISTER_CAT_OP(I, float, T_out);               \
  ^
/Users/malfet/git/pytorch/pytorch/aten/src/ATen/native/mps/kernels/Shape.metal:47:15: note: expanded from macro 'REGISTER_CAT_OP'
  kernel void cat<I, T_in, T_out>(                               \
```

Repeated about 20-30 times
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165726
Approved by: https://github.com/Skylion007
---
 aten/src/ATen/native/mps/kernels/Shape.metal | 1 -
 1 file changed, 1 deletion(-)

diff --git a/aten/src/ATen/native/mps/kernels/Shape.metal b/aten/src/ATen/native/mps/kernels/Shape.metal
index 44cf6f1e8d56..5c7aed8c01e6 100644
--- a/aten/src/ATen/native/mps/kernels/Shape.metal
+++ b/aten/src/ATen/native/mps/kernels/Shape.metal
@@ -16,7 +16,6 @@ kernel void cat(
   auto ndim = shared_params.ndim;
   auto cat_dim = shared_params.cat_dim;
   constant auto& output_strides = shared_params.output_strides;
-  constant auto& output_sizes = shared_params.output_sizes;
 
   auto cat_dim_offset = input_params.cat_dim_offset;
   auto input_element_offset = input_params.input_element_offset;

From 80d2ca7566cc38e68b964c1ce168b9320ed8e006 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Fri, 17 Oct 2025 11:23:13 +0000
Subject: [PATCH 035/123] Revert "[annotate] add annotate_fn function decorator
 (#165703)"

This reverts commit f1d882212afc3a73ce1e319d80b6406f9dc4a0c8.

Reverted https://github.com/pytorch/pytorch/pull/165703 on behalf of https://github.com/lw due to [GH job link](https://github.com/pytorch/pytorch/actions/runs/18585518705/job/52989521797) [HUD commit link](https://hud.pytorch.org/pytorch/pytorch/commit/f1d882212afc3a73ce1e319d80b6406f9dc4a0c8) ([comment](https://github.com/pytorch/pytorch/pull/165703#issuecomment-3415073467))
---
 .../test_aot_joint_with_descriptors.py        | 40 -------------------
 torch/fx/traceback.py                         | 37 -----------------
 2 files changed, 77 deletions(-)

diff --git a/test/functorch/test_aot_joint_with_descriptors.py b/test/functorch/test_aot_joint_with_descriptors.py
index d797b36748d0..167215bb8be1 100644
--- a/test/functorch/test_aot_joint_with_descriptors.py
+++ b/test/functorch/test_aot_joint_with_descriptors.py
@@ -922,46 +922,6 @@ class inner_f(torch.nn.Module):
             in custom_metadata
         )
 
-    def test_preserve_annotate_function(self):
-        """Test basic annotate_fn usage"""
-
-        @fx_traceback.annotate_fn({"pp_stage": 1})
-        def example_function(x):
-            return x * x
-
-        class SimpleLinear(nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.linear = nn.Linear(3, 2)
-
-            def forward(self, x):
-                with fx_traceback.annotate({"pp_stage": 0}):
-                    y = self.linear(x)
-                y = example_function(y)
-                return y - 1
-
-        inputs = (torch.randn(4, 3),)
-        model = SimpleLinear()
-
-        for with_export in [True, False]:
-            graph_module = graph_capture(model, inputs, with_export)
-            custom_metadata = fx_traceback._get_custom_metadata(graph_module)
-            self.assertExpectedInline(
-                str(custom_metadata),
-                """\
-('call_function', 't', {'pp_stage': 0})
-('call_function', 'addmm', {'pp_stage': 0})
-('call_function', 'mul', {'pp_stage': 1})
-('call_function', 'mul_1', {'pp_stage': 1})
-('call_function', 'mul_2', {'pp_stage': 1})
-('call_function', 't_1', {'pp_stage': 0})
-('call_function', 'mm', {'pp_stage': 0})
-('call_function', 't_2', {'pp_stage': 0})
-('call_function', 'sum_1', {'pp_stage': 0})
-('call_function', 'view', {'pp_stage': 0})
-('call_function', 't_3', {'pp_stage': 0})""",
-            )
-
 
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/fx/traceback.py b/torch/fx/traceback.py
index 56b5f5041aa1..3d1e3b7c5d53 100644
--- a/torch/fx/traceback.py
+++ b/torch/fx/traceback.py
@@ -18,7 +18,6 @@ log = logging.getLogger(__name__)
 
 __all__ = [
     "annotate",
-    "annotate_fn",
     "preserve_node_meta",
     "has_preserved_node_meta",
     "set_stack_trace",
@@ -292,42 +291,6 @@ def annotate(annotation_dict: dict):
             del current_meta["custom"]
 
 
-@compatibility(is_backward_compatible=False)
-def annotate_fn(annotation_dict: dict):
-    """
-    A decorator that wraps a function with the annotate context manager.
-    Use this when you want to annotate an entire function instead of a specific code block.
-
-    Note:
-        This API is **not backward compatible** and may evolve in future releases.
-
-    Note:
-        This API is not compatible with fx.symbolic_trace or jit.trace. It's intended
-        to be used with PT2 family of tracers, e.g. torch.export and dynamo.
-
-    Args:
-        annotation_dict (dict): A dictionary of custom key-value pairs to inject
-            into the FX trace metadata for all operations in the function.
-
-    Example:
-        >>> @annotate_fn({"pp_stage": 1})
-        ... def my_function(x):
-        ...     return x + 1
-        # All operations in my_function will have {"pp_stage": 1} in their metadata.
-    """
-    from functools import wraps
-
-    def decorator(func):
-        @wraps(func)
-        def wrapper(*args, **kwargs):
-            with annotate(annotation_dict):
-                return func(*args, **kwargs)
-
-        return wrapper
-
-    return decorator
-
-
 @compatibility(is_backward_compatible=False)
 def set_grad_fn_seq_nr(seq_nr):
     global current_meta

From 574c9fc9503e55f512693eedc52ac627e4330bb6 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Fri, 17 Oct 2025 12:24:08 +0000
Subject: [PATCH 036/123] Revert "Remove torch.serialization entries from the
 doc ignore list (#160224)"

This reverts commit 9fe3b2afbeff12080b483af1ee23e1c9d9fb0421.

Reverted https://github.com/pytorch/pytorch/pull/160224 on behalf of https://github.com/lw due to [GH job link](https://github.com/pytorch/pytorch/actions/runs/18588004962/job/52997748336) [HUD commit link](https://hud.pytorch.org/pytorch/pytorch/commit/9fe3b2afbeff12080b483af1ee23e1c9d9fb0421) ([comment](https://github.com/pytorch/pytorch/pull/160224#issuecomment-3415345175))
---
 docs/source/conf.py          | 14 +++++++++++---
 docs/source/torch.aliases.md | 19 -------------------
 docs/source/torch.rst        |  1 +
 3 files changed, 12 insertions(+), 22 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 410f24a974c1..d21e67c1caad 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -492,9 +492,6 @@ coverage_ignore_functions = [
     "amp_definitely_not_available",
     # torch.mtia.memory
     "reset_peak_memory_stats",
-    # torch.compiler
-    "load_cache_artifacts",
-    "save_cache_artifacts",
     # torch.cuda.nccl
     "all_gather",
     "all_reduce",
@@ -1730,6 +1727,17 @@ coverage_ignore_functions = [
     "tensorboard_trace_handler",
     # torch.return_types
     "pytree_register_structseq",
+    # torch.serialization
+    "check_module_version_greater_or_equal",
+    "default_restore_location",
+    "load",
+    "location_tag",
+    "mkdtemp",
+    "normalize_storage_type",
+    "save",
+    "storage_to_tensor_type",
+    "validate_cuda_device",
+    "validate_hpu_device",
     # torch.signal.windows.windows
     "bartlett",
     "blackman",
diff --git a/docs/source/torch.aliases.md b/docs/source/torch.aliases.md
index 2639fdf0d929..882b642265d4 100644
--- a/docs/source/torch.aliases.md
+++ b/docs/source/torch.aliases.md
@@ -32,22 +32,3 @@ in which they are defined. Feel free to use either the top-level version in ``to
     unique_consecutive
     unravel_index
 ```
-
-```{eval-rst}
-.. automodule:: torch.serialization
-.. currentmodule:: torch.serialization
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-
-    check_module_version_greater_or_equal
-    default_restore_location
-    load
-    location_tag
-    mkdtemp
-    normalize_storage_type
-    save
-    storage_to_tensor_type
-    validate_cuda_device
-    validate_hpu_device
-```
diff --git a/docs/source/torch.rst b/docs/source/torch.rst
index 47f8aa4a8951..068ffb52c0ad 100644
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@@ -808,6 +808,7 @@ Operator Tags
 .. py:module:: torch.utils.viz
 .. py:module:: torch.quasirandom
 .. py:module:: torch.return_types
+.. py:module:: torch.serialization
 .. py:module:: torch.signal.windows.windows
 .. py:module:: torch.sparse.semi_structured
 .. py:module:: torch.storage

From 5d4da26ed067d2d70102f30967f1b09f8fb7018a Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Fri, 17 Oct 2025 12:27:16 +0000
Subject: [PATCH 037/123] Revert "[export] preserve_node_meta by default
 (#165524)"

This reverts commit fdd560afd1d413a9f814cbf7cc2a72e0d39b0117.

Reverted https://github.com/pytorch/pytorch/pull/165524 on behalf of https://github.com/lw due to test/functorch/test_control_flow.py::TestControlFlowTraced::test_cond_symint_closure [GH job link](https://github.com/pytorch/pytorch/actions/runs/18586312291/job/52991654051) [HUD commit link](https://hud.pytorch.org/pytorch/pytorch/commit/fdd560afd1d413a9f814cbf7cc2a72e0d39b0117) ([comment](https://github.com/pytorch/pytorch/pull/165524#issuecomment-3415352522))
---
 test/export/test_export.py | 14 --------------
 torch/export/_trace.py     | 12 ++++--------
 2 files changed, 4 insertions(+), 22 deletions(-)

diff --git a/test/export/test_export.py b/test/export/test_export.py
index e4a789316359..23a7ad9bff1e 100755
--- a/test/export/test_export.py
+++ b/test/export/test_export.py
@@ -721,20 +721,6 @@ class TestExport(TestCase):
                 )
                 self.assertEqual(node.meta["from_node"][-1].graph_id, graph_id)
 
-    def test_fx_annotate(self):
-        class Foo(torch.nn.Module):
-            def forward(self, x):
-                x += 1
-                with torch.fx.traceback.annotate({"a": "b"}):
-                    x += 1
-                x += 1
-                return x
-
-        ep = export(Foo(), (torch.randn(2),))
-
-        add_1 = list(ep.graph.nodes)[2]
-        self.assertTrue("custom" in add_1.meta and add_1.meta["custom"].get("a") == "b")
-
     @requires_gpu
     def test_flex_attention_export(self):
         from torch.nn.attention.flex_attention import create_block_mask, flex_attention
diff --git a/torch/export/_trace.py b/torch/export/_trace.py
index b3ee2e18f0d8..803c9fc2080d 100644
--- a/torch/export/_trace.py
+++ b/torch/export/_trace.py
@@ -812,10 +812,7 @@ def _export_to_torch_ir(
         prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
     )
 
-    with (
-        torch._dynamo.config.patch(dataclasses.asdict(dynamo_cfg)),
-        torch.fx.traceback.preserve_node_meta(),
-    ):
+    with torch._dynamo.config.patch(dataclasses.asdict(dynamo_cfg)):
         try:
             module_call_specs: dict[str, dict[str, pytree.TreeSpec]] = (
                 _ExportModuleSpecTrackerDict()
@@ -905,7 +902,6 @@ def _export_to_aten_ir(
         _ignore_backend_decomps(),
         _compiling_state_context(),
         custom_triton_ops_decomposition_ctx(),
-        torch.fx.traceback.preserve_node_meta(),
     ):
         gm, graph_signature = transform(aot_export_module)(
             mod,
@@ -1934,8 +1930,9 @@ def _non_strict_export(
                             in mod._forward_pre_hooks.values()
                         ):
                             _check_input_constraints_pre_hook(mod, args, kwargs)
-                        args = (*args, *kwargs.values())
-                        tree_out = torch.fx.Interpreter(mod).run(*args)
+                        with torch.fx.traceback.preserve_node_meta():
+                            args = (*args, *kwargs.values())
+                            tree_out = torch.fx.Interpreter(mod).run(*args)
                     else:
                         tree_out = mod(*args, **kwargs)
                     flat_outs, out_spec = pytree.tree_flatten(tree_out)
@@ -2032,7 +2029,6 @@ def _non_strict_export(
             ),
             _fakify_module_inputs(fake_args, fake_kwargs, fake_mode),
             _override_builtin_ops(),
-            torch.fx.traceback.preserve_node_meta(),
         ):
             aten_export_artifact = _to_aten_func(  # type: ignore[operator]
                 patched_mod,

From 7231118db3156de661fa76fb0ccc91ecfdbc1416 Mon Sep 17 00:00:00 2001
From: Yuanyuan Chen <cyyever@outlook.com>
Date: Fri, 17 Oct 2025 13:24:46 +0000
Subject: [PATCH 038/123] Turn some const variables into constexpr in C++ code
 (#165401)

This PR checks the C++ code and turns some const variables into constexpr.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165401
Approved by: https://github.com/Skylion007
---
 aten/src/ATen/core/PhiloxRNGEngine.h          |  8 ++--
 aten/src/ATen/cuda/CUDAGeneratorImpl.cpp      | 12 ++---
 aten/src/ATen/native/Activation.cpp           |  4 +-
 aten/src/ATen/native/BlasKernel.cpp           |  4 +-
 aten/src/ATen/native/Distributions.h          |  5 +-
 aten/src/ATen/native/Math.h                   |  6 +--
 aten/src/ATen/native/Normalization.cpp        |  2 +-
 aten/src/ATen/native/cpu/UpSampleKernel.cpp   |  6 +--
 aten/src/ATen/native/cuda/DilatedMaxPool2d.cu |  2 +-
 aten/src/ATen/native/cuda/Embedding.cu        |  4 +-
 aten/src/ATen/native/cuda/IGammaKernel.cu     | 46 +++++++++----------
 aten/src/ATen/native/cuda/Math.cuh            |  8 ++--
 aten/src/ATen/native/cuda/UpSample.cuh        |  4 +-
 aten/src/ATen/native/mkldnn/Matmul.cpp        |  2 +-
 .../cpu/kernels/QuantizedOpKernels.cpp        |  2 +-
 .../src/ATen/native/quantized/cpu/qlinear.cpp |  2 +-
 .../ATen/native/quantized/cpu/qsoftmax.cpp    |  4 +-
 .../epilogue_thread_apply_logsumexp.h         |  6 +--
 aten/src/ATen/test/pow_test.cpp               | 20 ++++----
 aten/src/ATen/xpu/XPUGeneratorImpl.cpp        | 12 ++---
 20 files changed, 80 insertions(+), 79 deletions(-)

diff --git a/aten/src/ATen/core/PhiloxRNGEngine.h b/aten/src/ATen/core/PhiloxRNGEngine.h
index 413055d3fad6..e8bac545933c 100644
--- a/aten/src/ATen/core/PhiloxRNGEngine.h
+++ b/aten/src/ATen/core/PhiloxRNGEngine.h
@@ -229,10 +229,10 @@ private:
   }
 
 
-  static const uint32_t kPhilox10A = 0x9E3779B9;
-  static const uint32_t kPhilox10B = 0xBB67AE85;
-  static const uint32_t kPhiloxSA = 0xD2511F53;
-  static const uint32_t kPhiloxSB = 0xCD9E8D57;
+  static constexpr uint32_t kPhilox10A = 0x9E3779B9;
+  static constexpr uint32_t kPhilox10B = 0xBB67AE85;
+  static constexpr uint32_t kPhiloxSA = 0xD2511F53;
+  static constexpr uint32_t kPhiloxSB = 0xCD9E8D57;
 };
 
 typedef philox_engine Philox4_32;
diff --git a/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp b/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
index 9f7c9ba881e9..2e387fbc264d 100644
--- a/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
+++ b/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
@@ -325,9 +325,9 @@ uint64_t CUDAGeneratorImpl::seed() {
  */
 c10::intrusive_ptr<c10::TensorImpl> CUDAGeneratorImpl::get_state() const {
   // The RNG state comprises the seed, and an offset used for Philox.
-  static const size_t seed_size = sizeof(uint64_t);
-  static const size_t offset_size = sizeof(int64_t);
-  static const size_t total_size = seed_size + offset_size;
+  constexpr size_t seed_size = sizeof(uint64_t);
+  constexpr size_t offset_size = sizeof(int64_t);
+  constexpr size_t total_size = seed_size + offset_size;
 
   auto state_tensor = at::detail::empty_cpu({(int64_t)total_size}, ScalarType::Byte, std::nullopt, std::nullopt, std::nullopt, std::nullopt);
   auto rng_state = state_tensor.data_ptr<uint8_t>();
@@ -346,9 +346,9 @@ c10::intrusive_ptr<c10::TensorImpl> CUDAGeneratorImpl::get_state() const {
  * and size of the internal state.
  */
 void CUDAGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
-  static const size_t seed_size = sizeof(uint64_t);
-  static const size_t offset_size = sizeof(int64_t);
-  static const size_t total_size = seed_size + offset_size;
+  constexpr size_t seed_size = sizeof(uint64_t);
+  constexpr size_t offset_size = sizeof(int64_t);
+  constexpr size_t total_size = seed_size + offset_size;
 
   detail::check_rng_state(new_state);
 
diff --git a/aten/src/ATen/native/Activation.cpp b/aten/src/ATen/native/Activation.cpp
index 861c51f16097..c164120a1f3c 100644
--- a/aten/src/ATen/native/Activation.cpp
+++ b/aten/src/ATen/native/Activation.cpp
@@ -240,8 +240,8 @@ TORCH_META_FUNC(gelu_backward) (
 
 namespace at::native {
 
-static const double SELU_ALPHA = 1.6732632423543772848170429916717;
-static const double SELU_SCALE = 1.0507009873554804934193349852946;
+static constexpr double SELU_ALPHA = 1.6732632423543772848170429916717;
+static constexpr double SELU_SCALE = 1.0507009873554804934193349852946;
 
 DEFINE_DISPATCH(elu_stub);
 DEFINE_DISPATCH(elu_backward_stub);
diff --git a/aten/src/ATen/native/BlasKernel.cpp b/aten/src/ATen/native/BlasKernel.cpp
index a77604c535c1..b476ca3cff8f 100644
--- a/aten/src/ATen/native/BlasKernel.cpp
+++ b/aten/src/ATen/native/BlasKernel.cpp
@@ -286,7 +286,7 @@ template void scal_fast_path<scalar_t>(int *n, scalar_t *a, scalar_t *x, int *in
 #if AT_BUILD_WITH_BLAS()
 template <>
 bool scal_use_fast_path<double>(int64_t n, int64_t incx) {
-  auto intmax = std::numeric_limits<int>::max();
+  auto constexpr intmax = std::numeric_limits<int>::max();
   return n <= intmax && incx <= intmax;
 }
 
@@ -315,7 +315,7 @@ bool gemv_use_fast_path<float>(
     int64_t incx,
     [[maybe_unused]] float beta,
     int64_t incy) {
-  auto intmax = std::numeric_limits<int>::max();
+  auto constexpr intmax = std::numeric_limits<int>::max();
   return (m <= intmax) && (n <= intmax) && (lda <= intmax) &&
          (incx > 0) && (incx <= intmax) && (incy > 0) && (incy <= intmax);
 }
diff --git a/aten/src/ATen/native/Distributions.h b/aten/src/ATen/native/Distributions.h
index 1c9db44aebb0..755fe00b1f1c 100644
--- a/aten/src/ATen/native/Distributions.h
+++ b/aten/src/ATen/native/Distributions.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <array>
 #include <ATen/native/Math.h>
 #include <c10/macros/Macros.h>
 #include <c10/util/MathConstants.h>
@@ -127,7 +128,7 @@ C10_DEVICE scalar_t sample_gamma(scalar_t alpha, BaseSampler<accscalar_t, unifor
 
 template<typename scalar_t>
 C10_DEVICE scalar_t stirling_approx_tail(scalar_t k) {
-  const static scalar_t kTailValues[] = {
+  constexpr static scalar_t kTailValues[] = {
     0.0810614667953272,
     0.0413406959554092,
     0.0276779256849983,
@@ -139,7 +140,7 @@ C10_DEVICE scalar_t stirling_approx_tail(scalar_t k) {
     0.00925546218271273,
     0.00833056343336287
   };
-  if (k <= 9) {
+  if (k < std::size(kTailValues)) {
     return kTailValues[static_cast<size_t>(k)];
   }
   scalar_t kp1sq = (k + 1) * (k + 1);
diff --git a/aten/src/ATen/native/Math.h b/aten/src/ATen/native/Math.h
index b261da5fe54e..4677542706f6 100644
--- a/aten/src/ATen/native/Math.h
+++ b/aten/src/ATen/native/Math.h
@@ -581,7 +581,7 @@ scalar_t ratevl(scalar_t x, const scalar_t num[], int64_t M,
 template <typename scalar_t>
 static scalar_t lanczos_sum_expg_scaled(scalar_t x) {
   // lanczos approximation
-  static const scalar_t lanczos_sum_expg_scaled_num[13] = {
+  static constexpr scalar_t lanczos_sum_expg_scaled_num[13] = {
     0.006061842346248906525783753964555936883222,
     0.5098416655656676188125178644804694509993,
     19.51992788247617482847860966235652136208,
@@ -596,7 +596,7 @@ static scalar_t lanczos_sum_expg_scaled(scalar_t x) {
     103794043.1163445451906271053616070238554,
     56906521.91347156388090791033559122686859
   };
-  static const scalar_t lanczos_sum_expg_scaled_denom[13] = {
+  static constexpr scalar_t lanczos_sum_expg_scaled_denom[13] = {
     1.,
     66.,
     1925.,
@@ -712,7 +712,7 @@ static scalar_t _igamc_helper_series(scalar_t a, scalar_t x) {
 template <typename scalar_t>
 static scalar_t _igam_helper_asymptotic_series(scalar_t a, scalar_t x, bool igam) {
   // Compute igam/igamc using DLMF 8.12.3/8.12.4 [igam1]
-  static const scalar_t d[25][25] =
+  static constexpr scalar_t d[25][25] =
     {{-3.3333333333333333e-1, 8.3333333333333333e-2, -1.4814814814814815e-2,
       1.1574074074074074e-3, 3.527336860670194e-4, -1.7875514403292181e-4,
       3.9192631785224378e-5, -2.1854485106799922e-6, -1.85406221071516e-6,
diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp
index 86941806d307..72526162d133 100644
--- a/aten/src/ATen/native/Normalization.cpp
+++ b/aten/src/ATen/native/Normalization.cpp
@@ -62,7 +62,7 @@
 #include <utility>
 #include <vector>
 
-static const int MIOPEN_DIM_MAX = 5;
+static constexpr int MIOPEN_DIM_MAX = 5;
 
 namespace at::meta {
 
diff --git a/aten/src/ATen/native/cpu/UpSampleKernel.cpp b/aten/src/ATen/native/cpu/UpSampleKernel.cpp
index bd421aad111d..e59e5985bf7f 100644
--- a/aten/src/ATen/native/cpu/UpSampleKernel.cpp
+++ b/aten/src/ATen/native/cpu/UpSampleKernel.cpp
@@ -1038,7 +1038,7 @@ struct HelperInterpNearest : public HelperInterpBase {
   // We keep this structure for BC and consider as deprecated.
   // See HelperInterpNearestExact as replacement
 
-  static const int interp_size = 1;
+  static constexpr int interp_size = 1;
 
   static inline void init_indices_weights(
     at::ScalarType output_type,
@@ -1155,7 +1155,7 @@ struct HelperInterpNearestExact : public HelperInterpNearest {
 
 struct HelperInterpLinear : public HelperInterpBase {
 
-  static const int interp_size = 2;
+  static constexpr int interp_size = 2;
 
   // Compute indices and weights for each interpolated dimension
   // indices_weights = {
@@ -1275,7 +1275,7 @@ struct HelperInterpLinear : public HelperInterpBase {
 
 struct HelperInterpCubic : public HelperInterpBase {
 
-  static const int interp_size = 4;
+  static constexpr int interp_size = 4;
 
   // Compute indices and weights for each interpolated dimension
   // indices_weights = {
diff --git a/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu
index edb502688860..344906a2a4df 100644
--- a/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu
+++ b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu
@@ -249,7 +249,7 @@ __global__ void max_pool_forward_nhwc(
 }
 
 
-static const int BLOCK_THREADS = 256;
+static constexpr int BLOCK_THREADS = 256;
 
 template <typename scalar_t, typename accscalar_t>
 #if defined (USE_ROCM)
diff --git a/aten/src/ATen/native/cuda/Embedding.cu b/aten/src/ATen/native/cuda/Embedding.cu
index 602dfd6e5288..adc300a5a9ef 100644
--- a/aten/src/ATen/native/cuda/Embedding.cu
+++ b/aten/src/ATen/native/cuda/Embedding.cu
@@ -36,9 +36,9 @@ namespace at::native {
 namespace {
 
 #if defined(USE_ROCM)
-static const int BLOCKDIMY = 16;
+static constexpr int BLOCKDIMY = 16;
 #else
-static const int BLOCKDIMY = 32;
+static constexpr int BLOCKDIMY = 32;
 #endif
 
 template
diff --git a/aten/src/ATen/native/cuda/IGammaKernel.cu b/aten/src/ATen/native/cuda/IGammaKernel.cu
index 624f080d9f6e..73db6272be9e 100644
--- a/aten/src/ATen/native/cuda/IGammaKernel.cu
+++ b/aten/src/ATen/native/cuda/IGammaKernel.cu
@@ -82,7 +82,7 @@ __host__ __device__ scalar_t lanczos_sum_expg_scaled(scalar_t x) {
   // lanczos approximation
   using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
 
-  static const accscalar_t lanczos_sum_expg_scaled_num[13] = {
+  constexpr accscalar_t lanczos_sum_expg_scaled_num[13] = {
     0.006061842346248906525783753964555936883222,
     0.5098416655656676188125178644804694509993,
     19.51992788247617482847860966235652136208,
@@ -97,7 +97,7 @@ __host__ __device__ scalar_t lanczos_sum_expg_scaled(scalar_t x) {
     103794043.1163445451906271053616070238554,
     56906521.91347156388090791033559122686859
   };
-  static const accscalar_t lanczos_sum_expg_scaled_denom[13] = {
+  constexpr accscalar_t lanczos_sum_expg_scaled_denom[13] = {
     1.,
     66.,
     1925.,
@@ -126,10 +126,10 @@ __host__ __device__ scalar_t _igam_helper_fac(scalar_t a, scalar_t x) {
 
   using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
   accscalar_t ax, fac, res, num, numfac;
-  static const accscalar_t MAXLOG = std::is_same_v<accscalar_t,double> ?
+  constexpr accscalar_t MAXLOG = std::is_same_v<accscalar_t,double> ?
     7.09782712893383996843E2 : 88.72283905206835;
-  static const accscalar_t EXP1 = 2.718281828459045;
-  static const accscalar_t lanczos_g = 6.024680040776729583740234375;
+  constexpr accscalar_t EXP1 = 2.718281828459045;
+  constexpr accscalar_t lanczos_g = 6.024680040776729583740234375;
 
   if (::fabs(a - x) > 0.4 * ::fabs(a)) {
     ax = a * ::log(x) - x - ::lgamma(a);
@@ -158,9 +158,9 @@ __host__ __device__ scalar_t _igam_helper_series(scalar_t a, scalar_t x) {
   // Compute igam using DLMF 8.11.4. [igam1]
 
   using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
-  static const accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
+  constexpr accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
     1.11022302462515654042E-16 : 5.9604644775390625E-8;
-  static const int MAXITER = 2000;
+  constexpr int MAXITER = 2000;
 
   int i;
   accscalar_t ans, ax, c, r;
@@ -196,8 +196,8 @@ __host__ __device__ scalar_t _igamc_helper_series(scalar_t a, scalar_t x) {
   accscalar_t fac = 1;
   accscalar_t sum = 0;
   accscalar_t term, logx;
-  static const int MAXITER = 2000;
-  static const accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
+  constexpr int MAXITER = 2000;
+  constexpr accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
     1.11022302462515654042E-16 : 5.9604644775390625E-8;
 
   for (n = 1; n < MAXITER; n++) {
@@ -219,7 +219,7 @@ __host__ __device__ scalar_t _igam_helper_asymptotic_series(scalar_t a, scalar_t
   // Compute igam/igamc using DLMF 8.12.3/8.12.4 [igam1]
 
   using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
-  static const accscalar_t d[25][25] =
+  constexpr accscalar_t d[25][25] =
     {{-3.3333333333333333e-1, 8.3333333333333333e-2, -1.4814814814814815e-2, 1.1574074074074074e-3, 3.527336860670194e-4, -1.7875514403292181e-4, 3.9192631785224378e-5, -2.1854485106799922e-6, -1.85406221071516e-6, 8.296711340953086e-7, -1.7665952736826079e-7, 6.7078535434014986e-9, 1.0261809784240308e-8, -4.3820360184533532e-9, 9.1476995822367902e-10, -2.551419399494625e-11, -5.8307721325504251e-11, 2.4361948020667416e-11, -5.0276692801141756e-12, 1.1004392031956135e-13, 3.3717632624009854e-13, -1.3923887224181621e-13, 2.8534893807047443e-14, -5.1391118342425726e-16, -1.9752288294349443e-15},
     {-1.8518518518518519e-3, -3.4722222222222222e-3, 2.6455026455026455e-3, -9.9022633744855967e-4, 2.0576131687242798e-4, -4.0187757201646091e-7, -1.8098550334489978e-5, 7.6491609160811101e-6, -1.6120900894563446e-6, 4.6471278028074343e-9, 1.378633446915721e-7, -5.752545603517705e-8, 1.1951628599778147e-8, -1.7543241719747648e-11, -1.0091543710600413e-9, 4.1627929918425826e-10, -8.5639070264929806e-11, 6.0672151016047586e-14, 7.1624989648114854e-12, -2.9331866437714371e-12, 5.9966963656836887e-13, -2.1671786527323314e-16, -4.9783399723692616e-14, 2.0291628823713425e-14, -4.13125571381061e-15},
     {4.1335978835978836e-3, -2.6813271604938272e-3, 7.7160493827160494e-4, 2.0093878600823045e-6, -1.0736653226365161e-4, 5.2923448829120125e-5, -1.2760635188618728e-5, 3.4235787340961381e-8, 1.3721957309062933e-6, -6.298992138380055e-7, 1.4280614206064242e-7, -2.0477098421990866e-10, -1.4092529910867521e-8, 6.228974084922022e-9, -1.3670488396617113e-9, 9.4283561590146782e-13, 1.2872252400089318e-10, -5.5645956134363321e-11, 1.1975935546366981e-11, -4.1689782251838635e-15, -1.0940640427884594e-12, 4.6622399463901357e-13, -9.905105763906906e-14, 1.8931876768373515e-17, 8.8592218725911273e-15},
@@ -248,7 +248,7 @@ __host__ __device__ scalar_t _igam_helper_asymptotic_series(scalar_t a, scalar_t
 
   int k, n, sgn;
   int maxpow = 0;
-  static const accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
+  constexpr accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
     1.11022302462515654042E-16 : 5.9604644775390625E-8;
   accscalar_t lambda = x / a;
   accscalar_t sigma = (x - a) / a;
@@ -314,12 +314,12 @@ __host__ __device__ scalar_t _igamc_helper_continued_fraction(scalar_t a, scalar
   int i;
   accscalar_t ans, ax, c, yc, r, t, y, z;
   accscalar_t pk, pkm1, pkm2, qk, qkm1, qkm2;
-  static const int MAXITER = 2000;
-  static const accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
+  constexpr int MAXITER = 2000;
+  constexpr accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
     1.11022302462515654042E-16 : 5.9604644775390625E-8;
-  static const accscalar_t BIG = std::is_same_v<accscalar_t,double> ?
+  constexpr accscalar_t BIG = std::is_same_v<accscalar_t,double> ?
     4.503599627370496e15 : 16777216.;
-  static const accscalar_t BIGINV = std::is_same_v<accscalar_t,double> ?
+  constexpr accscalar_t BIGINV = std::is_same_v<accscalar_t,double> ?
     2.22044604925031308085e-16 : 5.9604644775390625E-8;
 
   ax = _igam_helper_fac(a, x);
@@ -385,10 +385,10 @@ __noinline__ __host__ __device__ scalar_t calc_igammac(scalar_t a, scalar_t x) {
   using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
   accscalar_t absxma_a;
 
-  static const accscalar_t SMALL = 20.0;
-  static const accscalar_t LARGE = 200.0;
-  static const accscalar_t SMALLRATIO = 0.3;
-  static const accscalar_t LARGERATIO = 4.5;
+  constexpr accscalar_t SMALL = 20.0;
+  constexpr accscalar_t LARGE = 200.0;
+  constexpr accscalar_t SMALLRATIO = 0.3;
+  constexpr accscalar_t LARGERATIO = 4.5;
 
   if ((x < 0) || (a < 0)) {
     // out of defined-region of the function
@@ -467,10 +467,10 @@ __noinline__ __host__ __device__ scalar_t calc_igamma(scalar_t a, scalar_t x) {
 
   using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
   accscalar_t absxma_a;
-  static const accscalar_t SMALL = 20.0;
-  static const accscalar_t LARGE = 200.0;
-  static const accscalar_t SMALLRATIO = 0.3;
-  static const accscalar_t LARGERATIO = 4.5;
+  constexpr accscalar_t SMALL = 20.0;
+  constexpr accscalar_t LARGE = 200.0;
+  constexpr accscalar_t SMALLRATIO = 0.3;
+  constexpr accscalar_t LARGERATIO = 4.5;
 
   // boundary values following SciPy
   if ((x < 0) || (a < 0)) {
diff --git a/aten/src/ATen/native/cuda/Math.cuh b/aten/src/ATen/native/cuda/Math.cuh
index 1d603132e689..1fa245af1a4d 100644
--- a/aten/src/ATen/native/cuda/Math.cuh
+++ b/aten/src/ATen/native/cuda/Math.cuh
@@ -231,7 +231,7 @@ const auto lcm_string = jiterator_stringify(
 const auto digamma_string = jiterator_stringify(
   template <typename T>
   T digamma(T x) {
-    static const double PI_f64 = 3.14159265358979323846;
+    static constexpr double PI_f64 = 3.14159265358979323846;
 
     // Short-circuits if x is +/- 0 and returns -/+ ∞ per the C++ standard
     if (x == 0) {
@@ -3072,9 +3072,9 @@ template <typename scalar_t>
 static inline C10_HOST_DEVICE scalar_t calc_digamma(scalar_t in) {
   // [C++ Standard Reference: Gamma Function] https://en.cppreference.com/w/cpp/numeric/math/tgamma
   using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
-  static const double PI_f64 = 3.14159265358979323846;
-  const accscalar_t PSI_10 = 2.25175258906672110764;
-  const accscalar_t A[] = {
+  static constexpr double PI_f64 = 3.14159265358979323846;
+  constexpr accscalar_t PSI_10 = 2.25175258906672110764;
+  constexpr accscalar_t A[] = {
       8.33333333333333333333E-2,
       -2.10927960927960927961E-2,
       7.57575757575757575758E-3,
diff --git a/aten/src/ATen/native/cuda/UpSample.cuh b/aten/src/ATen/native/cuda/UpSample.cuh
index 50428b377da8..09e094ea2bf0 100644
--- a/aten/src/ATen/native/cuda/UpSample.cuh
+++ b/aten/src/ATen/native/cuda/UpSample.cuh
@@ -277,7 +277,7 @@ struct BilinearFilterFunctor {
     return 0;
   }
 
-  static const int size = 2;
+  static constexpr int size = 2;
 };
 
 // taken from
@@ -301,7 +301,7 @@ struct BicubicFilterFunctor {
     return 0;
   }
 
-  static const int size = 4;
+  static constexpr int size = 4;
 };
 
 template <typename accscalar_t>
diff --git a/aten/src/ATen/native/mkldnn/Matmul.cpp b/aten/src/ATen/native/mkldnn/Matmul.cpp
index 740c056a7f23..fbc8294f45cf 100644
--- a/aten/src/ATen/native/mkldnn/Matmul.cpp
+++ b/aten/src/ATen/native/mkldnn/Matmul.cpp
@@ -416,7 +416,7 @@ static inline bool checksize(const Tensor& mat1, const Tensor& mat2){
   // else if dim = 3, mat1's size = (b * m * n), mat2's size = (b * n * k)
   // else called from aten::mv, mat1.size = (m * n), mat2.size = (n)
   // only m * n * b * k(if exist) are large enough we can get benefit from mkldnn optimized gemm kernel
-  static const int64_t mkldnn_gemm_min_size = 16 * 16 * 16;
+  constexpr int64_t mkldnn_gemm_min_size = 16 * 16 * 16;
   if (mat1.dim() == 1 && mat2.dim() == 1) {
     // aten::dot
     return mat1.size(0) > mkldnn_gemm_min_size;
diff --git a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
index 028047e4d6ac..293dfb20b9bf 100644
--- a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
+++ b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
@@ -3551,7 +3551,7 @@ void dequantize_tensor_per_tensor_affine_cpu(
 
 #if defined(__ARM_NEON__) || defined(__aarch64__)
 
-const static int PARALLEL_THRESHOLD = 1 << 20;
+constexpr static int PARALLEL_THRESHOLD = 1 << 20;
 
 // Generic template defaults to naive quantize implementation
 template <typename T>
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear.cpp b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
index 897eefd91d21..7a80b166f8cb 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
@@ -1388,7 +1388,7 @@ namespace at::native {
     TORCH_CHECK(act_scale.numel() == 1 && act_zero_point.numel() <= 1,
         "onednn int8 linear: act scale/zp size should be 1/<=1");
     static std::optional<at::Tensor> other = std::nullopt;
-    static const std::string_view binary_post_op = "none";
+    constexpr std::string_view binary_post_op = "none";
     int64_t act_zp = act_zero_point.numel() == 1 ? act_zero_point.item().toLong() : 0;
     return linear_int8_with_onednn_weight(
         act, act_scale.item().toDouble(), act_zp,
diff --git a/aten/src/ATen/native/quantized/cpu/qsoftmax.cpp b/aten/src/ATen/native/quantized/cpu/qsoftmax.cpp
index cd00a351b0e3..31221cd9bf26 100644
--- a/aten/src/ATen/native/quantized/cpu/qsoftmax.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qsoftmax.cpp
@@ -16,8 +16,8 @@ namespace {
 
 #ifdef USE_PYTORCH_QNNPACK
 
-const static float qnnpack_softmax_output_scale = 0x1.0p-8f;
-const static int qnnpack_softmax_output_zero_point = 0;
+constexpr static float qnnpack_softmax_output_scale = 0x1.0p-8f;
+constexpr static int qnnpack_softmax_output_zero_point = 0;
 
 bool is_qnnpack_compatible(
     const Tensor& qx,
diff --git a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/epilogue/epilogue_thread_apply_logsumexp.h b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/epilogue/epilogue_thread_apply_logsumexp.h
index e3dc0778e46b..156034954d9e 100644
--- a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/epilogue/epilogue_thread_apply_logsumexp.h
+++ b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/epilogue/epilogue_thread_apply_logsumexp.h
@@ -110,9 +110,9 @@ class ApplyLogSumExp {
   using ElementCompute = ElementCompute_;
   using ElementLSE = ElementLSE_;
 
-  static int const kElementsPerAccess = ElementsPerAccess;
-  static int const kCount = kElementsPerAccess;
-  static const ScaleType::Kind kScale =
+  static int constexpr kElementsPerAccess = ElementsPerAccess;
+  static int constexpr kCount = kElementsPerAccess;
+  static constexpr ScaleType::Kind kScale =
       cutlass::epilogue::thread::ScaleType::NoBetaScaling;
 
   using FragmentOutput = Array<ElementOutput, kCount>;
diff --git a/aten/src/ATen/test/pow_test.cpp b/aten/src/ATen/test/pow_test.cpp
index 95bb48b341f5..6391c3c8228c 100644
--- a/aten/src/ATen/test/pow_test.cpp
+++ b/aten/src/ATen/test/pow_test.cpp
@@ -14,16 +14,16 @@ using namespace at;
 
 namespace {
 
-const auto int_min = std::numeric_limits<int>::min();
-const auto int_max = std::numeric_limits<int>::max();
-const auto long_min = std::numeric_limits<int64_t>::min();
-const auto long_max = std::numeric_limits<int64_t>::max();
-const auto float_lowest = std::numeric_limits<float>::lowest();
-const auto float_min = std::numeric_limits<float>::min();
-const auto float_max = std::numeric_limits<float>::max();
-const auto double_lowest = std::numeric_limits<double>::lowest();
-const auto double_min = std::numeric_limits<double>::min();
-const auto double_max = std::numeric_limits<double>::max();
+constexpr auto int_min = std::numeric_limits<int>::min();
+constexpr auto int_max = std::numeric_limits<int>::max();
+constexpr auto long_min = std::numeric_limits<int64_t>::min();
+constexpr auto long_max = std::numeric_limits<int64_t>::max();
+constexpr auto float_lowest = std::numeric_limits<float>::lowest();
+constexpr auto float_min = std::numeric_limits<float>::min();
+constexpr auto float_max = std::numeric_limits<float>::max();
+constexpr auto double_lowest = std::numeric_limits<double>::lowest();
+constexpr auto double_min = std::numeric_limits<double>::min();
+constexpr auto double_max = std::numeric_limits<double>::max();
 
 const std::vector<int> ints {
   int_min,
diff --git a/aten/src/ATen/xpu/XPUGeneratorImpl.cpp b/aten/src/ATen/xpu/XPUGeneratorImpl.cpp
index 14f3059cc2b3..7a0859671ba7 100644
--- a/aten/src/ATen/xpu/XPUGeneratorImpl.cpp
+++ b/aten/src/ATen/xpu/XPUGeneratorImpl.cpp
@@ -146,9 +146,9 @@ uint64_t XPUGeneratorImpl::seed() {
 
 c10::intrusive_ptr<c10::TensorImpl> XPUGeneratorImpl::get_state() const {
   // The RNG state comprises the seed, and an offset used for Philox.
-  static const size_t seed_size = sizeof(uint64_t);
-  static const size_t offset_size = sizeof(uint64_t);
-  static const size_t total_size = seed_size + offset_size;
+  constexpr size_t seed_size = sizeof(uint64_t);
+  constexpr size_t offset_size = sizeof(uint64_t);
+  constexpr size_t total_size = seed_size + offset_size;
 
   // The internal state is returned as a CPU byte tensor.
   auto state_tensor = at::detail::empty_cpu(
@@ -170,9 +170,9 @@ c10::intrusive_ptr<c10::TensorImpl> XPUGeneratorImpl::get_state() const {
 void XPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
   at::xpu::assertNotCapturing(
       "Please ensure to utilize the XPUGeneratorImpl::set_state_index method during capturing.");
-  static const size_t seed_size = sizeof(uint64_t);
-  static const size_t offset_size = sizeof(uint64_t);
-  static const size_t total_size = seed_size + offset_size;
+  constexpr size_t seed_size = sizeof(uint64_t);
+  constexpr size_t offset_size = sizeof(uint64_t);
+  constexpr size_t total_size = seed_size + offset_size;
 
   at::detail::check_rng_state(new_state);
 

From ce29d0d796df40f484884e7b8db8b60567dcd95b Mon Sep 17 00:00:00 2001
From: PaulZhang12 <paulzhan@fb.com>
Date: Thu, 16 Oct 2025 12:16:03 -0700
Subject: [PATCH 039/123] [ATen] Vectorize 8 elements on 16 bit data types for
 sum/mean (#165055)

Benchmarks for a full reduction + reduction on the contiguous dimension. Vectorized loads do not occur on the non contiguous dimension. Benchmarking done for FP16/BF16, ~6% improvement on average across shapes, up to ~24% for single reduction on contiguous dimension and 46% for full reduce:
**BF16**
```
Tensor Shape         Operation    Full reduce (ms)     Contiguous dim (ms)  Full reduce (ms)     Contiguous dim (ms)  Full reduce diff %   Contiguous diff %
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(256, 256)           mean         0.022686             0.008263             0.015498             0.008117                          +46.38%               +1.80%
(256, 256)           sum          0.022769             0.008269             0.015628             0.008185                          +45.69%               +1.03%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(512, 512)           mean         0.014116             0.009545             0.012892             0.008839                           +9.49%               +7.99%
(512, 512)           sum          0.014110             0.009892             0.012891             0.008878                           +9.46%              +11.42%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(1024, 1024)         mean         0.014727             0.012642             0.014061             0.010519                           +4.74%              +20.18%
(1024, 1024)         sum          0.014376             0.012636             0.014069             0.010595                           +2.18%              +19.26%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(2048, 2048)         mean         0.018663             0.018294             0.018171             0.014678                           +2.71%              +24.64%
(2048, 2048)         sum          0.018638             0.017931             0.018142             0.014713                           +2.73%              +21.87%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(4096, 4096)         mean         0.034216             0.036953             0.033520             0.030585                           +2.08%              +20.82%
(4096, 4096)         sum          0.034196             0.036942             0.033518             0.030676                           +2.02%              +20.43%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(8192, 8192)         mean         0.087763             0.095201             0.085439             0.084960                           +2.72%              +12.05%
(8192, 8192)         sum          0.088079             0.095592             0.085353             0.084632                           +3.19%              +12.95%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(8192, 16384)        mean         0.148174             0.149705             0.146274             0.138865                           +1.30%               +7.81%
(8192, 16384)        sum          0.147820             0.149371             0.146419             0.138752                           +0.96%               +7.65%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(8192, 32768)        mean         0.266144             0.260807             0.265953             0.253330                           +0.07%               +2.95%
(8192, 32768)        sum          0.266572             0.261163             0.265729             0.253294                           +0.32%               +3.11%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(8192, 65536)        mean         0.502034             0.486312             0.498417             0.481246                           +0.73%               +1.05%
(8192, 65536)        sum          0.501597             0.486351             0.497735             0.481579                           +0.78%               +0.99%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(8192, 131072)       mean         0.971178             0.942988             0.957164             0.938316                           +1.46%               +0.50%
(8192, 131072)       sum          0.971189             0.943232             0.956814             0.937816                           +1.50%               +0.58%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(8192, 262144)       mean         1.953728             1.877648             1.904937             1.861692                           +2.56%               +0.86%
(8192, 262144)       sum          1.953969             1.877538             1.905990             1.862547                           +2.52%               +0.80%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(4096, 262144)       mean         0.970408             0.940965             0.957871             0.936732                           +1.31%               +0.45%
(4096, 262144)       sum          0.970919             0.941652             0.957765             0.936676                           +1.37%               +0.53%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(2048, 262144)       mean         0.501477             0.486976             0.497964             0.483570                           +0.71%               +0.70%
(2048, 262144)       sum          0.501955             0.487213             0.498210             0.483218                           +0.75%               +0.83%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(1024, 262144)       mean         0.266536             0.257111             0.265642             0.255439                           +0.34%               +0.65%
(1024, 262144)       sum          0.266613             0.257096             0.265427             0.255472                           +0.45%               +0.64%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(512, 131072)        mean         0.087805             0.091200             0.085818             0.087851                           +2.32%               +3.81%
(512, 131072)        sum          0.087788             0.091249             0.085373             0.087944                           +2.83%               +3.76%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(1000, 1000)         mean         0.014503             0.012328             0.013663             0.010190                           +6.15%              +20.98%
(1000, 1000)         sum          0.014545             0.012378             0.013662             0.010579                           +6.46%              +17.01%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(1024, 129)          mean         0.014163             0.008371             0.012893             0.008828                           +9.85%               -5.18%
(1024, 129)          sum          0.014132             0.008751             0.013234             0.008868                           +6.79%               -1.32%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(1024, 257)          mean         0.014296             0.009101             0.013334             0.008563                           +7.21%               +6.28%
(1024, 257)          sum          0.014302             0.009058             0.013020             0.008672                           +9.85%               +4.45%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(1024, 587)          mean         0.014127             0.010997             0.013443             0.009944                           +5.09%              +10.59%
(1024, 587)          sum          0.014471             0.011373             0.013123             0.010354                          +10.27%               +9.84%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(2048, 977)          mean         0.015607             0.013566             0.015089             0.012152                           +3.43%              +11.64%
(2048, 977)          sum          0.015953             0.013580             0.015039             0.011861                           +6.08%              +14.49%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(1024, 128)          mean         0.013982             0.008058             0.012747             0.008139                           +9.69%               -1.00%
(1024, 128)          sum          0.013967             0.008071             0.012726             0.007859                           +9.75%               +2.70%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(8192, 128)          mean         0.014378             0.009627             0.013712             0.009395                           +4.86%               +2.47%
(8192, 128)          sum          0.014389             0.009965             0.013718             0.009521                           +4.89%               +4.66%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(1024, 130)          mean         0.014156             0.008267             0.012895             0.008833                           +9.78%               -6.41%
(1024, 130)          sum          0.013797             0.008277             0.012903             0.008512                           +6.93%               -2.76%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(8192, 130)          mean         0.014977             0.010026             0.013911             0.009876                           +7.66%               +1.52%
(8192, 130)          sum          0.014994             0.010043             0.014235             0.009604                           +5.33%               +4.57%
====================================================================================================================================================================================
```

**FP16**
```
Tensor Shape         Operation    Full reduce (ms)     Contiguous dim (ms)  Full reduce (ms)     Contiguous dim (ms)  Full reduce diff %   Contiguous diff %
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(256, 256)           mean         0.022804             0.008298             0.015888             0.007848                          +43.53%               +5.73%
(256, 256)           sum          0.023215             0.008328             0.015677             0.007850                          +48.08%               +6.09%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(512, 512)           mean         0.013777             0.009988             0.012884             0.008512                           +6.93%              +17.34%
(512, 512)           sum          0.013775             0.009622             0.012870             0.009028                           +7.03%               +6.58%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(1024, 1024)         mean         0.014740             0.012322             0.013708             0.010239                           +7.53%              +20.34%
(1024, 1024)         sum          0.014762             0.012756             0.013722             0.010307                           +7.58%              +23.76%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(2048, 2048)         mean         0.018700             0.018364             0.018135             0.015078                           +3.12%              +21.79%
(2048, 2048)         sum          0.018276             0.018415             0.018471             0.015127                           -1.06%              +21.74%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(4096, 4096)         mean         0.034518             0.037000             0.033838             0.030617                           +2.01%              +20.85%
(4096, 4096)         sum          0.034569             0.037448             0.033842             0.031100                           +2.15%              +20.41%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(8192, 8192)         mean         0.087675             0.095176             0.085328             0.084105                           +2.75%              +13.16%
(8192, 8192)         sum          0.088102             0.095211             0.085707             0.084090                           +2.79%              +13.23%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(8192, 16384)        mean         0.147800             0.149263             0.146388             0.138390                           +0.96%               +7.86%
(8192, 16384)        sum          0.148147             0.148957             0.146439             0.138801                           +1.17%               +7.32%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(8192, 32768)        mean         0.266316             0.260294             0.265829             0.253411                           +0.18%               +2.72%
(8192, 32768)        sum          0.266562             0.260717             0.265744             0.253308                           +0.31%               +2.92%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(8192, 65536)        mean         0.502035             0.486077             0.498139             0.481374                           +0.78%               +0.98%
(8192, 65536)        sum          0.501571             0.485733             0.498353             0.481350                           +0.65%               +0.91%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(8192, 131072)       mean         0.971343             0.943016             0.956600             0.938622                           +1.54%               +0.47%
(8192, 131072)       sum          0.971463             0.942991             0.957352             0.938334                           +1.47%               +0.50%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(8192, 262144)       mean         1.952722             1.877165             1.906406             1.861455                           +2.43%               +0.84%
(8192, 262144)       sum          1.952634             1.876388             1.904677             1.861282                           +2.52%               +0.81%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(4096, 262144)       mean         0.970697             0.941298             0.956964             0.936160                           +1.44%               +0.55%
(4096, 262144)       sum          0.969981             0.941078             0.957016             0.936260                           +1.35%               +0.51%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(2048, 262144)       mean         0.501577             0.487208             0.498422             0.483493                           +0.63%               +0.77%
(2048, 262144)       sum          0.502029             0.487124             0.497854             0.483643                           +0.84%               +0.72%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(1024, 262144)       mean         0.266416             0.257383             0.265928             0.255140                           +0.18%               +0.88%
(1024, 262144)       sum          0.266434             0.257081             0.265817             0.255143                           +0.23%               +0.76%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(512, 131072)        mean         0.087858             0.091296             0.085816             0.087745                           +2.38%               +4.05%
(512, 131072)        sum          0.088144             0.091314             0.085664             0.087864                           +2.90%               +3.93%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(1000, 1000)         mean         0.014977             0.012393             0.014141             0.010614                           +5.91%              +16.76%
(1000, 1000)         sum          0.014589             0.012804             0.014118             0.010320                           +3.34%              +24.07%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(1024, 129)          mean         0.014208             0.008383             0.013273             0.008440                           +7.04%               -0.68%
(1024, 129)          sum          0.013804             0.008863             0.013265             0.009003                           +4.06%               -1.56%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(1024, 257)          mean         0.014378             0.009109             0.013037             0.009038                          +10.29%               +0.79%
(1024, 257)          sum          0.014387             0.009113             0.013396             0.008698                           +7.40%               +4.77%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(1024, 587)          mean         0.014207             0.011037             0.013182             0.010391                           +7.78%               +6.22%
(1024, 587)          sum          0.014588             0.011453             0.013539             0.010049                           +7.75%              +13.97%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(2048, 977)          mean         0.016024             0.013614             0.015448             0.011845                           +3.73%              +14.93%
(2048, 977)          sum          0.015990             0.014033             0.015406             0.012278                           +3.79%              +14.29%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(1024, 128)          mean         0.014037             0.007804             0.013143             0.008242                           +6.80%               -5.31%
(1024, 128)          sum          0.014041             0.007847             0.012759             0.007850                          +10.05%               -0.04%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(8192, 128)          mean         0.014361             0.009644             0.014075             0.009061                           +2.03%               +6.43%
(8192, 128)          sum          0.014366             0.010032             0.013702             0.009181                           +4.85%               +9.27%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(1024, 130)          mean         0.014226             0.008696             0.012894             0.008835                          +10.33%               -1.57%
(1024, 130)          sum          0.013830             0.008740             0.013288             0.008989                           +4.08%               -2.77%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(8192, 130)          mean         0.015036             0.010019             0.013917             0.009538                           +8.04%               +5.04%
(8192, 130)          sum          0.014652             0.010403             0.013900             0.009565                           +5.41%               +8.76%
====================================================================================================================================================================================
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165055
Approved by: https://github.com/ngimel
ghstack dependencies: #165494, #164790
---
 aten/src/ATen/native/cuda/Reduce.cuh          |  4 ---
 .../ATen/native/cuda/ReduceMomentKernel.cu    |  7 ++++-
 .../ATen/native/cuda/ReduceSumProdKernel.cu   | 27 ++++++++-----------
 3 files changed, 17 insertions(+), 21 deletions(-)

diff --git a/aten/src/ATen/native/cuda/Reduce.cuh b/aten/src/ATen/native/cuda/Reduce.cuh
index 953aacf181b4..ad3f63797240 100644
--- a/aten/src/ATen/native/cuda/Reduce.cuh
+++ b/aten/src/ATen/native/cuda/Reduce.cuh
@@ -1097,11 +1097,7 @@ ReduceConfig setReduceConfig(const TensorIterator& iter){
   // threads with different threadIdx.x are independent and will produce results for different outputs.
   // In such case, values in each loaded vector always correspond to different outputs.
   if (fastest_moving_stride == sizeof(scalar_t)) {
-#ifdef USE_ROCM
     if (reduction_on_fastest_striding_dimension && dim0 >= 128 && iter.num_reduce_dims() == 1) {
-#else
-    if (reduction_on_fastest_striding_dimension && dim0 > 128 && iter.num_reduce_dims() == 1 && vt0 >= input_vec_size) {
-#endif
       // Case 1: "vectorize along input"
       // Note that if vt0 < ReduceConfig::vec_size, then this means the register pressure could be high, in such case,
       // we should avoid vectorization.
diff --git a/aten/src/ATen/native/cuda/ReduceMomentKernel.cu b/aten/src/ATen/native/cuda/ReduceMomentKernel.cu
index d7d7fabecc95..cabe86b313e9 100644
--- a/aten/src/ATen/native/cuda/ReduceMomentKernel.cu
+++ b/aten/src/ATen/native/cuda/ReduceMomentKernel.cu
@@ -39,9 +39,14 @@ static void std_var_kernel_cuda(TensorIterator& iter, double correction, bool ta
 template <typename scalar_t, typename acc_t=scalar_t, typename out_t=scalar_t>
 void mean_kernel_impl(TensorIterator& iter) {
   //  returns acc_t for all non-complex dtypes and returns T for c10::complex<T>
+  constexpr bool is_16_bits = sizeof(scalar_t) == 2;
   using factor_t = typename c10::scalar_value_type<acc_t>::type;
   factor_t factor = static_cast<factor_t>(iter.num_output_elements()) / iter.numel();
-  gpu_reduce_kernel<scalar_t, out_t>(iter, MeanOps<scalar_t, acc_t, factor_t, out_t> {factor});
+  if constexpr (is_16_bits) {
+    gpu_reduce_kernel<scalar_t, out_t, /*vt0=*/4, /*input_vec_size=*/8>(iter, MeanOps<scalar_t, acc_t, factor_t, out_t> {factor});
+  } else {
+    gpu_reduce_kernel<scalar_t, out_t>(iter, MeanOps<scalar_t, acc_t, factor_t, out_t> {factor});
+  }
 }
 
 static void mean_kernel_cuda(TensorIterator& iter) {
diff --git a/aten/src/ATen/native/cuda/ReduceSumProdKernel.cu b/aten/src/ATen/native/cuda/ReduceSumProdKernel.cu
index eedbb6fa8129..36f0835890de 100644
--- a/aten/src/ATen/native/cuda/ReduceSumProdKernel.cu
+++ b/aten/src/ATen/native/cuda/ReduceSumProdKernel.cu
@@ -13,24 +13,19 @@ namespace at::native {
 template <typename scalar_t, typename acc_t = scalar_t, typename out_t = scalar_t>
 struct sum_functor {
   void operator()(TensorIterator& iter) {
-#ifdef USE_ROCM
-    // Half and BFloat16 can be packed in groups of up to 8 elements and
-    // can use *_DWORDX4 instructions to achieve that.
-    const bool is_16_bits =
-      ( (std::is_same<at::Half, scalar_t>::value) ||
-        (std::is_same<at::BFloat16, scalar_t>::value) );
-    if (is_16_bits) {
+    const auto sum_combine = [] GPU_LAMBDA(acc_t a, acc_t b) -> acc_t {
+      return a + b;
+    };
+    constexpr bool is_16_bits = sizeof(scalar_t) == 2;
+    if constexpr (is_16_bits) {
       gpu_reduce_kernel<scalar_t, out_t, /*vt0=*/4, /*input_vec_size=*/8>(
-        iter, func_wrapper<out_t>([] GPU_LAMBDA(acc_t a, acc_t b) -> acc_t {
-          return a + b;
-        }));
-      return;
+        iter, func_wrapper<out_t>(sum_combine)
+      );
+    } else {
+      gpu_reduce_kernel<scalar_t, out_t>(
+        iter, func_wrapper<out_t>(sum_combine)
+      );
     }
-#endif
-    gpu_reduce_kernel<scalar_t, out_t>(
-        iter, func_wrapper<out_t>([] GPU_LAMBDA(acc_t a, acc_t b) -> acc_t {
-          return a + b;
-        }));
   }
 };
 

From 6ece527fc5b9fa35a210f410e73a0a65d8f98e5d Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Thu, 16 Oct 2025 15:45:35 -0700
Subject: [PATCH 040/123] [CI] Add aarch64 operator benchmark (#165585)

Running on Graviton4
Skip ConvTranspose1d benchmarks if PyTorch is compiled with ACL, due to https://github.com/pytorch/pytorch/issues/165654
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165585
Approved by: https://github.com/huydhn
---
 .github/workflows/operator_benchmark.yml      |   24 +
 ...i_operator_benchmark_eager_float32_cpu.csv | 1319 +++++++++++++++++
 benchmarks/operator_benchmark/pt/conv_test.py |   16 +-
 3 files changed, 1353 insertions(+), 6 deletions(-)
 create mode 100644 benchmarks/operator_benchmark/aarch64_expected_ci_operator_benchmark_eager_float32_cpu.csv

diff --git a/.github/workflows/operator_benchmark.yml b/.github/workflows/operator_benchmark.yml
index 09f14b545cdb..40fb3b8d0c85 100644
--- a/.github/workflows/operator_benchmark.yml
+++ b/.github/workflows/operator_benchmark.yml
@@ -52,3 +52,27 @@ jobs:
       docker-image: ${{ needs.x86-opbenchmark-build.outputs.docker-image }}
       test-matrix: ${{ needs.x86-opbenchmark-build.outputs.test-matrix }}
     secrets: inherit
+
+  aarch64-opbenchmark-build:
+    if: github.repository_owner == 'pytorch'
+    name: aarch64-opbenchmark-build
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-jammy-aarch64-py3.10
+      runner: linux.arm64.m7g.4xlarge
+      docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc11
+      test-matrix: |
+        { include: [
+          { config: "cpu_operator_benchmark_short", shard: 1, num_shards: 1, runner: "linux.arm64.m8g.4xlarge" },
+        ]}
+    secrets: inherit
+
+  aarch64-opbenchmark-test:
+    name: aarch64-opbenchmark-test
+    uses: ./.github/workflows/_linux-test.yml
+    needs: aarch64-opbenchmark-build
+    with:
+      build-environment: linux-jammy-aarch64-py3.10
+      docker-image: ${{ needs.aarch64-opbenchmark-build.outputs.docker-image }}
+      test-matrix: ${{ needs.aarch64-opbenchmark-build.outputs.test-matrix }}
+    secrets: inherit
diff --git a/benchmarks/operator_benchmark/aarch64_expected_ci_operator_benchmark_eager_float32_cpu.csv b/benchmarks/operator_benchmark/aarch64_expected_ci_operator_benchmark_eager_float32_cpu.csv
new file mode 100644
index 000000000000..dfc72e4665dd
--- /dev/null
+++ b/benchmarks/operator_benchmark/aarch64_expected_ci_operator_benchmark_eager_float32_cpu.csv
@@ -0,0 +1,1319 @@
+Benchmarking Framework,Benchmarking Module Name,Case Name,tag,run_backward,Execution Time,Peak Memory (KB)
+PyTorch,add,add_M1_N1_K1_cpu,short,False,4.244240,0.000000
+PyTorch,add,add_M64_N64_K64_cpu,short,False,56.719577,0.000000
+PyTorch,add,add_M64_N64_K128_cpu,short,False,56.826275,0.000000
+PyTorch,add,add_M1_N1_K1_cpu_bwdall_BACKWARD,short,True,47.834313,0.000000
+PyTorch,add,add_M1_N1_K1_cpu_bwd1_BACKWARD,short,True,47.872547,0.000000
+PyTorch,add,add_M1_N1_K1_cpu_bwd2_BACKWARD,short,True,47.790496,0.000000
+PyTorch,add,add_M64_N64_K64_cpu_bwdall_BACKWARD,short,True,216.173346,0.000000
+PyTorch,add,add_M64_N64_K64_cpu_bwd1_BACKWARD,short,True,217.600432,0.000000
+PyTorch,add,add_M64_N64_K64_cpu_bwd2_BACKWARD,short,True,216.916940,0.000000
+PyTorch,add,add_M64_N64_K128_cpu_bwdall_BACKWARD,short,True,250.406573,0.000000
+PyTorch,add,add_M64_N64_K128_cpu_bwd1_BACKWARD,short,True,250.049463,0.000000
+PyTorch,add,add_M64_N64_K128_cpu_bwd2_BACKWARD,short,True,250.817280,0.000000
+PyTorch,arange,arange_start0_end1000_step2.5_cpu_dtypetorch.float32,short,False,7.851754,0.000000
+PyTorch,arange,arange_start-1024_end2048_step1_cpu_dtypetorch.float32,short,False,8.597164,0.000000
+PyTorch,as_strided,"as_strided_M8_N8_size(2,2)_stride(1,1)_storage_offset0_cpu",short,False,3.503591,0.000000
+PyTorch,as_strided,"as_strided_M256_N256_size(32,32)_stride(1,1)_storage_offset0_cpu",short,False,3.584804,0.000000
+PyTorch,as_strided,"as_strided_M512_N512_size(64,64)_stride(2,2)_storage_offset1_cpu",short,False,3.723034,0.000000
+PyTorch,batchnorm,batchnorm_M1_N256_K3136_cpu_trainingTrue_cudnnFalse,short,False,343.685714,0.000000
+PyTorch,batchnorm,batchnorm_M1_N256_K3136_cpu_trainingFalse_cudnnFalse,short,False,96.169117,0.000000
+PyTorch,batchnorm,batchnorm_M1_N256_K3136_cpu_trainingTrue_cudnnFalse_bwdall_BACKWARD,short,True,335.407438,0.000000
+PyTorch,batchnorm,batchnorm_M1_N256_K3136_cpu_trainingTrue_cudnnFalse_bwd1_BACKWARD,short,True,337.885862,0.000000
+PyTorch,batchnorm,batchnorm_M1_N256_K3136_cpu_trainingFalse_cudnnFalse_bwdall_BACKWARD,short,True,326.908147,0.000000
+PyTorch,batchnorm,batchnorm_M1_N256_K3136_cpu_trainingFalse_cudnnFalse_bwd1_BACKWARD,short,True,329.085216,0.000000
+PyTorch,batchnorm,batchnorm_N3136_C256_cpu_trainingTrue_cudnnFalse,short,False,363.524665,0.000000
+PyTorch,batchnorm,batchnorm_N3136_C256_cpu_trainingFalse_cudnnFalse,short,False,129.891489,0.000000
+PyTorch,batchnorm,batchnorm_N3136_C256_cpu_trainingTrue_cudnnFalse_bwdall_BACKWARD,short,True,484.415291,0.000000
+PyTorch,batchnorm,batchnorm_N3136_C256_cpu_trainingTrue_cudnnFalse_bwd1_BACKWARD,short,True,486.083544,0.000000
+PyTorch,batchnorm,batchnorm_N3136_C256_cpu_trainingFalse_cudnnFalse_bwdall_BACKWARD,short,True,439.912925,0.000000
+PyTorch,batchnorm,batchnorm_N3136_C256_cpu_trainingFalse_cudnnFalse_bwd1_BACKWARD,short,True,439.728483,0.000000
+PyTorch,add_,add__M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,3.355920,0.000000
+PyTorch,add_,add__M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,54.177022,0.000000
+PyTorch,add_,add__M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,55.350490,0.000000
+PyTorch,sub_,sub__M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,1.466720,0.000000
+PyTorch,sub_,sub__M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,53.482515,0.000000
+PyTorch,sub_,sub__M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,55.382850,0.000000
+PyTorch,mul_,mul__M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,3.065535,0.000000
+PyTorch,mul_,mul__M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,51.635021,0.000000
+PyTorch,mul_,mul__M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,54.669222,0.000000
+PyTorch,copy_,copy__M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,1.412698,0.000000
+PyTorch,copy_,copy__M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,50.044207,0.000000
+PyTorch,copy_,copy__M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,49.480417,0.000000
+PyTorch,div_,div__M1_N1_K1_cpu_dtype_onetorch.float32_dtype_twotorch.float32,short,False,3.127072,0.000000
+PyTorch,div_,div__M64_N64_K64_cpu_dtype_onetorch.float32_dtype_twotorch.float32,short,False,59.241161,0.000000
+PyTorch,div_,div__M64_N64_K128_cpu_dtype_onetorch.float32_dtype_twotorch.float32,short,False,59.852816,0.000000
+PyTorch,add,"add_in_one[64,1,64]_in_two[1,64,1]_cpu_dtypetorch.float32",short,False,57.006677,0.000000
+PyTorch,sub,"sub_in_one[64,1,64]_in_two[1,64,1]_cpu_dtypetorch.float32",short,False,55.606088,0.000000
+PyTorch,div,"div_in_one[64,1,64]_in_two[1,64,1]_cpu_dtypetorch.float32",short,False,58.529255,0.000000
+PyTorch,mul,"mul_in_one[64,1,64]_in_two[1,64,1]_cpu_dtypetorch.float32",short,False,54.645077,0.000000
+PyTorch,add,add_M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,4.397014,0.000000
+PyTorch,add,add_M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,59.243500,0.000000
+PyTorch,add,add_M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,57.947691,0.000000
+PyTorch,sub,sub_M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,1.925851,0.000000
+PyTorch,sub,sub_M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,57.308320,0.000000
+PyTorch,sub,sub_M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,57.787743,0.000000
+PyTorch,div,div_M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,7.978539,0.000000
+PyTorch,div,div_M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,159.754860,0.000000
+PyTorch,div,div_M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,165.360235,0.000000
+PyTorch,mul,mul_M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,3.928136,0.000000
+PyTorch,mul,mul_M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,56.413499,0.000000
+PyTorch,mul,mul_M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,55.925090,0.000000
+PyTorch,logical_and,"logical_and_in_one[64,1,64]_in_two[1,64,1]_cpu_dtypetorch.bool",short,False,78.404254,0.000000
+PyTorch,logical_and,logical_and_M1_N1_K1_cpu_dtype_onetorch.bool_dtype_twotorch.bool,short,False,5.354032,0.000000
+PyTorch,logical_and,logical_and_M64_N64_K64_cpu_dtype_onetorch.bool_dtype_twotorch.bool,short,False,54.072783,0.000000
+PyTorch,logical_and,logical_and_M64_N64_K128_cpu_dtype_onetorch.bool_dtype_twotorch.bool,short,False,53.680283,0.000000
+PyTorch,bmm,bmm_B2_M1_N8_K2_cpu_dtypetorch.float32,short,False,4.407892,0.000000
+PyTorch,bmm,bmm_B2_M1_N8_K2_cpu_dtypetorch.bfloat16,short,False,4.213927,0.000000
+PyTorch,bmm,bmm_B128_M64_N32_K64_cpu_dtypetorch.float32,short,False,200.303424,0.000000
+PyTorch,bmm,bmm_B128_M64_N32_K64_cpu_dtypetorch.bfloat16,short,False,229.912606,0.000000
+PyTorch,baddbmm,baddbmm_B2_M1_N8_K2_cpu_dtypetorch.float32,short,False,6.631313,0.000000
+PyTorch,baddbmm,baddbmm_B2_M1_N8_K2_cpu_dtypetorch.bfloat16,short,False,6.476986,0.000000
+PyTorch,baddbmm,baddbmm_B128_M64_N32_K64_cpu_dtypetorch.float32,short,False,266.065131,0.000000
+PyTorch,baddbmm,baddbmm_B128_M64_N32_K64_cpu_dtypetorch.bfloat16,short,False,295.503063,0.000000
+PyTorch,cat,"cat_sizes(1,1,1)_N2_dim0_cpu",short,False,4.301950,0.000000
+PyTorch,cat,"cat_sizes(512,512,2)_N2_dim1_cpu",short,False,99.093415,0.000000
+PyTorch,cat,"cat_sizes(128,1024,2)_N2_dim1_cpu",short,False,96.771578,0.000000
+PyTorch,channel_shuffle,channel_shuffle_batch_size2_channels_per_group16_height16_width16_groups2_channel_lastTrue,short,False,52.475549,0.000000
+PyTorch,channel_shuffle,channel_shuffle_batch_size2_channels_per_group16_height16_width16_groups2_channel_lastFalse,short,False,46.483135,0.000000
+PyTorch,channel_shuffle,channel_shuffle_batch_size2_channels_per_group32_height32_width32_groups2_channel_lastTrue,short,False,57.179441,0.000000
+PyTorch,channel_shuffle,channel_shuffle_batch_size2_channels_per_group32_height32_width32_groups2_channel_lastFalse,short,False,51.114112,0.000000
+PyTorch,channel_shuffle,channel_shuffle_batch_size4_channels_per_group32_height32_width32_groups4_channel_lastTrue,short,False,77.045573,0.000000
+PyTorch,channel_shuffle,channel_shuffle_batch_size4_channels_per_group32_height32_width32_groups4_channel_lastFalse,short,False,57.527440,0.000000
+PyTorch,channel_shuffle,channel_shuffle_batch_size4_channels_per_group64_height64_width64_groups4_channel_lastTrue,short,False,299.237060,0.000000
+PyTorch,channel_shuffle,channel_shuffle_batch_size4_channels_per_group64_height64_width64_groups4_channel_lastFalse,short,False,165.268507,0.000000
+PyTorch,channel_shuffle,channel_shuffle_batch_size8_channels_per_group64_height64_width64_groups8_channel_lastTrue,short,False,1034.480289,0.000000
+PyTorch,channel_shuffle,channel_shuffle_batch_size8_channels_per_group64_height64_width64_groups8_channel_lastFalse,short,False,627.552450,0.000000
+PyTorch,channel_shuffle,channel_shuffle_batch_size16_channels_per_group64_height64_width64_groups16_channel_lastTrue,short,False,4709.313910,0.000000
+PyTorch,channel_shuffle,channel_shuffle_batch_size16_channels_per_group64_height64_width64_groups16_channel_lastFalse,short,False,2470.991690,0.000000
+PyTorch,chunk,chunk_M8_N8_chunks2_cpu,short,False,6.881959,0.000000
+PyTorch,chunk,chunk_M256_N512_chunks2_cpu,short,False,7.016489,0.000000
+PyTorch,chunk,chunk_M512_N512_chunks2_cpu,short,False,6.829479,0.000000
+PyTorch,Conv1d,Conv1d_IC128_OC256_kernel3_stride1_N1_L64_cpu,short,False,161.526501,0.000000
+PyTorch,Conv1d,Conv1d_IC256_OC256_kernel3_stride2_N4_L64_cpu,short,False,389.396360,0.000000
+PyTorch,Conv2d,Conv2d_IC256_OC256_kernel3_stride1_N1_H16_W16_G1_pad0_cpu,short,False,837.232033,0.000000
+PyTorch,ConvTranspose2d,ConvTranspose2d_IC256_OC256_kernel3_stride1_N1_H16_W16_G1_pad0_cpu,short,False,1259.104354,0.000000
+PyTorch,Conv2dPointwise,Conv2dPointwise_IC256_OC256_stride1_N1_H16_W16_G1_pad0_cpu,short,False,423.592581,0.000000
+PyTorch,Conv3d,Conv3d_IC64_OC64_kernel3_stride1_N8_D4_H16_W16_cpu,short,False,4713.401237,0.000000
+PyTorch,ConvTranspose3d,ConvTranspose3d_IC64_OC64_kernel3_stride1_N8_D4_H16_W16_cpu,short,False,9798.085490,0.000000
+PyTorch,diag,diag_dim1_M64_N64_diagonal0_outTrue_cpu,short,False,9.983573,0.000000
+PyTorch,diag,diag_dim2_M128_N128_diagonal-10_outFalse_cpu,short,False,7.817579,0.000000
+PyTorch,diag,diag_dim1_M256_N256_diagonal20_outTrue_cpu,short,False,102.008750,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,25.932070,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,79.094040,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,25.618948,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,71.670897,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,25.800482,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,63.936052,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,25.779446,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,70.597326,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,26.118981,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,62.572553,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,26.209740,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,62.822163,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,25.702759,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,66.037250,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,25.827319,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,71.249488,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,25.775656,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,62.907740,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,25.834111,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,75.054840,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,26.253773,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,61.943780,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,26.276609,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,61.851260,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,25.689124,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,69.262678,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,25.672505,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,73.133838,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,25.631939,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,66.750426,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,25.913212,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,64.675854,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,26.447855,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,61.601586,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,26.252401,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,61.955597,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,25.703098,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,68.315884,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,25.807940,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,75.701812,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,25.857585,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,62.865699,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,25.785043,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,63.303901,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,26.329548,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,61.085350,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,26.401250,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,61.327850,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,True,76.646453,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,True,76.408263,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,66.143049,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,66.626689,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,True,78.586541,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,True,78.437226,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,67.294776,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,67.519295,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,True,83.240654,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,True,82.798171,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,70.350631,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,71.047552,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,True,76.947381,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,True,76.043851,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,68.641934,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,68.768893,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,True,78.648941,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,True,77.599791,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,69.483032,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,69.184328,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,True,83.075783,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,True,83.171316,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,72.100870,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,72.667771,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,True,77.178308,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,True,76.987765,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,173.891298,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,174.383305,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,True,78.001683,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,True,78.145431,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,174.426247,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,173.456537,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,True,83.578019,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,True,83.350259,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,179.564871,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,181.208623,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,True,76.724585,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,True,77.335260,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,172.416292,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,170.913750,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,True,77.864377,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,True,77.955812,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,173.070785,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,173.094255,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,True,82.591598,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,True,82.869897,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,181.269854,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,181.079995,0.000000
+PyTorch,embedding,embedding_num_embeddings10_embedding_dim64_input_size8_cpu,short,False,13.257645,0.000000
+PyTorch,embedding,embedding_num_embeddings10_embedding_dim64_input_size16_cpu,short,False,13.274894,0.000000
+PyTorch,embedding,embedding_num_embeddings10_embedding_dim64_input_size64_cpu,short,False,13.594135,0.000000
+PyTorch,embedding,embedding_num_embeddings120_embedding_dim64_input_size8_cpu,short,False,13.210569,0.000000
+PyTorch,embedding,embedding_num_embeddings120_embedding_dim64_input_size16_cpu,short,False,13.358302,0.000000
+PyTorch,embedding,embedding_num_embeddings120_embedding_dim64_input_size64_cpu,short,False,13.676537,0.000000
+PyTorch,embedding,embedding_num_embeddings1000_embedding_dim64_input_size8_cpu,short,False,13.230114,0.000000
+PyTorch,embedding,embedding_num_embeddings1000_embedding_dim64_input_size16_cpu,short,False,13.316872,0.000000
+PyTorch,embedding,embedding_num_embeddings1000_embedding_dim64_input_size64_cpu,short,False,13.728165,0.000000
+PyTorch,embedding,embedding_num_embeddings2300_embedding_dim64_input_size8_cpu,short,False,13.240829,0.000000
+PyTorch,embedding,embedding_num_embeddings2300_embedding_dim64_input_size16_cpu,short,False,13.322630,0.000000
+PyTorch,embedding,embedding_num_embeddings2300_embedding_dim64_input_size64_cpu,short,False,13.678991,0.000000
+PyTorch,embedding,embedding_num_embeddings10_embedding_dim64_input_size8_cpu_BACKWARD,short,True,52.434260,0.000000
+PyTorch,embedding,embedding_num_embeddings10_embedding_dim64_input_size16_cpu_BACKWARD,short,True,54.270657,0.000000
+PyTorch,embedding,embedding_num_embeddings10_embedding_dim64_input_size64_cpu_BACKWARD,short,True,60.054990,0.000000
+PyTorch,embedding,embedding_num_embeddings120_embedding_dim64_input_size8_cpu_BACKWARD,short,True,55.491721,0.000000
+PyTorch,embedding,embedding_num_embeddings120_embedding_dim64_input_size16_cpu_BACKWARD,short,True,56.325304,0.000000
+PyTorch,embedding,embedding_num_embeddings120_embedding_dim64_input_size64_cpu_BACKWARD,short,True,61.959455,0.000000
+PyTorch,embedding,embedding_num_embeddings1000_embedding_dim64_input_size8_cpu_BACKWARD,short,True,158.577292,0.000000
+PyTorch,embedding,embedding_num_embeddings1000_embedding_dim64_input_size16_cpu_BACKWARD,short,True,157.616690,0.000000
+PyTorch,embedding,embedding_num_embeddings1000_embedding_dim64_input_size64_cpu_BACKWARD,short,True,164.962560,0.000000
+PyTorch,embedding,embedding_num_embeddings2300_embedding_dim64_input_size8_cpu_BACKWARD,short,True,191.301190,0.000000
+PyTorch,embedding,embedding_num_embeddings2300_embedding_dim64_input_size16_cpu_BACKWARD,short,True,196.503447,0.000000
+PyTorch,embedding,embedding_num_embeddings2300_embedding_dim64_input_size64_cpu_BACKWARD,short,True,201.295830,0.000000
+PyTorch,fill_,fill__N1_cpu_dtypetorch.int32,short,False,1.126186,0.000000
+PyTorch,fill_,fill__N1024_cpu_dtypetorch.int32,short,False,2.565226,0.000000
+PyTorch,fill_,fill__N2048_cpu_dtypetorch.int32,short,False,2.978169,0.000000
+PyTorch,gather,gather_M256_N512_dim0_cpu,short,False,113.958748,0.000000
+PyTorch,gather,gather_M512_N512_dim1_cpu,short,False,72.347757,0.000000
+PyTorch,GroupNormBenchmark,"GroupNormBenchmark_dims(32,8,16)_num_groups2",short,False,60.884617,0.000000
+PyTorch,GroupNormBenchmark,"GroupNormBenchmark_dims(32,8,16)_num_groups4",short,False,53.373645,0.000000
+PyTorch,GroupNormBenchmark,"GroupNormBenchmark_dims(32,8,56,56)_num_groups2",short,False,113.483659,0.000000
+PyTorch,GroupNormBenchmark,"GroupNormBenchmark_dims(32,8,56,56)_num_groups4",short,False,114.206127,0.000000
+PyTorch,Hardsigmoid,Hardsigmoid_N1_C3_H256_W256_cpu,short,False,66.121431,0.000000
+PyTorch,Hardsigmoid,Hardsigmoid_N4_C3_H256_W256_cpu,short,False,74.423833,0.000000
+PyTorch,Hardswish,Hardswish_N1_C3_H256_W256_cpu,short,False,67.379220,0.000000
+PyTorch,Hardswish,Hardswish_N4_C3_H256_W256_cpu,short,False,82.693655,0.000000
+PyTorch,index_add_,index_add__M8_N32_K1_dim0_cpu_dtypetorch.float32,short,False,7.053411,0.000000
+PyTorch,index_add_,index_add__M256_N512_K1_dim1_cpu_dtypetorch.float32,short,False,13.263054,0.000000
+PyTorch,index_add_,index_add__M512_N512_K1_dim2_cpu_dtypetorch.float32,short,False,108.319590,0.000000
+PyTorch,index_select,index_select_M8_N8_K1_dim1_cpu,short,False,4.514675,0.000000
+PyTorch,index_select,index_select_M256_N512_K1_dim1_cpu,short,False,54.654160,0.000000
+PyTorch,index_select,index_select_M512_N512_K1_dim1_cpu,short,False,103.358516,0.000000
+PyTorch,index_select,index_select_M8_N8_K2_dim1_cpu,short,False,4.561579,0.000000
+PyTorch,index_select,index_select_M256_N512_K2_dim1_cpu,short,False,212.789483,0.000000
+PyTorch,index_select,index_select_M512_N512_K2_dim1_cpu,short,False,430.552168,0.000000
+PyTorch,InstanceNormBenchmark,"InstanceNormBenchmark_dims(32,8,16)",short,False,169.785802,0.000000
+PyTorch,InstanceNormBenchmark,"InstanceNormBenchmark_dims(32,8,56,56)",short,False,359.232437,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,60,40)_output_size(24,24)_channels_lastTrue_modenearest",short,False,10.529644,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,60,40)_output_size(24,24)_channels_lastTrue_modelinear",short,False,12.189028,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,60,40)_output_size(24,24)_channels_lastTrue_modebicubic",short,False,46.246996,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,60,40)_output_size(24,24)_channels_lastFalse_modenearest",short,False,22.743285,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,60,40)_output_size(24,24)_channels_lastFalse_modelinear",short,False,24.601899,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,60,40)_output_size(24,24)_channels_lastFalse_modebicubic",short,False,34.769822,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,600,400)_output_size(240,240)_channels_lastTrue_modenearest",short,False,128.987081,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,600,400)_output_size(240,240)_channels_lastTrue_modelinear",short,False,193.039880,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,600,400)_output_size(240,240)_channels_lastTrue_modebicubic",short,False,487.996140,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,600,400)_output_size(240,240)_channels_lastFalse_modenearest",short,False,80.409450,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,600,400)_output_size(240,240)_channels_lastFalse_modelinear",short,False,112.757609,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,600,400)_output_size(240,240)_channels_lastFalse_modebicubic",short,False,291.153090,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,320,320)_output_size(256,256)_channels_lastTrue_modenearest",short,False,136.694490,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,320,320)_output_size(256,256)_channels_lastTrue_modelinear",short,False,207.920459,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,320,320)_output_size(256,256)_channels_lastTrue_modebicubic",short,False,547.632725,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,320,320)_output_size(256,256)_channels_lastFalse_modenearest",short,False,81.090366,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,320,320)_output_size(256,256)_channels_lastFalse_modelinear",short,False,117.256844,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,320,320)_output_size(256,256)_channels_lastFalse_modebicubic",short,False,319.923544,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,60,40)_output_size(24,24)_channels_lastTrue_modenearest",short,False,10.135673,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,60,40)_output_size(24,24)_channels_lastTrue_modelinear",short,False,11.241479,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,60,40)_output_size(24,24)_channels_lastTrue_modebicubic",short,False,25.862923,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,60,40)_output_size(24,24)_channels_lastFalse_modenearest",short,False,9.880939,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,60,40)_output_size(24,24)_channels_lastFalse_modelinear",short,False,11.446106,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,60,40)_output_size(24,24)_channels_lastFalse_modebicubic",short,False,25.877143,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,600,400)_output_size(240,240)_channels_lastTrue_modenearest",short,False,80.987965,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,600,400)_output_size(240,240)_channels_lastTrue_modelinear",short,False,112.928955,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,600,400)_output_size(240,240)_channels_lastTrue_modebicubic",short,False,293.535760,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,600,400)_output_size(240,240)_channels_lastFalse_modenearest",short,False,80.649728,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,600,400)_output_size(240,240)_channels_lastFalse_modelinear",short,False,112.735063,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,600,400)_output_size(240,240)_channels_lastFalse_modebicubic",short,False,292.594442,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,320,320)_output_size(256,256)_channels_lastTrue_modenearest",short,False,81.071167,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,320,320)_output_size(256,256)_channels_lastTrue_modelinear",short,False,119.073692,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,320,320)_output_size(256,256)_channels_lastTrue_modebicubic",short,False,325.062960,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,320,320)_output_size(256,256)_channels_lastFalse_modenearest",short,False,80.776966,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,320,320)_output_size(256,256)_channels_lastFalse_modelinear",short,False,118.075726,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,320,320)_output_size(256,256)_channels_lastFalse_modebicubic",short,False,325.422923,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,60,40)_output_size(24,24)_channels_lastTrue_modenearest_dtypetorch.uint8",short,False,10.408200,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,60,40)_output_size(24,24)_channels_lastFalse_modenearest_dtypetorch.uint8",short,False,23.989929,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,600,400)_output_size(240,240)_channels_lastTrue_modenearest_dtypetorch.uint8",short,False,142.707918,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,600,400)_output_size(240,240)_channels_lastFalse_modenearest_dtypetorch.uint8",short,False,100.752786,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,320,320)_output_size(256,256)_channels_lastTrue_modenearest_dtypetorch.uint8",short,False,153.185516,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,320,320)_output_size(256,256)_channels_lastFalse_modenearest_dtypetorch.uint8",short,False,104.761840,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,60,40)_output_size(24,24)_channels_lastTrue_modenearest_dtypetorch.uint8",short,False,9.870818,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,60,40)_output_size(24,24)_channels_lastFalse_modenearest_dtypetorch.uint8",short,False,9.931431,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,600,400)_output_size(240,240)_channels_lastTrue_modenearest_dtypetorch.uint8",short,False,99.600515,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,600,400)_output_size(240,240)_channels_lastFalse_modenearest_dtypetorch.uint8",short,False,99.164257,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,320,320)_output_size(256,256)_channels_lastTrue_modenearest_dtypetorch.uint8",short,False,103.419602,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,320,320)_output_size(256,256)_channels_lastFalse_modenearest_dtypetorch.uint8",short,False,103.148608,0.000000
+PyTorch,LayerNormBenchmark,"LayerNormBenchmark_dims(1,8,16)",short,False,9.418410,0.000000
+PyTorch,LayerNormBenchmark,"LayerNormBenchmark_dims(8,8,16)",short,False,57.969351,0.000000
+PyTorch,LayerNormBenchmark,"LayerNormBenchmark_dims(32,8,16)",short,False,59.316279,0.000000
+PyTorch,LayerNormBenchmark,"LayerNormBenchmark_dims(64,128,56,56)",short,False,2573.762285,0.000000
+PyTorch,linear,linear_N1_IN1_OUT1_cpu,short,False,17.240207,0.000000
+PyTorch,linear,linear_N4_IN256_OUT128_cpu,short,False,70.636017,0.000000
+PyTorch,linear,linear_N16_IN512_OUT256_cpu,short,False,155.853732,0.000000
+PyTorch,matmul,matmul_M1_N1_K1_trans_aTrue_trans_bFalse_cpu,short,False,5.217676,0.000000
+PyTorch,matmul,matmul_M128_N128_K128_trans_aTrue_trans_bFalse_cpu,short,False,130.986713,0.000000
+PyTorch,matmul,matmul_M256_N256_K256_trans_aFalse_trans_bTrue_cpu,short,False,4967.684160,0.000000
+PyTorch,mm,mm_M1_N1_K1_cpu_dtypetorch.float32,short,False,4.969217,0.000000
+PyTorch,mm,mm_M64_N64_K64_cpu_dtypetorch.float32,short,False,56.936066,0.000000
+PyTorch,mm,mm_M64_N64_K128_cpu_dtypetorch.float32,short,False,59.284410,0.000000
+PyTorch,nan_to_num,nan_to_num_M16_N64_dtypetorch.float32_replace_infTrue,short,False,6.358168,0.000000
+PyTorch,nan_to_num,nan_to_num_M16_N64_dtypetorch.float32_replace_infFalse,short,False,6.798741,0.000000
+PyTorch,nan_to_num,nan_to_num_M16_N64_dtypetorch.float64_replace_infTrue,short,False,8.008753,0.000000
+PyTorch,nan_to_num,nan_to_num_M16_N64_dtypetorch.float64_replace_infFalse,short,False,8.567021,0.000000
+PyTorch,nan_to_num,nan_to_num_M16_N64_dtypetorch.float32_replace_infTrue,short,False,6.319673,0.000000
+PyTorch,nan_to_num,nan_to_num_M16_N64_dtypetorch.float32_replace_infFalse,short,False,6.744320,0.000000
+PyTorch,nan_to_num,nan_to_num_M16_N64_dtypetorch.float64_replace_infTrue,short,False,8.063743,0.000000
+PyTorch,nan_to_num,nan_to_num_M16_N64_dtypetorch.float64_replace_infFalse,short,False,8.583122,0.000000
+PyTorch,nan_to_num,nan_to_num_M64_N64_dtypetorch.float32_replace_infTrue,short,False,7.557407,0.000000
+PyTorch,nan_to_num,nan_to_num_M64_N64_dtypetorch.float32_replace_infFalse,short,False,8.056106,0.000000
+PyTorch,nan_to_num,nan_to_num_M64_N64_dtypetorch.float64_replace_infTrue,short,False,13.849453,0.000000
+PyTorch,nan_to_num,nan_to_num_M64_N64_dtypetorch.float64_replace_infFalse,short,False,14.596365,0.000000
+PyTorch,nan_to_num,nan_to_num_M64_N64_dtypetorch.float32_replace_infTrue,short,False,7.504524,0.000000
+PyTorch,nan_to_num,nan_to_num_M64_N64_dtypetorch.float32_replace_infFalse,short,False,8.090356,0.000000
+PyTorch,nan_to_num,nan_to_num_M64_N64_dtypetorch.float64_replace_infTrue,short,False,14.077416,0.000000
+PyTorch,nan_to_num,nan_to_num_M64_N64_dtypetorch.float64_replace_infFalse,short,False,14.615643,0.000000
+PyTorch,nan_to_num_,nan_to_num__M16_N64_dtypetorch.float32_replace_infTrue,short,False,4.053200,0.000000
+PyTorch,nan_to_num_,nan_to_num__M16_N64_dtypetorch.float32_replace_infFalse,short,False,4.485825,0.000000
+PyTorch,nan_to_num_,nan_to_num__M16_N64_dtypetorch.float64_replace_infTrue,short,False,5.800954,0.000000
+PyTorch,nan_to_num_,nan_to_num__M16_N64_dtypetorch.float64_replace_infFalse,short,False,6.403105,0.000000
+PyTorch,nan_to_num_,nan_to_num__M16_N64_dtypetorch.float32_replace_infTrue,short,False,4.020517,0.000000
+PyTorch,nan_to_num_,nan_to_num__M16_N64_dtypetorch.float32_replace_infFalse,short,False,4.438027,0.000000
+PyTorch,nan_to_num_,nan_to_num__M16_N64_dtypetorch.float64_replace_infTrue,short,False,5.689130,0.000000
+PyTorch,nan_to_num_,nan_to_num__M16_N64_dtypetorch.float64_replace_infFalse,short,False,6.420881,0.000000
+PyTorch,nan_to_num_,nan_to_num__M64_N64_dtypetorch.float32_replace_infTrue,short,False,4.984703,0.000000
+PyTorch,nan_to_num_,nan_to_num__M64_N64_dtypetorch.float32_replace_infFalse,short,False,5.660661,0.000000
+PyTorch,nan_to_num_,nan_to_num__M64_N64_dtypetorch.float64_replace_infTrue,short,False,11.735412,0.000000
+PyTorch,nan_to_num_,nan_to_num__M64_N64_dtypetorch.float64_replace_infFalse,short,False,12.347645,0.000000
+PyTorch,nan_to_num_,nan_to_num__M64_N64_dtypetorch.float32_replace_infTrue,short,False,5.176911,0.000000
+PyTorch,nan_to_num_,nan_to_num__M64_N64_dtypetorch.float32_replace_infFalse,short,False,5.569892,0.000000
+PyTorch,nan_to_num_,nan_to_num__M64_N64_dtypetorch.float64_replace_infTrue,short,False,11.676570,0.000000
+PyTorch,nan_to_num_,nan_to_num__M64_N64_dtypetorch.float64_replace_infFalse,short,False,12.506719,0.000000
+PyTorch,MaxPool1d,MaxPool1d_kernel3_stride1_N8_C256_L256_cpu,short,False,121.343571,0.000000
+PyTorch,AvgPool1d,AvgPool1d_kernel3_stride1_N8_C256_L256_cpu,short,False,315.454573,0.000000
+PyTorch,MaxPool2d,"MaxPool2d_kernel[3,1]_stride[2,1]_N1_C16_H32_W32_cpu",short,False,58.314310,0.000000
+PyTorch,AvgPool2d,"AvgPool2d_kernel[3,1]_stride[2,1]_N1_C16_H32_W32_cpu",short,False,55.510125,0.000000
+PyTorch,AdaptiveMaxPool2d,"AdaptiveMaxPool2d_kernel[3,1]_stride[2,1]_N1_C16_H32_W32_cpu",short,False,63.309880,0.000000
+PyTorch,FractionalMaxPool2d,"FractionalMaxPool2d_kernel[3,1]_stride[2,1]_N1_C16_H32_W32_cpu",short,False,66.127681,0.000000
+PyTorch,MaxPool3d,"MaxPool3d_kernel[3,1,3]_stride[2,1,2]_N1_C16_D16_H32_W32_cpu",short,False,236.593780,0.000000
+PyTorch,AvgPool3d,"AvgPool3d_kernel[3,1,3]_stride[2,1,2]_N1_C16_D16_H32_W32_cpu",short,False,100.692771,0.000000
+PyTorch,AdaptiveMaxPool3d,"AdaptiveMaxPool3d_kernel[3,1,3]_stride[2,1,2]_N1_C16_D16_H32_W32_cpu",short,False,192.562352,0.000000
+PyTorch,FractionalMaxPool3d,"FractionalMaxPool3d_kernel[3,1,3]_stride[2,1,2]_N1_C16_D16_H32_W32_cpu",short,False,66.164532,0.000000
+PyTorch,fmod,fmod_M1_N1_K1_cpu_dtypetorch.int32,short,False,3.635065,0.000000
+PyTorch,fmod,fmod_M1_N1_K1_cpu_dtypetorch.float32,short,False,3.901028,0.000000
+PyTorch,fmod,fmod_M1_N1_K1_cpu_dtypetorch.float64,short,False,4.041925,0.000000
+PyTorch,fmod,fmod_M64_N64_K64_cpu_dtypetorch.int32,short,False,129.514345,0.000000
+PyTorch,fmod,fmod_M64_N64_K64_cpu_dtypetorch.float32,short,False,151.149918,0.000000
+PyTorch,fmod,fmod_M64_N64_K64_cpu_dtypetorch.float64,short,False,746.067340,0.000000
+PyTorch,fmod,fmod_M64_N64_K128_cpu_dtypetorch.int32,short,False,210.913781,0.000000
+PyTorch,fmod,fmod_M64_N64_K128_cpu_dtypetorch.float32,short,False,252.686828,0.000000
+PyTorch,fmod,fmod_M64_N64_K128_cpu_dtypetorch.float64,short,False,1484.044931,0.000000
+PyTorch,remainder,remainder_M1_N1_K1_cpu_dtypetorch.int32,short,False,3.976802,0.000000
+PyTorch,remainder,remainder_M1_N1_K1_cpu_dtypetorch.float32,short,False,4.075495,0.000000
+PyTorch,remainder,remainder_M1_N1_K1_cpu_dtypetorch.float64,short,False,3.834691,0.000000
+PyTorch,remainder,remainder_M64_N64_K64_cpu_dtypetorch.int32,short,False,146.646648,0.000000
+PyTorch,remainder,remainder_M64_N64_K64_cpu_dtypetorch.float32,short,False,170.557022,0.000000
+PyTorch,remainder,remainder_M64_N64_K64_cpu_dtypetorch.float64,short,False,867.868537,0.000000
+PyTorch,remainder,remainder_M64_N64_K128_cpu_dtypetorch.int32,short,False,243.740380,0.000000
+PyTorch,remainder,remainder_M64_N64_K128_cpu_dtypetorch.float32,short,False,292.164866,0.000000
+PyTorch,remainder,remainder_M64_N64_K128_cpu_dtypetorch.float64,short,False,1730.402555,0.000000
+PyTorch,Softmax,Softmax_N1_C3_H256_W256_cpu,short,False,122.847048,0.000000
+PyTorch,Softmax,Softmax_N4_C3_H256_W256_cpu,short,False,317.788112,0.000000
+PyTorch,Softmax2d,Softmax2d_N1_C3_H256_W256_cpu,short,False,120.565735,0.000000
+PyTorch,Softmax2d,Softmax2d_N4_C3_H256_W256_cpu,short,False,316.982444,0.000000
+PyTorch,LogSoftmax,LogSoftmax_N1_C3_H256_W256_cpu,short,False,162.530153,0.000000
+PyTorch,LogSoftmax,LogSoftmax_N4_C3_H256_W256_cpu,short,False,266.478752,0.000000
+PyTorch,split,split_M8_N8_parts2_cpu,short,False,6.753952,0.000000
+PyTorch,split,split_M256_N512_parts2_cpu,short,False,6.873656,0.000000
+PyTorch,split,split_M512_N512_parts2_cpu,short,False,6.848019,0.000000
+PyTorch,stack,"stack_sizes(1,1,1)_N2_cpu_dim0",short,False,5.736891,0.000000
+PyTorch,stack,"stack_sizes(1,1,1)_N2_cpu_dim1",short,False,6.185757,0.000000
+PyTorch,stack,"stack_sizes(1,1,1)_N2_cpu_dim2",short,False,6.094516,0.000000
+PyTorch,stack,"stack_sizes(1,1,1)_N2_cpu_dim3",short,False,6.894034,0.000000
+PyTorch,stack,"stack_sizes(512,512,2)_N2_cpu_dim0",short,False,98.350665,0.000000
+PyTorch,stack,"stack_sizes(512,512,2)_N2_cpu_dim1",short,False,100.461322,0.000000
+PyTorch,stack,"stack_sizes(512,512,2)_N2_cpu_dim2",short,False,218.911485,0.000000
+PyTorch,stack,"stack_sizes(512,512,2)_N2_cpu_dim3",short,False,166.567879,0.000000
+PyTorch,stack,"stack_sizes(128,1024,2)_N2_cpu_dim0",short,False,99.504077,0.000000
+PyTorch,stack,"stack_sizes(128,1024,2)_N2_cpu_dim1",short,False,98.383429,0.000000
+PyTorch,stack,"stack_sizes(128,1024,2)_N2_cpu_dim2",short,False,153.173778,0.000000
+PyTorch,stack,"stack_sizes(128,1024,2)_N2_cpu_dim3",short,False,123.909933,0.000000
+PyTorch,sum,sum_R64_V32_dim0_contiguousTrue_cpu,short,False,6.692267,0.000000
+PyTorch,sum,sum_R64_V32_dim0_contiguousFalse_cpu,short,False,8.023065,0.000000
+PyTorch,sum,sum_R64_V32_dim1_contiguousTrue_cpu,short,False,6.881371,0.000000
+PyTorch,sum,sum_R64_V32_dim1_contiguousFalse_cpu,short,False,7.601940,0.000000
+PyTorch,sum,sum_R64_V512_dim0_contiguousTrue_cpu,short,False,44.774431,0.000000
+PyTorch,sum,sum_R64_V512_dim0_contiguousFalse_cpu,short,False,49.214148,0.000000
+PyTorch,sum,sum_R64_V512_dim1_contiguousTrue_cpu,short,False,45.532505,0.000000
+PyTorch,sum,sum_R64_V512_dim1_contiguousFalse_cpu,short,False,51.539750,0.000000
+PyTorch,sum,sum_R256_V32_dim0_contiguousTrue_cpu,short,False,7.732977,0.000000
+PyTorch,sum,sum_R256_V32_dim0_contiguousFalse_cpu,short,False,9.670269,0.000000
+PyTorch,sum,sum_R256_V32_dim1_contiguousTrue_cpu,short,False,7.691115,0.000000
+PyTorch,sum,sum_R256_V32_dim1_contiguousFalse_cpu,short,False,9.625176,0.000000
+PyTorch,sum,sum_R256_V512_dim0_contiguousTrue_cpu,short,False,50.954394,0.000000
+PyTorch,sum,sum_R256_V512_dim0_contiguousFalse_cpu,short,False,57.957757,0.000000
+PyTorch,sum,sum_R256_V512_dim1_contiguousTrue_cpu,short,False,53.592068,0.000000
+PyTorch,sum,sum_R256_V512_dim1_contiguousFalse_cpu,short,False,51.339726,0.000000
+PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N16_cpu,short,False,7.040985,0.000000
+PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N64_cpu,short,False,7.168604,0.000000
+PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N128_cpu,short,False,7.434442,0.000000
+PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N16_cpu,short,False,7.078318,0.000000
+PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N64_cpu,short,False,7.426670,0.000000
+PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N128_cpu,short,False,7.679027,0.000000
+PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N16_cpu,short,False,7.281365,0.000000
+PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N64_cpu,short,False,7.682783,0.000000
+PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N128_cpu,short,False,8.381938,0.000000
+PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N16_cpu,short,False,7.039854,0.000000
+PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N64_cpu,short,False,7.399855,0.000000
+PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N128_cpu,short,False,7.715193,0.000000
+PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N16_cpu,short,False,7.255140,0.000000
+PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N64_cpu,short,False,7.753522,0.000000
+PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N128_cpu,short,False,8.364281,0.000000
+PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N16_cpu,short,False,7.476377,0.000000
+PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N64_cpu,short,False,8.458564,0.000000
+PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N128_cpu,short,False,9.391939,0.000000
+PyTorch,addcmul,addcmul_M1_N2_cpu_dtypetorch.float32,short,False,4.461410,0.000000
+PyTorch,addcmul,addcmul_M1_N2_cpu_dtypetorch.bfloat16,short,False,4.560082,0.000000
+PyTorch,addcmul,addcmul_M32_N64_cpu_dtypetorch.float32,short,False,5.141248,0.000000
+PyTorch,addcmul,addcmul_M32_N64_cpu_dtypetorch.bfloat16,short,False,5.819053,0.000000
+PyTorch,addcdiv,addcdiv_M1_N2_cpu_dtypetorch.float32,short,False,4.922033,0.000000
+PyTorch,addcdiv,addcdiv_M1_N2_cpu_dtypetorch.bfloat16,short,False,4.861055,0.000000
+PyTorch,addcdiv,addcdiv_M32_N64_cpu_dtypetorch.float32,short,False,5.560473,0.000000
+PyTorch,addcdiv,addcdiv_M32_N64_cpu_dtypetorch.bfloat16,short,False,6.113489,0.000000
+PyTorch,topk,"topk_shape(16,4)_k4_dim1_cpu_dtypetorch.float32",short,False,6.656324,0.000000
+PyTorch,topk,"topk_shape(1048576,)_k16_dim0_cpu_dtypetorch.float32",short,False,2137.073922,0.000000
+PyTorch,where,"where_cond_shape(8,16,1)_input_shape(1,)_other_shape(1,)_cpu_dtypetorch.float32",short,False,6.551560,0.000000
+PyTorch,where,"where_cond_shape(8,16,1)_input_shape(16,1)_other_shape(8,16,1)_cpu_dtypetorch.float32",short,False,6.548704,0.000000
+PyTorch,where,"where_cond_shape(8,16,1)_input_shape(8,1,1)_other_shape(1,)_cpu_dtypetorch.float32",short,False,6.417945,0.000000
+PyTorch,relu,"relu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,9.394759,0.000000
+PyTorch,relu,"relu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,9.308802,0.000000
+PyTorch,relu,"relu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,9.267544,0.000000
+PyTorch,relu,"relu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,9.685650,0.000000
+PyTorch,relu,"relu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,9.606769,0.000000
+PyTorch,relu,"relu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,9.553571,0.000000
+PyTorch,relu,"relu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,80.796781,0.000000
+PyTorch,relu,"relu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,91.592676,0.000000
+PyTorch,relu,"relu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,83.363830,0.000000
+PyTorch,relu,"relu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,82.888682,0.000000
+PyTorch,relu,"relu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,97.166943,0.000000
+PyTorch,relu,"relu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,104.243662,0.000000
+PyTorch,relu6,"relu6_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,8.418549,0.000000
+PyTorch,relu6,"relu6_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,8.500449,0.000000
+PyTorch,relu6,"relu6_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,8.443481,0.000000
+PyTorch,relu6,"relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,8.960919,0.000000
+PyTorch,relu6,"relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,8.986856,0.000000
+PyTorch,relu6,"relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,8.814634,0.000000
+PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,80.921564,0.000000
+PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,81.595518,0.000000
+PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,85.112929,0.000000
+PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,84.740682,0.000000
+PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,85.530059,0.000000
+PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,106.365863,0.000000
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,8.055478,0.000000
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,8.238628,0.000000
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,8.119306,0.000000
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,8.683609,0.000000
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,8.759866,0.000000
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,8.594149,0.000000
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,77.579946,0.000000
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,83.634438,0.000000
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,84.316144,0.000000
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,84.438504,0.000000
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,84.312683,0.000000
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,105.458681,0.000000
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,6.480224,0.000000
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,6.658893,0.000000
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,6.502791,0.000000
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,7.091508,0.000000
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,7.071250,0.000000
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,7.143394,0.000000
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,344.615549,0.000000
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,360.922264,0.000000
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,360.622480,0.000000
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,344.514761,0.000000
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,361.637229,0.000000
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,360.860964,0.000000
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,12.176948,0.000000
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,11.734075,0.000000
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,11.181202,0.000000
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,13.658838,0.000000
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,13.976081,0.000000
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,12.947895,0.000000
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,437.285316,0.000000
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,445.478465,0.000000
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,523.076388,0.000000
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,442.810632,0.000000
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,449.038734,0.000000
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,541.625834,0.000000
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,6.427155,0.000000
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,6.355635,0.000000
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,6.445739,0.000000
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,7.175534,0.000000
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,7.055749,0.000000
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,7.111532,0.000000
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,321.942471,0.000000
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,412.526749,0.000000
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,413.297580,0.000000
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,322.569442,0.000000
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,413.410907,0.000000
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,414.466411,0.000000
+PyTorch,functional.tanh,"functional.tanh_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,6.392274,0.000000
+PyTorch,functional.tanh,"functional.tanh_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,6.349999,0.000000
+PyTorch,functional.tanh,"functional.tanh_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,6.554333,0.000000
+PyTorch,functional.tanh,"functional.tanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,7.061919,0.000000
+PyTorch,functional.tanh,"functional.tanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,7.149233,0.000000
+PyTorch,functional.tanh,"functional.tanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,7.086558,0.000000
+PyTorch,functional.tanh,"functional.tanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,406.644221,0.000000
+PyTorch,functional.tanh,"functional.tanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,373.447059,0.000000
+PyTorch,functional.tanh,"functional.tanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,371.772997,0.000000
+PyTorch,functional.tanh,"functional.tanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,409.167217,0.000000
+PyTorch,functional.tanh,"functional.tanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,373.676758,0.000000
+PyTorch,functional.tanh,"functional.tanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,374.537943,0.000000
+PyTorch,functional.hardswish,"functional.hardswish_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,9.930822,0.000000
+PyTorch,functional.hardswish,"functional.hardswish_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,10.116378,0.000000
+PyTorch,functional.hardswish,"functional.hardswish_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,10.149234,0.000000
+PyTorch,functional.hardswish,"functional.hardswish_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,11.481823,0.000000
+PyTorch,functional.hardswish,"functional.hardswish_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,11.614461,0.000000
+PyTorch,functional.hardswish,"functional.hardswish_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,11.762893,0.000000
+PyTorch,functional.hardswish,"functional.hardswish_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,335.415021,0.000000
+PyTorch,functional.hardswish,"functional.hardswish_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,350.660354,0.000000
+PyTorch,functional.hardswish,"functional.hardswish_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,351.735603,0.000000
+PyTorch,functional.hardswish,"functional.hardswish_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,336.152532,0.000000
+PyTorch,functional.hardswish,"functional.hardswish_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,350.996697,0.000000
+PyTorch,functional.hardswish,"functional.hardswish_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,353.547824,0.000000
+PyTorch,functional.elu,"functional.elu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,10.267545,0.000000
+PyTorch,functional.elu,"functional.elu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,10.379921,0.000000
+PyTorch,functional.elu,"functional.elu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,10.477865,0.000000
+PyTorch,functional.elu,"functional.elu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,11.684307,0.000000
+PyTorch,functional.elu,"functional.elu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,12.064549,0.000000
+PyTorch,functional.elu,"functional.elu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,12.134612,0.000000
+PyTorch,functional.elu,"functional.elu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,154.252406,0.000000
+PyTorch,functional.elu,"functional.elu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,450.243138,0.000000
+PyTorch,functional.elu,"functional.elu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,449.014350,0.000000
+PyTorch,functional.elu,"functional.elu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,153.808653,0.000000
+PyTorch,functional.elu,"functional.elu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,445.457985,0.000000
+PyTorch,functional.elu,"functional.elu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,453.355262,0.000000
+PyTorch,functional.celu,"functional.celu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,9.940230,0.000000
+PyTorch,functional.celu,"functional.celu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,10.151808,0.000000
+PyTorch,functional.celu,"functional.celu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,10.292930,0.000000
+PyTorch,functional.celu,"functional.celu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,11.492981,0.000000
+PyTorch,functional.celu,"functional.celu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,11.703474,0.000000
+PyTorch,functional.celu,"functional.celu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,11.779910,0.000000
+PyTorch,functional.celu,"functional.celu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,156.045063,0.000000
+PyTorch,functional.celu,"functional.celu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,446.178772,0.000000
+PyTorch,functional.celu,"functional.celu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,449.322654,0.000000
+PyTorch,functional.celu,"functional.celu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,155.598436,0.000000
+PyTorch,functional.celu,"functional.celu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,451.376561,0.000000
+PyTorch,functional.celu,"functional.celu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,456.399200,0.000000
+PyTorch,add,add_N2_dtypetorch.quint8_contigFalse,short,False,54.525704,0.000000
+PyTorch,add,add_N2_dtypetorch.quint8_contigTrue,short,False,48.507417,0.000000
+PyTorch,add,add_N2_dtypetorch.qint8_contigFalse,short,False,54.165648,0.000000
+PyTorch,add,add_N2_dtypetorch.qint8_contigTrue,short,False,49.270978,0.000000
+PyTorch,add,add_N2_dtypetorch.qint32_contigFalse,short,False,10.166548,0.000000
+PyTorch,add,add_N2_dtypetorch.qint32_contigTrue,short,False,9.839232,0.000000
+PyTorch,add,add_N8_dtypetorch.quint8_contigFalse,short,False,55.172433,0.000000
+PyTorch,add,add_N8_dtypetorch.quint8_contigTrue,short,False,46.703761,0.000000
+PyTorch,add,add_N8_dtypetorch.qint8_contigFalse,short,False,55.712299,0.000000
+PyTorch,add,add_N8_dtypetorch.qint8_contigTrue,short,False,47.370029,0.000000
+PyTorch,add,add_N8_dtypetorch.qint32_contigFalse,short,False,11.358310,0.000000
+PyTorch,add,add_N8_dtypetorch.qint32_contigTrue,short,False,11.571205,0.000000
+PyTorch,add,add_N64_dtypetorch.quint8_contigFalse,short,False,59.735500,0.000000
+PyTorch,add,add_N64_dtypetorch.quint8_contigTrue,short,False,47.242686,0.000000
+PyTorch,add,add_N64_dtypetorch.qint8_contigFalse,short,False,60.975918,0.000000
+PyTorch,add,add_N64_dtypetorch.qint8_contigTrue,short,False,47.022490,0.000000
+PyTorch,add,add_N64_dtypetorch.qint32_contigFalse,short,False,29.096942,0.000000
+PyTorch,add,add_N64_dtypetorch.qint32_contigTrue,short,False,89.559198,0.000000
+PyTorch,add,add_N512_dtypetorch.quint8_contigFalse,short,False,213.117569,0.000000
+PyTorch,add,add_N512_dtypetorch.quint8_contigTrue,short,False,58.900791,0.000000
+PyTorch,add,add_N512_dtypetorch.qint8_contigFalse,short,False,212.745501,0.000000
+PyTorch,add,add_N512_dtypetorch.qint8_contigTrue,short,False,58.136227,0.000000
+PyTorch,add,add_N512_dtypetorch.qint32_contigFalse,short,False,186.300471,0.000000
+PyTorch,add,add_N512_dtypetorch.qint32_contigTrue,short,False,690.767958,0.000000
+PyTorch,add_relu,add_relu_N2_dtypetorch.quint8_contigFalse,short,False,10.009465,0.000000
+PyTorch,add_relu,add_relu_N2_dtypetorch.quint8_contigTrue,short,False,9.746104,0.000000
+PyTorch,add_relu,add_relu_N2_dtypetorch.qint8_contigFalse,short,False,10.162506,0.000000
+PyTorch,add_relu,add_relu_N2_dtypetorch.qint8_contigTrue,short,False,9.701948,0.000000
+PyTorch,add_relu,add_relu_N2_dtypetorch.qint32_contigFalse,short,False,10.097318,0.000000
+PyTorch,add_relu,add_relu_N2_dtypetorch.qint32_contigTrue,short,False,9.738773,0.000000
+PyTorch,add_relu,add_relu_N8_dtypetorch.quint8_contigFalse,short,False,11.193524,0.000000
+PyTorch,add_relu,add_relu_N8_dtypetorch.quint8_contigTrue,short,False,11.319229,0.000000
+PyTorch,add_relu,add_relu_N8_dtypetorch.qint8_contigFalse,short,False,11.153031,0.000000
+PyTorch,add_relu,add_relu_N8_dtypetorch.qint8_contigTrue,short,False,11.185324,0.000000
+PyTorch,add_relu,add_relu_N8_dtypetorch.qint32_contigFalse,short,False,11.368479,0.000000
+PyTorch,add_relu,add_relu_N8_dtypetorch.qint32_contigTrue,short,False,11.326698,0.000000
+PyTorch,add_relu,add_relu_N64_dtypetorch.quint8_contigFalse,short,False,29.288667,0.000000
+PyTorch,add_relu,add_relu_N64_dtypetorch.quint8_contigTrue,short,False,81.897881,0.000000
+PyTorch,add_relu,add_relu_N64_dtypetorch.qint8_contigFalse,short,False,39.738525,0.000000
+PyTorch,add_relu,add_relu_N64_dtypetorch.qint8_contigTrue,short,False,82.035375,0.000000
+PyTorch,add_relu,add_relu_N64_dtypetorch.qint32_contigFalse,short,False,43.063633,0.000000
+PyTorch,add_relu,add_relu_N64_dtypetorch.qint32_contigTrue,short,False,89.797751,0.000000
+PyTorch,add_relu,add_relu_N512_dtypetorch.quint8_contigFalse,short,False,186.276330,0.000000
+PyTorch,add_relu,add_relu_N512_dtypetorch.quint8_contigTrue,short,False,621.216089,0.000000
+PyTorch,add_relu,add_relu_N512_dtypetorch.qint8_contigFalse,short,False,397.837161,0.000000
+PyTorch,add_relu,add_relu_N512_dtypetorch.qint8_contigTrue,short,False,626.707880,0.000000
+PyTorch,add_relu,add_relu_N512_dtypetorch.qint32_contigFalse,short,False,399.039524,0.000000
+PyTorch,add_relu,add_relu_N512_dtypetorch.qint32_contigTrue,short,False,695.372335,0.000000
+PyTorch,mul,mul_N2_dtypetorch.quint8_contigFalse,short,False,10.792049,0.000000
+PyTorch,mul,mul_N2_dtypetorch.quint8_contigTrue,short,False,10.337356,0.000000
+PyTorch,mul,mul_N2_dtypetorch.qint8_contigFalse,short,False,29.766997,0.000000
+PyTorch,mul,mul_N2_dtypetorch.qint8_contigTrue,short,False,10.670764,0.000000
+PyTorch,mul,mul_N2_dtypetorch.qint32_contigFalse,short,False,10.747730,0.000000
+PyTorch,mul,mul_N2_dtypetorch.qint32_contigTrue,short,False,10.272625,0.000000
+PyTorch,mul,mul_N8_dtypetorch.quint8_contigFalse,short,False,11.249079,0.000000
+PyTorch,mul,mul_N8_dtypetorch.quint8_contigTrue,short,False,10.184144,0.000000
+PyTorch,mul,mul_N8_dtypetorch.qint8_contigFalse,short,False,412.500754,0.000000
+PyTorch,mul,mul_N8_dtypetorch.qint8_contigTrue,short,False,380.488152,0.000000
+PyTorch,mul,mul_N8_dtypetorch.qint32_contigFalse,short,False,11.217967,0.000000
+PyTorch,mul,mul_N8_dtypetorch.qint32_contigTrue,short,False,10.372477,0.000000
+PyTorch,mul,mul_N64_dtypetorch.quint8_contigFalse,short,False,26.384046,0.000000
+PyTorch,mul,mul_N64_dtypetorch.quint8_contigTrue,short,False,13.281053,0.000000
+PyTorch,mul,mul_N64_dtypetorch.qint8_contigFalse,short,False,427.333217,0.000000
+PyTorch,mul,mul_N64_dtypetorch.qint8_contigTrue,short,False,378.800277,0.000000
+PyTorch,mul,mul_N64_dtypetorch.qint32_contigFalse,short,False,22.636102,0.000000
+PyTorch,mul,mul_N64_dtypetorch.qint32_contigTrue,short,False,13.891831,0.000000
+PyTorch,mul,mul_N512_dtypetorch.quint8_contigFalse,short,False,324.837860,0.000000
+PyTorch,mul,mul_N512_dtypetorch.quint8_contigTrue,short,False,70.655191,0.000000
+PyTorch,mul,mul_N512_dtypetorch.qint8_contigFalse,short,False,697.828340,0.000000
+PyTorch,mul,mul_N512_dtypetorch.qint8_contigTrue,short,False,414.893995,0.000000
+PyTorch,mul,mul_N512_dtypetorch.qint32_contigFalse,short,False,140.090565,0.000000
+PyTorch,mul,mul_N512_dtypetorch.qint32_contigTrue,short,False,72.970641,0.000000
+PyTorch,add_scalar,add_scalar_N2_dtypetorch.quint8_contigFalse,short,False,9.650154,0.000000
+PyTorch,add_scalar,add_scalar_N2_dtypetorch.quint8_contigTrue,short,False,9.056958,0.000000
+PyTorch,add_scalar,add_scalar_N2_dtypetorch.qint8_contigFalse,short,False,10.032105,0.000000
+PyTorch,add_scalar,add_scalar_N2_dtypetorch.qint8_contigTrue,short,False,9.419741,0.000000
+PyTorch,add_scalar,add_scalar_N2_dtypetorch.qint32_contigFalse,short,False,9.857270,0.000000
+PyTorch,add_scalar,add_scalar_N2_dtypetorch.qint32_contigTrue,short,False,9.260383,0.000000
+PyTorch,add_scalar,add_scalar_N8_dtypetorch.quint8_contigFalse,short,False,10.275563,0.000000
+PyTorch,add_scalar,add_scalar_N8_dtypetorch.quint8_contigTrue,short,False,8.914322,0.000000
+PyTorch,add_scalar,add_scalar_N8_dtypetorch.qint8_contigFalse,short,False,9.973162,0.000000
+PyTorch,add_scalar,add_scalar_N8_dtypetorch.qint8_contigTrue,short,False,9.329676,0.000000
+PyTorch,add_scalar,add_scalar_N8_dtypetorch.qint32_contigFalse,short,False,9.742725,0.000000
+PyTorch,add_scalar,add_scalar_N8_dtypetorch.qint32_contigTrue,short,False,9.058522,0.000000
+PyTorch,add_scalar,add_scalar_N64_dtypetorch.quint8_contigFalse,short,False,20.745533,0.000000
+PyTorch,add_scalar,add_scalar_N64_dtypetorch.quint8_contigTrue,short,False,11.517188,0.000000
+PyTorch,add_scalar,add_scalar_N64_dtypetorch.qint8_contigFalse,short,False,14.588801,0.000000
+PyTorch,add_scalar,add_scalar_N64_dtypetorch.qint8_contigTrue,short,False,9.918611,0.000000
+PyTorch,add_scalar,add_scalar_N64_dtypetorch.qint32_contigFalse,short,False,13.542074,0.000000
+PyTorch,add_scalar,add_scalar_N64_dtypetorch.qint32_contigTrue,short,False,10.794776,0.000000
+PyTorch,add_scalar,add_scalar_N512_dtypetorch.quint8_contigFalse,short,False,120.869888,0.000000
+PyTorch,add_scalar,add_scalar_N512_dtypetorch.quint8_contigTrue,short,False,75.806970,0.000000
+PyTorch,add_scalar,add_scalar_N512_dtypetorch.qint8_contigFalse,short,False,81.201255,0.000000
+PyTorch,add_scalar,add_scalar_N512_dtypetorch.qint8_contigTrue,short,False,55.456395,0.000000
+PyTorch,add_scalar,add_scalar_N512_dtypetorch.qint32_contigFalse,short,False,85.280151,0.000000
+PyTorch,add_scalar,add_scalar_N512_dtypetorch.qint32_contigTrue,short,False,59.971946,0.000000
+PyTorch,mul_scalar,mul_scalar_N2_dtypetorch.quint8_contigFalse,short,False,9.801843,0.000000
+PyTorch,mul_scalar,mul_scalar_N2_dtypetorch.quint8_contigTrue,short,False,9.290992,0.000000
+PyTorch,mul_scalar,mul_scalar_N2_dtypetorch.qint8_contigFalse,short,False,9.980126,0.000000
+PyTorch,mul_scalar,mul_scalar_N2_dtypetorch.qint8_contigTrue,short,False,9.359637,0.000000
+PyTorch,mul_scalar,mul_scalar_N2_dtypetorch.qint32_contigFalse,short,False,9.915617,0.000000
+PyTorch,mul_scalar,mul_scalar_N2_dtypetorch.qint32_contigTrue,short,False,9.210668,0.000000
+PyTorch,mul_scalar,mul_scalar_N8_dtypetorch.quint8_contigFalse,short,False,9.820922,0.000000
+PyTorch,mul_scalar,mul_scalar_N8_dtypetorch.quint8_contigTrue,short,False,9.130066,0.000000
+PyTorch,mul_scalar,mul_scalar_N8_dtypetorch.qint8_contigFalse,short,False,9.822860,0.000000
+PyTorch,mul_scalar,mul_scalar_N8_dtypetorch.qint8_contigTrue,short,False,9.208939,0.000000
+PyTorch,mul_scalar,mul_scalar_N8_dtypetorch.qint32_contigFalse,short,False,9.923802,0.000000
+PyTorch,mul_scalar,mul_scalar_N8_dtypetorch.qint32_contigTrue,short,False,9.228233,0.000000
+PyTorch,mul_scalar,mul_scalar_N64_dtypetorch.quint8_contigFalse,short,False,13.801614,0.000000
+PyTorch,mul_scalar,mul_scalar_N64_dtypetorch.quint8_contigTrue,short,False,9.730629,0.000000
+PyTorch,mul_scalar,mul_scalar_N64_dtypetorch.qint8_contigFalse,short,False,14.292015,0.000000
+PyTorch,mul_scalar,mul_scalar_N64_dtypetorch.qint8_contigTrue,short,False,9.772135,0.000000
+PyTorch,mul_scalar,mul_scalar_N64_dtypetorch.qint32_contigFalse,short,False,13.532725,0.000000
+PyTorch,mul_scalar,mul_scalar_N64_dtypetorch.qint32_contigTrue,short,False,10.971262,0.000000
+PyTorch,mul_scalar,mul_scalar_N512_dtypetorch.quint8_contigFalse,short,False,79.350580,0.000000
+PyTorch,mul_scalar,mul_scalar_N512_dtypetorch.quint8_contigTrue,short,False,56.108255,0.000000
+PyTorch,mul_scalar,mul_scalar_N512_dtypetorch.qint8_contigFalse,short,False,80.221636,0.000000
+PyTorch,mul_scalar,mul_scalar_N512_dtypetorch.qint8_contigTrue,short,False,54.967161,0.000000
+PyTorch,mul_scalar,mul_scalar_N512_dtypetorch.qint32_contigFalse,short,False,85.677349,0.000000
+PyTorch,mul_scalar,mul_scalar_N512_dtypetorch.qint32_contigTrue,short,False,58.340807,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,274.988859,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,314.877017,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,274.143065,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,333.170297,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,276.114808,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,318.133386,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,316.446400,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,351.285540,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,316.018478,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,351.023262,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,314.584634,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,348.879078,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,510.666462,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,546.541658,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,513.146251,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,544.085314,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,512.262547,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,563.350471,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,526.527040,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,561.490715,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,526.299266,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,563.797929,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,533.919534,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,585.499031,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,77.160832,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,77.230151,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,77.935535,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,77.894121,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,81.645482,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,81.267530,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,87.730819,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,87.759078,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,88.382237,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,88.687020,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,92.216803,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,92.051609,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,318.113337,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,316.527647,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,311.871957,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,316.786788,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,318.008949,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,318.298942,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,309.078271,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,309.316080,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,309.372130,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,311.992863,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,312.211778,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,311.930870,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings10_embedding_dim64_input_size8_cpu,short,False,266.095368,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings10_embedding_dim64_input_size16_cpu,short,False,264.323879,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings10_embedding_dim64_input_size64_cpu,short,False,265.230784,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings120_embedding_dim64_input_size8_cpu,short,False,300.983800,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings120_embedding_dim64_input_size16_cpu,short,False,302.473380,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings120_embedding_dim64_input_size64_cpu,short,False,302.886389,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings1000_embedding_dim64_input_size8_cpu,short,False,497.948795,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings1000_embedding_dim64_input_size16_cpu,short,False,497.101363,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings1000_embedding_dim64_input_size64_cpu,short,False,498.723660,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings2300_embedding_dim64_input_size8_cpu,short,False,516.198427,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings2300_embedding_dim64_input_size16_cpu,short,False,516.910952,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings2300_embedding_dim64_input_size64_cpu,short,False,518.768045,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings10_embedding_dim64_input_size8_cpu_BACKWARD,short,True,64.304382,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings10_embedding_dim64_input_size16_cpu_BACKWARD,short,True,65.962808,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings10_embedding_dim64_input_size64_cpu_BACKWARD,short,True,71.122468,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings120_embedding_dim64_input_size8_cpu_BACKWARD,short,True,73.623478,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings120_embedding_dim64_input_size16_cpu_BACKWARD,short,True,75.755343,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings120_embedding_dim64_input_size64_cpu_BACKWARD,short,True,81.115363,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings1000_embedding_dim64_input_size8_cpu_BACKWARD,short,True,295.989743,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings1000_embedding_dim64_input_size16_cpu_BACKWARD,short,True,296.732952,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings1000_embedding_dim64_input_size64_cpu_BACKWARD,short,True,303.545079,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings2300_embedding_dim64_input_size8_cpu_BACKWARD,short,True,332.342200,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings2300_embedding_dim64_input_size16_cpu_BACKWARD,short,True,333.213785,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings2300_embedding_dim64_input_size64_cpu_BACKWARD,short,True,339.762786,0.000000
+PyTorch,QBatchNorm1d,QBatchNorm1d_M1_N256_K3136_cpu_dtypetorch.qint8,short,False,1279.230735,0.000000
+PyTorch,QBatchNorm2d,QBatchNorm2d_M1_N256_K3136_cpu_dtypetorch.qint8,short,False,1143.587020,0.000000
+PyTorch,qcat,qcat_M256_N512_K1_L2_dim0_contigall_dtypetorch.quint8,short,False,229.089037,0.000000
+PyTorch,qcat,qcat_M256_N512_K1_L2_dim0_contigall_dtypetorch.qint8,short,False,229.814037,0.000000
+PyTorch,qcat,qcat_M256_N512_K1_L2_dim0_contigall_dtypetorch.qint32,short,False,919.673338,0.000000
+PyTorch,qcat,qcat_M256_N512_K1_L2_dim0_contigone_dtypetorch.quint8,short,False,301.101660,0.000000
+PyTorch,qcat,qcat_M256_N512_K1_L2_dim0_contigone_dtypetorch.qint8,short,False,300.354370,0.000000
+PyTorch,qcat,qcat_M256_N512_K1_L2_dim0_contigone_dtypetorch.qint32,short,False,996.242370,0.000000
+PyTorch,qcat,qcat_M256_N512_K1_L2_dim0_contignone_dtypetorch.quint8,short,False,367.358463,0.000000
+PyTorch,qcat,qcat_M256_N512_K1_L2_dim0_contignone_dtypetorch.qint8,short,False,373.531795,0.000000
+PyTorch,qcat,qcat_M256_N512_K1_L2_dim0_contignone_dtypetorch.qint32,short,False,1071.199771,0.000000
+PyTorch,qcat,qcat_M512_N512_K2_L1_dim1_contigall_dtypetorch.quint8,short,False,355.003390,0.000000
+PyTorch,qcat,qcat_M512_N512_K2_L1_dim1_contigall_dtypetorch.qint8,short,False,357.724388,0.000000
+PyTorch,qcat,qcat_M512_N512_K2_L1_dim1_contigall_dtypetorch.qint32,short,False,1591.623679,0.000000
+PyTorch,qcat,qcat_M512_N512_K2_L1_dim1_contigone_dtypetorch.quint8,short,False,458.641811,0.000000
+PyTorch,qcat,qcat_M512_N512_K2_L1_dim1_contigone_dtypetorch.qint8,short,False,458.108343,0.000000
+PyTorch,qcat,qcat_M512_N512_K2_L1_dim1_contigone_dtypetorch.qint32,short,False,1715.952436,0.000000
+PyTorch,qcat,qcat_M512_N512_K2_L1_dim1_contignone_dtypetorch.quint8,short,False,556.800793,0.000000
+PyTorch,qcat,qcat_M512_N512_K2_L1_dim1_contignone_dtypetorch.qint8,short,False,557.022942,0.000000
+PyTorch,qcat,qcat_M512_N512_K2_L1_dim1_contignone_dtypetorch.qint32,short,False,1831.625177,0.000000
+PyTorch,eq,eq_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,10.659249,0.000000
+PyTorch,eq,eq_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,24.488580,0.000000
+PyTorch,eq,eq_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,14.062653,0.000000
+PyTorch,eq,eq_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,29.175123,0.000000
+PyTorch,eq,eq_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,7.031340,0.000000
+PyTorch,eq,eq_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,18.240752,0.000000
+PyTorch,eq,eq_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,10.901555,0.000000
+PyTorch,eq,eq_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,23.333026,0.000000
+PyTorch,eq,eq_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,10.366241,0.000000
+PyTorch,eq,eq_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,23.646604,0.000000
+PyTorch,eq,eq_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,14.343720,0.000000
+PyTorch,eq,eq_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,28.861064,0.000000
+PyTorch,eq,eq_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,6.998121,0.000000
+PyTorch,eq,eq_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,17.624672,0.000000
+PyTorch,eq,eq_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,10.924173,0.000000
+PyTorch,eq,eq_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,23.223008,0.000000
+PyTorch,eq,eq_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,False,10.916533,0.000000
+PyTorch,eq,eq_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,False,24.926139,0.000000
+PyTorch,eq,eq_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,False,14.413789,0.000000
+PyTorch,eq,eq_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,False,29.167968,0.000000
+PyTorch,eq,eq_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,False,7.286591,0.000000
+PyTorch,eq,eq_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,False,19.297183,0.000000
+PyTorch,eq,eq_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,False,11.087414,0.000000
+PyTorch,eq,eq_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,False,23.674432,0.000000
+PyTorch,eq,eq_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,18.425990,0.000000
+PyTorch,eq,eq_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,33.055810,0.000000
+PyTorch,eq,eq_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,21.737632,0.000000
+PyTorch,eq,eq_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,37.173348,0.000000
+PyTorch,eq,eq_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,11.547812,0.000000
+PyTorch,eq,eq_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,24.831548,0.000000
+PyTorch,eq,eq_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,16.424478,0.000000
+PyTorch,eq,eq_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,28.738332,0.000000
+PyTorch,eq,eq_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,19.230981,0.000000
+PyTorch,eq,eq_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,34.484918,0.000000
+PyTorch,eq,eq_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,22.740766,0.000000
+PyTorch,eq,eq_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,38.301714,0.000000
+PyTorch,eq,eq_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,10.705394,0.000000
+PyTorch,eq,eq_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,24.413391,0.000000
+PyTorch,eq,eq_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,16.401949,0.000000
+PyTorch,eq,eq_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,28.602660,0.000000
+PyTorch,eq,eq_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,False,28.037415,0.000000
+PyTorch,eq,eq_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,False,43.889381,0.000000
+PyTorch,eq,eq_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,False,27.580923,0.000000
+PyTorch,eq,eq_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,False,43.491900,0.000000
+PyTorch,eq,eq_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,False,21.994874,0.000000
+PyTorch,eq,eq_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,False,34.649429,0.000000
+PyTorch,eq,eq_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,False,20.859801,0.000000
+PyTorch,eq,eq_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,False,33.119628,0.000000
+PyTorch,ne,ne_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,10.472581,0.000000
+PyTorch,ne,ne_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,24.114184,0.000000
+PyTorch,ne,ne_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,14.017749,0.000000
+PyTorch,ne,ne_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,29.735235,0.000000
+PyTorch,ne,ne_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,6.569071,0.000000
+PyTorch,ne,ne_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,17.797276,0.000000
+PyTorch,ne,ne_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,10.891585,0.000000
+PyTorch,ne,ne_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,23.659451,0.000000
+PyTorch,ne,ne_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,10.143022,0.000000
+PyTorch,ne,ne_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,23.786464,0.000000
+PyTorch,ne,ne_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,14.225867,0.000000
+PyTorch,ne,ne_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,29.986286,0.000000
+PyTorch,ne,ne_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,6.614645,0.000000
+PyTorch,ne,ne_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,17.335371,0.000000
+PyTorch,ne,ne_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,11.021240,0.000000
+PyTorch,ne,ne_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,23.611790,0.000000
+PyTorch,ne,ne_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,False,10.667795,0.000000
+PyTorch,ne,ne_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,False,24.338721,0.000000
+PyTorch,ne,ne_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,False,14.562054,0.000000
+PyTorch,ne,ne_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,False,29.746058,0.000000
+PyTorch,ne,ne_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,False,7.040875,0.000000
+PyTorch,ne,ne_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,False,18.537772,0.000000
+PyTorch,ne,ne_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,False,11.289554,0.000000
+PyTorch,ne,ne_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,False,24.121479,0.000000
+PyTorch,ne,ne_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,17.736341,0.000000
+PyTorch,ne,ne_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,32.487414,0.000000
+PyTorch,ne,ne_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,20.927801,0.000000
+PyTorch,ne,ne_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,36.157429,0.000000
+PyTorch,ne,ne_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,11.152495,0.000000
+PyTorch,ne,ne_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,24.151756,0.000000
+PyTorch,ne,ne_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,15.921099,0.000000
+PyTorch,ne,ne_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,28.827231,0.000000
+PyTorch,ne,ne_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,18.198807,0.000000
+PyTorch,ne,ne_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,33.871904,0.000000
+PyTorch,ne,ne_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,21.828119,0.000000
+PyTorch,ne,ne_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,38.920595,0.000000
+PyTorch,ne,ne_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,11.054162,0.000000
+PyTorch,ne,ne_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,24.071486,0.000000
+PyTorch,ne,ne_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,16.014435,0.000000
+PyTorch,ne,ne_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,29.079400,0.000000
+PyTorch,ne,ne_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,False,28.000709,0.000000
+PyTorch,ne,ne_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,False,42.665661,0.000000
+PyTorch,ne,ne_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,False,26.996536,0.000000
+PyTorch,ne,ne_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,False,42.408350,0.000000
+PyTorch,ne,ne_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,False,22.120757,0.000000
+PyTorch,ne,ne_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,False,34.036985,0.000000
+PyTorch,ne,ne_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,False,20.305630,0.000000
+PyTorch,ne,ne_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,False,33.293711,0.000000
+PyTorch,lt,lt_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,10.989175,0.000000
+PyTorch,lt,lt_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,24.022303,0.000000
+PyTorch,lt,lt_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,14.211976,0.000000
+PyTorch,lt,lt_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,28.225586,0.000000
+PyTorch,lt,lt_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,6.725662,0.000000
+PyTorch,lt,lt_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,18.036751,0.000000
+PyTorch,lt,lt_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,11.195603,0.000000
+PyTorch,lt,lt_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,23.173156,0.000000
+PyTorch,lt,lt_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,10.922803,0.000000
+PyTorch,lt,lt_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,24.063407,0.000000
+PyTorch,lt,lt_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,14.478919,0.000000
+PyTorch,lt,lt_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,28.725090,0.000000
+PyTorch,lt,lt_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,6.556450,0.000000
+PyTorch,lt,lt_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,17.992666,0.000000
+PyTorch,lt,lt_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,11.041052,0.000000
+PyTorch,lt,lt_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,23.128039,0.000000
+PyTorch,lt,lt_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,False,10.908588,0.000000
+PyTorch,lt,lt_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,False,24.932022,0.000000
+PyTorch,lt,lt_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,False,14.509387,0.000000
+PyTorch,lt,lt_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,False,28.507423,0.000000
+PyTorch,lt,lt_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,False,6.991223,0.000000
+PyTorch,lt,lt_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,False,18.883428,0.000000
+PyTorch,lt,lt_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,False,11.340537,0.000000
+PyTorch,lt,lt_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,False,23.474580,0.000000
+PyTorch,lt,lt_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,17.780582,0.000000
+PyTorch,lt,lt_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,33.483268,0.000000
+PyTorch,lt,lt_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,21.736950,0.000000
+PyTorch,lt,lt_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,37.622393,0.000000
+PyTorch,lt,lt_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,11.501619,0.000000
+PyTorch,lt,lt_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,25.636465,0.000000
+PyTorch,lt,lt_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,16.487000,0.000000
+PyTorch,lt,lt_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,28.538948,0.000000
+PyTorch,lt,lt_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,19.407710,0.000000
+PyTorch,lt,lt_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,34.710407,0.000000
+PyTorch,lt,lt_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,23.001715,0.000000
+PyTorch,lt,lt_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,38.803145,0.000000
+PyTorch,lt,lt_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,11.308907,0.000000
+PyTorch,lt,lt_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,25.126098,0.000000
+PyTorch,lt,lt_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,16.409281,0.000000
+PyTorch,lt,lt_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,28.723077,0.000000
+PyTorch,lt,lt_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,False,28.078608,0.000000
+PyTorch,lt,lt_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,False,43.862870,0.000000
+PyTorch,lt,lt_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,False,28.342684,0.000000
+PyTorch,lt,lt_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,False,45.247717,0.000000
+PyTorch,lt,lt_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,False,22.467307,0.000000
+PyTorch,lt,lt_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,False,35.229839,0.000000
+PyTorch,lt,lt_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,False,20.828508,0.000000
+PyTorch,lt,lt_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,False,34.281815,0.000000
+PyTorch,gt,gt_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,10.201065,0.000000
+PyTorch,gt,gt_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,24.046987,0.000000
+PyTorch,gt,gt_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,13.518527,0.000000
+PyTorch,gt,gt_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,28.247002,0.000000
+PyTorch,gt,gt_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,6.413535,0.000000
+PyTorch,gt,gt_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,17.443923,0.000000
+PyTorch,gt,gt_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,10.946319,0.000000
+PyTorch,gt,gt_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,23.251914,0.000000
+PyTorch,gt,gt_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,9.841737,0.000000
+PyTorch,gt,gt_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,23.463844,0.000000
+PyTorch,gt,gt_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,13.387307,0.000000
+PyTorch,gt,gt_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,28.580578,0.000000
+PyTorch,gt,gt_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,6.499470,0.000000
+PyTorch,gt,gt_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,17.091755,0.000000
+PyTorch,gt,gt_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,10.880642,0.000000
+PyTorch,gt,gt_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,23.144200,0.000000
+PyTorch,gt,gt_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,False,10.522574,0.000000
+PyTorch,gt,gt_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,False,24.733810,0.000000
+PyTorch,gt,gt_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,False,13.634346,0.000000
+PyTorch,gt,gt_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,False,28.491347,0.000000
+PyTorch,gt,gt_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,False,6.759546,0.000000
+PyTorch,gt,gt_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,False,18.334460,0.000000
+PyTorch,gt,gt_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,False,11.276761,0.000000
+PyTorch,gt,gt_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,False,23.338620,0.000000
+PyTorch,gt,gt_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,17.579850,0.000000
+PyTorch,gt,gt_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,33.150634,0.000000
+PyTorch,gt,gt_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,21.018504,0.000000
+PyTorch,gt,gt_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,37.094236,0.000000
+PyTorch,gt,gt_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,10.386846,0.000000
+PyTorch,gt,gt_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,24.705712,0.000000
+PyTorch,gt,gt_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,16.199474,0.000000
+PyTorch,gt,gt_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,28.768630,0.000000
+PyTorch,gt,gt_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,18.496909,0.000000
+PyTorch,gt,gt_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,34.266361,0.000000
+PyTorch,gt,gt_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,22.630030,0.000000
+PyTorch,gt,gt_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,38.576213,0.000000
+PyTorch,gt,gt_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,10.491930,0.000000
+PyTorch,gt,gt_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,23.950235,0.000000
+PyTorch,gt,gt_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,15.528805,0.000000
+PyTorch,gt,gt_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,28.809764,0.000000
+PyTorch,gt,gt_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,False,27.852019,0.000000
+PyTorch,gt,gt_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,False,43.631335,0.000000
+PyTorch,gt,gt_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,False,28.047012,0.000000
+PyTorch,gt,gt_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,False,43.522750,0.000000
+PyTorch,gt,gt_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,False,21.437350,0.000000
+PyTorch,gt,gt_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,False,34.323098,0.000000
+PyTorch,gt,gt_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,False,20.572556,0.000000
+PyTorch,gt,gt_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,False,33.726399,0.000000
+PyTorch,le,le_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,10.355769,0.000000
+PyTorch,le,le_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,24.231171,0.000000
+PyTorch,le,le_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,14.381682,0.000000
+PyTorch,le,le_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,27.908206,0.000000
+PyTorch,le,le_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,7.015842,0.000000
+PyTorch,le,le_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,18.156515,0.000000
+PyTorch,le,le_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,10.764506,0.000000
+PyTorch,le,le_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,22.775082,0.000000
+PyTorch,le,le_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,10.645387,0.000000
+PyTorch,le,le_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,23.661967,0.000000
+PyTorch,le,le_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,14.528062,0.000000
+PyTorch,le,le_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,28.619186,0.000000
+PyTorch,le,le_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,6.821544,0.000000
+PyTorch,le,le_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,17.372435,0.000000
+PyTorch,le,le_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,10.892625,0.000000
+PyTorch,le,le_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,22.654621,0.000000
+PyTorch,le,le_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,False,10.859466,0.000000
+PyTorch,le,le_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,False,24.897908,0.000000
+PyTorch,le,le_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,False,14.472520,0.000000
+PyTorch,le,le_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,False,27.655807,0.000000
+PyTorch,le,le_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,False,7.103746,0.000000
+PyTorch,le,le_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,False,18.891796,0.000000
+PyTorch,le,le_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,False,11.237153,0.000000
+PyTorch,le,le_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,False,23.076524,0.000000
+PyTorch,le,le_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,18.089216,0.000000
+PyTorch,le,le_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,33.345103,0.000000
+PyTorch,le,le_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,21.725297,0.000000
+PyTorch,le,le_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,35.991615,0.000000
+PyTorch,le,le_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,12.072585,0.000000
+PyTorch,le,le_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,24.803279,0.000000
+PyTorch,le,le_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,16.287302,0.000000
+PyTorch,le,le_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,28.200946,0.000000
+PyTorch,le,le_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,19.513103,0.000000
+PyTorch,le,le_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,34.783793,0.000000
+PyTorch,le,le_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,22.548814,0.000000
+PyTorch,le,le_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,37.271383,0.000000
+PyTorch,le,le_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,11.784068,0.000000
+PyTorch,le,le_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,24.427171,0.000000
+PyTorch,le,le_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,16.172816,0.000000
+PyTorch,le,le_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,28.083668,0.000000
+PyTorch,le,le_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,False,28.238695,0.000000
+PyTorch,le,le_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,False,44.109961,0.000000
+PyTorch,le,le_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,False,28.149361,0.000000
+PyTorch,le,le_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,False,41.709949,0.000000
+PyTorch,le,le_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,False,22.886642,0.000000
+PyTorch,le,le_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,False,34.559269,0.000000
+PyTorch,le,le_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,False,20.791157,0.000000
+PyTorch,le,le_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,False,33.302911,0.000000
+PyTorch,ge,ge_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,10.306199,0.000000
+PyTorch,ge,ge_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,23.398023,0.000000
+PyTorch,ge,ge_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,14.367481,0.000000
+PyTorch,ge,ge_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,29.014630,0.000000
+PyTorch,ge,ge_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,6.389997,0.000000
+PyTorch,ge,ge_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,17.330705,0.000000
+PyTorch,ge,ge_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,10.804766,0.000000
+PyTorch,ge,ge_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,23.171337,0.000000
+PyTorch,ge,ge_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,10.069797,0.000000
+PyTorch,ge,ge_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,23.063348,0.000000
+PyTorch,ge,ge_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,14.393169,0.000000
+PyTorch,ge,ge_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,29.074848,0.000000
+PyTorch,ge,ge_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,6.426396,0.000000
+PyTorch,ge,ge_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,16.922122,0.000000
+PyTorch,ge,ge_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,10.935307,0.000000
+PyTorch,ge,ge_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,23.255825,0.000000
+PyTorch,ge,ge_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,False,10.479719,0.000000
+PyTorch,ge,ge_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,False,24.519697,0.000000
+PyTorch,ge,ge_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,False,14.386574,0.000000
+PyTorch,ge,ge_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,False,29.143988,0.000000
+PyTorch,ge,ge_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,False,6.898638,0.000000
+PyTorch,ge,ge_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,False,18.271767,0.000000
+PyTorch,ge,ge_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,False,10.997651,0.000000
+PyTorch,ge,ge_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,False,23.476497,0.000000
+PyTorch,ge,ge_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,16.836825,0.000000
+PyTorch,ge,ge_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,32.890492,0.000000
+PyTorch,ge,ge_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,20.590077,0.000000
+PyTorch,ge,ge_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,36.788412,0.000000
+PyTorch,ge,ge_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,9.996323,0.000000
+PyTorch,ge,ge_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,24.700884,0.000000
+PyTorch,ge,ge_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,16.088683,0.000000
+PyTorch,ge,ge_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,28.550079,0.000000
+PyTorch,ge,ge_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,18.296114,0.000000
+PyTorch,ge,ge_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,34.263955,0.000000
+PyTorch,ge,ge_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,21.947267,0.000000
+PyTorch,ge,ge_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,38.622379,0.000000
+PyTorch,ge,ge_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,10.075395,0.000000
+PyTorch,ge,ge_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,24.391116,0.000000
+PyTorch,ge,ge_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,15.990073,0.000000
+PyTorch,ge,ge_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,28.557654,0.000000
+PyTorch,ge,ge_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,False,28.126564,0.000000
+PyTorch,ge,ge_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,False,43.531679,0.000000
+PyTorch,ge,ge_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,False,26.983753,0.000000
+PyTorch,ge,ge_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,False,43.014786,0.000000
+PyTorch,ge,ge_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,False,21.464556,0.000000
+PyTorch,ge,ge_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,False,34.336164,0.000000
+PyTorch,ge,ge_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,False,20.083832,0.000000
+PyTorch,ge,ge_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,False,33.717209,0.000000
+PyTorch,QConv1d,QConv1d_IC128_OC256_kernel3_stride1_N1_L64_cpu,short,False,2474.554141,0.000000
+PyTorch,QConv1d,QConv1d_IC256_OC256_kernel3_stride2_N4_L64_cpu,short,False,10019.689350,0.000000
+PyTorch,QConv2d,QConv2d_IC256_OC256_kernel3_stride1_N1_H16_W16_G1_pad0_cpu,short,False,2819.508730,0.000000
+PyTorch,qembeddingbag_byte_prepack,qembeddingbag_byte_prepack_num_embeddings80_embedding_dim128,short,False,18.134076,0.000000
+PyTorch,qembeddingbag_byte_prepack,qembeddingbag_byte_prepack_num_embeddings80_embedding_dim256,short,False,34.939813,0.000000
+PyTorch,qembeddingbag_byte_prepack,qembeddingbag_byte_prepack_num_embeddings80_embedding_dim512,short,False,65.717219,0.000000
+PyTorch,qembeddingbag_4bit_prepack,qembeddingbag_4bit_prepack_num_embeddings80_embedding_dim128,short,False,36.029054,0.000000
+PyTorch,qembeddingbag_4bit_prepack,qembeddingbag_4bit_prepack_num_embeddings80_embedding_dim256,short,False,66.511117,0.000000
+PyTorch,qembeddingbag_4bit_prepack,qembeddingbag_4bit_prepack_num_embeddings80_embedding_dim512,short,False,128.594099,0.000000
+PyTorch,qembeddingbag_2bit_prepack,qembeddingbag_2bit_prepack_num_embeddings80_embedding_dim128,short,False,35.738603,0.000000
+PyTorch,qembeddingbag_2bit_prepack,qembeddingbag_2bit_prepack_num_embeddings80_embedding_dim256,short,False,67.034801,0.000000
+PyTorch,qembeddingbag_2bit_prepack,qembeddingbag_2bit_prepack_num_embeddings80_embedding_dim512,short,False,129.472195,0.000000
+PyTorch,qembeddingbag_byte_unpack,qembeddingbag_byte_unpack_num_embeddings80_embedding_dim128,short,False,6.597953,0.000000
+PyTorch,qembeddingbag_byte_unpack,qembeddingbag_byte_unpack_num_embeddings80_embedding_dim256,short,False,9.279742,0.000000
+PyTorch,qembeddingbag_byte_unpack,qembeddingbag_byte_unpack_num_embeddings80_embedding_dim512,short,False,12.878452,0.000000
+PyTorch,qembeddingbag_4bit_unpack,qembeddingbag_4bit_unpack_num_embeddings80_embedding_dim128,short,False,57.690957,0.000000
+PyTorch,qembeddingbag_4bit_unpack,qembeddingbag_4bit_unpack_num_embeddings80_embedding_dim256,short,False,109.143374,0.000000
+PyTorch,qembeddingbag_4bit_unpack,qembeddingbag_4bit_unpack_num_embeddings80_embedding_dim512,short,False,211.718602,0.000000
+PyTorch,qembeddingbag_2bit_unpack,qembeddingbag_2bit_unpack_num_embeddings80_embedding_dim128,short,False,110.866952,0.000000
+PyTorch,qembeddingbag_2bit_unpack,qembeddingbag_2bit_unpack_num_embeddings80_embedding_dim256,short,False,213.131957,0.000000
+PyTorch,qembeddingbag_2bit_unpack,qembeddingbag_2bit_unpack_num_embeddings80_embedding_dim512,short,False,418.880093,0.000000
+PyTorch,qembeddingbag_byte_prepack,qembeddingbag_byte_prepack_num_embeddings80_embedding_dim128_batch_size10,short,False,206.945818,0.000000
+PyTorch,qembeddingbag_byte_prepack,qembeddingbag_byte_prepack_num_embeddings80_embedding_dim256_batch_size10,short,False,363.442792,0.000000
+PyTorch,qembeddingbag_byte_prepack,qembeddingbag_byte_prepack_num_embeddings80_embedding_dim512_batch_size10,short,False,666.987745,0.000000
+PyTorch,qembeddingbag_4bit_prepack,qembeddingbag_4bit_prepack_num_embeddings80_embedding_dim128_batch_size10,short,False,6.759820,0.000000
+PyTorch,qembeddingbag_4bit_prepack,qembeddingbag_4bit_prepack_num_embeddings80_embedding_dim256_batch_size10,short,False,6.655541,0.000000
+PyTorch,qembeddingbag_4bit_prepack,qembeddingbag_4bit_prepack_num_embeddings80_embedding_dim512_batch_size10,short,False,6.737512,0.000000
+PyTorch,qembeddingbag_2bit_prepack,qembeddingbag_2bit_prepack_num_embeddings80_embedding_dim128_batch_size10,short,False,6.743112,0.000000
+PyTorch,qembeddingbag_2bit_prepack,qembeddingbag_2bit_prepack_num_embeddings80_embedding_dim256_batch_size10,short,False,6.652576,0.000000
+PyTorch,qembeddingbag_2bit_prepack,qembeddingbag_2bit_prepack_num_embeddings80_embedding_dim512_batch_size10,short,False,6.841990,0.000000
+PyTorch,qembeddingbag_byte_unpack,qembeddingbag_byte_unpack_num_embeddings80_embedding_dim128_batch_size10,short,False,23.021744,0.000000
+PyTorch,qembeddingbag_byte_unpack,qembeddingbag_byte_unpack_num_embeddings80_embedding_dim256_batch_size10,short,False,38.487234,0.000000
+PyTorch,qembeddingbag_byte_unpack,qembeddingbag_byte_unpack_num_embeddings80_embedding_dim512_batch_size10,short,False,71.024263,0.000000
+PyTorch,qembeddingbag_4bit_unpack,qembeddingbag_4bit_unpack_num_embeddings80_embedding_dim128_batch_size10,short,False,8.177698,0.000000
+PyTorch,qembeddingbag_4bit_unpack,qembeddingbag_4bit_unpack_num_embeddings80_embedding_dim256_batch_size10,short,False,8.039202,0.000000
+PyTorch,qembeddingbag_4bit_unpack,qembeddingbag_4bit_unpack_num_embeddings80_embedding_dim512_batch_size10,short,False,8.332832,0.000000
+PyTorch,qembeddingbag_2bit_unpack,qembeddingbag_2bit_unpack_num_embeddings80_embedding_dim128_batch_size10,short,False,11.874304,0.000000
+PyTorch,qembeddingbag_2bit_unpack,qembeddingbag_2bit_unpack_num_embeddings80_embedding_dim256_batch_size10,short,False,11.875088,0.000000
+PyTorch,qembeddingbag_2bit_unpack,qembeddingbag_2bit_unpack_num_embeddings80_embedding_dim512_batch_size10,short,False,11.973970,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,37.749198,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,37.918866,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,37.601117,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,37.524010,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,37.579205,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,37.955366,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,37.884045,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,38.208370,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,38.443378,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,38.740487,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,38.368374,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,38.422703,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,37.686129,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,37.801677,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,37.489407,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,37.679521,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,37.752840,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,37.905238,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,37.819355,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,38.130109,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,38.408468,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,38.747029,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,38.404787,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,38.502984,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,37.756773,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,37.893388,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,37.831078,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,37.867489,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,37.857305,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,37.989236,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,37.809535,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,37.960946,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,38.544690,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,38.844939,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,38.371755,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,39.108865,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,37.655707,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,37.948385,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,37.677788,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,38.097931,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,37.906198,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,38.246369,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,37.859952,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,38.499342,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,38.788211,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,38.998297,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,38.683481,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,38.536436,0.000000
+PyTorch,QGroupNormBenchmark,"QGroupNormBenchmark_dims(32,8,16)_num_groups2_dtypetorch.qint8",short,False,58.164334,0.000000
+PyTorch,QGroupNormBenchmark,"QGroupNormBenchmark_dims(32,8,16)_num_groups4_dtypetorch.qint8",short,False,57.796211,0.000000
+PyTorch,QGroupNormBenchmark,"QGroupNormBenchmark_dims(32,8,56,56)_num_groups2_dtypetorch.qint8",short,False,1148.216412,0.000000
+PyTorch,QGroupNormBenchmark,"QGroupNormBenchmark_dims(32,8,56,56)_num_groups4_dtypetorch.qint8",short,False,1148.804126,0.000000
+PyTorch,QInstanceNormBenchmark,"QInstanceNormBenchmark_dims(32,8,16)_dtypetorch.qint8",short,False,57.575234,0.000000
+PyTorch,QInstanceNormBenchmark,"QInstanceNormBenchmark_dims(32,8,56,56)_dtypetorch.qint8",short,False,1147.707670,0.000000
+PyTorch,q_interpolate,q_interpolate_M32_N32_K32_dtypetorch.quint8_modenearest_scale0.5_contigTrue,short,False,7.150264,0.000000
+PyTorch,q_interpolate,q_interpolate_M32_N32_K32_dtypetorch.quint8_modebilinear_scale0.5_contigTrue,short,False,9.218789,0.000000
+PyTorch,q_interpolate,q_interpolate_M32_N32_K32_dtypetorch.quint8_modenearest_scale2.0_contigTrue,short,False,7.490512,0.000000
+PyTorch,q_interpolate,q_interpolate_M32_N32_K32_dtypetorch.quint8_modebilinear_scale2.0_contigTrue,short,False,9.314491,0.000000
+PyTorch,q_interpolate,q_interpolate_M3_N720_K1280_dtypetorch.quint8_modebilinear_scale0.83333_contigTrue,short,False,66.910531,0.000000
+PyTorch,QLayerNormBenchmark,"QLayerNormBenchmark_dims(1,8,16)_dtypetorch.qint8",short,False,15.853110,0.000000
+PyTorch,QLayerNormBenchmark,"QLayerNormBenchmark_dims(8,8,16)_dtypetorch.qint8",short,False,62.647792,0.000000
+PyTorch,QLayerNormBenchmark,"QLayerNormBenchmark_dims(32,8,16)_dtypetorch.qint8",short,False,66.094037,0.000000
+PyTorch,QLayerNormBenchmark,"QLayerNormBenchmark_dims(64,128,56,56)_dtypetorch.qint8",short,False,51655.592280,0.000000
+PyTorch,QLinear,QLinear_N1_IN1_OUT1_cpu,short,False,48.466068,0.000000
+PyTorch,QLinear,QLinear_N4_IN256_OUT128_cpu,short,False,97.047966,0.000000
+PyTorch,QLinear,QLinear_N16_IN512_OUT256_cpu,short,False,92.013699,0.000000
+PyTorch,QDynamicLinear,QDynamicLinear_N1_IN1_OUT1_cpu,short,False,55.162945,0.000000
+PyTorch,QDynamicLinear,QDynamicLinear_N4_IN256_OUT128_cpu,short,False,181.460491,0.000000
+PyTorch,QDynamicLinear,QDynamicLinear_N16_IN512_OUT256_cpu,short,False,186.868091,0.000000
+PyTorch,MinMaxObserver,MinMaxObserver_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_tensor_affine,short,False,178.683642,0.000000
+PyTorch,MinMaxObserver,MinMaxObserver_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_tensor_symmetric,short,False,165.985880,0.000000
+PyTorch,MovingAverageMinMaxObserver,MovingAverageMinMaxObserver_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_tensor_affine,short,False,209.793412,0.000000
+PyTorch,MovingAverageMinMaxObserver,MovingAverageMinMaxObserver_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_tensor_symmetric,short,False,199.116115,0.000000
+PyTorch,PerChannelMinMaxObserver,PerChannelMinMaxObserver_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_channel_affine,short,False,383.567212,0.000000
+PyTorch,PerChannelMinMaxObserver,PerChannelMinMaxObserver_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_channel_symmetric,short,False,386.658467,0.000000
+PyTorch,MovingAveragePerChannelMinMaxObserver,MovingAveragePerChannelMinMaxObserver_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_channel_affine,short,False,406.231582,0.000000
+PyTorch,MovingAveragePerChannelMinMaxObserver,MovingAveragePerChannelMinMaxObserver_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_channel_symmetric,short,False,424.846136,0.000000
+PyTorch,HistogramObserver,HistogramObserver_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_tensor_affine,short,False,1852.950257,0.000000
+PyTorch,HistogramObserver,HistogramObserver_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_tensor_symmetric,short,False,1886.575278,0.000000
+PyTorch,HistogramObserverCalculateQparams,HistogramObserverCalculateQparams_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_tensor_affine,short,False,1916.034661,0.000000
+PyTorch,HistogramObserverCalculateQparams,HistogramObserverCalculateQparams_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_tensor_symmetric,short,False,1848.436297,0.000000
+PyTorch,QAdaptiveAvgPool2dBenchmark,"QAdaptiveAvgPool2dBenchmark_N4_C3_input_size(224,224)_output_size(112,112)_contigTrue_dtypetorch.qint32",short,False,125.012330,0.000000
+PyTorch,QAdaptiveAvgPool2dBenchmark,"QAdaptiveAvgPool2dBenchmark_N4_C3_input_size(224,224)_output_size(112,112)_contigTrue_dtypetorch.qint8",short,False,120.338743,0.000000
+PyTorch,QAdaptiveAvgPool2dBenchmark,"QAdaptiveAvgPool2dBenchmark_N4_C3_input_size(224,224)_output_size(112,112)_contigTrue_dtypetorch.quint8",short,False,120.237932,0.000000
+PyTorch,QAvgPool2dBenchmark,"QAvgPool2dBenchmark_C1_H3_W3_k(3,3)_s(1,1)_p(0,0)_N2_contigTrue_dtypetorch.qint32",short,False,58.290125,0.000000
+PyTorch,QAvgPool2dBenchmark,"QAvgPool2dBenchmark_C1_H3_W3_k(3,3)_s(1,1)_p(0,0)_N2_contigTrue_dtypetorch.qint8",short,False,56.845484,0.000000
+PyTorch,QAvgPool2dBenchmark,"QAvgPool2dBenchmark_C1_H3_W3_k(3,3)_s(1,1)_p(0,0)_N2_contigTrue_dtypetorch.quint8",short,False,57.068030,0.000000
+PyTorch,QMaxPool2dBenchmark,"QMaxPool2dBenchmark_C1_H3_W3_k(3,3)_s(1,1)_p(0,0)_N2_contigTrue_dtypetorch.qint32",short,False,62.013425,0.000000
+PyTorch,QMaxPool2dBenchmark,"QMaxPool2dBenchmark_C1_H3_W3_k(3,3)_s(1,1)_p(0,0)_N2_contigTrue_dtypetorch.qint8",short,False,61.332599,0.000000
+PyTorch,QMaxPool2dBenchmark,"QMaxPool2dBenchmark_C1_H3_W3_k(3,3)_s(1,1)_p(0,0)_N2_contigTrue_dtypetorch.quint8",short,False,60.981402,0.000000
+PyTorch,QLSTM,QLSTM_I1_H3_NL1_BTrue_DFalse_dtypetorch.qint8,short,False,20708.077910,0.000000
+PyTorch,QLSTM,QLSTM_I1_H3_NL1_BTrue_DTrue_dtypetorch.qint8,short,False,41009.405290,0.000000
+PyTorch,QLSTM,QLSTM_I5_H7_NL4_BTrue_DFalse_dtypetorch.qint8,short,False,81385.994580,0.000000
+PyTorch,QLSTM,QLSTM_I5_H7_NL4_BTrue_DTrue_dtypetorch.qint8,short,False,162347.641390,0.000000
+PyTorch,QMethodTensorInputCopyBenchmark,QMethodTensorInputCopyBenchmark_M32_N32_dtypetorch.quint8_contigFalse,short,False,0.884224,0.000000
+PyTorch,QMethodTensorInputCopyBenchmark,QMethodTensorInputCopyBenchmark_M32_N32_dtypetorch.quint8_contigTrue,short,False,0.881290,0.000000
+PyTorch,QuantizePerTensor,QuantizePerTensor_C3_M512_N512_dtypetorch.quint8_modeQ,short,False,139.818657,0.000000
+PyTorch,DequantizePerTensor,DequantizePerTensor_C3_M512_N512_dtypetorch.quint8_modeD,short,False,111.856445,0.000000
+PyTorch,QuantizePerChannel,QuantizePerChannel_C3_M512_N512_dtypetorch.quint8_modeQ_axis0,short,False,137.870248,0.000000
+PyTorch,DequantizePerChannel,DequantizePerChannel_C3_M512_N512_dtypetorch.quint8_modeD_axis0,short,False,295.384286,0.000000
+PyTorch,FakeQuantize,FakeQuantize_N1_C3_H512_W512_zero_point_dtypetorch.int32_cpu,short,False,498.468140,0.000000
+PyTorch,learnable_kernel_tensor,learnable_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu,short,False,212.106189,0.000000
+PyTorch,learnable_kernel_tensor,learnable_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu,short,False,212.103393,0.000000
+PyTorch,original_kernel_tensor,original_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu,short,False,210.769552,0.000000
+PyTorch,original_kernel_tensor,original_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu,short,False,210.336579,0.000000
+PyTorch,learnable_kernel_tensor,learnable_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwdall_BACKWARD,short,True,645.670738,0.000000
+PyTorch,learnable_kernel_tensor,learnable_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwd1_BACKWARD,short,True,646.979930,0.000000
+PyTorch,learnable_kernel_tensor,learnable_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwd2_BACKWARD,short,True,648.774775,0.000000
+PyTorch,learnable_kernel_tensor,learnable_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwd3_BACKWARD,short,True,647.536140,0.000000
+PyTorch,learnable_kernel_tensor,learnable_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwdall_BACKWARD,short,True,645.420480,0.000000
+PyTorch,learnable_kernel_tensor,learnable_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwd1_BACKWARD,short,True,647.989360,0.000000
+PyTorch,learnable_kernel_tensor,learnable_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwd2_BACKWARD,short,True,648.279117,0.000000
+PyTorch,learnable_kernel_tensor,learnable_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwd3_BACKWARD,short,True,648.012305,0.000000
+PyTorch,original_kernel_tensor,original_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwdall_BACKWARD,short,True,396.607204,0.000000
+PyTorch,original_kernel_tensor,original_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwd1_BACKWARD,short,True,396.439610,0.000000
+PyTorch,original_kernel_tensor,original_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwd2_BACKWARD,short,True,398.157875,0.000000
+PyTorch,original_kernel_tensor,original_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwd3_BACKWARD,short,True,393.582596,0.000000
+PyTorch,original_kernel_tensor,original_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwdall_BACKWARD,short,True,394.932475,0.000000
+PyTorch,original_kernel_tensor,original_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwd1_BACKWARD,short,True,398.150060,0.000000
+PyTorch,original_kernel_tensor,original_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwd2_BACKWARD,short,True,394.573905,0.000000
+PyTorch,original_kernel_tensor,original_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwd3_BACKWARD,short,True,389.742169,0.000000
+PyTorch,learnable_kernel_channel,learnable_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu,short,False,462.132270,0.000000
+PyTorch,learnable_kernel_channel,learnable_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu,short,False,460.794395,0.000000
+PyTorch,original_kernel_channel,original_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu,short,False,454.659963,0.000000
+PyTorch,original_kernel_channel,original_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu,short,False,450.819046,0.000000
+PyTorch,learnable_kernel_channel,learnable_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwdall_BACKWARD,short,True,727.548224,0.000000
+PyTorch,learnable_kernel_channel,learnable_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwd1_BACKWARD,short,True,732.767646,0.000000
+PyTorch,learnable_kernel_channel,learnable_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwd2_BACKWARD,short,True,731.549638,0.000000
+PyTorch,learnable_kernel_channel,learnable_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwd3_BACKWARD,short,True,732.523360,0.000000
+PyTorch,learnable_kernel_channel,learnable_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwdall_BACKWARD,short,True,734.845672,0.000000
+PyTorch,learnable_kernel_channel,learnable_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwd1_BACKWARD,short,True,734.484530,0.000000
+PyTorch,learnable_kernel_channel,learnable_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwd2_BACKWARD,short,True,731.358856,0.000000
+PyTorch,learnable_kernel_channel,learnable_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwd3_BACKWARD,short,True,732.279545,0.000000
+PyTorch,original_kernel_channel,original_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwdall_BACKWARD,short,True,392.022089,0.000000
+PyTorch,original_kernel_channel,original_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwd1_BACKWARD,short,True,396.691596,0.000000
+PyTorch,original_kernel_channel,original_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwdall_BACKWARD,short,True,395.044202,0.000000
+PyTorch,original_kernel_channel,original_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwd1_BACKWARD,short,True,393.618618,0.000000
+PyTorch,q_argsort,q_argsort_M512_N512_dtypetorch.quint8,short,False,498.230444,0.000000
+PyTorch,q_clone,q_clone_M512_N512_dtypetorch.quint8,short,False,54.217228,0.000000
+PyTorch,q_mean,q_mean_M512_N512_dtypetorch.quint8,short,False,98.299090,0.000000
+PyTorch,q_relu,q_relu_M512_N512_dtypetorch.quint8,short,False,50.626535,0.000000
+PyTorch,q_relu_,q_relu__M512_N512_dtypetorch.quint8,short,False,50.900865,0.000000
+PyTorch,q_sort,q_sort_M512_N512_dtypetorch.quint8,short,False,489.762199,0.000000
+PyTorch,qtopk,qtopk_M512_N512_k5_dtypetorch.quint8,short,False,106.761619,0.000000
+PyTorch,abs,abs_M512_N512_cpu,short,False,57.051424,0.000000
+PyTorch,abs_,abs__M512_N512_cpu,short,False,52.200911,0.000000
+PyTorch,acos,acos_M512_N512_cpu,short,False,163.152278,0.000000
+PyTorch,acos_,acos__M512_N512_cpu,short,False,154.986924,0.000000
+PyTorch,argsort,argsort_M512_N512_cpu,short,False,1293.551670,0.000000
+PyTorch,asin,asin_M512_N512_cpu,short,False,143.466299,0.000000
+PyTorch,asin_,asin__M512_N512_cpu,short,False,138.166554,0.000000
+PyTorch,atan,atan_M512_N512_cpu,short,False,183.999280,0.000000
+PyTorch,atan_,atan__M512_N512_cpu,short,False,178.477300,0.000000
+PyTorch,ceil,ceil_M512_N512_cpu,short,False,53.237791,0.000000
+PyTorch,ceil_,ceil__M512_N512_cpu,short,False,51.146127,0.000000
+PyTorch,clamp,clamp_M512_N512_cpu,short,False,57.982160,0.000000
+PyTorch,clone,clone_M512_N512_cpu,short,False,55.928251,0.000000
+PyTorch,cos,cos_M512_N512_cpu,short,False,153.934110,0.000000
+PyTorch,cos_,cos__M512_N512_cpu,short,False,149.205590,0.000000
+PyTorch,cosh,cosh_M512_N512_cpu,short,False,233.610736,0.000000
+PyTorch,digamma,digamma_M512_N512_cpu,short,False,512.670916,0.000000
+PyTorch,erf,erf_M512_N512_cpu,short,False,248.115065,0.000000
+PyTorch,erf_,erf__M512_N512_cpu,short,False,245.928480,0.000000
+PyTorch,erfc,erfc_M512_N512_cpu,short,False,471.492698,0.000000
+PyTorch,erfc_,erfc__M512_N512_cpu,short,False,466.460295,0.000000
+PyTorch,erfinv,erfinv_M512_N512_cpu,short,False,1359.954587,0.000000
+PyTorch,exp,exp_M512_N512_cpu,short,False,102.685068,0.000000
+PyTorch,exp_,exp__M512_N512_cpu,short,False,98.656667,0.000000
+PyTorch,expm1,expm1_M512_N512_cpu,short,False,224.464036,0.000000
+PyTorch,expm1_,expm1__M512_N512_cpu,short,False,220.063117,0.000000
+PyTorch,floor,floor_M512_N512_cpu,short,False,53.244395,0.000000
+PyTorch,floor_,floor__M512_N512_cpu,short,False,51.672797,0.000000
+PyTorch,frac,frac_M512_N512_cpu,short,False,55.433832,0.000000
+PyTorch,frac_,frac__M512_N512_cpu,short,False,51.270698,0.000000
+PyTorch,gelu,gelu_M512_N512_cpu,short,False,156.736075,0.000000
+PyTorch,hardshrink,hardshrink_M512_N512_cpu,short,False,57.883780,0.000000
+PyTorch,lgamma,lgamma_M512_N512_cpu,short,False,853.460615,0.000000
+PyTorch,log,log_M512_N512_cpu,short,False,154.847541,0.000000
+PyTorch,log10,log10_M512_N512_cpu,short,False,163.334617,0.000000
+PyTorch,log10_,log10__M512_N512_cpu,short,False,157.360735,0.000000
+PyTorch,log1p,log1p_M512_N512_cpu,short,False,163.516254,0.000000
+PyTorch,log1p_,log1p__M512_N512_cpu,short,False,159.639356,0.000000
+PyTorch,log2,log2_M512_N512_cpu,short,False,163.969243,0.000000
+PyTorch,log2_,log2__M512_N512_cpu,short,False,159.835136,0.000000
+PyTorch,log_,log__M512_N512_cpu,short,False,150.952504,0.000000
+PyTorch,logit,logit_M512_N512_cpu,short,False,177.961690,0.000000
+PyTorch,logit_,logit__M512_N512_cpu,short,False,172.351381,0.000000
+PyTorch,neg,neg_M512_N512_cpu,short,False,55.097290,0.000000
+PyTorch,neg_,neg__M512_N512_cpu,short,False,50.983444,0.000000
+PyTorch,reciprocal,reciprocal_M512_N512_cpu,short,False,63.374416,0.000000
+PyTorch,reciprocal_,reciprocal__M512_N512_cpu,short,False,58.360915,0.000000
+PyTorch,relu,relu_M512_N512_cpu,short,False,55.350610,0.000000
+PyTorch,relu_,relu__M512_N512_cpu,short,False,52.531514,0.000000
+PyTorch,round,round_M512_N512_cpu,short,False,54.882808,0.000000
+PyTorch,round_,round__M512_N512_cpu,short,False,51.705845,0.000000
+PyTorch,rsqrt,rsqrt_M512_N512_cpu,short,False,72.353625,0.000000
+PyTorch,rsqrt_,rsqrt__M512_N512_cpu,short,False,67.110910,0.000000
+PyTorch,sigmoid,sigmoid_M512_N512_cpu,short,False,101.934045,0.000000
+PyTorch,sigmoid_,sigmoid__M512_N512_cpu,short,False,101.207989,0.000000
+PyTorch,sign,sign_M512_N512_cpu,short,False,57.157465,0.000000
+PyTorch,sgn,sgn_M512_N512_cpu,short,False,56.892450,0.000000
+PyTorch,sin,sin_M512_N512_cpu,short,False,129.825713,0.000000
+PyTorch,sin_,sin__M512_N512_cpu,short,False,124.252865,0.000000
+PyTorch,sinh,sinh_M512_N512_cpu,short,False,237.181745,0.000000
+PyTorch,sqrt,sqrt_M512_N512_cpu,short,False,55.643847,0.000000
+PyTorch,sqrt_,sqrt__M512_N512_cpu,short,False,51.970346,0.000000
+PyTorch,square,square_M512_N512_cpu,short,False,56.493474,0.000000
+PyTorch,square_,square__M512_N512_cpu,short,False,53.660946,0.000000
+PyTorch,tan,tan_M512_N512_cpu,short,False,212.381058,0.000000
+PyTorch,tan_,tan__M512_N512_cpu,short,False,209.302840,0.000000
+PyTorch,tanh,tanh_M512_N512_cpu,short,False,254.571910,0.000000
+PyTorch,tanh_,tanh__M512_N512_cpu,short,False,250.419008,0.000000
+PyTorch,trunc,trunc_M512_N512_cpu,short,False,50.202160,0.000000
+PyTorch,trunc_,trunc__M512_N512_cpu,short,False,48.335770,0.000000
+PyTorch,unique,unique_M512_N512_cpu,short,False,18881.017060,0.000000
+PyTorch,zero_,zero__M512_N512_cpu,short,False,48.573353,0.000000
+PyTorch,bernoulli_,bernoulli__M512_N512_cpu,short,False,2761.902873,0.000000
+PyTorch,cauchy_,cauchy__M512_N512_cpu,short,False,6134.592810,0.000000
+PyTorch,digamma_,digamma__M512_N512_cpu,short,False,968.574541,0.000000
+PyTorch,exponential_,exponential__M512_N512_cpu,short,False,4554.747990,0.000000
+PyTorch,normal_,normal__M512_N512_cpu,short,False,1969.108666,0.000000
+PyTorch,random_,random__M512_N512_cpu,short,False,742.022216,0.000000
+PyTorch,sign_,sign__M512_N512_cpu,short,False,53.070620,0.000000
+PyTorch,uniform_,uniform__M512_N512_cpu,short,False,719.128405,0.000000
+PyTorch,half,half_M512_N512_cpu,short,False,56.301074,0.000000
+PyTorch,long,long_M512_N512_cpu,short,False,69.495610,0.000000
diff --git a/benchmarks/operator_benchmark/pt/conv_test.py b/benchmarks/operator_benchmark/pt/conv_test.py
index 93b4942cea2b..65baf47e0d67 100644
--- a/benchmarks/operator_benchmark/pt/conv_test.py
+++ b/benchmarks/operator_benchmark/pt/conv_test.py
@@ -38,12 +38,16 @@ class ConvTranspose1dBenchmark(op_bench.TorchBenchmarkBase):
 op_bench.generate_pt_test(
     configs.conv_1d_configs_short + configs.conv_1d_configs_long, Conv1dBenchmark
 )
-op_bench.generate_pt_test(
-    configs.convtranspose_1d_configs_short
-    + configs.conv_1d_configs_short
-    + configs.conv_1d_configs_long,
-    ConvTranspose1dBenchmark,
-)
+
+
+if not torch.backends.mkldnn.is_acl_available():
+    # convtranpose1d crashes with ACL, see https://github.com/pytorch/pytorch/issues/165654
+    op_bench.generate_pt_test(
+        configs.convtranspose_1d_configs_short
+        + configs.conv_1d_configs_short
+        + configs.conv_1d_configs_long,
+        ConvTranspose1dBenchmark,
+    )
 
 
 """

From 3af2f0c12accc6bd10ef2b76fb5c51aa0f6b73a3 Mon Sep 17 00:00:00 2001
From: Isuru Fernando <ifernando@quansight.com>
Date: Tue, 30 Sep 2025 14:15:23 +0000
Subject: [PATCH 041/123] [inductor] require shape in TritonCSEVariable
 (#162275)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/162275
Approved by: https://github.com/mlazos
ghstack dependencies: #164158
---
 torch/_inductor/codegen/triton.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
index adf4b6609347..a7d29a2fb736 100644
--- a/torch/_inductor/codegen/triton.py
+++ b/torch/_inductor/codegen/triton.py
@@ -951,8 +951,7 @@ class TritonCSEVariable(CSEVariable):
         # We'll use this to track which masks the variable needs when used for indirect indexing
         self.mask_vars: OrderedSet[str] = OrderedSet()
         assert dtype is not None, "TritonCSEVariable must have dtype"
-        # TODO: uncomment this and fix the few failures left
-        # assert shape is not None, "TritonCSEVariable must have shape"
+        assert shape is not None, "TritonCSEVariable must have shape"
 
     def update_on_args(self, name, args, kwargs):
         for arg in args:

From 935ccdbe75c9c24c63a1131fecb119fc2eb441f3 Mon Sep 17 00:00:00 2001
From: inventshah <39803835+inventshah@users.noreply.github.com>
Date: Fri, 17 Oct 2025 15:35:49 +0000
Subject: [PATCH 042/123] [MPS] Fix internal assertion in torch.linalg.solve
 for singular matrices (#165254)

Fixes #163962 by special casing MPS in the negative status code branch in `_linalg_check_errors`.

Checks if info is [`MPSMatrixDecompositionStatus.singular`](https://developer.apple.com/documentation/metalperformanceshaders/mpsmatrixdecompositionstatus/singular) (which has a raw value of -2). I didn't find an official Apple source with this raw value (besides printing the enum value), so I'm not sure if we can (or should) depend on it? Is there a way to directly get the Objective-C enum value in C++?
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165254
Approved by: https://github.com/malfet
---
 .../native/mps/operations/LinearAlgebra.mm    | 25 +++++++++++++++++++
 test/test_mps.py                              | 10 ++++++++
 2 files changed, 35 insertions(+)

diff --git a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
index 2f490df8d330..d5c68119f673 100644
--- a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
+++ b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
@@ -196,6 +196,28 @@ bool use_metal_mm(const Tensor& self, const Tensor& other, const Tensor& output)
        other.size(0) > max_stride_size || other.size(1) > max_stride_size);
 }
 
+void map_mps_decomposition_error_code_to_blas(const Tensor& status) {
+  const auto& status_flat = status.view(-1);
+
+  for (const auto i : c10::irange(status_flat.size(0))) {
+    int code = status_flat[i].item<int>();
+    switch (code) {
+      case MPSMatrixDecompositionStatusSuccess:
+        status_flat[i] = 0;
+        break;
+      case MPSMatrixDecompositionStatusNonPositiveDefinite:
+      case MPSMatrixDecompositionStatusSingular:
+        status_flat[i] = 2;
+        break;
+      case MPSMatrixDecompositionStatusFailure:
+        status_flat[i] = -1;
+        break;
+      default:
+        TORCH_INTERNAL_ASSERT(false, "Unknown MPSMatrixDecompositionStatus enum value: ", code);
+    }
+  }
+}
+
 } // anonymous namespace
 
 static void linalg_lu_factor_ex_out_mps_impl(const Tensor& A,
@@ -487,6 +509,9 @@ static void linalg_solve_out_mps_impl(const Tensor& A,
                   "mpsmatrixdecompositionstatus for details.");
     }
   }
+
+  map_mps_decomposition_error_code_to_blas(info);
+
   if (!left) {
     // If this was a right solve, transpose the result back
     result.copy_(result_t.transpose(-2, -1).contiguous());
diff --git a/test/test_mps.py b/test/test_mps.py
index 341f3338efa1..7346d1d26d44 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -1978,6 +1978,16 @@ class TestMPS(TestCaseMPS):
         run_linalg_solve_test(32, 10, 10)
         run_linalg_solve_test(32, 2, 2, 2, 2, 10, 10)
 
+    def test_linalg_solve_singular(self):
+        # Regression test for https://github.com/pytorch/pytorch/issues/163962
+
+        # Explicit singular matrix
+        A = torch.tensor([[1.0, 2.0], [2.0, 4.0]], device="mps")
+        b = torch.rand_like(A)
+
+        with self.assertRaisesRegex(RuntimeError, "input matrix is singular"):
+            torch.linalg.solve(A, b)
+
     def test_linalg_solve_with_broadcasting(self):
         from functools import partial
         import torch

From 85c5433d38146dbb30ee410c45fc875ea70b673f Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Fri, 17 Oct 2025 15:57:01 +0000
Subject: [PATCH 043/123] Revert "Fix `_StridedShard` incorrect split
 (#165533)"

This reverts commit dfc8a1c5ddc8401197e9ab546e03b0f745edc27b.

Reverted https://github.com/pytorch/pytorch/pull/165533 on behalf of https://github.com/seemethere due to Causing a merge conflict internally, see D84829161 ([comment](https://github.com/pytorch/pytorch/pull/165533#issuecomment-3416143176))
---
 test/distributed/tensor/test_redistribute.py | 17 ----
 torch/distributed/tensor/_api.py             | 34 +++-----
 torch/distributed/tensor/placement_types.py  | 83 ++++++++++----------
 3 files changed, 52 insertions(+), 82 deletions(-)

diff --git a/test/distributed/tensor/test_redistribute.py b/test/distributed/tensor/test_redistribute.py
index 1eb0830422f6..8b5d031bccfd 100644
--- a/test/distributed/tensor/test_redistribute.py
+++ b/test/distributed/tensor/test_redistribute.py
@@ -20,7 +20,6 @@ from torch.distributed.tensor._collective_utils import shard_dim_alltoall
 from torch.distributed.tensor._dtensor_spec import ShardOrderEntry
 from torch.distributed.tensor._redistribute import redistribute_local_tensor
 from torch.distributed.tensor.debug import CommDebugMode
-from torch.distributed.tensor.placement_types import _StridedShard
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
@@ -1146,22 +1145,6 @@ class DistributeWithDeviceOrderTest(DTensorTestBase):
             sharded_dt, mesh, tgt_placement, shard_order=None
         )
 
-    @with_comms
-    def test_shard_order_same_data_as_strided_shard(self):
-        device_mesh = init_device_mesh(self.device_type, (4, 2))
-        x = torch.randn(8, 4, device=self.device_type)
-        # specify right-to-left order use _StridedShard
-        strided_placement = [_StridedShard(-2, split_factor=2), Shard(-2)]
-        x_strided_dt = distribute_tensor(x, device_mesh, strided_placement)
-        # specify right-to-left order use ordered shard
-        x_ordered_dt = self.distribute_tensor(
-            x,
-            device_mesh,
-            placements=[Shard(0), Shard(0)],
-            shard_order=(ShardOrderEntry(tensor_dim=0, mesh_dims=(1, 0)),),
-        )
-        self.assertEqual(x_ordered_dt.to_local(), x_strided_dt.to_local())
-
 
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/distributed/tensor/_api.py b/torch/distributed/tensor/_api.py
index 5fd66b2c5f8e..03eec9c7d1d4 100644
--- a/torch/distributed/tensor/_api.py
+++ b/torch/distributed/tensor/_api.py
@@ -25,7 +25,6 @@ from torch.distributed.tensor._utils import (
     normalize_to_torch_size,
 )
 from torch.distributed.tensor.placement_types import (
-    _StridedShard,
     Partial,
     Placement,
     Replicate,
@@ -777,29 +776,18 @@ def distribute_tensor(
     # distribute the tensor according to the placements.
     placements = list(placements)
     for idx, placement in enumerate(placements):
-        if isinstance(placement, Shard):
-            placement_dim = (
-                placement.dim + tensor.ndim if placement.dim < 0 else placement.dim
+        if placement.is_shard():
+            placement = cast(Shard, placement)
+            if placement.dim < 0:
+                # normalize shard placement dim
+                placement = Shard(placement.dim + tensor.ndim)
+                placements[idx] = placement
+            local_tensor = placement._shard_tensor(
+                local_tensor, device_mesh, idx, src_data_rank
             )
-            if isinstance(placement, _StridedShard):
-                local_tensor = _StridedShard._make_shard_tensor(
-                    placement_dim,
-                    local_tensor,
-                    device_mesh,
-                    idx,
-                    src_data_rank,
-                    split_factor=placement.split_factor,
-                )
-                placements[idx] = _StridedShard(
-                    placement_dim, split_factor=placement.split_factor
-                )
-            else:
-                local_tensor = Shard._make_shard_tensor(
-                    placement_dim, local_tensor, device_mesh, idx, src_data_rank
-                )
-                placements[idx] = Shard(placement_dim)
-        elif isinstance(placement, Replicate):
-            local_tensor = Replicate._make_replicate_tensor(
+        elif placement.is_replicate():
+            placement = cast(Replicate, placement)
+            local_tensor = placement._replicate_tensor(
                 local_tensor, device_mesh, idx, src_data_rank
             )
         else:
diff --git a/torch/distributed/tensor/placement_types.py b/torch/distributed/tensor/placement_types.py
index 5f68ff03ee22..d6b7efadee6e 100644
--- a/torch/distributed/tensor/placement_types.py
+++ b/torch/distributed/tensor/placement_types.py
@@ -69,8 +69,9 @@ class Shard(Placement):
         else:
             return True
 
-    def _split_tensor(
-        self,
+    @staticmethod
+    def _make_split_tensor(
+        dim: int,
         tensor: torch.Tensor,
         num_chunks: int,
         *,
@@ -86,31 +87,47 @@ class Shard(Placement):
             few ranks before calling the collectives (i.e. scatter/all_gather, etc.).
             This is because collectives usually require equal size tensor inputs
         """
-        assert self.dim <= tensor.ndim, (
-            f"Sharding dim {self.dim} greater than tensor ndim {tensor.ndim}"
+        assert dim <= tensor.ndim, (
+            f"Sharding dim {dim} greater than tensor ndim {tensor.ndim}"
         )
 
         # chunk tensor over dimension `dim` into n slices
-        tensor_list = list(torch.chunk(tensor, num_chunks, dim=self.dim))
+        tensor_list = list(torch.chunk(tensor, num_chunks, dim=dim))
         tensor_list = fill_empty_tensor_to_shards(
-            tensor_list, self.dim, num_chunks - len(tensor_list)
+            tensor_list, dim, num_chunks - len(tensor_list)
         )
 
         # compute the chunk size inline with ``torch.chunk`` to calculate padding
-        full_chunk_size = (tensor.size(self.dim) + num_chunks - 1) // num_chunks
+        full_chunk_size = (tensor.size(dim) + num_chunks - 1) // num_chunks
 
         shard_list: list[torch.Tensor] = []
         pad_sizes: list[int] = []
         for shard in tensor_list:
             if with_padding:
-                pad_size = full_chunk_size - shard.size(self.dim)
-                shard = pad_tensor(shard, self.dim, pad_size)
+                pad_size = full_chunk_size - shard.size(dim)
+                shard = pad_tensor(shard, dim, pad_size)
                 pad_sizes.append(pad_size)
             if contiguous:
                 shard = shard.contiguous()
             shard_list.append(shard)
         return shard_list, pad_sizes
 
+    def _split_tensor(
+        self,
+        tensor: torch.Tensor,
+        num_chunks: int,
+        *,
+        with_padding: bool = True,
+        contiguous: bool = True,
+    ) -> tuple[list[torch.Tensor], list[int]]:
+        return Shard._make_split_tensor(
+            self.dim,
+            tensor,
+            num_chunks,
+            with_padding=with_padding,
+            contiguous=contiguous,
+        )
+
     @staticmethod
     @maybe_run_for_local_tensor
     def local_shard_size_and_offset(
@@ -169,8 +186,9 @@ class Shard(Placement):
                 local_tensor = local_tensor.contiguous()
         return local_tensor
 
-    def _shard_tensor(
-        self,
+    @staticmethod
+    def _make_shard_tensor(
+        dim: int,
         tensor: torch.Tensor,
         mesh: DeviceMesh,
         mesh_dim: int,
@@ -192,14 +210,14 @@ class Shard(Placement):
         if src_data_rank is None:
             # src_data_rank specified as None explicitly means to skip the
             # communications, simply split
-            scatter_list, _ = self._split_tensor(
-                tensor, num_chunks, with_padding=False, contiguous=True
+            scatter_list, _ = Shard._make_split_tensor(
+                dim, tensor, num_chunks, with_padding=False, contiguous=True
             )
 
-            return self._select_shard(scatter_list, mesh_dim_local_rank)
+            return Shard._select_shard(scatter_list, mesh_dim_local_rank)
 
-        scatter_list, pad_sizes = self._split_tensor(
-            tensor, num_chunks, with_padding=True, contiguous=True
+        scatter_list, pad_sizes = Shard._make_split_tensor(
+            dim, tensor, num_chunks, with_padding=True, contiguous=True
         )
 
         it = iter(scatter_list)
@@ -216,20 +234,17 @@ class Shard(Placement):
         )
 
         return Shard._maybe_unpad_tensor_with_sizes(
-            self.dim, output, pad_sizes, mesh_dim_local_rank, True
+            dim, output, pad_sizes, mesh_dim_local_rank, True
         )
 
-    @classmethod
-    def _make_shard_tensor(
-        cls,
-        dim: int,
+    def _shard_tensor(
+        self,
         tensor: torch.Tensor,
         mesh: DeviceMesh,
         mesh_dim: int,
         src_data_rank: Optional[int] = 0,
     ) -> torch.Tensor:
-        shard_placement = cls(dim)
-        return shard_placement._shard_tensor(tensor, mesh, mesh_dim, src_data_rank)
+        return Shard._make_shard_tensor(self.dim, tensor, mesh, mesh_dim, src_data_rank)
 
     def _reduce_shard_tensor(
         self,
@@ -252,8 +267,8 @@ class Shard(Placement):
         is_padded = tensor.size(self.dim) % num_chunks != 0
         pad_sizes = None
         if is_padded:
-            scattered_list, pad_sizes = self._split_tensor(
-                tensor, num_chunks, with_padding=True, contiguous=True
+            scattered_list, pad_sizes = Shard._make_split_tensor(
+                self.dim, tensor, num_chunks, with_padding=True, contiguous=True
             )
             tensor = torch.cat(scattered_list, dim=self.dim)
         elif not tensor.is_contiguous():
@@ -523,21 +538,6 @@ class _StridedShard(Shard):
         """human readable representation of the _StridedShard placement"""
         return f"_S({self.dim}, {self.split_factor})"
 
-    @classmethod
-    def _make_shard_tensor(
-        cls,
-        dim: int,
-        tensor: torch.Tensor,
-        mesh: DeviceMesh,
-        mesh_dim: int,
-        src_data_rank: Optional[int] = 0,
-        split_factor: int = 1,
-    ) -> torch.Tensor:
-        strided_shard_placement = cls(dim=dim, split_factor=split_factor)
-        return strided_shard_placement._shard_tensor(
-            tensor, mesh, mesh_dim, src_data_rank
-        )
-
     def _split_tensor(
         self,
         tensor: torch.Tensor,
@@ -704,9 +704,8 @@ class Replicate(Placement):
         """
         return "R"
 
-    @classmethod
+    @staticmethod
     def _make_replicate_tensor(
-        cls,
         tensor: torch.Tensor,
         mesh: DeviceMesh,
         mesh_dim: int,

From faff826a46c1569eb1c94b0a02299578d1f0e715 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Fri, 17 Oct 2025 16:27:59 +0000
Subject: [PATCH 044/123] Revert "[ROCm] new implementation of
 upsample_bilinear2d_backward (#164572)"

This reverts commit 53f9ae0e50d4dcc47f2ca4bf854803f9d4f875ae.

Reverted https://github.com/pytorch/pytorch/pull/164572 on behalf of https://github.com/seemethere due to Looks like this is failing in our internal builds, will post a suggestion for a fix but want you to double verify that this behavior is correct ([comment](https://github.com/pytorch/pytorch/pull/164572#issuecomment-3416262676))
---
 .../ATen/native/cuda/UpSampleBilinear2d.cu    | 103 +-----------------
 1 file changed, 2 insertions(+), 101 deletions(-)

diff --git a/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu b/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu
index 75dde207c528..b891750891d5 100644
--- a/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu
@@ -127,29 +127,6 @@ __global__ void upsample_bilinear2d_nhwc_out_frame(
   }
 }
 
-#ifdef USE_ROCM
-// Helper function to compute output pixel range that can contribute to input pixel
-template <typename accscalar_t>
-__device__ __forceinline__ void compute_output_range(
-    int input_pos,
-    accscalar_t scale,
-    int output_size,
-    bool align_corners,
-    int& min_output,
-    int& max_output) {
-  accscalar_t lo, hi;
-  if (align_corners) {
-      lo = static_cast<accscalar_t>(input_pos - 1) / scale;
-      hi = static_cast<accscalar_t>(input_pos + 1) / scale;
-  } else {
-      lo = (input_pos - static_cast<accscalar_t>(0.5)) / scale - static_cast<accscalar_t>(0.5);
-      hi = (input_pos + static_cast<accscalar_t>(1.5)) / scale - static_cast<accscalar_t>(0.5);
-  }
-  min_output = max(0, static_cast<int>(ceil(lo)));
-  max_output = min(output_size - 1, static_cast<int>(floor(hi)));
-}
-#endif
-
 // Backward (adjoint) operation 1 <- 2 (accumulates)
 template <typename scalar_t, typename accscalar_t>
 C10_LAUNCH_BOUNDS_1(1024)
@@ -164,74 +141,8 @@ __global__ void upsample_bilinear2d_backward_out_frame(
     const bool align_corners,
     scalar_t* __restrict__ idata,
     const scalar_t* __restrict__ odata) {
-  // In C++, integer multiplication, like in standard arithmetic, is generally commutative.
-  const size_t i_numel = nc * width1 * height1;
-#ifdef USE_ROCM
-  for (size_t index = blockDim.x * blockIdx.x + threadIdx.x; index < i_numel;
-       index += blockDim.x * gridDim.x) {
-    // Decode input pixel coordinates
-    size_t index_temp = index;
-    const int w1 = index_temp % width1;
-    index_temp /= width1;
-    const int h1 = index_temp % height1;
-    const size_t nc_idx = index_temp / height1;
-
-    accscalar_t grad_sum = 0;
-
-    // Find range of output pixels that could interpolate from this input pixel
-    int h2_min, h2_max, w2_min, w2_max;
-    compute_output_range<accscalar_t>(h1, rheight, height2, align_corners, h2_min, h2_max);
-    compute_output_range<accscalar_t>(w1, rwidth, width2, align_corners, w2_min, w2_max);
-
-    // Iterate over potential output pixels
-    for (int h2 = h2_min; h2 <= h2_max; h2++) {
-      for (int w2 = w2_min; w2 <= w2_max; w2++) {
-        // Compute source coordinates for this output pixel
-        const accscalar_t h1r = area_pixel_compute_source_index<accscalar_t>(
-            rheight, h2, align_corners, /*cubic=*/false);
-        const int h1_base = (int)h1r;
-        const int h1p = (h1_base < height1 - 1) ? 1 : 0;
-        const accscalar_t h1lambda = h1r - h1_base;
-        const accscalar_t h0lambda = static_cast<accscalar_t>(1) - h1lambda;
-
-        const accscalar_t w1r = area_pixel_compute_source_index<accscalar_t>(
-            rwidth, w2, align_corners, /*cubic=*/false);
-        const int w1_base = (int)w1r;
-        const int w1p = (w1_base < width1 - 1) ? 1 : 0;
-        const accscalar_t w1lambda = w1r - w1_base;
-        const accscalar_t w0lambda = static_cast<accscalar_t>(1) - w1lambda;
-
-        // Check if our input pixel participates in this interpolation and accumulate all weights
-        // At boundaries, h1p=0 or w1p=0 causes some sampling positions to collapse
-        // to the same pixel, so we need to accumulate weights from all matching positions
-        accscalar_t weight = 0;
-
-        // Check all four interpolation positions and accumulate weights
-        if (h1 == h1_base && w1 == w1_base) {
-          weight += h0lambda * w0lambda;  // top-left
-        }
-        if (h1 == h1_base && w1 == w1_base + w1p) {
-          weight += h0lambda * w1lambda;  // top-right (may be same as top-left if w1p=0)
-        }
-        if (h1 == h1_base + h1p && w1 == w1_base) {
-          weight += h1lambda * w0lambda;  // bottom-left (may be same as top-left if h1p=0)
-        }
-        if (h1 == h1_base + h1p && w1 == w1_base + w1p) {
-          weight += h1lambda * w1lambda;  // bottom-right (may collapse to other positions)
-        }
-
-        if (weight > 0) {
-          const size_t output_idx = nc_idx * height2 * width2 + h2 * width2 + w2;
-          grad_sum += weight * static_cast<accscalar_t>(odata[output_idx]);
-        }
-      }
-    }
-
-    // Write accumulated gradient (no atomics needed)
-    idata[index] = static_cast<scalar_t>(grad_sum);
-  }
-#else
   const size_t o_numel = nc * width2 * height2;
+  const size_t i_numel = nc * width1 * height1;
   for (size_t index = blockDim.x * blockIdx.x + threadIdx.x; index < o_numel;
        index += blockDim.x * gridDim.x) {
     size_t index_temp = index;
@@ -280,7 +191,6 @@ __global__ void upsample_bilinear2d_backward_out_frame(
         static_cast<scalar_t>(h1lambda * w1lambda * d2val),
         true);
   }
-#endif
 }
 
 template <typename scalar_t, typename accscalar_t>
@@ -477,6 +387,7 @@ static void upsample_bilinear2d_backward_out_cuda_template(
   // threads are not covering the whole input tensor.
   grad_input.zero_();
 
+  const size_t num_kernels = nbatch * channels * output_height * output_width;
   const int num_threads = std::min(
       at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
@@ -486,12 +397,6 @@ static void upsample_bilinear2d_backward_out_cuda_template(
     return;
   }
 
-#ifdef USE_ROCM
-  constexpr bool use_input = true;
-#else
-  constexpr bool use_input = false;
-#endif
-
   AT_DISPATCH_FLOATING_TYPES_AND2(
       at::ScalarType::Half, at::ScalarType::BFloat16,
       grad_output_.scalar_type(), "upsample_bilinear2d_backward_out_frame", [&] {
@@ -509,8 +414,6 @@ static void upsample_bilinear2d_backward_out_cuda_template(
       const accscalar_t rwidth = area_pixel_compute_scale<accscalar_t>(
           input_width, output_width, align_corners, scales_w);
 
-      const size_t num_kernels = nbatch * channels * output_height * output_width;
-
       upsample_bilinear2d_backward_nhwc_out_frame<scalar_t, accscalar_t>
           <<<ceil_div(num_kernels, static_cast<size_t>(num_threads)), num_threads, 0, stream>>>(
               input_height,
@@ -541,8 +444,6 @@ static void upsample_bilinear2d_backward_out_cuda_template(
       const accscalar_t rwidth = area_pixel_compute_scale<accscalar_t>(
           input_width, output_width, align_corners, scales_w);
 
-      const size_t num_kernels = nbatch * channels * (use_input ? input_height * input_width : output_height * output_width);
-
       upsample_bilinear2d_backward_out_frame<scalar_t, accscalar_t>
           <<<ceil_div(num_kernels, static_cast<size_t>(num_threads)),
              num_threads,

From bfcdbd0a970e5ce08cecd0aa33dd389819f0ec4f Mon Sep 17 00:00:00 2001
From: "Han, Xu" <xu.han@intel.com>
Date: Fri, 17 Oct 2025 16:37:02 +0000
Subject: [PATCH 045/123] fix wrong accuracy_status when exception. (#165731)

When I debug `XPU` accruacy issue, I found the script output wrong accuracy_status.
When the `try` block raise an exception, we should process the exception, but not return the `fail_accuracy`.

Before fixing, it returned as `fail_accuracy`:
<img width="1109" height="216" alt="image" src="https://github.com/user-attachments/assets/385c354f-fbf6-48e4-a1be-3e37e987341b" />

After fixing, it returned the exception message:
<img width="1101" height="292" alt="image" src="https://github.com/user-attachments/assets/f18c0e3c-8358-4ec7-a6bb-c2e01b69d27f" />

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165731
Approved by: https://github.com/Stonepia, https://github.com/chuanqi129, https://github.com/Lucaskabela
---
 benchmarks/dynamo/common.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index b81f8a9dbd24..f3b75e9f72ea 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -2284,9 +2284,11 @@ class BenchmarkRunner:
                     )
                 ):
                     is_same = False
-            except Exception:
+            except Exception as e:
                 # Sometimes torch.allclose may throw RuntimeError
-                is_same = False
+                exception_string = str(e)
+                accuracy_status = f"fail_exception: {exception_string}"
+                return record_status(accuracy_status, dynamo_start_stats=start_stats)
 
             if not is_same:
                 accuracy_status = "eager_two_runs_differ"
@@ -2403,9 +2405,11 @@ class BenchmarkRunner:
                     force_max_multiplier=force_max_multiplier,
                 ):
                     is_same = False
-            except Exception:
+            except Exception as e:
                 # Sometimes torch.allclose may throw RuntimeError
-                is_same = False
+                exception_string = str(e)
+                accuracy_status = f"fail_exception: {exception_string}"
+                return record_status(accuracy_status, dynamo_start_stats=start_stats)
 
             if not is_same:
                 if self.args.skip_accuracy_check:

From 1dc9a05d0323ee3c7a20945c62463959d40f1a51 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Thu, 16 Oct 2025 17:03:02 -0700
Subject: [PATCH 046/123] [dynamo][user_defined] Replace UserFunctionVariable
 with VariableTracker build (#165706)

Audit: To prevent future issues with functools.partial or callable
objects.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165706
Approved by: https://github.com/Lucaskabela
ghstack dependencies: #165683
---
 torch/_dynamo/variables/user_defined.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/torch/_dynamo/variables/user_defined.py b/torch/_dynamo/variables/user_defined.py
index c17a1b9392d2..530189f7f2ab 100644
--- a/torch/_dynamo/variables/user_defined.py
+++ b/torch/_dynamo/variables/user_defined.py
@@ -293,9 +293,8 @@ class UserDefinedClassVariable(UserDefinedVariable):
             return VariableTracker.build(tx, obj.__get__(self.value), source)
         elif isinstance(obj, classmethod):
             if isinstance(obj.__func__, property):
-                return variables.UserFunctionVariable(obj.__func__.fget).call_function(
-                    tx, [self], {}
-                )
+                fget_vt = VariableTracker.build(tx, obj.__func__.fget)
+                return fget_vt.call_function(tx, [self], {})
             return variables.UserMethodVariable(obj.__func__, self, source=source)
         elif isinstance(obj, types.ClassMethodDescriptorType):
             # e.g.: inspect.getattr_static(dict, "fromkeys")
@@ -1789,7 +1788,7 @@ class SourcelessGraphModuleVariable(UserDefinedObjectVariable):
         args: "list[VariableTracker]",
         kwargs: "dict[str, VariableTracker]",
     ) -> "VariableTracker":
-        fn_variable = variables.UserFunctionVariable(self.value.forward.__func__)
+        fn_variable = VariableTracker.build(tx, self.value.forward.__func__)
         args = [self] + args
         return tx.inline_user_function_return(
             fn_variable,

From 630520b346b8883db7821562e589ccde7d12687a Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Thu, 16 Oct 2025 17:03:05 -0700
Subject: [PATCH 047/123] [dynamo][misc] Replace UserFunctionVariable with
 VariableTracker build (#165707)

Audit: To prevent future issues with functools.partial or callable
objects.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165707
Approved by: https://github.com/Lucaskabela
ghstack dependencies: #165683, #165706
---
 torch/_dynamo/variables/misc.py | 46 ++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 24 deletions(-)

diff --git a/torch/_dynamo/variables/misc.py b/torch/_dynamo/variables/misc.py
index 690357e55ab3..2b1cbdbd3488 100644
--- a/torch/_dynamo/variables/misc.py
+++ b/torch/_dynamo/variables/misc.py
@@ -200,9 +200,10 @@ class SuperVariable(VariableTracker):
                 and not (args or kwargs)
             ):
                 with do_not_convert_to_tracable_parameter():
-                    return variables.UserFunctionVariable(
-                        unpatched_nn_module_init, source=source
-                    ).call_function(tx, [self.objvar] + args, kwargs)
+                    fn_vt = VariableTracker.build(
+                        tx, unpatched_nn_module_init, source=source
+                    )
+                    return fn_vt.call_function(tx, [self.objvar] + args, kwargs)
             else:
                 unimplemented_v2(
                     gb_type="Unsupported super().__init__() call",
@@ -230,9 +231,8 @@ class SuperVariable(VariableTracker):
         elif isinstance(inner_fn, staticmethod) and isinstance(
             inner_fn.__func__, types.FunctionType
         ):
-            return variables.UserFunctionVariable(
-                inner_fn.__func__, source=source
-            ).call_function(tx, args, kwargs)
+            fn_vt = VariableTracker.build(tx, inner_fn.__func__, source=source)
+            return fn_vt.call_function(tx, args, kwargs)
         elif isinstance(inner_fn, classmethod) and isinstance(
             inner_fn.__func__, types.FunctionType
         ):
@@ -255,13 +255,13 @@ class SuperVariable(VariableTracker):
                     tx, self.objvar.value_type, cls_source
                 )
 
-            return variables.UserFunctionVariable(
-                inner_fn.__func__, source=AttrSource(source, "__func__")
-            ).call_function(tx, [cls_variable, *args], kwargs)
+            fn_vt = VariableTracker.build(
+                tx, inner_fn.__func__, source=AttrSource(source, "__func__")
+            )
+            return fn_vt.call_function(tx, [cls_variable, *args], kwargs)
         elif isinstance(inner_fn, types.FunctionType):
-            return variables.UserFunctionVariable(
-                inner_fn, source=source
-            ).call_function(tx, [self.objvar] + args, kwargs)
+            fn_vt = VariableTracker.build(tx, inner_fn, source=source)
+            return fn_vt.call_function(tx, [self.objvar] + args, kwargs)
         elif isinstance(inner_fn, types.MethodType):
             return variables.UserMethodVariable(
                 inner_fn.__func__, self.objvar, source=source
@@ -574,10 +574,8 @@ class ComptimeVariable(VariableTracker):
         from ..comptime import comptime
 
         # To support the comptime.print_graph convenience accessors
-        from .functions import UserFunctionVariable
-
-        return UserFunctionVariable(
-            getattr(comptime, name), source=AttrSource(self.source, name)
+        return VariableTracker.build(
+            tx, getattr(comptime, name), source=AttrSource(self.source, name)
         )
 
     def call_function(
@@ -771,9 +769,8 @@ class AutogradFunctionVariable(VariableTracker):
             sig = inspect.signature(fn)
             if len(args) - 1 == len(sig._parameters):
                 args = args[1:]  # Don't use context
-            return variables.UserFunctionVariable(fn, source=source).call_function(
-                tx, args, kwargs
-            )
+            fn_vt = VariableTracker.build(tx, fn, source=source)
+            return fn_vt.call_function(tx, args, kwargs)
         elif isinstance(fn, types.MethodType):
             return variables.UserMethodVariable(
                 fn.__func__,
@@ -799,9 +796,8 @@ class AutogradFunctionVariable(VariableTracker):
         assert isinstance(fn, types.FunctionType)
 
         fn_source = AttrSource(self.source, "backward")
-        return variables.UserFunctionVariable(fn, source=fn_source).call_function(
-            tx, args, kwargs
-        )
+        fn_vt = VariableTracker.build(tx, fn, source=fn_source)
+        return fn_vt.call_function(tx, args, kwargs)
 
     def call_function(self, tx: "InstructionTranslator", args, kwargs):
         return AutogradFunctionVariable(self.fn_cls)
@@ -1026,10 +1022,12 @@ class AutogradEngineVariable(UserDefinedObjectVariable):
                 assert tx.one_graph or tx.error_on_graph_break, (
                     "queue_callback() is only supported when Compiled Autograd is enabled with fullgraph=True"
                 )
-                return variables.UserFunctionVariable(
+                fn_vt = VariableTracker.build(
+                    tx,
                     torch._dynamo.external_utils.FakeCompiledAutogradEngine.queue_callback,
                     source=self.source,
-                ).call_function(
+                )
+                return fn_vt.call_function(
                     tx,
                     (tx.output.side_effects.get_ca_final_callbacks_var(), *args),
                     kwargs,

From 2928c5c5724bec7da91f5a3b24bbd15d5658a0cc Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Fri, 17 Oct 2025 17:13:04 +0000
Subject: [PATCH 048/123] Revert "Pyrefly suppressions 2 (#165692)"

This reverts commit 43d78423ac224cce432bf34ed9627035169d5433.

Reverted https://github.com/pytorch/pytorch/pull/165692 on behalf of https://github.com/seemethere due to This is causing merge conflicts when attempting to land internally, see D84890919 for more details ([comment](https://github.com/pytorch/pytorch/pull/165692#issuecomment-3416397240))
---
 pyrefly.toml                                    |  4 +---
 torch/_inductor/codegen/common.py               |  1 -
 torch/_inductor/codegen/cpp_gemm_template.py    |  2 --
 torch/_inductor/codegen/cpp_wrapper_gpu.py      |  1 -
 torch/_inductor/codegen/mps.py                  |  2 --
 torch/_inductor/codegen/simd.py                 |  1 -
 torch/_inductor/codegen/wrapper_fxir.py         |  1 -
 torch/_inductor/runtime/autotune_cache.py       |  8 --------
 torch/_inductor/runtime/benchmarking.py         |  2 --
 .../runtime/caching/implementations.py          |  1 -
 .../runtime/coordinate_descent_tuner.py         | 11 ++++-------
 torch/_inductor/runtime/hints.py                |  2 --
 torch/_inductor/runtime/runtime_utils.py        |  5 -----
 torch/_inductor/runtime/static_cuda_launcher.py | 17 -----------------
 torch/fx/experimental/proxy_tensor.py           |  1 -
 15 files changed, 5 insertions(+), 54 deletions(-)

diff --git a/pyrefly.toml b/pyrefly.toml
index 88054d605258..ad74e4df084c 100644
--- a/pyrefly.toml
+++ b/pyrefly.toml
@@ -22,10 +22,8 @@ project-includes = [
 project-excludes = [
   # ==== below will be enabled directory by directory ====
   # ==== to test Pyrefly on a specific directory, simply comment it out ====
+  "torch/_inductor/runtime",
   "torch/_inductor/codegen/triton.py",
-  "torch/_inductor/runtime/triton_helpers.py",
-  "torch/_inductor/runtime/triton_heuristics.py",
-  "torch/_inductor/runtime/halide_helpers.py",
   # formatting issues, will turn on after adjusting where suppressions can be
   # in import statements
   "torch/linalg/__init__.py",
diff --git a/torch/_inductor/codegen/common.py b/torch/_inductor/codegen/common.py
index 743baec01dfa..36ded3aea2fe 100644
--- a/torch/_inductor/codegen/common.py
+++ b/torch/_inductor/codegen/common.py
@@ -1739,7 +1739,6 @@ class KernelArgs:
         for outer, inner in chain(
             # pyrefly: ignore  # bad-argument-type
             self.input_buffers.items(),
-            # pyrefly: ignore  # bad-argument-type
             self.output_buffers.items(),
         ):
             if outer in self.inplace_buffers or isinstance(inner, RemovedArg):
diff --git a/torch/_inductor/codegen/cpp_gemm_template.py b/torch/_inductor/codegen/cpp_gemm_template.py
index cb17b5a7deb0..9b26105bab10 100644
--- a/torch/_inductor/codegen/cpp_gemm_template.py
+++ b/torch/_inductor/codegen/cpp_gemm_template.py
@@ -1480,7 +1480,6 @@ class CppGemmTemplate(CppTemplate):
             gemm_output_buffer = ir.Buffer(
                 # pyrefly: ignore  # missing-attribute
                 name=gemm_output_name,
-                # pyrefly: ignore  # missing-attribute
                 layout=template_buffer.layout,
             )
             current_input_buffer = gemm_output_buffer
@@ -1504,7 +1503,6 @@ class CppGemmTemplate(CppTemplate):
                     current_input_buffer = ir.Buffer(
                         # pyrefly: ignore  # missing-attribute
                         name=buffer_name,
-                        # pyrefly: ignore  # missing-attribute
                         layout=template_buffer.layout,
                     )
 
diff --git a/torch/_inductor/codegen/cpp_wrapper_gpu.py b/torch/_inductor/codegen/cpp_wrapper_gpu.py
index dd4a3a984d34..d1ddc7e1cd40 100644
--- a/torch/_inductor/codegen/cpp_wrapper_gpu.py
+++ b/torch/_inductor/codegen/cpp_wrapper_gpu.py
@@ -824,7 +824,6 @@ class CppWrapperGpu(CppWrapperCpu):
             call_args, arg_types = self.prepare_triton_wrapper_args(
                 # pyrefly: ignore  # bad-argument-type
                 call_args,
-                # pyrefly: ignore  # bad-argument-type
                 arg_types,
             )
             wrapper_name = f"call_{kernel_name}"
diff --git a/torch/_inductor/codegen/mps.py b/torch/_inductor/codegen/mps.py
index fb3939531b71..a74506d7247a 100644
--- a/torch/_inductor/codegen/mps.py
+++ b/torch/_inductor/codegen/mps.py
@@ -683,7 +683,6 @@ class MetalKernel(SIMDKernel):
                     # pyrefly: ignore  # missing-argument
                     t
                     for t in self.range_tree_nodes.values()
-                    # pyrefly: ignore  # missing-argument
                     if t.is_reduction
                 )
                 cmp_op = ">" if reduction_type == "argmax" else "<"
@@ -866,7 +865,6 @@ class MetalKernel(SIMDKernel):
                     # pyrefly: ignore  # missing-argument
                     t.numel
                     for t in self.range_trees
-                    # pyrefly: ignore  # missing-argument
                     if t.is_reduction
                 )
                 # If using dynamic shapes, set the threadgroup size to be the
diff --git a/torch/_inductor/codegen/simd.py b/torch/_inductor/codegen/simd.py
index 79d0b603220a..e2294f05ddca 100644
--- a/torch/_inductor/codegen/simd.py
+++ b/torch/_inductor/codegen/simd.py
@@ -968,7 +968,6 @@ class SIMDKernel(Kernel[CSEVariableType], Generic[CSEVariableType]):
             # pyrefly: ignore  # missing-argument
             t
             for t in self.range_trees
-            # pyrefly: ignore  # missing-argument
             if not t.is_reduction or self.inside_reduction
         ]
 
diff --git a/torch/_inductor/codegen/wrapper_fxir.py b/torch/_inductor/codegen/wrapper_fxir.py
index e123f9592770..72c8e0335508 100644
--- a/torch/_inductor/codegen/wrapper_fxir.py
+++ b/torch/_inductor/codegen/wrapper_fxir.py
@@ -1004,7 +1004,6 @@ class FxConverter:
                 # pyrefly: ignore  # missing-attribute
                 call_kwargs[key]
                 for key in signature
-                # pyrefly: ignore  # missing-attribute
                 if key not in cfg.kwargs
             ]
 
diff --git a/torch/_inductor/runtime/autotune_cache.py b/torch/_inductor/runtime/autotune_cache.py
index 63d7a52ff7d7..3c55a9cd1b08 100644
--- a/torch/_inductor/runtime/autotune_cache.py
+++ b/torch/_inductor/runtime/autotune_cache.py
@@ -275,11 +275,8 @@ class AutotuneCache:
         triton_cache_hash: str | None = None,
     ) -> None:
         data = {
-            # pyrefly: ignore  # missing-attribute
             **config.kwargs,
-            # pyrefly: ignore  # missing-attribute
             "num_warps": config.num_warps,
-            # pyrefly: ignore  # missing-attribute
             "num_stages": config.num_stages,
             "configs_hash": self.configs_hash,
             "found_by_coordesc": found_by_coordesc,
@@ -573,20 +570,15 @@ def _load_cached_autotuning(
             )
 
         # Create the triton_config with the appropriate arguments
-        # pyrefly: ignore  # bad-argument-count
         triton_config = Config(best_config, **config_args)
-        # pyrefly: ignore  # missing-attribute
         triton_config.found_by_coordesc = True
         return triton_config
 
     matching_configs = [
         cfg
         for cfg in configs
-        # pyrefly: ignore  # missing-attribute
         if all(val == best_config.get(key) for key, val in cfg.kwargs.items())
-        # pyrefly: ignore  # missing-attribute
         and cfg.num_warps == best_config.get("num_warps")
-        # pyrefly: ignore  # missing-attribute
         and cfg.num_stages == best_config.get("num_stages")
     ]
     if len(matching_configs) != 1:
diff --git a/torch/_inductor/runtime/benchmarking.py b/torch/_inductor/runtime/benchmarking.py
index ee504b1a0575..698484658ddd 100644
--- a/torch/_inductor/runtime/benchmarking.py
+++ b/torch/_inductor/runtime/benchmarking.py
@@ -123,7 +123,6 @@ class Benchmarker:
         - The runtime of `fn(*fn_args, **fn_kwargs)`, in milliseconds.
         """
         inferred_device = None
-        # pyrefly: ignore  # bad-assignment
         for arg_or_kwarg in chain(fn_args, fn_kwargs.values()):
             if not isinstance(arg_or_kwarg, torch.Tensor):
                 continue
@@ -197,7 +196,6 @@ class TritonBenchmarker(Benchmarker):
 
     @may_distort_benchmarking_result
     @time_and_count
-    # pyrefly: ignore  # bad-override
     def benchmark_gpu(
         self: Self,
         _callable: Callable[[], Any],
diff --git a/torch/_inductor/runtime/caching/implementations.py b/torch/_inductor/runtime/caching/implementations.py
index 8292b957f562..abc113caae93 100644
--- a/torch/_inductor/runtime/caching/implementations.py
+++ b/torch/_inductor/runtime/caching/implementations.py
@@ -190,7 +190,6 @@ class _OnDiskCacheImpl(_CacheImpl):
                     Defaults to empty string if not specified.
         """
         self._cache_dir: Path = self._base_dir / (sub_dir or "")
-        # pyrefly: ignore  # bad-assignment
         self._flock: FileLock = FileLock(str(self._cache_dir / "dir.lock"))
 
     @property
diff --git a/torch/_inductor/runtime/coordinate_descent_tuner.py b/torch/_inductor/runtime/coordinate_descent_tuner.py
index 30e0acfca4fe..faa2b06bcaf1 100644
--- a/torch/_inductor/runtime/coordinate_descent_tuner.py
+++ b/torch/_inductor/runtime/coordinate_descent_tuner.py
@@ -186,7 +186,6 @@ class CoordescTuner:
 
     def check_all_tuning_directions(
         self,
-        # pyrefly: ignore  # missing-attribute
         func: Callable[["triton.Config"], float],
         best_config,
         best_timing,
@@ -256,12 +255,10 @@ class CoordescTuner:
 
     def autotune(
         self,
-        func: Callable[
-            ["triton.Config"], float  # pyrefly: ignore  # missing-attribute
-        ],
-        baseline_config: "triton.Config",  # pyrefly: ignore  # missing-attribute
-        baseline_timing: float | None = None,  # pyrefly: ignore  # missing-attribute
-    ) -> "triton.Config":  # pyrefly: ignore  # missing-attribute
+        func: Callable[["triton.Config"], float],
+        baseline_config: "triton.Config",
+        baseline_timing: float | None = None,
+    ) -> "triton.Config":
         if baseline_timing is None:
             baseline_timing = self.call_func(func, baseline_config)
 
diff --git a/torch/_inductor/runtime/hints.py b/torch/_inductor/runtime/hints.py
index 71ba05011e41..1cff04d04079 100644
--- a/torch/_inductor/runtime/hints.py
+++ b/torch/_inductor/runtime/hints.py
@@ -88,13 +88,11 @@ if has_triton_package():
             divisible_by_16=None,
             equal_to_1=None,
         ):
-            # pyrefly: ignore  # not-iterable
             return {(x,): [["tt.divisibility", 16]] for x in divisible_by_16}
 
 else:
     # Define a namedtuple as a fallback when AttrsDescriptor is not available
     AttrsDescriptorWrapper = collections.namedtuple(  # type: ignore[no-redef, name-match]
-        # pyrefly: ignore  # invalid-argument
         "AttrsDescriptor",
         ["divisible_by_16", "equal_to_1"],
         defaults=[(), ()],
diff --git a/torch/_inductor/runtime/runtime_utils.py b/torch/_inductor/runtime/runtime_utils.py
index 30087d95663a..21cd5987f8f4 100644
--- a/torch/_inductor/runtime/runtime_utils.py
+++ b/torch/_inductor/runtime/runtime_utils.py
@@ -68,11 +68,8 @@ def triton_config_to_hashable(cfg: Config) -> Hashable:
     Convert triton config to a tuple that can uniquely identify it. We can use
     the return value as a dictionary key.
     """
-    # pyrefly: ignore  # missing-attribute
     items = sorted(cfg.kwargs.items())
-    # pyrefly: ignore  # missing-attribute
     items.append(("num_warps", cfg.num_warps))
-    # pyrefly: ignore  # missing-attribute
     items.append(("num_stages", cfg.num_stages))
     return tuple(items)
 
@@ -106,7 +103,6 @@ def get_max_y_grid() -> int:
 
 
 try:
-    # pyrefly: ignore  # import-error
     import colorama
 
     HAS_COLORAMA = True
@@ -118,7 +114,6 @@ except ModuleNotFoundError:
 if HAS_COLORAMA:
 
     def _color_text(msg: str, color: str) -> str:
-        # pyrefly: ignore  # missing-attribute
         return getattr(colorama.Fore, color.upper()) + msg + colorama.Fore.RESET
 
 else:
diff --git a/torch/_inductor/runtime/static_cuda_launcher.py b/torch/_inductor/runtime/static_cuda_launcher.py
index e7d4705740e5..a5e511052b28 100644
--- a/torch/_inductor/runtime/static_cuda_launcher.py
+++ b/torch/_inductor/runtime/static_cuda_launcher.py
@@ -34,29 +34,21 @@ class StaticallyLaunchedCudaKernel:
     """
 
     def __init__(self, kernel: CompiledKernel) -> None:
-        # pyrefly: ignore  # missing-attribute
         self.name = kernel.src.fn.__name__
-        # pyrefly: ignore  # missing-attribute
         self.cubin_raw = kernel.asm.get("cubin", None)
-        # pyrefly: ignore  # missing-attribute
         self.cubin_path = kernel._cubin_path
 
         # Used by torch.compile to filter constants in older triton versions
-        # pyrefly: ignore  # missing-attribute
         self.arg_names = kernel.src.fn.arg_names
 
         # Const exprs that are declared by the triton kernel directly
         # Used to generate the kernel launcher's def args
-        # pyrefly: ignore  # missing-attribute
         self.declared_constexprs = kernel.src.fn.constexprs
 
-        # pyrefly: ignore  # missing-attribute
         self.hash = kernel.hash
 
         if triton_knobs is None:
-            # pyrefly: ignore  # missing-attribute
             launch_enter = kernel.__class__.launch_enter_hook
-            # pyrefly: ignore  # missing-attribute
             launch_exit = kernel.__class__.launch_exit_hook
         else:
             launch_enter = triton_knobs.runtime.launch_enter_hook
@@ -78,15 +70,12 @@ class StaticallyLaunchedCudaKernel:
             raise NotImplementedError(
                 "We don't support launch enter or launch exit hooks"
             )
-        # pyrefly: ignore  # missing-attribute
         self.num_warps = kernel.metadata.num_warps
         self.shared = (
-            # pyrefly: ignore  # missing-attribute
             kernel.shared if hasattr(kernel, "shared") else kernel.metadata.shared
         )
 
         def needs_scratch_arg(scratch_name: str, param_name: str) -> bool:
-            # pyrefly: ignore  # missing-attribute
             if hasattr(kernel.metadata, param_name):
                 if getattr(kernel.metadata, param_name) > 0:
                     raise NotImplementedError(
@@ -102,7 +91,6 @@ class StaticallyLaunchedCudaKernel:
         # same situation for profile scratch - triton-lang/triton#7258
         self.has_profile_scratch = needs_scratch_arg("Profile", "profile_scratch_size")
 
-        # pyrefly: ignore  # missing-attribute
         self.arg_tys = self.arg_ty_from_signature(kernel.src)
         self.function: int | None = None  # Loaded by load_kernel(on the parent process)
         num_ctas = 1
@@ -182,7 +170,6 @@ class StaticallyLaunchedCudaKernel:
     def arg_ty_from_signature(self, src: ASTSource) -> str:
         def index_key(i: Any) -> int:
             if isinstance(i, str):
-                # pyrefly: ignore  # missing-attribute
                 return src.fn.arg_names.index(i)
             elif isinstance(i, tuple):
                 # In triton 3.3, src.fn.constants has tuples as a key
@@ -190,7 +177,6 @@ class StaticallyLaunchedCudaKernel:
             else:
                 return i
 
-        # pyrefly: ignore  # missing-attribute
         signature = {index_key(key): value for key, value in src.signature.items()}
         # Triton uses these as the main way to filter out constants passed to their cubin
         constants = [index_key(key) for key in getattr(src, "constants", dict())]
@@ -212,7 +198,6 @@ class StaticallyLaunchedCudaKernel:
             if ty == "constexpr" or i in constants:
                 pass
             else:
-                # pyrefly: ignore  # bad-argument-type
                 params.append(self.extract_type(ty))
         return "".join(params)
 
@@ -250,7 +235,6 @@ class StaticallyLaunchedCudaKernel:
             if has_scratch:
                 arg_tys = arg_tys + "O"
                 args = (*args, None)
-        # pyrefly: ignore  # bad-argument-type
         assert len(args) == len(arg_tys)
 
         # TODO: can handle grid functions here or in C++, so
@@ -263,7 +247,6 @@ class StaticallyLaunchedCudaKernel:
             self.num_warps,
             self.shared,
             arg_tys,
-            # pyrefly: ignore  # bad-argument-type
             args,
             stream,
         )
diff --git a/torch/fx/experimental/proxy_tensor.py b/torch/fx/experimental/proxy_tensor.py
index 28a60bafcac8..805d59008e02 100644
--- a/torch/fx/experimental/proxy_tensor.py
+++ b/torch/fx/experimental/proxy_tensor.py
@@ -421,7 +421,6 @@ def get_proxy_slot(
             else:
                 # Attempt to build it from first principles.
                 _build_proxy_for_sym_expr(tracer, obj.node.expr, obj)
-                # pyrefly: ignore  # no-matching-overload
                 value = tracker.get(obj)
 
     if value is None:

From 080365b7d82a3c99c995cab6dc912b7dfe22aa41 Mon Sep 17 00:00:00 2001
From: Turner Richmond <trichmon@redhat.com>
Date: Fri, 17 Oct 2025 17:35:14 +0000
Subject: [PATCH 049/123] Escaped html tags name and target to appear as
 strings (#165543)

Fixes small typo in markdown documentation file - Added escape characters to precede tag pattern.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165543
Approved by: https://github.com/mikaylagawarecki
---
 docs/source/export/ir_spec.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/export/ir_spec.md b/docs/source/export/ir_spec.md
index 562cae1e337f..879df6ee04a0 100644
--- a/docs/source/export/ir_spec.md
+++ b/docs/source/export/ir_spec.md
@@ -158,11 +158,11 @@ This format captures everything present in the Node class, with the exception of
 
 Concretely:
 
-- **<name>** is the name of the node as it would appear in `node.name`.
-- **<op_name>** is the `node.op` field, which must be one of these:
+- **\<name>** is the name of the node as it would appear in `node.name`.
+- **\<op_name>** is the `node.op` field, which must be one of these:
   `<call_function>`, `<placeholder>`,
   `<get_attr>`, or `<output>`.
-- **<target>** is the target of the node as `node.target`. The meaning of this
+- **\<target>** is the target of the node as `node.target`. The meaning of this
   field depends on `op_name`.
 - **args1, … args 4…** are what is listed in the `node.args` tuple. If a
   value in the list is an {class}`torch.fx.Node`, then it will be especially

From 45afaf08a14ab760d86ea80dea6d50cec8626513 Mon Sep 17 00:00:00 2001
From: Pian Pawakapan <pianpwk@meta.com>
Date: Thu, 16 Oct 2025 22:42:29 -0700
Subject: [PATCH 050/123] [DebugMode][2/N] add nn.Module tracking (#165498)

Uses ModTracker to record nn.Module entries, much like CommDebugMode.

Can be switched on with `DebugMode(record_nn_module=True)`:
```
    [nn.Mod] Bar
      [nn.Mod] Bar.abc
        [nn.Mod] Bar.abc.l1
          aten::t(t: f32[4, 4])
          aten::addmm(t: f32[4], t: f32[4, 4], t: f32[4, 4])
        [nn.Mod] Bar.abc.l2
          aten::t(t: f32[4, 4])
          aten::addmm(t: f32[4], t: f32[4, 4], t: f32[4, 4])
      [nn.Mod] Bar.xyz
        aten::t(t: f32[4, 4])
        aten::addmm(t: f32[4], t: f32[4, 4], t: f32[4, 4])"""
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165498
Approved by: https://github.com/SherlockNoMad
ghstack dependencies: #165376
---
 .../tensor/debug/test_debug_mode.py           | 40 +++++++++++++++++
 torch/utils/_debug_mode.py                    | 45 ++++++++++++++++++-
 2 files changed, 84 insertions(+), 1 deletion(-)

diff --git a/test/distributed/tensor/debug/test_debug_mode.py b/test/distributed/tensor/debug/test_debug_mode.py
index aab91ddebe94..20da99f52eb0 100644
--- a/test/distributed/tensor/debug/test_debug_mode.py
+++ b/test/distributed/tensor/debug/test_debug_mode.py
@@ -330,6 +330,46 @@ class TestDTensorDebugMode(TestCase):
             f(x)
         self.assertEqual(len(debug_mode.debug_string()), 0)
 
+    def test_nn_module(self):
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.l1 = torch.nn.Linear(4, 4)
+                self.l2 = torch.nn.Linear(4, 4)
+
+            def forward(self, x):
+                return self.l2(self.l1(x))
+
+        class Bar(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.abc = Foo()
+                self.xyz = torch.nn.Linear(4, 4)
+
+            def forward(self, x):
+                return self.xyz(self.abc(x))
+
+        mod = Bar()
+        inp = torch.randn(4, 4)
+        with DebugMode(record_nn_module=True) as debug_mode:
+            _ = mod(inp)
+
+        self.assertExpectedInline(
+            debug_mode.debug_string(),
+            """\
+    [nn.Mod] Bar
+      [nn.Mod] Bar.abc
+        [nn.Mod] Bar.abc.l1
+          aten::t(t: f32[4, 4])
+          aten::addmm(t: f32[4], t: f32[4, 4], t: f32[4, 4])
+        [nn.Mod] Bar.abc.l2
+          aten::t(t: f32[4, 4])
+          aten::addmm(t: f32[4], t: f32[4, 4], t: f32[4, 4])
+      [nn.Mod] Bar.xyz
+        aten::t(t: f32[4, 4])
+        aten::addmm(t: f32[4], t: f32[4, 4], t: f32[4, 4])""",
+        )
+
 
 instantiate_parametrized_tests(TestDTensorDebugMode)
 
diff --git a/torch/utils/_debug_mode.py b/torch/utils/_debug_mode.py
index 29b74aab5ee3..2c87aa8f1c4d 100644
--- a/torch/utils/_debug_mode.py
+++ b/torch/utils/_debug_mode.py
@@ -1,6 +1,6 @@
 # mypy: allow-untyped-defs
 import contextlib
-from typing import Optional
+from typing import Optional, TYPE_CHECKING
 
 import torch
 from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
@@ -13,6 +13,10 @@ from torch.utils._python_dispatch import (
 from torch.utils._pytree import tree_map
 
 
+if TYPE_CHECKING:
+    from torch.distributed._tools.mod_tracker import ModTracker
+
+
 __all__ = ["DebugMode", "get_active_debug_mode"]
 
 REDISTRIBUTE_FUNC = "redistribute_input"
@@ -139,6 +143,17 @@ class _RedistributeCall(_DebugCall):
         return f"{REDISTRIBUTE_FUNC}({arg_str}, {placement_str})"
 
 
+class _NNModuleCall(_DebugCall):
+    """Designates entering an nn.Module's forward method"""
+
+    def __init__(self, module_name: str, call_depth: int):
+        super().__init__(call_depth)
+        self.module_name = module_name
+
+    def render(self, attributes: list[str]) -> str:
+        return f"[nn.Mod] {self.module_name}"
+
+
 class DebugMode(TorchDispatchMode):
     def __init__(
         self,
@@ -147,6 +162,7 @@ class DebugMode(TorchDispatchMode):
         record_faketensor=False,
         record_realtensor=True,
         record_tensor_attributes=None,
+        record_nn_module=False,
     ):
         super().__init__()
         import torch.distributed.tensor  # noqa: F401
@@ -157,6 +173,12 @@ class DebugMode(TorchDispatchMode):
         self.record_realtensor = record_realtensor
         self.record_tensor_attributes = record_tensor_attributes or []
 
+        self.record_nn_module = record_nn_module
+
+        self.module_tracker: Optional[ModTracker] = None
+        if self.record_nn_module:
+            self.module_tracker_setup()
+
         self.operators = []
         self.call_depth = 0
 
@@ -211,14 +233,35 @@ class DebugMode(TorchDispatchMode):
             torch._C._push_on_torch_function_stack(self)
 
         super().__enter__()
+        if self.record_nn_module:
+            self.module_tracker.__enter__()  # type: ignore[attribute, union-attr]
         return self
 
     # pyrefly: ignore  # bad-override
     def __exit__(self, *args):
         super().__exit__(*args)
+        if self.record_nn_module:
+            self.module_tracker.__exit__()  # type: ignore[attribute, union-attr]
         if self.record_torchfunction:
             torch._C._pop_torch_function_stack()
 
+    def module_tracker_setup(self):
+        from torch.distributed._tools.mod_tracker import ModTracker
+
+        self.module_tracker = ModTracker()
+
+        # module pre-fw hook: record module call
+        def pre_fw_hook(module, input):
+            fqn = self.module_tracker._get_mod_name(module)  # type: ignore[attribute, union-attr]
+            self.operators.append(_NNModuleCall(fqn, self.call_depth + 1))
+            self.call_depth += 1
+
+        # module post-fw hook: decrement call depth
+        def post_fw_hook(module, input, output):
+            self.call_depth -= 1
+
+        self.module_tracker.register_user_hooks(pre_fw_hook, post_fw_hook)
+
     @contextlib.contextmanager
     def record_redistribute_calls(
         self,

From da8517fa634e9922e3299e14b86428bcbf2b373d Mon Sep 17 00:00:00 2001
From: Jeff Daily <jeff.daily@amd.com>
Date: Fri, 17 Oct 2025 17:41:16 +0000
Subject: [PATCH 051/123] [ROCm][CI] upgrade wheels to 7.0.2 and 6.4.4 patch
 release (#165756)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165756
Approved by: https://github.com/jeffdaily

Co-authored-by: Jeff Daily <jeff.daily@amd.com>
---
 .ci/docker/libtorch/build.sh  | 6 +++++-
 .ci/docker/manywheel/build.sh | 6 +++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/.ci/docker/libtorch/build.sh b/.ci/docker/libtorch/build.sh
index 8447eb0d8331..c40896cb5499 100755
--- a/.ci/docker/libtorch/build.sh
+++ b/.ci/docker/libtorch/build.sh
@@ -39,9 +39,13 @@ case ${DOCKER_TAG_PREFIX} in
         DOCKER_GPU_BUILD_ARG=""
         ;;
     rocm*)
+        # we want the patch version of 7.0 instead
+        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
+            GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
+        fi
         # we want the patch version of 6.4 instead
         if [[ "$GPU_ARCH_VERSION" == *"6.4"* ]]; then
-            GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
+            GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.4"
         fi
         BASE_TARGET=rocm
         GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
diff --git a/.ci/docker/manywheel/build.sh b/.ci/docker/manywheel/build.sh
index 99f03f5c8636..b4b505997303 100755
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@@ -75,9 +75,13 @@ case ${image} in
         DOCKERFILE_SUFFIX="_cuda_aarch64"
         ;;
     manylinux2_28-builder:rocm*)
+        # we want the patch version of 7.0 instead
+        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
+            GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
+        fi
         # we want the patch version of 6.4 instead
         if [[ "$GPU_ARCH_VERSION" == *"6.4"* ]]; then
-            GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
+            GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.4"
         fi
         TARGET=rocm_final
         MANY_LINUX_VERSION="2_28"

From cff1b207717b84b6ac3fdc95fc5ac91cc3802b63 Mon Sep 17 00:00:00 2001
From: jmaczan <jedrzejpawel@maczan.pl>
Date: Fri, 17 Oct 2025 17:44:43 +0000
Subject: [PATCH 052/123] Patch the flex_attention._get_mod_type to not use
 inspect.signature when computing num_positional_args (an alternative fix for
 flex attention graph break on create_block_mask) (#164923)

The initial fix for inspect.signature uses not a right approach (https://github.com/pytorch/pytorch/pull/164349#pullrequestreview-3306614010). As @williamwen42 suggests (https://github.com/pytorch/pytorch/pull/164349#issuecomment-3379222885) we can just for now get rid of `inspect.signature` call in flex_attention to resolve this high priority issue (https://github.com/pytorch/pytorch/issues/164247#issuecomment-3378673179). In this PR I did exactly this - limited the scope of fix to just computing `num_positional_args` in `flex_attention._get_mod_type` based on properties returned by `NestedUserFunctionVariable.const_getattr` (some were missing so I added them)

Fixes #164247

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164923
Approved by: https://github.com/williamwen42
---
 test/dynamo/test_repros.py                    | 63 +++++++++++++++++++
 .../TestScript.test_python_frontend           |  0
 .../TestScript.test_python_frontend_py3       |  0
 torch/_dynamo/variables/functions.py          | 14 ++++-
 torch/nn/attention/flex_attention.py          | 19 ++++--
 5 files changed, 90 insertions(+), 6 deletions(-)
 delete mode 100644 test/dynamo_expected_failures/TestScript.test_python_frontend
 delete mode 100644 test/dynamo_expected_failures/TestScript.test_python_frontend_py3

diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index db950037a194..47692a4fa81b 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -46,6 +46,7 @@ from torch._dynamo.backends.debugging import ExplainWithBackend
 from torch._dynamo.debug_utils import same_two_models
 from torch._dynamo.testing import (
     CompileCounter,
+    CompileCounterWithBackend,
     EagerAndRecordGraphs,
     rand_strided,
     same,
@@ -54,6 +55,7 @@ from torch._dynamo.testing import (
 )
 from torch._inductor.utils import fresh_cache
 from torch.nn import functional as F
+from torch.nn.attention.flex_attention import create_block_mask, flex_attention
 from torch.profiler import profile, ProfilerActivity
 from torch.testing._internal.common_cuda import (
     PLATFORM_SUPPORTS_FLASH_ATTENTION,
@@ -7369,6 +7371,67 @@ def forward(self, s77 : torch.SymInt, s27 : torch.SymInt, L_x_ : torch.Tensor):
         )
         self.assertEqual(explain_output.break_reasons[0].reason, expected_msg)
 
+    @parametrize("backend", ["eager", "inductor"])
+    def test_issue164247(self, backend: str):
+        if backend == "inductor" and torch._dynamo.config.dynamic_shapes:
+            raise unittest.SkipTest(
+                "Skip only in dynamic-shapes wrapper (known issue #157612)"
+            )
+
+        class MixedFakeModeModel(nn.Module):
+            def __init__(self, dim=64):
+                super().__init__()
+                self.dim = dim
+                self.lin = torch.nn.Linear(64, 64)
+
+            def forward(self, x):
+                batch_size, seq_len, _ = x.shape
+
+                # Process input first - this creates fake tensors in export's fake mode
+                processed = self.lin(x)
+
+                # Create some computation that depends on processed tensor
+                intermediate = processed.sum(dim=-1).detach()  # Shape: (batch, seq_len)
+
+                def dynamic_mask_function(batch_idx, head_idx, q_idx, kv_idx):
+                    threshold = intermediate[
+                        batch_idx, q_idx % seq_len
+                    ]  # Access the captured tensor
+                    return (kv_idx <= q_idx) & (threshold > 0)
+
+                block_mask = create_block_mask(
+                    mask_mod=dynamic_mask_function,
+                    B=batch_size,
+                    H=None,
+                    Q_LEN=seq_len,
+                    KV_LEN=seq_len,
+                    device=x.device,
+                    _compile=False,
+                )
+                q = processed.view(batch_size, 1, seq_len, self.dim)
+                k = processed.view(batch_size, 1, seq_len, self.dim)
+                v = processed.view(batch_size, 1, seq_len, self.dim)
+
+                out = torch.compile(flex_attention)(q, k, v, block_mask=block_mask)
+                out = flex_attention(q, k, v, block_mask=block_mask)
+
+                return out
+
+        backend_counter = CompileCounterWithBackend(backend)
+        model = MixedFakeModeModel()
+        compiled = torch.compile(model, backend=backend_counter, fullgraph=True)
+
+        if backend == "inductor":
+            # A known InductorError Issue https://github.com/pytorch/pytorch/issues/157612
+            with self.assertRaises(RuntimeError):
+                compiled(torch.randn(2, 128, 64))
+        else:
+            compiled(torch.randn(2, 128, 64))
+
+        # One graph, so no graph breaks
+        self.assertEqual(backend_counter.frame_count, 1)
+        self.assertEqual(len(backend_counter.graphs), 1)
+
     # https://github.com/pytorch/pytorch/issues/164990
     def test_guard_same_frame_fail_message(self):
         import torch._dynamo.guards as g
diff --git a/test/dynamo_expected_failures/TestScript.test_python_frontend b/test/dynamo_expected_failures/TestScript.test_python_frontend
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/TestScript.test_python_frontend_py3 b/test/dynamo_expected_failures/TestScript.test_python_frontend_py3
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/torch/_dynamo/variables/functions.py b/torch/_dynamo/variables/functions.py
index 7d534de073c9..4911ded6e333 100644
--- a/torch/_dynamo/variables/functions.py
+++ b/torch/_dynamo/variables/functions.py
@@ -1320,9 +1320,21 @@ class NestedUserFunctionVariable(BaseUserFunctionVariable):
 
     def const_getattr(self, tx, name):
         if name == "__name__":
-            return self.fn_name.as_python_constant()
+            return self.get_name()
+        if name == "__code__":
+            return self.get_code()
+        if name == "__defaults__":
+            d = getattr(self, "defaults", None)
+            return d.as_python_constant() if d else None
         return super().const_getattr(tx, name)
 
+    def call_obj_hasattr(self, tx: "InstructionTranslator", name):
+        if name == "__code__":
+            return variables.ConstantVariable.create(hasattr(self, "code"))
+        if name == "__defaults__":
+            return variables.ConstantVariable.create(hasattr(self, "defaults"))
+        return super().call_obj_hasattr(tx, name)
+
     def has_self(self):
         return False
 
diff --git a/torch/nn/attention/flex_attention.py b/torch/nn/attention/flex_attention.py
index a608020f30f3..0a4acdd7a232 100644
--- a/torch/nn/attention/flex_attention.py
+++ b/torch/nn/attention/flex_attention.py
@@ -267,11 +267,20 @@ def _get_mod_type(fn: Callable) -> _ModificationType:
     considered as a score_mod function. If the function has 4 positional arguments, it is
     considered as a mask function.
     """
-    num_positional_args = sum(
-        1
-        for param in inspect.signature(fn).parameters.values()
-        if param.default is inspect.Parameter.empty
-    )
+    if hasattr(fn, "__code__"):
+        code = fn.__code__
+        num_positional_total = code.co_argcount
+        defaults = ()
+        if hasattr(fn, "__defaults__"):
+            defaults = fn.__defaults__ or ()
+        num_defaults = len(defaults)
+        num_positional_args = num_positional_total - num_defaults
+    else:
+        num_positional_args = sum(
+            1
+            for param in inspect.signature(fn).parameters.values()
+            if param.default is inspect.Parameter.empty
+        )
     assert num_positional_args == 5 or num_positional_args == 4
     if num_positional_args == 5:
         return _ModificationType.SCORE_MOD

From dd3b48e85dd51ccbec8128159947a719902344c6 Mon Sep 17 00:00:00 2001
From: James Wu <jjwu@meta.com>
Date: Tue, 14 Oct 2025 14:24:23 -0700
Subject: [PATCH 053/123] Fix bug with serialization after AOTAutogradCache hit
 (#165474)

Fixes #165447

On AOTAutogradCache load, the serialization function we pick is just lambda: self, because the object itself is an AOTAutogradCacheEntry. However, this isn't safe, because `wrap_post_compile` will make `self` unserializable, since it needs to load triton kernels and stuff!

So instead, on AOTAutogradCache load, we preserve the bytes that were used to load the object to begin with, and return that object on a call to serialize(). This effectively makes it so that we save a copy of the pre-hydrated artifact, without needing to do an eager copy until someone actually calls `serialize`.

Test Plan:

Run

```py
import torch

class M(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.linear1 = torch.nn.Linear(2, 4)
        self.relu = torch.nn.ReLU()
        self.linear2 = torch.nn.Linear(4, 8)
    def forward(self, x):
        return self.linear2(self.relu(self.linear1(x)))

device = "cuda"
m = M().to(device)
sample_inputs = (torch.randn(2, 2, device=device),)
eager_out = m(*sample_inputs)

with torch._dynamo.config.patch("enable_aot_compile", True):
    compiled_fn_path = "./m.pt"
    compiled_fn = torch.compile(
        m,
        fullgraph=True
    ).forward.aot_compile((sample_inputs, {}))

    compiled_fn.save_compiled_function(compiled_fn_path)
    torch._dynamo.reset()
    with torch.compiler.set_stance("fail_on_recompile"):
        with open(compiled_fn_path, "rb") as f:
            loaded_fn = torch.compiler.load_compiled_function(f)

assert loaded_fn is not None

compiled_out = loaded_fn(m, *sample_inputs)

assert torch.allclose(eager_out, compiled_out)
```

twice, see that it succeeds.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165474
Approved by: https://github.com/yiming0416, https://github.com/zhxchen17
---
 .../_aot_autograd/autograd_cache.py           | 30 +++++++++++++------
 torch/_inductor/standalone_compile.py         |  5 ++--
 2 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/torch/_functorch/_aot_autograd/autograd_cache.py b/torch/_functorch/_aot_autograd/autograd_cache.py
index f3d6842318ad..0ac2407269ac 100644
--- a/torch/_functorch/_aot_autograd/autograd_cache.py
+++ b/torch/_functorch/_aot_autograd/autograd_cache.py
@@ -17,7 +17,7 @@ import time
 import traceback
 from abc import ABC, abstractmethod
 from collections.abc import Callable
-from copy import copy
+from copy import copy, deepcopy
 from dataclasses import dataclass
 from typing import Any, Generic, Optional, TYPE_CHECKING, TypeVar, Union
 from typing_extensions import override
@@ -963,10 +963,6 @@ class GenericAOTAutogradCacheEntry(Generic[TForward, TBackward]):
             )
 
         # Add serialization function back onto object
-        compiled_function = SerializableCompiledFunction(
-            compiled_function, lambda: self
-        )
-
         compiled_function, _ = post_compile(
             self.dispatch_wrappers,
             compiled_function,
@@ -1055,6 +1051,9 @@ def deserialize_bundled_cache_entry(entry: BundledAOTAutogradCacheEntry) -> Call
     # so we don't have a place to track cudagraphs here.
     cudagraphs = BoxedBool(torch._inductor.config.triton.cudagraphs)
     boxed_forward_device_index = BoxedDeviceIndex(None)
+    # We need to make a clean copy of the cache entry
+    # in case it needs to be serialized again
+    serializable_copy = deepcopy(entry)
     compiled_fn = entry.wrap_post_compile(
         [],
         entry.sanitized_aot_config,
@@ -1063,6 +1062,8 @@ def deserialize_bundled_cache_entry(entry: BundledAOTAutogradCacheEntry) -> Call
             "boxed_forward_device_index": boxed_forward_device_index,
         },
     )
+    # Ensure the deserialized cache entry is still serializable
+    compiled_fn = SerializableCompiledFunction(compiled_fn, lambda: serializable_copy)
 
     # TODO: this ignores flat_params, which can exist
     # if inline_builtin_nn_modules=False
@@ -1155,13 +1156,19 @@ class AOTAutogradCache(GuardedCache[GenericAOTAutogradCacheEntry]):
                 cache_key, debug_lines = autograd_cache_key(
                     gm, args, aot_config, fx_config
                 )
-                entry: Optional[GenericAOTAutogradCacheEntry] = (
+                result: Optional[tuple[GenericAOTAutogradCacheEntry, bytes]] = (
                     AOTAutogradCache._lookup(
                         cache_key, local, remote, args, cache_info, aot_config
                     )
                 )
-                if entry is not None:
+                if result is not None:
+                    (entry, pickled_content) = result
                     compiled_fn = entry.wrap_post_compile(args, aot_config, fx_config)
+                    # Make the compiled_fn serializable, where the serialize function just
+                    # makes a copy of the original entry before post compile via the pickled content
+                    compiled_fn = SerializableCompiledFunction(
+                        compiled_fn, lambda: pickle.loads(pickled_content)
+                    )
                     log.info("AOTAutograd cache hit for key %s", cache_key)
 
                     counters["aot_autograd"]["autograd_cache_hit"] += 1
@@ -1321,7 +1328,7 @@ class AOTAutogradCache(GuardedCache[GenericAOTAutogradCacheEntry]):
         args: list[Any],
         cache_info: dict[str, Any],
         aot_config: Optional[AOTConfig],
-    ) -> Optional[GenericAOTAutogradCacheEntry]:
+    ) -> Optional[tuple[GenericAOTAutogradCacheEntry, bytes]]:
         """Given a key generated by AOTAutogradCachePickler, look up its location in the cache."""
         remote_cache: Optional[RemoteCache[JsonDataTy]] = None
         if remote:
@@ -1330,6 +1337,7 @@ class AOTAutogradCache(GuardedCache[GenericAOTAutogradCacheEntry]):
         symints = AOTAutogradCache._filter_backed_symints(args)
         hints = [hint_int(s) for s in symints]
         entry = None
+        pickled_content = None
         try:
             (
                 entry,
@@ -1363,7 +1371,11 @@ class AOTAutogradCache(GuardedCache[GenericAOTAutogradCacheEntry]):
             log.info("AOTAutograd cache unable to load compiled graph: %s", e)
             if config.strict_autograd_cache:
                 raise e
-        return entry
+        if entry is not None:
+            assert pickled_content is not None
+            return (entry, pickled_content)
+        else:
+            return None
 
     @staticmethod
     def _write_to_local_cache(key: str, content: bytes):
diff --git a/torch/_inductor/standalone_compile.py b/torch/_inductor/standalone_compile.py
index 26042535bc29..0d21b06f7182 100644
--- a/torch/_inductor/standalone_compile.py
+++ b/torch/_inductor/standalone_compile.py
@@ -158,7 +158,7 @@ class CompiledArtifact:
                         AOTAutogradCache,
                     )
 
-                    entry = AOTAutogradCache._lookup(
+                    result = AOTAutogradCache._lookup(
                         key,
                         local=True,
                         remote=False,
@@ -167,7 +167,8 @@ class CompiledArtifact:
                         aot_config=None,
                     )
 
-                assert entry is not None
+                assert result is not None
+                (entry, _) = result
 
                 from .compile_fx import _CompileFxKwargs
 

From 39e0a832c9898b013314ceee189643410ff8ed11 Mon Sep 17 00:00:00 2001
From: Simon Layton <simonlayton@meta.com>
Date: Fri, 17 Oct 2025 05:44:35 -0700
Subject: [PATCH 054/123] Fix B200 test fails in scaled_mm (#165747)

Summary:

PR #165528 changes some scale/swizzle inference behavior in scaled_mm
tests - mxfp8 tests on Blackwell can get incorrectly classified,
resulting in failures.

Fix the scale/swizzle inference code to prevent this.

Fixes https://github.com/pytorch/pytorch/issues/165743

Test Plan:

```
pytest -svv test/test_scaled_matmul_cuda.py
```

Reviewers:

@jagadish-amd @jeffdaily @drisspg

Subscribers:

@Aidyn-A

Tasks:

Tags:
Signed-off-by: Simon Layton <simonlaytonmeta.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165747
Approved by: https://github.com/eqy, https://github.com/drisspg, https://github.com/jeffdaily
---
 test/test_scaled_matmul_cuda.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_scaled_matmul_cuda.py b/test/test_scaled_matmul_cuda.py
index c0b96595de6e..d57b1535d02f 100644
--- a/test/test_scaled_matmul_cuda.py
+++ b/test/test_scaled_matmul_cuda.py
@@ -154,8 +154,8 @@ def infer_scale_swizzle(mat, scale):
 
     # MXFP4 w/o swizzle
     if (
-        scale.numel() == 2 * math.ceil(mat.shape[0] // 32) * mat.shape[1]
-        or scale.numel() == 2 * math.ceil(mat.shape[1] // 32) * mat.shape[0]
+        (scale.numel() == 2 * math.ceil(mat.shape[0] // 32) * mat.shape[1]
+            or scale.numel() == 2 * math.ceil(mat.shape[1] // 32) * mat.shape[0])
         and mat.dtype == torch.float4_e2m1fn_x2
         and scale.dtype == torch.float8_e8m0fnu
     ):

From a032510db38e8331afa08f7635d146f9cefdd0ab Mon Sep 17 00:00:00 2001
From: Bruce Chang <brchang@nvidia.com>
Date: Fri, 17 Oct 2025 17:55:00 +0000
Subject: [PATCH 055/123] shrink_group implementation to expose ncclCommShrink
 API (#164518)

Closes #164529

To expose the new [ncclCommShrink](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html#ncclcommshrink) API to PyTorch.

This is useful when you need to exclude certain GPUs or nodes from a collective operation, for example in fault tolerance scenarios or when dynamically adjusting resource utilization.

For more info:  [Shrinking a communicator](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/communicators.html#shrinking-a-communicator)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164518
Approved by: https://github.com/Skylion007, https://github.com/syed-ahmed, https://github.com/kwen2501
---
 docs/source/distributed.md                    |   4 +
 test/distributed/logging_utils.py             |  43 ++
 test/distributed/test_c10d_nccl.py            | 640 +++++++++++++++++-
 torch/csrc/distributed/c10d/Backend.hpp       |  17 +
 torch/csrc/distributed/c10d/NCCLUtils.cpp     |  59 ++
 torch/csrc/distributed/c10d/NCCLUtils.hpp     |  12 +
 .../distributed/c10d/ProcessGroupNCCL.cpp     | 135 +++-
 .../distributed/c10d/ProcessGroupNCCL.hpp     |  21 +
 torch/csrc/distributed/c10d/init.cpp          |  11 +
 torch/distributed/distributed_c10d.py         | 515 ++++++++++++++
 torch/testing/_internal/common_distributed.py |  48 ++
 11 files changed, 1503 insertions(+), 2 deletions(-)
 create mode 100644 test/distributed/logging_utils.py

diff --git a/docs/source/distributed.md b/docs/source/distributed.md
index 5da02bb8a194..69df7be1fa80 100644
--- a/docs/source/distributed.md
+++ b/docs/source/distributed.md
@@ -394,6 +394,10 @@ an opaque group handle that can be given as a `group` argument to all collective
 .. autofunction:: new_group
 ```
 
+```{eval-rst}
+.. autofunction:: torch.distributed.distributed_c10d.shrink_group
+```
+
 ```{eval-rst}
 .. autofunction:: get_group_rank
 ```
diff --git a/test/distributed/logging_utils.py b/test/distributed/logging_utils.py
new file mode 100644
index 000000000000..09a0adccfd80
--- /dev/null
+++ b/test/distributed/logging_utils.py
@@ -0,0 +1,43 @@
+import logging
+import time
+
+
+_start_time = time.time()
+_logger = logging.getLogger(__name__)
+
+
+def _ts():
+    return time.time() - _start_time
+
+
+def configure(level=logging.INFO, force=False):
+    try:
+        logging.basicConfig(
+            level=level,
+            format="%(asctime)s %(name)s %(levelname)s: %(message)s",
+            force=force,
+        )
+    except TypeError:
+        logging.basicConfig(
+            level=level, format="%(asctime)s %(name)s %(levelname)s: %(message)s"
+        )
+
+
+def log_test_info(rank, message):
+    _logger.info("[%7.3fs][Rank %s] %s", _ts(), rank, message)
+
+
+def log_test_success(rank, message):
+    _logger.info("[%7.3fs][Rank %s] ✅ %s", _ts(), rank, message)
+
+
+def log_test_validation(rank, message):
+    _logger.info("[%7.3fs][Rank %s] ✓ %s", _ts(), rank, message)
+
+
+def log_test_warning(rank, message):
+    _logger.warning("[%7.3fs][Rank %s] ⚠️ %s", _ts(), rank, message)
+
+
+def log_test_error(rank, message):
+    _logger.error("[%7.3fs][Rank %s] ✗ %s", _ts(), rank, message)
diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index 7410255d27a8..0f518fab62cf 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -2,6 +2,7 @@
 
 import copy
 import json
+import logging
 import os
 import pickle
 import random
@@ -21,6 +22,7 @@ from unittest import mock, SkipTest
 import torch
 import torch.distributed as c10d
 import torch.distributed._functional_collectives as _functional_collectives
+from torch.distributed.distributed_c10d import SHRINK_ABORT as NCCL_SHRINK_ABORT
 
 
 if not c10d.is_available() or not c10d.is_nccl_available():
@@ -47,12 +49,15 @@ from torch._C._distributed_c10d import ErrorType, OpType, WorkResult
 from torch.nn.parallel import DistributedDataParallel
 from torch.testing._internal.common_cuda import _get_torch_rocm_version, TEST_MULTIGPU
 from torch.testing._internal.common_distributed import (
+    get_required_world_size,
     get_timeout,
     init_multigpu_helper,
     MultiProcessTestCase,
     requires_multicast_support,
     requires_nccl,
+    requires_nccl_shrink,
     requires_nccl_version,
+    requires_world_size,
     skip_if_lt_x_gpu,
     skip_if_rocm_multiprocess,
     sm_is_or_higher_than,
@@ -87,6 +92,17 @@ BFLOAT16_AVAILABLE = torch.cuda.is_available() and (
     torch.version.cuda is not None or torch.version.hip is not None
 )
 
+from logging_utils import (
+    configure as _log_configure,
+    log_test_info,
+    log_test_success,
+    log_test_validation,
+    log_test_warning,
+)
+
+
+_log_configure(level=logging.INFO, force=True)
+
 
 class RendezvousEnvTest(TestCase):
     @retry_on_connect_failures
@@ -317,7 +333,7 @@ class ProcessGroupNCCLGroupTest(MultiProcessTestCase):
 
     @property
     def world_size(self):
-        return 2
+        return get_required_world_size(self, 2)
 
     @property
     def rank_to_GPU(self):
@@ -1255,6 +1271,628 @@ class ProcessGroupNCCLGroupTest(MultiProcessTestCase):
         pg_2 = c10d.new_group([0, 1])
         self.assertEqual(pg_2.group_desc, "undefined")
 
+    @requires_nccl_shrink()
+    @requires_world_size(2)
+    def test_shrink_group_basic(self):
+        """Test basic shrink_group functionality."""
+        self._perform_shrink_test([1], "Basic shrink test")
+
+    @requires_nccl_shrink()
+    @requires_world_size(2)
+    def test_shrink_group_validation(self):
+        """Test input validation in shrink_group."""
+        device, pg = self._setup_shrink_test("validation")
+
+        def _test_invalid_input(ranks, description, expected_exception):
+            """Helper to test invalid inputs."""
+            try:
+                c10d.shrink_group(ranks)
+                self.fail(f"Expected {expected_exception.__name__} for {description}")
+            except expected_exception:
+                log_test_validation(self.rank, f"✓ {description}")
+            except Exception:
+                if expected_exception == Exception:  # Accept any exception
+                    log_test_validation(self.rank, f"✓ {description}")
+                else:
+                    raise
+
+        # Test cases
+        _test_invalid_input([], "Empty exclusion list", ValueError)
+        if self.world_size > 1:
+            _test_invalid_input([0, 0, 1], "Duplicate ranks", Exception)
+        _test_invalid_input([self.world_size + 1], "Out of bounds rank", Exception)
+
+        log_test_success(self.rank, "All validation tests passed")
+        dist.destroy_process_group()
+
+    @requires_nccl_shrink()
+    @requires_world_size(2)
+    def test_shrink_group_backend_properties(self):
+        """Test that backend properties are preserved after shrinking."""
+
+        test_name = "Backend Properties Test"
+        ranks_to_exclude = [0]
+
+        # Reuse _setup_shrink_test for complete setup (device, environment, and process group)
+        device, pg = self._setup_shrink_test("backend_properties")
+
+        # Follow _perform_shrink_test pattern from here
+        log_test_info(self.rank, f"{test_name} (world_size={self.world_size})")
+
+        is_excluded = self.rank in ranks_to_exclude
+        log_test_info(
+            self.rank,
+            f"Excluding ranks: {ranks_to_exclude}, am_excluded: {is_excluded}",
+        )
+
+        # Store original backend property values (not references) before shrinking
+        original_timeout = None
+        original_high_priority = None
+        if not is_excluded:
+            original_backend = pg._get_backend(device)
+            original_timeout = original_backend.options._timeout
+            original_high_priority = original_backend.options.is_high_priority_stream
+            log_test_info(
+                self.rank,
+                f"Storing original backend properties: timeout={original_timeout}, high_priority={original_high_priority}",
+            )
+
+        if is_excluded:
+            log_test_info(
+                self.rank,
+                f"Excluded rank {self.rank} - setup complete, skipping shrink operation",
+            )
+            dist.destroy_process_group()  # hang without it
+            return
+
+        # Only non-excluded ranks proceed with shrink (same as _perform_shrink_test)
+        log_test_info(self.rank, "Non-excluded rank calling shrink_group")
+        shrunk_pg = c10d.shrink_group(ranks_to_exclude)
+
+        # Reuse _validate_shrunk_group helper (same as _perform_shrink_test)
+        expected_size = self.world_size - len(ranks_to_exclude)
+        _ = self._validate_shrunk_group(shrunk_pg, expected_size, test_name)
+
+        # Add custom backend properties validation
+        new_backend = shrunk_pg._get_backend(device)
+        log_test_info(self.rank, "Validating backend properties are preserved")
+
+        new_timeout = new_backend.options._timeout
+        new_high_priority = new_backend.options.is_high_priority_stream
+
+        log_test_info(
+            self.rank,
+            f"Timeout comparison - original: {original_timeout}, new: {new_timeout}",
+        )
+        self.assertEqual(
+            original_timeout, new_timeout, f"{test_name}: timeout not preserved"
+        )
+
+        log_test_info(
+            self.rank,
+            f"High priority stream comparison - original: {original_high_priority}, new: {new_high_priority}",
+        )
+        self.assertEqual(
+            original_high_priority,
+            new_high_priority,
+            f"{test_name}: high_priority_stream not preserved",
+        )
+
+        log_test_validation(
+            self.rank, f"{test_name}: Backend properties preserved successfully"
+        )
+        log_test_success(
+            self.rank, f"{test_name} successful (shrink + backend validation)"
+        )
+
+        # Cleanup (same as _perform_shrink_test)
+        dist.destroy_process_group()
+
+    @requires_nccl_shrink()
+    @requires_world_size(2)
+    def test_shrink_group_multiple_comms(self):
+        """Test shrink_group with multiple communicators and subgroup invalidation."""
+
+        device, pg = self._setup_shrink_test("multiple_comms")
+
+        # Create subgroup [0, 1] and test shrinking it
+        subgroup = c10d.new_group([0, 1])
+        if self.rank <= 1:
+            # Shrink subgroup: exclude rank 1
+            if self.rank == 0:  # Only rank 0 remains
+                shrunk_subgroup = c10d.shrink_group([1], group=subgroup)
+                self.assertEqual(shrunk_subgroup.size(), 1)
+                # Test communication on shrunk subgroup
+                tensor = torch.full((1,), self.rank).cuda(device)
+                c10d.all_reduce(tensor, group=shrunk_subgroup)
+                self.assertEqual(tensor.item(), 0)  # Only rank 0
+                log_test_success(self.rank, "Subgroup shrinking successful")
+
+        dist.barrier()  # Sync before default group test
+
+        # Shrink default group: exclude last rank
+        ranks_to_exclude = [self.world_size - 1]
+        if self.rank not in ranks_to_exclude:
+            shrunk_default = c10d.shrink_group(ranks_to_exclude)
+            expected_size = self.world_size - 1
+            self.assertEqual(shrunk_default.size(), expected_size)
+
+            # Test collective on shrunk default group
+            tensor = torch.full((1,), self.rank).cuda(device)
+            c10d.all_reduce(tensor, group=shrunk_default)
+            expected_sum = sum(
+                range(self.world_size - 1)
+            )  # 0 + 1 + ... + (world_size-2)
+            self.assertEqual(tensor.item(), expected_sum)
+            log_test_success(self.rank, "Default group shrinking successful")
+
+            # Note: After shrinking default group, the old subgroup is invalid
+            # due to global rank reassignment
+
+        dist.destroy_process_group()
+
+    def _test_shrink_group_with_flag(self, shrink_flag, flag_name, rank_to_exclude):
+        """Helper method to test shrink_group with a specific flag."""
+        if self.world_size < 2:
+            log_test_info(self.rank, f"Skipping (needs ≥2 GPUs, got {self.world_size})")
+            return
+        ranks_to_exclude = [rank_to_exclude]
+        log_test_info(self.rank, f"Using {flag_name} flag (value: {shrink_flag})")
+        if flag_name == "NCCL_SHRINK_ABORT":
+            log_test_info(
+                self.rank,
+                "ABORT flag will terminate ongoing operations before shrinking",
+            )
+
+        self._perform_shrink_test(
+            ranks_to_exclude, f"{flag_name} flag test", shrink_flags=shrink_flag
+        )
+
+    @requires_nccl_shrink()
+    @requires_world_size(2)
+    def test_shrink_group_flags(self):
+        """Test shrink_group with different shrink flags."""
+        # Test ABORT flags
+        log_test_info(self.rank, "Testing NCCL_SHRINK_ABORT flag")
+        self._test_shrink_group_with_flag(NCCL_SHRINK_ABORT, "NCCL_SHRINK_ABORT", 1)
+
+    @requires_nccl_shrink()
+    @requires_world_size(2)
+    def test_shrink_group_nccl_config(self):
+        """Verify that passing NCCL config via pg_options influences the shrunk group's backend options."""
+        device, pg = self._setup_shrink_test("config")
+        if self.rank == self.world_size - 1:
+            # excluded rank should not call shrink_group
+            dist.destroy_process_group()
+            return
+
+        # Prepare pg_options with NCCL config overrides
+        # Capture parent's current backend options to ensure we can prove override vs inherit
+        parent_backend = pg._get_backend(torch.device("cuda"))
+        parent_hp = parent_backend.options.is_high_priority_stream
+        parent_blocking = parent_backend.options.config.blocking
+
+        # Choose overrides that differ from the parent (flip where possible)
+        override_hp = not parent_hp
+        if parent_blocking in (0, 1):
+            override_blocking = 1 - parent_blocking
+        else:
+            # If undefined or unexpected, set to 1 which is a concrete value
+            override_blocking = 1
+
+        opts = c10d.ProcessGroupNCCL.Options()
+        opts.is_high_priority_stream = override_hp
+        opts.config.blocking = override_blocking
+
+        shrunk_pg = c10d.shrink_group([self.world_size - 1], pg_options=opts)
+
+        # Validate backend options propagated
+        backend = shrunk_pg._get_backend(torch.device("cuda"))
+        # is_high_priority_stream should exactly match our override and differ from parent
+        self.assertEqual(backend.options.is_high_priority_stream, override_hp)
+        self.assertNotEqual(backend.options.is_high_priority_stream, parent_hp)
+        # config is a struct; check representative field and difference from parent when meaningful
+        self.assertEqual(backend.options.config.blocking, override_blocking)
+        if parent_blocking in (0, 1):
+            self.assertNotEqual(backend.options.config.blocking, parent_blocking)
+
+        dist.destroy_process_group()
+
+    @requires_nccl_shrink()
+    @requires_world_size(2)
+    def test_shrink_group_performance(self):
+        """Test shrink_group performance and regression detection."""
+        import time
+
+        ranks_to_exclude = self._get_default_ranks_to_exclude()
+        is_excluded = self.rank in ranks_to_exclude
+
+        if not ranks_to_exclude:
+            log_test_info(self.rank, "Skipping performance test (world_size=1)")
+            return
+
+        log_test_info(self.rank, f"Performance test with {self.world_size} processes")
+        device, pg = self._setup_shrink_test("performance")
+
+        if not is_excluded:
+            log_test_info(self.rank, "Measuring shrink_group performance")
+            start_time = time.time()
+            shrunk_pg = c10d.shrink_group(ranks_to_exclude)
+            end_time = time.time()
+
+            elapsed_time = end_time - start_time
+            log_test_info(self.rank, f"shrink_group: {elapsed_time:.3f}s")
+
+            # Regression check: should complete within reasonable time
+            self.assertLess(
+                elapsed_time,
+                30.0,
+                f"shrink_group took {elapsed_time:.3f}s, possible regression",
+            )
+
+            # Test collective performance
+            expected_size = self.world_size - len(ranks_to_exclude)
+            self._validate_shrunk_group(shrunk_pg, expected_size, "performance")
+
+            collective_start = time.time()
+            _ = self._test_collective_on_shrunk_group(
+                shrunk_pg, device, ranks_to_exclude, "performance"
+            )
+            collective_time = time.time() - collective_start
+
+            log_test_info(self.rank, f"all_reduce: {collective_time:.3f}s")
+            log_test_success(self.rank, "Performance test passed")
+        else:
+            log_test_info(self.rank, "Excluded rank - waiting")
+
+        dist.destroy_process_group()
+
+    @requires_nccl_shrink()
+    @requires_world_size(4)
+    def test_shrink_group_multiple_exclusions(self):
+        """Test shrink_group with multiple ranks excluded at once."""
+        # Scale exclusions with world size
+        ranks_to_exclude = list(range(2, self.world_size, 2))  # Every other rank from 2
+
+        self._perform_shrink_test(ranks_to_exclude, "Multiple exclusions test")
+
+    @requires_nccl_shrink()
+    @requires_world_size(3)
+    def test_shrink_group_multiple_iterations(self):
+        """Test multiple shrink operations in sequence."""
+        log_test_info(
+            self.rank,
+            f"Starting test_shrink_group_multiple_iterations with world_size={self.world_size}",
+        )
+
+        store = c10d.FileStore(self.file_name, self.world_size)
+        device = torch.device(f"cuda:{self.rank}")
+        _ = self._create_process_group_nccl(store, self.opts(), device_id=device)
+
+        # Track current effective world size throughout shrinking operations
+        current_world_size = self.world_size
+        log_test_info(self.rank, f"Initial world_size: {current_world_size}")
+
+        # First shrinking: exclude the last rank(s)
+        first_exclusion = [self.world_size - 1]
+        if self.world_size >= 6:
+            first_exclusion.append(
+                self.world_size - 2
+            )  # Exclude last two ranks for larger sizes
+
+        log_test_info(self.rank, f"First shrinking: excluding ranks {first_exclusion}")
+
+        if self.rank not in first_exclusion:
+            # Only non-excluded ranks should call shrink_group
+            first_pg = c10d.shrink_group(first_exclusion)
+            self.assertIsNotNone(first_pg)
+            # IMPORTANT: Update world size after first shrinking
+            current_world_size = first_pg.size()
+            expected_first_size = self.world_size - len(first_exclusion)
+            log_test_info(
+                self.rank,
+                f"After first shrinking: world_size {self.world_size} -> {current_world_size}",
+            )
+            self.assertEqual(first_pg.size(), expected_first_size)
+
+            # Second shrinking: exclude another rank from the remaining group
+            # Choose a rank that's in the middle range
+            if current_world_size >= 3:
+                second_exclusion = [
+                    current_world_size - 1
+                ]  # Exclude the new "last" rank
+                log_test_info(
+                    self.rank,
+                    f"Second shrinking from group of size {current_world_size}: excluding ranks {second_exclusion}",
+                )
+
+                if self.rank not in second_exclusion:
+                    # Only non-excluded ranks should call shrink_group for second iteration
+                    second_pg = c10d.shrink_group(second_exclusion, group=first_pg)
+                    self.assertIsNotNone(second_pg)
+                    # IMPORTANT: Update world size after second shrinking
+                    final_world_size = second_pg.size()
+                    expected_final_size = current_world_size - len(second_exclusion)
+                    log_test_info(
+                        self.rank,
+                        f"After second shrinking: world_size {current_world_size} -> {final_world_size}",
+                    )
+                    self.assertEqual(second_pg.size(), expected_final_size)
+
+                    # Test collective on final group
+                    tensor = torch.full((1,), self.rank).cuda(device)
+                    log_test_info(
+                        self.rank,
+                        f"Performing all_reduce on final group (size {final_world_size}) with tensor: {tensor.item()}",
+                    )
+                    c10d.all_reduce(tensor, group=second_pg)
+                    log_test_info(
+                        self.rank,
+                        f"Final all_reduce completed, result: {tensor.item()}",
+                    )
+
+                    # Calculate expected sum of remaining ranks
+                    all_excluded = set(first_exclusion + second_exclusion)
+                    remaining_ranks = [
+                        r for r in range(self.world_size) if r not in all_excluded
+                    ]
+                    expected_sum = sum(remaining_ranks)
+                    log_test_info(
+                        self.rank,
+                        f"Remaining ranks: {remaining_ranks}, expected sum: {expected_sum}, actual: {tensor.item()}",
+                    )
+                    self.assertEqual(tensor.item(), expected_sum)
+                    log_test_info(self.rank, "Final verification passed")
+                else:
+                    log_test_info(
+                        self.rank,
+                        "This rank excluded in second shrinking, not calling shrink_group",
+                    )
+            else:
+                log_test_info(
+                    self.rank, "Skipping second shrinking (remaining group too small)"
+                )
+        else:
+            log_test_info(
+                self.rank,
+                "This rank excluded in first shrinking, not calling shrink_group",
+            )
+
+        log_test_info(self.rank, "Destroying process group")
+        dist.destroy_process_group()
+        log_test_info(self.rank, "test_shrink_group_multiple_iterations completed")
+
+    # Helper methods for optimized shrink group tests
+    def _setup_shrink_test(self, test_suffix, world_size=None, warmup=True):
+        """Common setup for shrink group tests."""
+        os.environ["TORCH_NCCL_USE_COMM_NONBLOCKING"] = "1"
+        world_size = world_size or self.world_size
+        store = c10d.FileStore(self.file_name + f"_{test_suffix}", world_size)
+        device = torch.device(f"cuda:{self.rank}")
+        c10d.init_process_group(
+            "nccl",
+            world_size=world_size,
+            rank=self.rank,
+            store=store,
+            pg_options=self.opts(),
+            device_id=device,
+        )
+        pg = c10d.distributed_c10d._get_default_group()
+
+        if warmup:
+            c10d.all_reduce(torch.ones(1).cuda(device), group=pg)
+
+        return device, pg
+
+    def _validate_shrunk_group(self, shrunk_pg, expected_size, test_name=""):
+        """Validate properties of a shrunk process group."""
+        self.assertIsNotNone(shrunk_pg, f"{test_name}: shrunk_pg should not be None")
+        actual_size = shrunk_pg.size()
+        self.assertEqual(
+            actual_size, expected_size, f"{test_name}: group size mismatch"
+        )
+
+        new_rank = shrunk_pg.rank()
+        self.assertTrue(
+            0 <= new_rank < expected_size, f"{test_name}: invalid new rank {new_rank}"
+        )
+
+        log_test_info(
+            self.rank,
+            f"{test_name}: world_size {self.world_size} -> {actual_size}, rank {self.rank} -> {new_rank}",
+        )
+        return new_rank
+
+    def _test_collective_on_shrunk_group(
+        self, shrunk_pg, device, ranks_to_exclude, test_name=""
+    ):
+        """Test collective communication on shrunk group and verify correctness."""
+        test_tensor = torch.full((1,), self.rank, device=device, dtype=torch.float32)
+        c10d.all_reduce(test_tensor, group=shrunk_pg)
+
+        result = test_tensor.item()
+        expected_sum = sum(
+            r for r in range(self.world_size) if r not in ranks_to_exclude
+        )
+
+        self.assertEqual(
+            result, expected_sum, f"{test_name}: collective result mismatch"
+        )
+        log_test_info(
+            self.rank, f"{test_name}: collective passed ({result} == {expected_sum})"
+        )
+        return result
+
+    def _perform_shrink_test(
+        self, ranks_to_exclude, test_name, shrink_flags=0, with_collective=True
+    ):
+        """Complete shrink test flow: setup, shrink, validate, test collective, cleanup.
+
+        Consistent API: All ranks perform setup to initialize distributed environment.
+        ONLY non-excluded ranks call shrink_group() for both default and non-default groups.
+        Excluded ranks perform setup, then exit without calling shrink_group() or waiting.
+        """
+        log_test_info(self.rank, f"{test_name} (world_size={self.world_size})")
+
+        is_excluded = self.rank in ranks_to_exclude
+        log_test_info(
+            self.rank,
+            f"Excluding ranks: {ranks_to_exclude}, am_excluded: {is_excluded}",
+        )
+
+        # All ranks (including excluded ones) perform setup to initialize distributed environment
+        device, pg = self._setup_shrink_test(test_name.lower().replace(" ", "_"))
+        is_default_group = pg == c10d.distributed_c10d._get_default_group()
+
+        if is_excluded:
+            log_test_info(
+                self.rank,
+                f"Excluded rank {self.rank} - setup complete, skipping shrink operation",
+            )
+            if shrink_flags & NCCL_SHRINK_ABORT:
+                log_test_info(self.rank, f"Using abort for excluded rank {self.rank}")
+                pg._get_backend(torch.device(device)).abort()
+                log_test_info(
+                    self.rank, f"cleanup resources for excluded rank {self.rank}"
+                )
+                dist.destroy_process_group()
+                log_test_info(self.rank, f"Excluded rank {self.rank} - exit")
+            else:
+                log_test_info(
+                    self.rank, f"Using regular destroy for excluded rank {self.rank}"
+                )
+                dist.destroy_process_group()
+            return None
+
+        # Only non-excluded ranks proceed with shrink
+        log_test_info(
+            self.rank,
+            f"Non-excluded rank calling shrink_group (default_group={is_default_group})",
+        )
+        shrunk_pg = c10d.shrink_group(ranks_to_exclude, shrink_flags=shrink_flags)
+        log_test_info(
+            self.rank,
+            f"Non-excluded rank calling shrink_group (default_group={is_default_group}) done",
+        )
+
+        # Non-excluded ranks: validate and test the new group
+        expected_size = self.world_size - len(ranks_to_exclude)
+        _ = self._validate_shrunk_group(shrunk_pg, expected_size, test_name)
+
+        if with_collective:
+            _ = self._test_collective_on_shrunk_group(
+                shrunk_pg, device, ranks_to_exclude, test_name
+            )
+            log_test_success(self.rank, f"{test_name} successful (shrink + collective)")
+        else:
+            log_test_success(self.rank, f"{test_name} successful (shrink only)")
+
+        dist.destroy_process_group()
+        return shrunk_pg
+
+    def _get_default_ranks_to_exclude(self):
+        """Get default ranks to exclude based on world size."""
+        if self.world_size <= 1:
+            return []
+        return [self.world_size - 1]  # Exclude last rank by default
+
+    @requires_nccl_shrink()
+    @requires_world_size(3)
+    def test_shrink_group_vs_abort_reinit_performance(self):
+        """Compare performance of shrink_group vs traditional abort+reinit (simplified for reliability)."""
+        log_test_info(self.rank, "=== TEST 1: abort+reinit ===")
+
+        device, pg1 = self._setup_shrink_test("_perf_reinit")
+        torch.cuda.synchronize(device)
+
+        # Test 1: Traditional abort + reinit
+        start_time = time.perf_counter()
+        dist.destroy_process_group()
+
+        device, new_pg = self._setup_shrink_test("perf_shrink_test1")
+        reinit_time = time.perf_counter() - start_time
+
+        # Test collective with original rank values for fair comparison (non-blocking mode)
+        test_tensor = torch.full((1,), self.rank, device=device, dtype=torch.float32)
+        work = c10d.all_reduce(test_tensor, group=new_pg, async_op=True)
+        work.wait()
+
+        torch.cuda.synchronize(device)
+
+        # Verify correctness
+        expected_sum = sum(r for r in range(self.world_size))
+        self.assertEqual(test_tensor.item(), expected_sum, "Reinit collective failed")
+
+        log_test_info(self.rank, f"abort+reinit: {reinit_time:.4f}s")
+        dist.destroy_process_group(new_pg)
+
+        # Test 2: shrink_group with NCCL_SHRINK_ABORT
+        log_test_info(self.rank, "=== TEST 2: shrink_group ===")
+
+        ranks_to_exclude = [self.world_size - 1]
+        is_excluded = self.rank in ranks_to_exclude
+        log_test_info(
+            self.rank,
+            f"Excluding ranks: {ranks_to_exclude}, am_excluded: {is_excluded}",
+        )
+
+        device, pg1 = self._setup_shrink_test("perf_shrink_test2")  # Unique suffix
+
+        shrink_time = 0
+        if not is_excluded:
+            torch.cuda.synchronize(device)  # Ensure accurate timing
+            start_time = time.perf_counter()
+            shrunk_pg = c10d.shrink_group(
+                ranks_to_exclude, shrink_flags=NCCL_SHRINK_ABORT
+            )
+            c10d.all_reduce(torch.ones(1).cuda(device), group=shrunk_pg)
+            shrink_time = time.perf_counter() - start_time
+
+            # Test collective communication on shrunk group (non-blocking mode)
+            test_tensor = torch.full(
+                (1,), self.rank, device=device, dtype=torch.float32
+            )
+            work = c10d.all_reduce(test_tensor, group=shrunk_pg, async_op=True)
+            work.wait()
+
+            # Verify correctness
+            expected_sum = sum(
+                r for r in range(self.world_size) if r not in ranks_to_exclude
+            )
+            self.assertEqual(
+                test_tensor.item(),
+                expected_sum,
+                "shrink_test: collective result mismatch",
+            )
+
+            torch.cuda.synchronize(device)  # Ensure operations complete
+            log_test_info(self.rank, f"shrink_group: {shrink_time:.4f}s")
+            dist.destroy_process_group()
+        else:
+            log_test_info(self.rank, "Excluded from shrink test - exiting immediately")
+            dist.destroy_process_group()
+            return
+
+        # Performance analysis (only for participating ranks)
+        if shrink_time > 0 and reinit_time > 0:
+            speedup = reinit_time / shrink_time
+            time_saved = reinit_time - shrink_time
+
+            log_test_info(self.rank, "=== PERFORMANCE RESULTS ===")
+            log_test_info(self.rank, f"shrink_group:  {shrink_time:.4f}s")
+            log_test_info(self.rank, f"abort+reinit:  {reinit_time:.4f}s")
+            log_test_info(self.rank, f"time_saved:    {time_saved:+.4f}s")
+            log_test_info(self.rank, f"speedup:       {speedup:.2f}x")
+
+            if speedup > 1.1:
+                log_test_success(self.rank, "shrink_group significantly faster")
+            elif speedup > 0.9:
+                log_test_info(self.rank, "≈ comparable performance")
+            else:
+                log_test_warning(self.rank, "abort+reinit faster")
+
+        log_test_info(self.rank, "Performance test completed")
+
     @requires_nccl()
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
     def test_deterministic_mode_no_break(self):
diff --git a/torch/csrc/distributed/c10d/Backend.hpp b/torch/csrc/distributed/c10d/Backend.hpp
index 655e0a5578c2..1ebf9394e064 100644
--- a/torch/csrc/distributed/c10d/Backend.hpp
+++ b/torch/csrc/distributed/c10d/Backend.hpp
@@ -79,6 +79,23 @@ class TORCH_API Backend : public torch::CustomClassHolder {
     return false;
   }
 
+  virtual bool supportsShrinking() const {
+    return false;
+  }
+
+  // Shrink the backend by excluding specified ranks. Backends that support
+  // communicator shrinking should override this and return a new backend
+  // instance representing the shrunken group. Backends may use opts_override
+  // to supply backend-specific options for the new group.
+  virtual c10::intrusive_ptr<Backend> shrink(
+      const std::vector<int64_t>& /*ranks_to_exclude*/,
+      int /*shrink_flags*/ = 0,
+      const c10::intrusive_ptr<Options>& /*opts_override*/ = nullptr) {
+    TORCH_CHECK(
+        false,
+        c10::str("Backend ", getBackendName(), " does not support shrink"));
+  }
+
   virtual void setTimeout(std::chrono::milliseconds timeout) {
     TORCH_CHECK(
         false,
diff --git a/torch/csrc/distributed/c10d/NCCLUtils.cpp b/torch/csrc/distributed/c10d/NCCLUtils.cpp
index 8074cc98a04f..a41f654b9ae2 100644
--- a/torch/csrc/distributed/c10d/NCCLUtils.cpp
+++ b/torch/csrc/distributed/c10d/NCCLUtils.cpp
@@ -259,6 +259,65 @@ std::shared_ptr<NCCLComm> NCCLComm::split(
 }
 #endif
 
+#ifdef NCCL_HAS_COMM_SHRINK
+std::shared_ptr<NCCLComm> NCCLComm::shrink(
+    NCCLComm* source,
+    std::vector<int>& ranks_to_exclude,
+    ncclConfig_t* config,
+    int shrinkFlags) {
+  // Preconditions are validated in ProcessGroupNCCL::shrink
+
+  LOG(INFO) << "Rank " << source->rank_ << ": shrinking comm " << source->repr()
+            << " excluding " << ranks_to_exclude.size() << " ranks";
+
+  at::cuda::OptionalCUDAGuard gpuGuard(source->deviceIndex_);
+  auto comm = std::make_shared<NCCLComm>();
+
+  // This call will block until the source communicator is initialized
+  auto sourceComm = source->getNcclComm();
+
+  C10D_NCCL_CHECK_NONBLOCKING(
+      ncclCommShrink(
+          sourceComm,
+          ranks_to_exclude.data(),
+          ranks_to_exclude.size(),
+          reinterpret_cast<ncclComm_t*>(&(comm->ncclComm_)),
+          config,
+          shrinkFlags),
+      source->getNcclCommFailureReason());
+
+  // Wait for the child communicator to be ready
+  source->waitReady(true);
+  comm->initialized_ = true;
+
+  // NCCL automatically assigns rank during shrink - query it efficiently
+  int assigned_rank;
+  try {
+    C10D_NCCL_CHECK(
+        ncclCommUserRank(comm->ncclComm_, &assigned_rank), std::nullopt);
+    comm->rank_ = assigned_rank;
+  } catch (const std::exception& e) {
+    // Fallback: if ncclCommUserRank fails, we can't determine the rank
+    LOG(ERROR) << "Failed to query NCCL-assigned rank: " << e.what();
+    throw;
+  }
+
+  // Child comm should be on the same device as parent comm
+  comm->deviceIndex_ = source->deviceIndex_;
+  if (config != nullptr) {
+    comm->nonBlocking_ = config->blocking == 0;
+  } else {
+    // Inherit parent behavior if no config provided
+    comm->nonBlocking_ = source->nonBlocking_;
+  }
+
+  LOG(INFO) << "Rank " << source->rank_ << ": created shrunken comm "
+            << comm->repr() << " with NCCL-assigned rank " << assigned_rank;
+
+  return comm;
+}
+#endif
+
 void NCCLComm::finalize() {
   LockType lock(mutex_);
   if (aborted_) {
diff --git a/torch/csrc/distributed/c10d/NCCLUtils.hpp b/torch/csrc/distributed/c10d/NCCLUtils.hpp
index fdd50f69ef3d..142633b82374 100644
--- a/torch/csrc/distributed/c10d/NCCLUtils.hpp
+++ b/torch/csrc/distributed/c10d/NCCLUtils.hpp
@@ -90,6 +90,10 @@ static_assert(
 #define NCCL_HAS_NVLS_CTAS
 #endif
 
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 27, 0)
+#define NCCL_HAS_COMM_SHRINK
+#endif
+
 // Macro to throw on a non-successful NCCL return value.
 #define C10D_NCCL_CHECK(cmd, failureReason)                                   \
   do {                                                                        \
@@ -294,6 +298,14 @@ class NCCLComm {
       ncclConfig_t& config);
 #endif // NCCL_HAS_COMM_SPLIT
 
+#ifdef NCCL_HAS_COMM_SHRINK
+  static std::shared_ptr<NCCLComm> shrink(
+      NCCLComm* source,
+      std::vector<int>& ranks_to_exclude,
+      ncclConfig_t* config,
+      int shrinkFlags = 0);
+#endif // NCCL_HAS_COMM_SHRINK
+
 #if (defined(IS_NCCLX) || defined(USE_ROCM)) && defined(NCCL_COMM_DUMP)
   std::unordered_map<std::string, std::string> ncclCommDump();
 #endif
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index 9b615b9f16b0..1a63128f8ddf 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -165,7 +165,7 @@ ncclRedOpRAII getNcclReduceOp(
 }
 
 // Get a key string from device
-inline std::string getKeyFromDevice(at::Device& device) {
+inline std::string getKeyFromDevice(const at::Device& device) {
   return std::to_string(device.index());
 }
 
@@ -5838,6 +5838,139 @@ at::Tensor ProcessGroupNCCL::allocateTensor(
   return tensor;
 }
 
+#ifdef NCCL_HAS_COMM_SHRINK
+c10::intrusive_ptr<Backend> ProcessGroupNCCL::shrink(
+    const std::vector<int64_t>& ranks_to_exclude,
+    int shrink_flags,
+    const c10::intrusive_ptr<Backend::Options>& opts_override) {
+  // Runtime version check with better error message
+  auto runtime_version = torch::cuda::nccl::version();
+  TORCH_CHECK(
+      runtime_version >= NCCL_VERSION(2, 27, 0),
+      "ProcessGroupNCCL::shrink requires NCCL version 2.27.0 or later. "
+      "Found version: ",
+      runtime_version);
+
+  // Early validation with detailed error messages
+  TORCH_CHECK_VALUE(
+      !ranks_to_exclude.empty(), "ranks_to_exclude cannot be empty");
+  TORCH_CHECK_VALUE(
+      static_cast<int>(ranks_to_exclude.size()) < size_,
+      "Cannot exclude all ranks (",
+      ranks_to_exclude.size(),
+      " >= ",
+      size_,
+      ")");
+
+  // Validate ranks and convert to int efficiently
+  std::vector<int> int_ranks_to_exclude;
+  int_ranks_to_exclude.reserve(ranks_to_exclude.size());
+  for (int64_t rank : ranks_to_exclude) {
+    TORCH_CHECK_VALUE(
+        rank >= 0 && rank < size_,
+        "Invalid rank ",
+        rank,
+        " for group size ",
+        size_);
+    int_ranks_to_exclude.push_back(static_cast<int>(rank));
+  }
+
+  // Get primary communicator with better error context
+  auto primary_device_index = guessDeviceId();
+  auto primary_device = at::Device(at::kCUDA, primary_device_index);
+  const auto primary_key = getKeyFromDevice(primary_device);
+
+  std::shared_ptr<NCCLComm> primary_comm = getNCCLComm(primary_key);
+  TORCH_CHECK(
+      primary_comm,
+      "Primary NCCL communicator for device ",
+      primary_device,
+      " (key: ",
+      primary_key,
+      ") is not initialized");
+
+  // Cache device index before shrink operation
+  at::DeviceIndex parent_device_index = primary_comm->getDeviceIndex();
+
+  ncclConfig_t* config = nullptr;
+  // Default to inheriting from parent options
+  bool high_priority_stream = options_->is_high_priority_stream;
+  if (opts_override) {
+    auto nccl_opts =
+        c10::static_intrusive_pointer_cast<ProcessGroupNCCL::Options>(
+            opts_override);
+    config = &nccl_opts->config;
+    // If user provided override options, honor is_high_priority_stream as well
+    high_priority_stream = nccl_opts->is_high_priority_stream;
+  }
+
+  std::shared_ptr<NCCLComm> shrunk_comm = NCCLComm::shrink(
+      primary_comm.get(),
+      int_ranks_to_exclude,
+      (config != nullptr ? config : &options_->config),
+      shrink_flags);
+
+  // Calculate new size and get NCCL-assigned rank
+  int new_size = size_ - static_cast<int>(ranks_to_exclude.size());
+  int new_rank = shrunk_comm->rank_;
+
+  // Create new ProcessGroupNCCL with optimized options cloning
+  auto new_store = store_->clone();
+  auto new_opts = ProcessGroupNCCL::Options::create(high_priority_stream);
+  new_opts->timeout = options_->timeout;
+  if (config != nullptr) {
+    new_opts->config = *config;
+  } else {
+    new_opts->config = options_->config;
+  }
+
+  auto new_pg = c10::make_intrusive<ProcessGroupNCCL>(
+      new_store, new_rank, new_size, new_opts);
+
+  // Set up the new process group with optimized device setup
+  new_pg->initializeDeviceStateForComm(
+      at::Device(at::kCUDA, parent_device_index), shrunk_comm);
+
+  return c10::static_intrusive_pointer_cast<Backend>(new_pg);
+}
+
+#else // !NCCL_HAS_COMM_SHRINK
+// Backend interface override: raise consistent error when shrink is
+// unsupported.
+c10::intrusive_ptr<Backend> ProcessGroupNCCL::shrink(
+    const std::vector<int64_t>& /*ranks_to_exclude*/,
+    int /*shrink_flags*/,
+    const c10::intrusive_ptr<Backend::Options>& /*opts_override*/) {
+  TORCH_CHECK(
+      false,
+      "ProcessGroupNCCL::shrink requires NCCL version 2.27.0 or later, "
+      "but PyTorch was built with an older version or without NCCL shrink support.");
+}
+
+#endif // NCCL_HAS_COMM_SHRINK
+
+void ProcessGroupNCCL::initializeDeviceStateForComm(
+    const at::Device& device,
+    std::shared_ptr<NCCLComm> comm) {
+  const auto key = getKeyFromDevice(device);
+  std::unique_lock<std::mutex> lock(mutex_);
+  at::cuda::OptionalCUDAGuard gpuGuard(device);
+
+  bool force_high = getCvarBool(TORCH_NCCL_HIGH_PRIORITY, false);
+  auto stream = at::cuda::getStreamFromPool(
+      options_->is_high_priority_stream || force_high);
+
+  devNCCLCommMap_[key] = comm;
+  ncclStreams_.emplace(key, stream);
+  ncclEvents_.emplace(key, at::cuda::CUDAEvent(cudaEventDisableTiming));
+  usedDeviceIdxs_.insert(device.index());
+
+  if (shouldAllCommunicatorsRegisterAllTensors()) {
+    std::lock_guard<std::mutex> map_lock(ncclCommMemPoolMapMutex);
+    ncclCommMemPoolMap.emplace(std::move(comm), MemPoolSet{});
+  }
+}
+
 } // namespace c10d
 
 #endif // USE_C10D_NCCL
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
index 286eab14d1a8..2ead1a107394 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
@@ -997,6 +997,21 @@ class TORCH_API ProcessGroupNCCL : public Backend {
 
   ErrorType getError() override;
 
+  bool supportsShrinking() const override {
+#ifdef NCCL_HAS_COMM_SHRINK
+    return true;
+#else
+    return false;
+#endif
+  }
+
+  // Backend-style shrink override that returns a Backend instance.
+  c10::intrusive_ptr<Backend> shrink(
+      const std::vector<int64_t>& ranks_to_exclude,
+      int shrink_flags = 0,
+      const c10::intrusive_ptr<Backend::Options>& opts_override =
+          nullptr) override;
+
   std::shared_ptr<c10::Allocator> getMemAllocator() override;
 
   // Allocate tensor from communication-optimized memory pool
@@ -1065,6 +1080,12 @@ class TORCH_API ProcessGroupNCCL : public Backend {
       int p2pRank = 0,
       bool isSendRecvSelf = false);
 
+  // Initialize device-specific state (comm, stream, event, bookkeeping) for a
+  // given communicator on this process group instance.
+  void initializeDeviceStateForComm(
+      const at::Device& device,
+      std::shared_ptr<NCCLComm> comm);
+
   // Wrapper method which can be overridden for tests.
   virtual std::exception_ptr checkForNCCLErrors(
       std::shared_ptr<NCCLComm>& ncclComm);
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index bdf2576efbe7..f7d60e0cb62d 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -2730,12 +2730,23 @@ Arguments:
               "supports_time_estimate",
               &::c10d::Backend::supportsTimeEstimation,
               "(test whether the backend supports collective time estimation)")
+          .def_property_readonly(
+              "supports_shrinking",
+              &::c10d::Backend::supportsShrinking,
+              "(test whether the backend supports communicator shrinking)")
           .def(
               "set_timeout",
               &::c10d::Backend::setTimeout,
               py::arg("timeout"),
               py::call_guard<py::gil_scoped_release>(),
               R"(Sets the default timeout for all future operations.)")
+          .def(
+              "shrink",
+              &::c10d::Backend::shrink,
+              py::arg("ranks_to_exclude"),
+              py::arg("shrink_flags") = 0,
+              py::arg("opts_override") = nullptr,
+              py::call_guard<py::gil_scoped_release>())
           .def(
               "broadcast",
               &::c10d::Backend::broadcast,
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index ea194a6ebe9a..0652024365de 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -130,6 +130,7 @@ __all__ = [
     "reduce_scatter_tensor",
     "get_node_local_rank",
     "split_group",
+    "shrink_group",
 ]
 
 _MPI_AVAILABLE = True
@@ -5696,3 +5697,517 @@ def _get_process_group_name(pg: ProcessGroup) -> str:
 
 def _get_process_group_store(pg: ProcessGroup) -> Store:
     return _world.pg_map[pg][1]
+
+
+# Shrink flags for process group backends
+SHRINK_DEFAULT = 0x00
+SHRINK_ABORT = 0x01
+
+
+@_time_logger
+def shrink_group(
+    ranks_to_exclude: list[int],
+    group: Optional[ProcessGroup] = None,
+    shrink_flags: int = SHRINK_DEFAULT,
+    pg_options: Optional[Any] = None,
+) -> ProcessGroup:
+    """
+    Shrinks a process group by excluding specified ranks.
+
+    Creates and returns a new, smaller process group comprising only the ranks
+    from the original group that were not in the ``ranks_to_exclude`` list.
+
+    Args:
+        ranks_to_exclude (List[int]): A list of ranks from the original
+            ``group`` to exclude from the new group.
+        group (ProcessGroup, optional): The process group to shrink. If ``None``,
+            the default process group is used. Defaults to ``None``.
+        shrink_flags (int, optional): Flags to control the shrinking behavior.
+            Can be ``SHRINK_DEFAULT`` (default) or ``SHRINK_ABORT``.
+            ``SHRINK_ABORT`` will attempt to terminate ongoing operations
+            in the parent communicator before shrinking.
+            Defaults to ``SHRINK_DEFAULT``.
+        pg_options (ProcessGroupOptions, optional): Backend-specific options to apply
+            to the shrunken process group. If provided, the backend will use
+            these options when creating the new group. If omitted, the new group
+            inherits defaults from the parent.
+
+    Returns:
+        ProcessGroup: a new group comprised of the remaining ranks. If the
+        default group was shrunk, the returned group becomes the new default group.
+
+    Raises:
+        TypeError: if the group’s backend does not support shrinking.
+        ValueError: if ``ranks_to_exclude`` is invalid (empty, out of bounds,
+        duplicates, or excludes all ranks).
+        RuntimeError: if an excluded rank calls this function or the backend
+        fails the operation.
+
+    Notes:
+        - Only non-excluded ranks should call this function; excluded ranks
+          must not participate in the shrink operation.
+        - Shrinking the default group destroys all other process groups since
+          rank reassignment makes them inconsistent.
+    """
+    # Step 1: Validate input parameters with comprehensive error checking
+    _validate_shrink_inputs(ranks_to_exclude, shrink_flags)
+
+    # Step 2: Get target group and essential properties
+    target_group_info = _prepare_shrink_target_group(group)
+
+    # Step 3: Validate backend requirements and availability
+    backend_impl = _validate_shrink_backend_requirements(target_group_info)
+
+    # Step 4: Validate ranks against group and check for duplicates
+    excluded_ranks_set = _validate_and_process_excluded_ranks(
+        ranks_to_exclude, target_group_info
+    )
+
+    # Step 5: Execute the actual shrink operation (backend-specific)
+    new_backend = backend_impl.shrink(
+        sorted(excluded_ranks_set),
+        shrink_flags,
+        pg_options if pg_options is not None else None,
+    )
+
+    # Step 6: Handle cleanup and creation of new process group
+    target_group_info["pg_options_override"] = pg_options
+    return _finalize_shrunk_group(target_group_info, excluded_ranks_set, new_backend)
+
+
+def _validate_shrink_inputs(ranks_to_exclude: list[int], shrink_flags: int) -> None:
+    """Validate input parameters for shrink_group."""
+    if not isinstance(ranks_to_exclude, list):
+        raise TypeError(
+            f"ranks_to_exclude must be a list, but got {type(ranks_to_exclude).__name__}. "
+            f"Example: [1, 3, 5] to exclude ranks 1, 3, and 5."
+        )
+
+    if not ranks_to_exclude:
+        raise ValueError(
+            "ranks_to_exclude cannot be empty. To shrink a group, you must specify at least "
+            "one rank to exclude. Example: [failed_rank_id]"
+        )
+
+    # Validate shrink_flags with clear explanation of valid values
+    valid_flags = [SHRINK_DEFAULT, SHRINK_ABORT]
+    if not isinstance(shrink_flags, int) or shrink_flags not in valid_flags:
+        raise ValueError(
+            f"Invalid shrink_flags value: {shrink_flags}. Must be one of: "
+            f"SHRINK_DEFAULT ({SHRINK_DEFAULT}) or SHRINK_ABORT ({SHRINK_ABORT}). "
+            f"Use SHRINK_ABORT to abort ongoing operations before shrinking."
+        )
+
+
+def _prepare_shrink_target_group(group: Optional[ProcessGroup]) -> dict:
+    """Prepare and validate the target group for shrinking."""
+    target_pg = group if group is not None else _get_default_group()
+
+    # Cache frequently accessed properties to avoid repeated calls
+    group_size = int(target_pg.size())
+    group_info = {
+        "process_group": target_pg,
+        "is_default_group": (target_pg == _get_default_group()),
+        "group_size": group_size,
+        "current_rank": target_pg.rank(),
+        "group_name": _get_process_group_name(target_pg),
+    }
+
+    # Validate that we have a valid process group
+    if group_size <= 1:
+        raise ValueError(
+            f"Cannot shrink a process group with size {group_size}. "
+            f"Group must have at least 2 ranks to support shrinking."
+        )
+
+    return group_info
+
+
+def _validate_shrink_backend_requirements(group_info: dict) -> Any:
+    """Return the backend implementation for the target group or raise if unsupported."""
+    target_pg = group_info["process_group"]
+    group_name = group_info["group_name"]
+
+    # Get the group's backend directly via ProcessGroup API. Prefer a bound device if present,
+    # otherwise try CUDA then fall back to CPU.
+    try:
+        preferred_device = getattr(target_pg, "bound_device_id", None)
+        if preferred_device is not None:
+            backend_impl = target_pg._get_backend(preferred_device)
+        else:
+            # Try CUDA first if available, else CPU
+            try:
+                backend_impl = target_pg._get_backend(torch.device("cuda"))
+            except Exception:
+                backend_impl = target_pg._get_backend(torch.device("cpu"))
+    except RuntimeError as e:
+        raise RuntimeError(
+            f"Cannot access device backend for process group '{group_name}'. "
+            f"Ensure the process group was initialized with a compatible device backend and devices are available."
+        ) from e
+
+    try:
+        supports = bool(backend_impl.supports_shrinking)
+    except Exception:
+        supports = False
+    if not supports:
+        raise TypeError(
+            f"Process group backend for '{group_name}' does not support shrinking operations."
+        )
+
+    return backend_impl
+
+
+def _validate_and_process_excluded_ranks(
+    ranks_to_exclude: list[int], group_info: dict
+) -> set:
+    """Validate excluded ranks and convert to set for efficient operations."""
+    group_size = group_info["group_size"]
+    current_rank = group_info["current_rank"]
+
+    # Use set for O(1) duplicate detection and membership testing
+    excluded_ranks_set = set()
+
+    # Validate each rank with detailed error messages
+    for i, rank in enumerate(ranks_to_exclude):
+        if not isinstance(rank, int):
+            raise TypeError(
+                f"All elements in ranks_to_exclude must be integers. "
+                f"Element at index {i} is {type(rank).__name__}: {rank}"
+            )
+
+        if not (0 <= rank < group_size):
+            raise ValueError(
+                f"Rank {rank} at index {i} is out of bounds for group size {group_size}. "
+                f"Valid ranks are in range [0, {group_size - 1}]."
+            )
+
+        if rank in excluded_ranks_set:
+            raise ValueError(
+                f"Duplicate rank {rank} found in ranks_to_exclude at index {i}. "
+                f"Each rank can only be excluded once."
+            )
+
+        excluded_ranks_set.add(rank)
+
+    # Ensure we don't exclude all ranks
+    if len(excluded_ranks_set) >= group_size:
+        raise ValueError(
+            f"Cannot exclude all {group_size} ranks from process group. "
+            f"At least one rank must remain. Excluding {len(excluded_ranks_set)} ranks."
+        )
+
+    # Critical check: current rank should not be in excluded list
+    if current_rank in excluded_ranks_set:
+        raise RuntimeError(
+            f"Current rank {current_rank} is in the exclusion list and should not call shrink_group(). "
+            f"Only non-excluded ranks should participate in the shrinking operation. "
+            f"Excluded ranks should terminate their processes instead."
+        )
+
+    return excluded_ranks_set
+
+
+def _finalize_shrunk_group(
+    group_info: dict, excluded_ranks_set: set, new_backend
+) -> ProcessGroup:
+    """Clean up old group and create new shrunk process group."""
+    target_pg = group_info["process_group"]
+    is_default_group = group_info["is_default_group"]
+
+    # Handle default group dependencies - destroy other groups first
+    if is_default_group:
+        _destroy_all_other_groups(exclude_group=target_pg)
+
+    # Gather original group metadata before cleanup
+    original_group_metadata = _extract_group_metadata(target_pg)
+
+    # Calculate remaining ranks efficiently
+    original_ranks = get_process_group_ranks(target_pg)
+    remaining_ranks = [
+        rank for rank in original_ranks if rank not in excluded_ranks_set
+    ]
+
+    # Clean up the original group
+    _cleanup_original_group(target_pg, is_default_group)
+
+    # Create and configure the new process group
+    new_pg = _create_shrunk_process_group(
+        new_backend, remaining_ranks, original_group_metadata, is_default_group
+    )
+
+    # Register the new group in global state
+    if is_default_group:
+        _update_default_pg(new_pg)
+
+    # Update global state with new group information
+    rank_mapping = {
+        global_rank: group_rank
+        for group_rank, global_rank in enumerate(remaining_ranks)
+    }
+    _update_process_group_global_state(
+        pg=new_pg,
+        backend_name=original_group_metadata["backend_name"],
+        store=original_group_metadata["store"],
+        group_name=original_group_metadata["new_group_name"],
+        backend_config=original_group_metadata["backend_config"],
+        rank_mapping=rank_mapping,
+    )
+
+    return new_pg
+
+
+def _extract_group_metadata(target_pg: ProcessGroup) -> dict:
+    """Extract metadata from the original group before cleanup."""
+    original_backend_name, original_store = _world.pg_map[target_pg]
+    original_backend_config = _world.pg_backend_config.get(target_pg, "")
+    original_group_name = _get_process_group_name(target_pg)
+
+    # Extract device binding information before cleanup to avoid accessing destroyed group
+    bound_device_id = None
+    if hasattr(target_pg, "bound_device_id"):
+        bound_device_id = target_pg.bound_device_id
+
+    # Generate new group name for the shrunk group; hash for uniqueness across backends
+    remaining_ranks = list(get_process_group_ranks(target_pg))
+    new_group_name = _process_group_name(remaining_ranks, use_hashed_name=True)
+
+    return {
+        "backend_name": original_backend_name,
+        "store": original_store,
+        "backend_config": original_backend_config,
+        "original_group_name": original_group_name,
+        "new_group_name": new_group_name,
+        "bound_device_id": bound_device_id,  # Safe to access after cleanup
+    }
+
+
+def _cleanup_original_group(target_pg: ProcessGroup, is_default_group: bool) -> None:
+    """Clean up the original process group safely."""
+    try:
+        destroy_process_group(target_pg)
+    except Exception as e:
+        group_type = "default" if is_default_group else "non-default"
+        logger.warning("Failed to destroy %s group during shrinking: %s", group_type, e)
+
+    # Ensure global state cleanup even if destroy_process_group fails
+    _cleanup_process_group_global_state(target_pg)
+
+
+def _create_shrunk_process_group(
+    new_backend, remaining_ranks: list[int], metadata: dict, is_default_group: bool
+) -> ProcessGroup:
+    """Create and configure the new shrunk process group."""
+    # Create new group properties
+    new_group_rank = new_backend.rank()
+    new_group_size = new_backend.size()
+    group_name = metadata["new_group_name"]
+
+    # Generate descriptive group description
+    if is_default_group:
+        group_desc = "default:shrunken"
+    else:
+        group_desc = f"{metadata['original_group_name']}:shrunk"
+
+    # Create process group with new communicator (clone the parent store like split does)
+    prefix_store = PrefixStore(f"{group_name}/", metadata["store"].clone())
+    new_pg = ProcessGroup(prefix_store, new_group_rank, new_group_size)
+
+    # Configure backend using the device type of the new backend's bound device if available,
+    # otherwise derive from the original group's bound device or fall back to CPU.
+    backend_device = metadata.get("bound_device_id")
+    if backend_device is None:
+        # Default to CPU if no bound device is present
+        backend_device = torch.device("cpu")
+
+    # Choose backend enum based on device type
+    if backend_device.type == "cuda":
+        backend_type = ProcessGroup.BackendType.NCCL
+    else:
+        backend_type = ProcessGroup.BackendType.GLOO
+
+    new_pg._register_backend(backend_device, backend_type, new_backend)
+    new_pg._set_default_backend(backend_type)
+
+    # Inherit device binding from original group if it was bound
+    bound_device_id = metadata.get("bound_device_id")
+    if bound_device_id is not None:
+        new_pg.bound_device_id = bound_device_id
+
+    # Set group metadata
+    new_pg._set_group_name(group_name)
+    new_pg._set_group_desc(group_desc)
+
+    # Persist backend configuration overrides (if provided via shrink_group)
+    backend_config_override = metadata.get("backend_config")
+    if backend_config_override is not None:
+        # Store for introspection/debugging and potential backend hooks
+        _world.pg_backend_config[new_pg] = backend_config_override
+
+    return new_pg
+
+
+def _destroy_all_other_groups(exclude_group: Optional[ProcessGroup] = None) -> None:
+    """
+    Destroy all process groups except the excluded group and clean up all global state.
+
+    This is necessary when shrinking the default group because global ranks
+    are reassigned by NCCL, making all existing process groups inconsistent.
+
+    Note: Uses abort for non-collective cleanup since excluded ranks may not
+    participate in collective operations. Backend cleanup is handled independently per group.
+
+    Args:
+        exclude_group (ProcessGroup, optional): Process group to exclude from destruction.
+            If None, destroys all process groups.
+    """
+    # Get list of groups to destroy (avoid modifying dict while iterating)
+    groups_to_destroy = []
+    for pg in list(_world.pg_group_ranks.keys()):
+        if exclude_group is not None and pg == exclude_group:
+            continue
+        groups_to_destroy.append(pg)
+
+    # Warn user about automatic destruction
+    if groups_to_destroy:
+        group_names = [_get_process_group_name(pg) for pg in groups_to_destroy]
+        logger.warning(
+            "Shrinking default group will destroy %d other process groups: %s. "
+            "This is necessary because shrinking the default group reassigns global ranks, "
+            "making existing groups inconsistent.",
+            len(groups_to_destroy),
+            ", ".join(group_names),
+        )
+
+    # Destroy each group and clean up global state
+    for pg in groups_to_destroy:
+        try:
+            # First call abort_process_group which handles the C++ cleanup non-collectively
+            _abort_process_group(pg)
+        except Exception as e:
+            # Log but don't fail - some groups might already be destroyed
+            logger.warning(
+                "Failed to abort process group %s: %s",
+                _get_process_group_name(pg),
+                e,
+            )
+
+        # Ensure all global state is cleaned up even if _abort_process_group fails
+        # or doesn't clean up everything
+        _cleanup_process_group_global_state(pg)
+
+
+def _cleanup_process_group_global_state(pg: ProcessGroup) -> None:
+    """
+    Clean up all global state associated with a process group.
+
+    This function ensures complete cleanup of process group state from all
+    global dictionaries and registries, even if destroy_process_group fails
+    or doesn't clean up everything. This is critical when destroying multiple
+    groups to prevent inconsistent state.
+
+    The cleanup removes the process group from:
+    - _world.pg_map (backend and store mapping)
+    - _world.pg_names (group name mapping)
+    - _world.pg_group_ranks (rank mappings)
+    - _world.pg_backend_config (backend configuration)
+    - _world.tags_to_pg and _world.pg_to_tag (tag mappings)
+    - _world.pg_coalesce_state (coalescing state)
+    - C++ internal registries via _unregister_process_group
+
+    Args:
+        pg (ProcessGroup): The process group to clean up.
+    """
+    try:
+        # Clean up main process group mappings
+        _world.pg_map.pop(pg, None)
+        _world.pg_group_ranks.pop(pg, None)
+        _world.pg_backend_config.pop(pg, None)
+
+        # Clean up process group name mapping
+        group_name = _world.pg_names.pop(pg, None)
+
+        # Clean up tag mappings
+        pg_tag = _world.pg_to_tag.pop(pg, None)
+        if pg_tag is not None and pg_tag in _world.tags_to_pg:
+            try:
+                _world.tags_to_pg[pg_tag].remove(pg)
+                # Remove the tag entry if list is empty
+                if not _world.tags_to_pg[pg_tag]:
+                    _world.tags_to_pg.pop(pg_tag, None)
+            except (ValueError, KeyError):
+                # Process group was already removed from the list
+                pass
+
+        # Clean up any registered process group names using C++ unregister function
+        if group_name is not None:
+            try:
+                _unregister_process_group(group_name)
+            except Exception:
+                # Process group name might not be registered or already unregistered
+                pass
+
+        # Clean up coalesce state if present
+        _world.pg_coalesce_state.pop(pg, None)
+
+    except Exception as e:
+        # Log cleanup failures but don't propagate - we want to continue with other cleanups
+        logger.warning("Failed to fully clean up global state for process group: %s", e)
+
+
+def _update_process_group_global_state(
+    pg: ProcessGroup,
+    backend_name: str,
+    store: Store,
+    group_name: str,
+    backend_config: str,
+    rank_mapping: Optional[dict[int, int]] = None,
+    pg_tag: Optional[str] = None,
+    user_tag: Optional[str] = None,
+) -> None:
+    """
+    Update all global state dictionaries for a process group.
+
+    This helper function consolidates the common pattern of updating multiple
+    global state dictionaries when creating or modifying process groups.
+
+    Args:
+        pg (ProcessGroup): The process group to update state for.
+        backend_name (str): Backend name for pg_map.
+        store (Store): Store instance for pg_map.
+        group_name (str): Group name for pg_names and registration.
+        backend_config (str): Backend configuration string.
+        rank_mapping (Dict[int, int], optional): Global rank to group rank mapping.
+            If None, skips updating pg_group_ranks.
+        pg_tag (str, optional): Process group tag. If None, defaults to f"ptd:{group_name}".
+        user_tag (str, optional): User-provided tag for special tag handling.
+            If provided, creates "user:{user_tag}" tag and also adds to default "".
+    """
+    # Update main process group mappings
+    _world.pg_map[pg] = (backend_name, store)
+    _world.pg_names[pg] = group_name
+    _world.pg_backend_config[pg] = backend_config
+
+    # Register the process group name
+    _register_process_group(group_name, pg)
+
+    # Update rank mapping if provided
+    if rank_mapping is not None:
+        _world.pg_group_ranks[pg] = rank_mapping
+
+    # Handle tag management
+    if pg_tag is None:
+        pg_tag = f"ptd:{group_name}"
+
+    if user_tag is not None:
+        # Special handling for user-provided tags
+        # Add to default "" tag first
+        _world.tags_to_pg.setdefault("", []).append(pg)
+        # Then create user-specific tag
+        user_pg_tag = f"user:{user_tag}"
+        _world.tags_to_pg.setdefault(user_pg_tag, []).append(pg)
+        _world.pg_to_tag[pg] = user_pg_tag
+    else:
+        # Standard process group tag
+        _world.tags_to_pg.setdefault(pg_tag, []).append(pg)
+        _world.pg_to_tag[pg] = pg_tag
diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index 17a317463cb5..8ce17367b86b 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -228,6 +228,47 @@ def skip_if_lt_x_gpu(x):
     return decorator
 
 
+def requires_world_size(n: int):
+    """
+    Decorator to request a specific world size for a test. The test harness can
+    read this attribute to set the number of ranks to spawn. If there are fewer
+    than `n` CUDA devices available, the test should be skipped by the harness.
+
+    Usage:
+        @require_world_size(3)
+        def test_something(self):
+            ...
+    """
+
+    def decorator(func):
+        func._required_world_size = n
+        available = torch.cuda.device_count()
+        return unittest.skipUnless(
+            available >= n, f"requires {n} GPUs, found {available}"
+        )(func)
+
+    return decorator
+
+
+def get_required_world_size(obj: Any, default: int) -> int:
+    """
+    Returns the requested world size for the currently running unittest method on `obj`
+    if annotated via `@require_world_size(n)`, else returns `default`.
+    """
+    try:
+        # Try MultiProcessTestCase helper first, then unittest fallback
+        test_name = (
+            obj._current_test_name()  # type: ignore[attr-defined]
+            if hasattr(obj, "_current_test_name") and callable(obj._current_test_name)
+            else obj._testMethodName
+        )
+        fn = getattr(obj, test_name)
+        value = fn._required_world_size
+        return int(value)
+    except Exception:
+        return default
+
+
 # This decorator helps avoiding initializing cuda while testing other backends
 def nccl_skip_if_lt_x_gpu(backend, x):
     def decorator(func):
@@ -355,6 +396,13 @@ def requires_nccl_version(version, msg):
         )
 
 
+def requires_nccl_shrink():
+    """
+    Require NCCL shrink support (NCCL available and version >= 2.27).
+    """
+    return requires_nccl_version((2, 27), "Need NCCL 2.27+ for shrink_group")
+
+
 def requires_nccl():
     return skip_but_pass_in_sandcastle_if(
         not c10d.is_nccl_available(),

From 58879bfafa8336b7ededccfb8b9f3f34c42b8abe Mon Sep 17 00:00:00 2001
From: Luca Wehrstedt <lcw@meta.com>
Date: Fri, 17 Oct 2025 13:39:45 +0000
Subject: [PATCH 056/123] [DeviceMesh] Prefer using _layout over _mesh for all
 sorts of things (#165554)

The goal of this PR is to avoid storing the explicit `mesh` Tensor inside each DeviceMesh, and instead compute it on-the-fly when the end user needs it, and try to replace all of its internal usages with `_layout` and the newly-introduced `_global_rank_permutation` Tensor. The name of this attribute is up for debate. The advantage of the `_global_rank_permutation` Tensor is that it is _the same_ Tensor for the root mesh and all its children, so it doesn't need to be copied/reallocated.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165554
Approved by: https://github.com/fduwjj
---
 test/distributed/_pycute/test_int_tuple.py  |   3 +
 test/distributed/test_device_mesh.py        |   6 +-
 torch/distributed/_local_tensor/__init__.py |  26 +----
 torch/distributed/_mesh_layout.py           |  32 +++---
 torch/distributed/_pycute/int_tuple.py      |   4 +-
 torch/distributed/device_mesh.py            | 119 +++++++++++---------
 6 files changed, 93 insertions(+), 97 deletions(-)

diff --git a/test/distributed/_pycute/test_int_tuple.py b/test/distributed/_pycute/test_int_tuple.py
index 27cebf30bd57..b6fb10394c5b 100644
--- a/test/distributed/_pycute/test_int_tuple.py
+++ b/test/distributed/_pycute/test_int_tuple.py
@@ -164,6 +164,9 @@ class TestIntTuple(TestCase):
             crd2idx(4, ((2, 2, 2), (2, 2, 2)), ((1, 16, 4), (8, 2, 32))), 8
         )  # 4 -> (1,0,0) -> 1*8 = 8
 
+        # Test with zero-length shape and strides
+        self.assertEqual(crd2idx(0, (), ()), 0)  # 0 -> () -> sum([]) = 0
+
     def test_idx2crd_basic(self):
         # Test basic int/int case
         self.assertEqual(idx2crd(2, 5, 1), 2)
diff --git a/test/distributed/test_device_mesh.py b/test/distributed/test_device_mesh.py
index d79452ed5905..0ed4651d3ec5 100644
--- a/test/distributed/test_device_mesh.py
+++ b/test/distributed/test_device_mesh.py
@@ -1664,14 +1664,14 @@ class CuTeLayoutTest(TestCase):
     def test_remap_to_tensor(self):
         """Test the remap_to_tensor method for various scenarios."""
         # Test 1: Consecutive ranks, full world - should return logical groups directly
-        original_mesh = torch.tensor([[0, 1], [2, 3]], dtype=torch.int)
+        original_mesh = torch.tensor([0, 1, 2, 3], dtype=torch.int)
         layout1 = _Layout((2, 2), (2, 1))  # row-major 2x2
         result1 = layout1.remap_to_tensor(original_mesh)
         expected1 = torch.tensor([[[0, 1], [2, 3]]], dtype=torch.int)
         self.assertEqual(result1, expected1)
 
         # Test 2: Non-consecutive ranks - should map to actual ranks
-        original_mesh = torch.tensor([[10, 20], [30, 40]], dtype=torch.int)
+        original_mesh = torch.tensor([10, 20, 30, 40], dtype=torch.int)
         layout2 = _Layout((2, 2), (2, 1))
         result2 = layout2.remap_to_tensor(original_mesh)
         expected2 = torch.tensor([[[10, 20], [30, 40]]], dtype=torch.int)
@@ -1692,7 +1692,7 @@ class CuTeLayoutTest(TestCase):
         self.assertEqual(result5, expected5)
 
         # Test 6: Tensor Cute representation of a 2D mesh
-        original_mesh = torch.tensor([[0, 2], [1, 3]], dtype=torch.int)
+        original_mesh = torch.tensor([0, 2, 1, 3], dtype=torch.int)
         layout6 = _Layout((2, 2), (1, 2))  # column-major style
         result6 = layout6.remap_to_tensor(original_mesh)
         expected6 = torch.tensor([[[0, 1], [2, 3]]], dtype=torch.int)
diff --git a/torch/distributed/_local_tensor/__init__.py b/torch/distributed/_local_tensor/__init__.py
index 4ac1dd4a0a0c..d9eb7b47e9a3 100644
--- a/torch/distributed/_local_tensor/__init__.py
+++ b/torch/distributed/_local_tensor/__init__.py
@@ -707,27 +707,13 @@ class _LocalDeviceMesh:
         lm = local_tensor_mode()
         assert lm is not None, "Unexpectedly not in LocalTensorMode"
 
-        root_mesh = self._get_root_mesh()
-        submesh_dims = self.mesh_dim_names
-
         coords: list[dict[int, int]] = [{} for _ in range(self.ndim)]
-        old_get_rank = DeviceMesh.get_rank  # type: ignore[assignment]
-        try:
-            for r in lm.ranks:
-                DeviceMesh.get_rank = lambda self: r  # type: ignore[method-assign]
-                submesh = (
-                    root_mesh
-                    if submesh_dims is None
-                    else root_mesh.__getitem__(submesh_dims)
-                )
-                rank_coords = (submesh.mesh == r).nonzero().tolist()
-                assert len(rank_coords) in (0, 1)
-                if len(rank_coords) == 0:
-                    continue
-                for d, c in enumerate(rank_coords[0]):
-                    coords[d][r] = c
-        finally:
-            DeviceMesh.get_rank = old_get_rank  # type: ignore[method-assign]
+        for r in lm.ranks:
+            rank_tensor = self._layout.remap_to_tensor(self._rank_map)
+            rank_coords = (rank_tensor == r).nonzero().tolist()
+            assert len(rank_coords) == 1
+            for d, c in enumerate(rank_coords[0][1:]):
+                coords[d][r] = c
 
         out = [torch.SymInt(LocalIntNode(c)) for c in coords]
 
diff --git a/torch/distributed/_mesh_layout.py b/torch/distributed/_mesh_layout.py
index 7c0516b0e425..0e620c643765 100644
--- a/torch/distributed/_mesh_layout.py
+++ b/torch/distributed/_mesh_layout.py
@@ -301,10 +301,7 @@ class _MeshLayout(Layout):
         ranks = self.all_ranks_from_zero()
         return len(ranks) == len(set(ranks))
 
-    def remap_to_tensor(
-        self,
-        mesh_tensor: torch.Tensor,
-    ) -> torch.Tensor:
+    def remap_to_tensor(self, rank_map: torch.Tensor) -> torch.Tensor:
         """
         Leverage layout as an index for mesh tensor that re-maps the indexes after layout
         transformation to actual device ranks.
@@ -316,10 +313,7 @@ class _MeshLayout(Layout):
         can be treated as a view or subset of mesh tensor, we do need to use the actual view or
         sub-tensor for DeviceMesh and its backend creation.
 
-        The shape of the `mesh_tensor` can be any size because users can define a device mesh with any
-        shapes. But we can further refactor the code so that internally we can only support 1D mesh tensor
-        and reconstruct the mesh tensor with the shape of the layout when accessed by users.
-        #TODO: Only support 1D mesh tensor stored internally and reconstruct the mesh tensor via layout.
+        The shape of the `rank_map` must be 1D and contiguous.
 
         Examples:
 
@@ -336,18 +330,18 @@ class _MeshLayout(Layout):
             Return: [[[10,30],[20,40]]]
 
         Args:
-            mesh_tensor: The concrete mesh tensor with actual device ranks
+            rank_map: The concrete mesh tensor with actual device ranks
 
         Returns:
-            torch.Tensor: A tensor representing the actual device allocation from mesh_tensor
+            torch.Tensor: A tensor representing the actual device allocation from rank_map
         """
-        complement_layout = self.complement(mesh_tensor.numel())
+        assert rank_map.ndim == 1
+        assert rank_map.is_contiguous()
+        assert rank_map.numel() >= self.cosize()
 
-        return (
-            mesh_tensor.flatten()
-            .as_strided(
-                flatten(complement_layout.sizes) + flatten(self.sizes),
-                flatten(complement_layout.strides) + flatten(self.strides),
-            )
-            .reshape(-1, *(self[i].numel() for i in range(len(self))))
-        )
+        complement_layout = self.complement(rank_map.numel())
+
+        return rank_map.as_strided(
+            flatten(complement_layout.sizes) + flatten(self.sizes),
+            flatten(complement_layout.strides) + flatten(self.strides),
+        ).reshape(-1, *self.top_level_sizes)
diff --git a/torch/distributed/_pycute/int_tuple.py b/torch/distributed/_pycute/int_tuple.py
index 5a3ad707e785..008b67cf6f96 100644
--- a/torch/distributed/_pycute/int_tuple.py
+++ b/torch/distributed/_pycute/int_tuple.py
@@ -198,7 +198,9 @@ def crd2idx(
             for i in range(len(shape) - 1, 0, -1):
                 result += crd2idx(crd % product(shape[i]), shape[i], stride[i])
                 crd = crd // product(shape[i])
-            return result + crd2idx(crd, shape[0], stride[0])
+            if len(shape) > 0:
+                result += crd2idx(crd, shape[0], stride[0])
+            return result
         else:  # "int" "int" "int"
             assert not is_tuple(shape) and not is_tuple(stride)
             return crd * stride  # all are ints after type checks
diff --git a/torch/distributed/device_mesh.py b/torch/distributed/device_mesh.py
index cfc991242e06..a2ba7efb955e 100644
--- a/torch/distributed/device_mesh.py
+++ b/torch/distributed/device_mesh.py
@@ -173,7 +173,7 @@ else:
         """
 
         _device_type: str
-        _mesh: torch.Tensor
+        _rank_map: torch.Tensor
         _mesh_dim_names: Optional[tuple[str, ...]]
         _layout: _MeshLayout
         _root_mesh: Optional["DeviceMesh"] = None
@@ -190,46 +190,49 @@ else:
             _init_backend: bool = True,
             _rank: Optional[int] = None,
             _layout: Optional[_MeshLayout] = None,
+            _root_mesh: Optional["DeviceMesh"] = None,
         ) -> None:
             self._device_type = device_type
             if isinstance(mesh, torch.Tensor) and mesh.device.type != "cpu":
                 raise ValueError(f"`mesh` must be a CPU tensor, got {mesh}")
-            self._mesh = (
+            mesh_tensor = (
                 mesh.detach().to(dtype=torch.int).contiguous()
                 if isinstance(mesh, torch.Tensor)
                 else torch.tensor(mesh, device="cpu", dtype=torch.int)
             )
+            self._rank_map = (
+                _root_mesh._rank_map
+                if _root_mesh is not None
+                else mesh_tensor.flatten()
+            )
             self._mesh_dim_names = tuple(mesh_dim_names) if mesh_dim_names else None
-            if backend_override is None:
-                backend_override = ((None, None),) * self.mesh.ndim
-            elif len(backend_override) != self.mesh.ndim:
-                raise ValueError(
-                    f"backend_override should have the same length as the number of mesh dimensions, "
-                    f"but got {len(backend_override)} and {self.mesh.ndim}."
-                )
             # Internal bookkeeping for the device mesh.
             self._layout = (
                 _layout
                 if _layout
-                else _MeshLayout(self.mesh.size(), self.mesh.stride())
+                else _MeshLayout(mesh_tensor.size(), mesh_tensor.stride())
             )
+            self._root_mesh = _root_mesh
             assert self._layout.check_non_overlap(), (
                 "Please use a non-overlapping layout when creating a DeviceMesh."
             )
             # Because we still need to support slicing of flattened dim from root mesh, so we don't check stride here.
-            assert self._layout.top_level_sizes == self.mesh.size(), (
+            assert self._layout.top_level_sizes == mesh_tensor.size(), (
                 "Please use a valid layout when creating a DeviceMesh."
-                f"The layout {self._layout} is not consistent with the mesh size {self.mesh.size()}."
+                f"The layout {self._layout} is not consistent with the mesh size {mesh_tensor.size()}."
             )
 
-            # private field to pre-generate DeviceMesh's hash
-            self._flatten_mesh_list = tuple(self.mesh.flatten().tolist())
-            self._thread_id = None
-            # Initialize instance-specific flatten mapping
-            self._flatten_mapping = {}
+            if backend_override is None:
+                backend_override = ((None, None),) * len(self._layout)
+            elif len(backend_override) != len(self._layout):
+                raise ValueError(
+                    f"backend_override should have the same length as the number of mesh dimensions, "
+                    f"but got {len(backend_override)} and {len(self._layout)}."
+                )
 
             # Skip process group initialization if xla device or init backend is False
             # TODO(yeounoh) implement DeviceMesh backend and register XLA backend.
+            self._thread_id = None
             if device_type != "xla":
                 # always try to create default (world) pg, even if it is not initialized
                 # already. The world pg is used for device mesh identity (rank) on each
@@ -252,6 +255,11 @@ else:
                     rank_coords[0].tolist() if rank_coords.size(0) > 0 else None
                 )
 
+            # private field to pre-generate DeviceMesh's hash
+            self._flatten_mesh_list = tuple(self.mesh.flatten().tolist())
+            # Initialize instance-specific flatten mapping
+            self._flatten_mapping = {}
+
         @property
         def device_type(self) -> str:
             """Returns the device type of the mesh."""
@@ -260,7 +268,17 @@ else:
         @property
         def mesh(self) -> torch.Tensor:
             """Returns the tensor representing the layout of devices."""
-            return self._mesh
+            full_mesh = self._layout.remap_to_tensor(self._rank_map)
+            if full_mesh.size(0) == 1:
+                return full_mesh[0]
+            my_coords = (full_mesh == get_rank()).nonzero()
+            if my_coords.size(0) > 0:
+                return full_mesh[my_coords[0, 0]]
+            raise RuntimeError(
+                "In order to get the mesh Tensor of a DeviceMesh it needs to "
+                "either have all its original dimensions (e.g., no slicing) "
+                "or it needs to contain the local rank"
+            )
 
         @property
         def mesh_dim_names(self) -> Optional[tuple[str, ...]]:
@@ -275,9 +293,9 @@ else:
                 init_process_group()
 
             world_size = get_world_size()
-            if self.mesh.numel() > world_size:
+            if self._layout.numel() > world_size:
                 raise RuntimeError(
-                    f"Mesh should not be bigger than default world size {world_size}, but found {self.mesh.numel()} ranks!"
+                    f"Mesh should not be bigger than default world size {world_size}, but found {self._layout.numel()} ranks!"
                 )
 
             # ONLY set the device if the current device is not initialized, if user already
@@ -328,8 +346,8 @@ else:
             default_group = _get_default_group()
 
             if (
-                self.mesh.ndim == 1
-                and self.mesh.numel() == get_world_size()
+                len(self._layout) == 1
+                and self._layout.numel() == get_world_size()
                 and backend_override[0] == (None, None)
             ):
                 # Append the default pg to the first dim groups only if the default pg is compatible with `self._device_type`.
@@ -348,11 +366,11 @@ else:
                 dim_group_names.append(dim_group.group_name)
             else:
                 # create sub pgs base on the mesh argument specified
-                for dim in range(self.mesh.ndim):
+                for dim in range(len(self._layout)):
                     # swap the current dim to the last dim
                     # then reshape to flatten out other dims
-                    pg_ranks_by_dim = self.mesh.swapdims(-1, dim).reshape(
-                        -1, self.mesh.size(dim)
+                    pg_ranks_by_dim = (
+                        self._layout[dim].nest().remap_to_tensor(self._rank_map)
                     )
                     backend, pg_options = backend_override[dim]
                     # We need to explicitly pass in timeout when specified in option, otherwise
@@ -448,14 +466,14 @@ else:
 
         def __repr__(self) -> str:
             device_mesh_repr = (
-                f"({', '.join(f'{k}={v}' for k, v in zip(self._mesh_dim_names, self._mesh.shape))})"
+                f"({', '.join(f'{k}={v}' for k, v in zip(self._mesh_dim_names, self._layout.top_level_sizes))})"
                 if self._mesh_dim_names
-                else f"{tuple(self._mesh.shape)}"
+                else f"{self._layout.top_level_sizes}"
             )
-            device_mesh_repr = f"DeviceMesh({device_mesh_repr}, '{self.device_type}', stride={self._mesh.stride()}"
+            device_mesh_repr = f"DeviceMesh({device_mesh_repr}, '{self.device_type}', stride={self._layout.strides}"
             # We only print the mesh tensor if the debug mode is turned on.
             if os.environ.get("TORCH_DISTRIBUTED_DEBUG", "") == "DETAIL":
-                device_mesh_repr += f", Mesh: {self._mesh.tolist()}"
+                device_mesh_repr += f", Mesh: {self.mesh.tolist()}"
             return f"{device_mesh_repr})"
 
         def __hash__(self):
@@ -465,7 +483,7 @@ else:
                 self._hash = hash(
                     (
                         self._flatten_mesh_list,
-                        self._mesh.shape,
+                        self._layout,
                         self._device_type,
                         self._mesh_dim_names,
                         self._thread_id,
@@ -481,7 +499,7 @@ else:
                 return False
             return (
                 self._flatten_mesh_list == other._flatten_mesh_list
-                and self._mesh.shape == other._mesh.shape
+                and self._layout == other._layout
                 and self._device_type == other._device_type
                 and self._mesh_dim_names == other._mesh_dim_names
                 and self._thread_id == other._thread_id
@@ -573,16 +591,16 @@ else:
             if not hasattr(self, "_dim_group_names"):
                 raise RuntimeError("DeviceMesh process groups not initialized!")
 
-            if self.mesh.ndim > 1 and mesh_dim is None:
+            if len(self._layout) > 1 and mesh_dim is None:
                 raise RuntimeError(
-                    f"Found the DeviceMesh have {self.mesh.ndim} dimensions",
+                    f"Found the DeviceMesh have {len(self._layout)} dimensions",
                     "Optional kwarg `mesh_dim` needs to be specified when device_mesh.ndim > 1.",
                     "If you want to get the list of all the ProcessGroups in the DeviceMesh,"
                     "please use `get_all_groups()` instead.",
                 )
 
             # Quick return if the current device_mesh is a 1D mesh.
-            if self.mesh.ndim == 1 and mesh_dim is None:
+            if len(self._layout) == 1 and mesh_dim is None:
                 return not_none(_resolve_process_group(self._dim_group_names[0]))
 
             root_mesh = self._get_root_mesh()
@@ -608,7 +626,7 @@ else:
             Returns:
                 A list of :class:`ProcessGroup` object.
             """
-            return [self.get_group(i) for i in range(self.mesh.ndim)]
+            return [self.get_group(i) for i in range(len(self._layout))]
 
         def _create_sub_mesh(
             self,
@@ -635,9 +653,7 @@ else:
                         ]
                     )
             cur_rank = self.get_rank()
-            pg_ranks_by_dim = layout.remap_to_tensor(
-                root_mesh.mesh,
-            )
+            pg_ranks_by_dim = layout.remap_to_tensor(root_mesh._rank_map)
             res_submesh = DeviceMesh._create_mesh_from_ranks(
                 self._device_type,
                 pg_ranks_by_dim,
@@ -692,9 +708,7 @@ else:
             cur_rank = root_mesh.get_rank()
             # Due to the limitation of ProcessGroup api, we need to start from root mesh so that all ranks call the
             # new_group api to avoid potential hang.
-            pg_ranks_by_dim = flattened_mesh_layout.remap_to_tensor(
-                root_mesh.mesh,
-            )
+            pg_ranks_by_dim = flattened_mesh_layout.remap_to_tensor(root_mesh._rank_map)
             res_flattened_mesh = DeviceMesh._create_mesh_from_ranks(
                 root_mesh._device_type,
                 pg_ranks_by_dim.flatten(
@@ -833,9 +847,7 @@ else:
             """
             mesh_dim = self._get_mesh_dim_by_name(mesh_dim_name)
             layout = self._layout[mesh_dim]
-            pg_ranks_by_dim = layout.remap_to_tensor(
-                self.mesh,
-            )
+            pg_ranks_by_dim = layout.remap_to_tensor(self._rank_map)
             cur_rank = self.get_rank()
             res_submeshes = []
             for mesh_1d in pg_ranks_by_dim:
@@ -896,6 +908,7 @@ else:
                     backend_override=backend_override,
                     _init_backend=_init_backend,
                     _layout=_layout,
+                    _root_mesh=_root_mesh,
                 )
                 if cur_rank in mesh_nd:
                     res_mesh = mesh
@@ -904,8 +917,6 @@ else:
                     f"Current rank {cur_rank} not found in any mesh, "
                     f"input {pg_ranks_by_dim} does not contain all ranks in the world"
                 )
-            if _root_mesh is not None:
-                res_mesh._root_mesh = _root_mesh
             return res_mesh
 
         @staticmethod
@@ -1004,15 +1015,17 @@ else:
             return device_mesh
 
         def size(self, mesh_dim: Optional[int] = None) -> int:
-            return self.mesh.numel() if mesh_dim is None else self.mesh.size(mesh_dim)
+            if mesh_dim is not None:
+                return self._layout[mesh_dim].numel()
+            return self._layout.numel()
 
         @property
         def ndim(self) -> int:
-            return self.mesh.ndim
+            return len(self._layout)
 
         @property
         def shape(self) -> tuple[int, ...]:
-            return tuple(self.mesh.shape)
+            return self._layout.top_level_sizes
 
         def get_rank(self) -> int:
             """
@@ -1051,7 +1064,7 @@ else:
             """
             if self.ndim > 1 and mesh_dim is None:
                 raise RuntimeError(
-                    f"Found the DeviceMesh have {self.mesh.ndim} dimensions",
+                    f"Found the DeviceMesh have {len(self._layout)} dimensions",
                     "Optional kwarg `mesh_dim` needs to be specified when device_mesh.ndim > 1.",
                 )
             elif mesh_dim is None:
@@ -1115,9 +1128,7 @@ else:
             root_mesh = self._get_root_mesh()
             cur_rank = self.get_rank()
             unflattened_layout = self._layout.unflatten(dim, mesh_sizes)
-            pg_ranks_by_dim = unflattened_layout.remap_to_tensor(
-                root_mesh.mesh,
-            )
+            pg_ranks_by_dim = unflattened_layout.remap_to_tensor(root_mesh._rank_map)
             unflattened_mesh_dim_names = list(not_none(self.mesh_dim_names))
             unflattened_mesh_dim_names[dim : dim + 1] = list(mesh_dim_names)
             res_mesh = DeviceMesh._create_mesh_from_ranks(
@@ -1141,7 +1152,7 @@ else:
                     tuple(unflattened_layout.strides[dim : dim + unflatten_length]),  # type: ignore[index]
                 )
                 unflatten_pg_ranks_by_dim = unflatten_layout.remap_to_tensor(
-                    root_mesh.mesh,
+                    root_mesh._rank_map
                 )
                 unflatten_submesh = DeviceMesh._create_mesh_from_ranks(
                     self.device_type,

From d659bbde625e10969722cd51e60d42cda00872e1 Mon Sep 17 00:00:00 2001
From: Luca Wehrstedt <lcw@meta.com>
Date: Fri, 17 Oct 2025 13:39:45 +0000
Subject: [PATCH 057/123] [DeviceMesh] Introduce private constructor instead of
 _create_mesh_from_ranks (#165555)

The refactoring of DeviceMesh is heavily constrained by the signature of its constructor, which is a public API which contains some "legacy" concepts which we'd love to get rid of, such as an explicit/materialized `mesh` Tensor.

In other languages the solution to this would be to add a private overload of the constructor. Python doesn't natively allow this, but in this PR I managed to build something that approximates it.

This new private constructor basically only takes `_layout`, `_global_rank_permutation`, and `mesh_dim_names`.

With such a constructor we can effectively simplify a lot of callsites and get rid of the `_create_mesh_from_ranks` helper method. That's a good thing because it was instantiating many DeviceMeshes in a for loop, which always felt unnecessary.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165555
Approved by: https://github.com/fduwjj, https://github.com/fegin
ghstack dependencies: #165554
---
 torch/distributed/_mesh_layout.py      |  10 +-
 torch/distributed/_pycute/__init__.py  |   1 +
 torch/distributed/_pycute/int_tuple.py |   8 ++
 torch/distributed/device_mesh.py       | 173 ++++++++-----------------
 4 files changed, 68 insertions(+), 124 deletions(-)

diff --git a/torch/distributed/_mesh_layout.py b/torch/distributed/_mesh_layout.py
index 0e620c643765..3a76d0079ca0 100644
--- a/torch/distributed/_mesh_layout.py
+++ b/torch/distributed/_mesh_layout.py
@@ -17,6 +17,7 @@ from torch.distributed._pycute import (
     is_int,
     is_tuple,
     Layout,
+    match_structure,
     suffix_product,
 )
 
@@ -48,14 +49,9 @@ class _MeshLayout(Layout):
             raise TypeError(f"shape must be a tuple or int, got {type(self.shape)}")
         if not is_tuple(self.stride) and not is_int(self.stride):
             raise TypeError(f"stride must be a tuple or int, got {type(self.stride)}")
-        if (
-            is_tuple(self.shape)
-            and is_tuple(self.stride)
-            and len(flatten(self.shape)) != len(flatten(self.stride))
-        ):
+        if not match_structure(self.shape, self.stride):
             raise ValueError(
-                f"sizes {len(flatten(self.shape))} and "
-                f"strides {len(flatten(self.stride))} must have the same length"
+                f"sizes {self.shape} and strides {self.stride} don't match"
             )
 
     @property
diff --git a/torch/distributed/_pycute/__init__.py b/torch/distributed/_pycute/__init__.py
index 9dbd35a44533..a6d28d8f2712 100644
--- a/torch/distributed/_pycute/__init__.py
+++ b/torch/distributed/_pycute/__init__.py
@@ -41,6 +41,7 @@ from .int_tuple import (
     IntTuple,
     is_int,
     is_tuple,
+    match_structure,
     product,
     shape_div,
     signum,
diff --git a/torch/distributed/_pycute/int_tuple.py b/torch/distributed/_pycute/int_tuple.py
index 008b67cf6f96..72e898b16e15 100644
--- a/torch/distributed/_pycute/int_tuple.py
+++ b/torch/distributed/_pycute/int_tuple.py
@@ -54,6 +54,14 @@ def is_tuple(x: object) -> TypeIs[tuple]:
     return isinstance(x, tuple)
 
 
+def match_structure(a: IntTuple, b: IntTuple) -> bool:
+    if is_int(a) and is_int(b):
+        return True
+    if is_tuple(a) and is_tuple(b):
+        return len(a) == len(b) and all(match_structure(x, y) for x, y in zip(a, b))
+    return False
+
+
 def flatten(t: IntTuple) -> tuple[int, ...]:
     if is_tuple(t):
         if len(t) == 0:
diff --git a/torch/distributed/device_mesh.py b/torch/distributed/device_mesh.py
index a2ba7efb955e..b19e297b1bb0 100644
--- a/torch/distributed/device_mesh.py
+++ b/torch/distributed/device_mesh.py
@@ -1,7 +1,6 @@
 # mypy: allow-untyped-defs
 # Copyright (c) Meta Platforms, Inc. and affiliates
 import logging
-import math
 import os
 import threading
 import warnings
@@ -12,7 +11,7 @@ from typing import Optional, TYPE_CHECKING, Union
 import torch
 from torch.distributed import is_available
 from torch.distributed._mesh_layout import _MeshLayout
-from torch.distributed._pycute import is_int
+from torch.distributed._pycute import is_int, suffix_product
 from torch.utils._typing_utils import not_none
 
 
@@ -183,45 +182,52 @@ else:
         def __init__(
             self,
             device_type: str,
-            mesh: Union[torch.Tensor, "ArrayLike"],
+            mesh: Optional[Union[torch.Tensor, "ArrayLike"]] = None,
             *,
             mesh_dim_names: Optional[tuple[str, ...]] = None,
             backend_override: Optional[tuple[BackendConfig, ...]] = None,
             _init_backend: bool = True,
             _rank: Optional[int] = None,
             _layout: Optional[_MeshLayout] = None,
+            _rank_map: Optional[torch.Tensor] = None,
             _root_mesh: Optional["DeviceMesh"] = None,
         ) -> None:
-            self._device_type = device_type
-            if isinstance(mesh, torch.Tensor) and mesh.device.type != "cpu":
-                raise ValueError(f"`mesh` must be a CPU tensor, got {mesh}")
-            mesh_tensor = (
-                mesh.detach().to(dtype=torch.int).contiguous()
-                if isinstance(mesh, torch.Tensor)
-                else torch.tensor(mesh, device="cpu", dtype=torch.int)
-            )
-            self._rank_map = (
-                _root_mesh._rank_map
-                if _root_mesh is not None
-                else mesh_tensor.flatten()
-            )
-            self._mesh_dim_names = tuple(mesh_dim_names) if mesh_dim_names else None
-            # Internal bookkeeping for the device mesh.
-            self._layout = (
-                _layout
-                if _layout
-                else _MeshLayout(mesh_tensor.size(), mesh_tensor.stride())
-            )
-            self._root_mesh = _root_mesh
-            assert self._layout.check_non_overlap(), (
+            if mesh is not None:
+                if _layout is not None or _rank_map is not None:
+                    raise TypeError(
+                        "Cannot provide _layout and/or _rank_map if passing explicit mesh"
+                    )
+                if isinstance(mesh, torch.Tensor) and mesh.device.type != "cpu":
+                    raise ValueError(f"`mesh` must be a CPU tensor, got {mesh}")
+                mesh_tensor = (
+                    mesh.detach().to(dtype=torch.int).contiguous()
+                    if isinstance(mesh, torch.Tensor)
+                    else torch.tensor(mesh, device="cpu", dtype=torch.int)
+                )
+                _layout = _MeshLayout(mesh_tensor.size(), mesh_tensor.stride())
+                _rank_map = mesh_tensor.flatten()
+            else:
+                if _layout is None or _rank_map is None:
+                    raise TypeError(
+                        "The mesh argument is required except for PRIVATE USAGE ONLY!"
+                    )
+
+            assert _layout.check_non_overlap(), (
                 "Please use a non-overlapping layout when creating a DeviceMesh."
             )
-            # Because we still need to support slicing of flattened dim from root mesh, so we don't check stride here.
-            assert self._layout.top_level_sizes == mesh_tensor.size(), (
-                "Please use a valid layout when creating a DeviceMesh."
-                f"The layout {self._layout} is not consistent with the mesh size {mesh_tensor.size()}."
+            assert _rank_map.ndim == 1, "The rank map must be 1-dimensional"
+            assert _rank_map.is_contiguous(), "The rank map must be contiguous"
+            assert _rank_map.numel() >= _layout.cosize(), (
+                f"The rank map contains {_rank_map.numel()} element, "
+                f"which isn't large enough for layout {_layout}"
             )
 
+            self._device_type = device_type
+            self._layout = _layout
+            self._rank_map = _rank_map
+            self._mesh_dim_names = tuple(mesh_dim_names) if mesh_dim_names else None
+            self._root_mesh = _root_mesh
+
             if backend_override is None:
                 backend_override = ((None, None),) * len(self._layout)
             elif len(backend_override) != len(self._layout):
@@ -652,16 +658,13 @@ else:
                             not_none(flatten_mesh._mesh_dim_names).index(name)
                         ]
                     )
-            cur_rank = self.get_rank()
-            pg_ranks_by_dim = layout.remap_to_tensor(root_mesh._rank_map)
-            res_submesh = DeviceMesh._create_mesh_from_ranks(
+            res_submesh = DeviceMesh(
                 self._device_type,
-                pg_ranks_by_dim,
-                cur_rank,
-                submesh_dim_names,
-                _init_backend=False,
                 _layout=layout,
+                _rank_map=root_mesh._rank_map,
+                mesh_dim_names=submesh_dim_names,
                 _root_mesh=root_mesh,
+                _init_backend=False,
             )
             res_submesh._dim_group_names = slice_dim_group_name
             return res_submesh
@@ -705,20 +708,13 @@ else:
                         f"Please specify another valid mesh_dim_name."
                     )
 
-            cur_rank = root_mesh.get_rank()
-            # Due to the limitation of ProcessGroup api, we need to start from root mesh so that all ranks call the
-            # new_group api to avoid potential hang.
-            pg_ranks_by_dim = flattened_mesh_layout.remap_to_tensor(root_mesh._rank_map)
-            res_flattened_mesh = DeviceMesh._create_mesh_from_ranks(
+            res_flattened_mesh = DeviceMesh(
                 root_mesh._device_type,
-                pg_ranks_by_dim.flatten(
-                    start_dim=1
-                ),  # this is needed for flatten non-contiguous mesh dims.
-                cur_rank,
-                (mesh_dim_name,),
-                (backend_override,),
                 _layout=flattened_mesh_layout,
+                _rank_map=root_mesh._rank_map,
+                mesh_dim_names=(mesh_dim_name,),
                 _root_mesh=root_mesh,
+                backend_override=(backend_override,),
             )
             root_mesh._flatten_mapping[mesh_dim_name] = res_flattened_mesh
 
@@ -866,59 +862,6 @@ else:
 
             return res_submeshes
 
-        @staticmethod
-        def _create_mesh_from_ranks(
-            device_type: str,
-            pg_ranks_by_dim: torch.Tensor,
-            cur_rank: int,
-            mesh_dim_names: tuple[str, ...],
-            backend_override: Optional[tuple[BackendConfig, ...]] = None,
-            _init_backend: bool = True,
-            _layout: Optional[_MeshLayout] = None,
-            _root_mesh: Optional["DeviceMesh"] = None,
-        ) -> "DeviceMesh":
-            """
-            Helper method to create a DeviceMesh from tensor `pg_ranks_by_dim`. This is due to
-            the constraint of ProcessGroup API that all ranks have to call the PG creation API
-            even if the rank is not in that PG.
-            We will create a potentially very large number of DeviceMesh objects
-            (e.g., on 1024 GPUs with TP=2, this could be up to 512 DeviceMeshes), only to throw
-            them all away except when the mesh contains the current rank.
-
-            #TODO: Further refactor this method once we relax the ProcessGroup API constraint.
-
-            Args:
-                device_type: The device type of the mesh.
-                pg_ranks_by_dim: all ranks within the worlds organized by dimensions.
-                cur_rank: The current global rank in the mesh.
-                mesh_dim_names: Mesh dimension names.
-                backend_override: Optional backend override for the mesh.
-                _init_backend: Whether to initialize the backend of the mesh.
-                _layout: Optional layout for the mesh.
-
-            Returns:
-                The DeviceMesh containing the current rank.
-            """
-            res_mesh = None
-            for mesh_nd in pg_ranks_by_dim:
-                mesh = DeviceMesh(
-                    device_type,
-                    mesh_nd,
-                    mesh_dim_names=mesh_dim_names,
-                    backend_override=backend_override,
-                    _init_backend=_init_backend,
-                    _layout=_layout,
-                    _root_mesh=_root_mesh,
-                )
-                if cur_rank in mesh_nd:
-                    res_mesh = mesh
-            if res_mesh is None:
-                raise RuntimeError(
-                    f"Current rank {cur_rank} not found in any mesh, "
-                    f"input {pg_ranks_by_dim} does not contain all ranks in the world"
-                )
-            return res_mesh
-
         @staticmethod
         def from_group(
             group: Union[ProcessGroup, list[ProcessGroup]],
@@ -1126,19 +1069,16 @@ else:
             ] = ((None, None),),
         ) -> "DeviceMesh":
             root_mesh = self._get_root_mesh()
-            cur_rank = self.get_rank()
             unflattened_layout = self._layout.unflatten(dim, mesh_sizes)
-            pg_ranks_by_dim = unflattened_layout.remap_to_tensor(root_mesh._rank_map)
             unflattened_mesh_dim_names = list(not_none(self.mesh_dim_names))
             unflattened_mesh_dim_names[dim : dim + 1] = list(mesh_dim_names)
-            res_mesh = DeviceMesh._create_mesh_from_ranks(
+            res_mesh = DeviceMesh(
                 self.device_type,
-                pg_ranks_by_dim,
-                cur_rank,
-                tuple(unflattened_mesh_dim_names),
-                _init_backend=False,
                 _layout=unflattened_layout,
+                _rank_map=root_mesh._rank_map,
+                mesh_dim_names=tuple(unflattened_mesh_dim_names),
                 _root_mesh=root_mesh,
+                _init_backend=False,
             )
 
             # If original mesh has initiated its backend, we need to initialize the backend
@@ -1151,14 +1091,11 @@ else:
                     tuple(unflattened_layout.sizes[dim : dim + unflatten_length]),  # type: ignore[index]
                     tuple(unflattened_layout.strides[dim : dim + unflatten_length]),  # type: ignore[index]
                 )
-                unflatten_pg_ranks_by_dim = unflatten_layout.remap_to_tensor(
-                    root_mesh._rank_map
-                )
-                unflatten_submesh = DeviceMesh._create_mesh_from_ranks(
+                unflatten_submesh = DeviceMesh(
                     self.device_type,
-                    unflatten_pg_ranks_by_dim,
-                    cur_rank,
-                    mesh_dim_names,
+                    _layout=unflatten_layout,
+                    _rank_map=root_mesh._rank_map,
+                    mesh_dim_names=mesh_dim_names,
                     backend_override=backend_override,
                 )
                 dim_group_names = []
@@ -1360,13 +1297,15 @@ else:
                 "If you maintained a 'torch.device' object, it's recommended to pass in 'device.type'.",
             )
 
-        # Always initialize the mesh's tensor on CPU, regardless of what the
+        layout = _MeshLayout(tuple(mesh_shape), suffix_product(tuple(mesh_shape)))
+        # Always initialize the (identity) rank map on CPU, regardless of what the
         # external device type has been set to be (e.g. meta)
         with torch.device("cpu"):
-            mesh = torch.arange(math.prod(mesh_shape), dtype=torch.int).view(mesh_shape)
+            rank_map = torch.arange(layout.numel(), dtype=torch.int)
         device_mesh = DeviceMesh(
             device_type=device_type,
-            mesh=mesh,
+            _layout=layout,
+            _rank_map=rank_map,
             mesh_dim_names=mesh_dim_names,
             backend_override=backend_override_tuple,
         )

From 0d4c2b71e85d1a755bf4293d315726e9326cf30f Mon Sep 17 00:00:00 2001
From: Luca Wehrstedt <lcw@meta.com>
Date: Fri, 17 Oct 2025 13:39:46 +0000
Subject: [PATCH 058/123] [DeviceMesh] Simplify unflatten method (#165556)

By adding a few small helpers (e.g., a `splice` method to `_MeshLayout`, and making `_init_process_groups` static and thus stateless) we can substantially shorten the definition of the unflatten method, and help readability.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165556
Approved by: https://github.com/fduwjj
ghstack dependencies: #165554, #165555
---
 torch/distributed/_mesh_layout.py      | 56 ++++--------------
 torch/distributed/_pycute/__init__.py  |  1 +
 torch/distributed/_pycute/int_tuple.py |  6 ++
 torch/distributed/device_mesh.py       | 78 +++++++++++++-------------
 4 files changed, 57 insertions(+), 84 deletions(-)

diff --git a/torch/distributed/_mesh_layout.py b/torch/distributed/_mesh_layout.py
index 3a76d0079ca0..2a8355fb26cc 100644
--- a/torch/distributed/_mesh_layout.py
+++ b/torch/distributed/_mesh_layout.py
@@ -9,6 +9,7 @@ from itertools import product
 
 import torch
 from torch.distributed._pycute import (
+    as_tuple,
     coalesce,
     complement,
     composition,
@@ -18,7 +19,6 @@ from torch.distributed._pycute import (
     is_tuple,
     Layout,
     match_structure,
-    suffix_product,
 )
 
 
@@ -75,6 +75,11 @@ class _MeshLayout(Layout):
 
     # # operator []    (get-i like tuples)
     def __getitem__(self, i: int) -> "_MeshLayout":
+        if i < -len(self) or i >= len(self):
+            raise IndexError(
+                f"Dim {i} is out of range for layout with {len(self)} dimensions. "
+                f"Expected dim to be in range [{-len(self)}, {len(self) - 1}]."
+            )
         layout = super().__getitem__(i)
         return _MeshLayout(layout.shape, layout.stride)
 
@@ -152,50 +157,11 @@ class _MeshLayout(Layout):
         layout = complement(self, world_size)
         return _MeshLayout(layout.shape, layout.stride)
 
-    def unflatten(self, dim: int, unflatten_sizes: tuple[int, ...]) -> "_MeshLayout":
-        """
-        Unflatten a single dimension in the layout by splitting it into multiple dimensions.
-        It takes a dimension at position `dim` and splits it into multiple new dimensions
-        with the specified sizes.
-
-        Args:
-            dim (int): The index of the dimension to unflatten. Must be a valid dimension index.
-            unflatten_sizes (tuple[int, ...]): The new sizes for the dimensions that will replace
-                the original dimension at `dim`. The product of these sizes must equal the size
-                of the original dimension at `dim`.
-
-        Returns:
-            _MeshLayout: A new layout with the specified dimension unflattened.
-
-        Example:
-            Original: sizes=(8,), strides=(1,)  # 8 ranks in 1D
-            Call: unflatten(0, (2, 2, 2))  # Create 3D topology
-            Result: sizes=(2, 2, 2), strides=(4, 2, 1)  # 2*2*2 unflattened topology
-        """
-        # Check that dim is within valid range
-        if dim < 0 or dim >= len(self):
-            raise ValueError(
-                f"dim {dim} is out of range for layout with {len(self)} dimensions. "
-                f"Expected dim to be in range [0, {len(self) - 1}]."
-            )
-
-        # Check that the product of unflatten_sizes equals the original dimension size
-        original_size = self[dim].numel()
-        unflatten_product = math.prod(unflatten_sizes)
-        if unflatten_product != original_size:
-            raise ValueError(
-                f"The product of unflatten_sizes {unflatten_sizes} is {unflatten_product}, "
-                f"but the original dimension at dim={dim} has size {original_size}. "
-                f"These must be equal for unflatten to work correctly."
-            )
-
-        sizes = list(self.sizes)  # type: ignore[arg-type]
-        strides = list(self.strides)  # type: ignore[arg-type]
-        unflatten_layout = self[dim].composition(
-            _MeshLayout(tuple(unflatten_sizes), suffix_product(unflatten_sizes))
-        )
-        sizes[dim : dim + 1] = list(unflatten_layout.sizes)  # type: ignore[arg-type]
-        strides[dim : dim + 1] = list(unflatten_layout.strides)  # type: ignore[arg-type]
+    def splice(self, start: int, end: int, layout: "_MeshLayout") -> "_MeshLayout":
+        sizes = list(as_tuple(self.sizes))
+        strides = list(as_tuple(self.strides))
+        sizes[start:end] = list(as_tuple(layout.sizes))
+        strides[start:end] = list(as_tuple(layout.strides))
         return _MeshLayout(tuple(sizes), tuple(strides))
 
     def all_ranks_from_zero(self) -> list[int]:
diff --git a/torch/distributed/_pycute/__init__.py b/torch/distributed/_pycute/__init__.py
index a6d28d8f2712..e13bcc86e509 100644
--- a/torch/distributed/_pycute/__init__.py
+++ b/torch/distributed/_pycute/__init__.py
@@ -31,6 +31,7 @@
 #################################################################################################
 
 from .int_tuple import (
+    as_tuple,
     crd2crd,
     crd2idx,
     elem_scale,
diff --git a/torch/distributed/_pycute/int_tuple.py b/torch/distributed/_pycute/int_tuple.py
index 72e898b16e15..b060edde2281 100644
--- a/torch/distributed/_pycute/int_tuple.py
+++ b/torch/distributed/_pycute/int_tuple.py
@@ -54,6 +54,12 @@ def is_tuple(x: object) -> TypeIs[tuple]:
     return isinstance(x, tuple)
 
 
+def as_tuple(x: IntTuple) -> tuple[IntTuple, ...]:
+    if is_int(x):
+        return (x,)
+    return x
+
+
 def match_structure(a: IntTuple, b: IntTuple) -> bool:
     if is_int(a) and is_int(b):
         return True
diff --git a/torch/distributed/device_mesh.py b/torch/distributed/device_mesh.py
index b19e297b1bb0..5c8969091d69 100644
--- a/torch/distributed/device_mesh.py
+++ b/torch/distributed/device_mesh.py
@@ -245,7 +245,12 @@ else:
                 # process (we need to know if the current global rank is in the mesh or not).
                 if _init_backend:
                     self._setup_world_group_and_device()
-                    self._init_process_groups(backend_override)
+                    self._dim_group_names = self._init_process_groups(
+                        self._layout,
+                        self._rank_map,
+                        self._mesh_dim_names,
+                        backend_override,
+                    )
 
                 if is_initialized() and get_backend() == "threaded":
                     # pyrefly: ignore  # bad-assignment
@@ -341,10 +346,13 @@ else:
 
             return _get_default_group()
 
+        @staticmethod
         def _init_process_groups(
-            self,
+            layout: _MeshLayout,
+            rank_map: torch.Tensor,
+            mesh_dim_names: Optional[tuple[str, ...]],
             backend_override: tuple[BackendConfig, ...],
-        ):
+        ) -> list[str]:
             # group_name associated with each mesh dimension, each
             # mesh dimension should have one sub-group per rank
             #
@@ -352,8 +360,8 @@ else:
             default_group = _get_default_group()
 
             if (
-                len(self._layout) == 1
-                and self._layout.numel() == get_world_size()
+                len(layout) == 1
+                and layout.numel() == get_world_size()
                 and backend_override[0] == (None, None)
             ):
                 # Append the default pg to the first dim groups only if the default pg is compatible with `self._device_type`.
@@ -372,12 +380,10 @@ else:
                 dim_group_names.append(dim_group.group_name)
             else:
                 # create sub pgs base on the mesh argument specified
-                for dim in range(len(self._layout)):
+                for dim in range(len(layout)):
                     # swap the current dim to the last dim
                     # then reshape to flatten out other dims
-                    pg_ranks_by_dim = (
-                        self._layout[dim].nest().remap_to_tensor(self._rank_map)
-                    )
+                    pg_ranks_by_dim = layout[dim].nest().remap_to_tensor(rank_map)
                     backend, pg_options = backend_override[dim]
                     # We need to explicitly pass in timeout when specified in option, otherwise
                     # the default timeout will be used to override the timeout set in option.
@@ -389,8 +395,8 @@ else:
                     # If the mesh doesn't not have a mesh_dim_names, then the group description of the
                     # subgroup would be `mesh_dim_0` and `mesh_dim_1`.
                     group_desc = (
-                        f"mesh_{self._mesh_dim_names[dim]}"
-                        if self._mesh_dim_names
+                        f"mesh_{mesh_dim_names[dim]}"
+                        if mesh_dim_names
                         else f"mesh_dim_{dim}"
                     )
 
@@ -448,14 +454,14 @@ else:
                             )
 
                         # only add to dim_groups if the current rank in the subgroup
-                        if self.get_rank() in subgroup_ranks:
+                        if get_rank() in subgroup_ranks:
                             if len(dim_group_names) > dim:
                                 raise RuntimeError(
-                                    f"Each device mesh dimension should get only one process group, but got {self.get_rank()} "
+                                    f"Each device mesh dimension should get only one process group, but got {get_rank()} "
                                     f"in {subgroup_ranks}!"
                                 )
                             dim_group_names.append(dim_group.group_name)  # type: ignore[union-attr]
-            self._dim_group_names = dim_group_names
+            return dim_group_names
 
         def _get_root_mesh(self) -> "DeviceMesh":
             return self._root_mesh if self._root_mesh else self
@@ -1068,10 +1074,21 @@ else:
                 tuple[Optional[str], Optional[C10dBackend.Options]], ...
             ] = ((None, None),),
         ) -> "DeviceMesh":
-            root_mesh = self._get_root_mesh()
-            unflattened_layout = self._layout.unflatten(dim, mesh_sizes)
+            inner_layout = _MeshLayout(tuple(mesh_sizes), suffix_product(mesh_sizes))
+
+            if inner_layout.numel() != self._layout[dim].numel():
+                raise ValueError(
+                    f"The product of {mesh_sizes=} is {inner_layout.numel()}, "
+                    f"but the original dimension at dim={dim} has size {self._layout[dim].numel()}. "
+                    f"These must be equal for unflatten to work correctly."
+                )
+
+            partial_layout = self._layout[dim].composition(inner_layout)
+            unflattened_layout = self._layout.splice(dim, dim + 1, partial_layout)
             unflattened_mesh_dim_names = list(not_none(self.mesh_dim_names))
             unflattened_mesh_dim_names[dim : dim + 1] = list(mesh_dim_names)
+
+            root_mesh = self._get_root_mesh()
             res_mesh = DeviceMesh(
                 self.device_type,
                 _layout=unflattened_layout,
@@ -1086,30 +1103,13 @@ else:
             # TODO: To make backend init more efficient with cute layout representation and support
             # per dim backend init.
             if hasattr(self, "_dim_group_names"):
-                unflatten_length = len(mesh_sizes)
-                unflatten_layout = _MeshLayout(
-                    tuple(unflattened_layout.sizes[dim : dim + unflatten_length]),  # type: ignore[index]
-                    tuple(unflattened_layout.strides[dim : dim + unflatten_length]),  # type: ignore[index]
+                dim_group_names = self._dim_group_names.copy()
+                dim_group_names[dim : dim + 1] = self._init_process_groups(
+                    partial_layout,
+                    root_mesh._rank_map,
+                    mesh_dim_names,
+                    backend_override,
                 )
-                unflatten_submesh = DeviceMesh(
-                    self.device_type,
-                    _layout=unflatten_layout,
-                    _rank_map=root_mesh._rank_map,
-                    mesh_dim_names=mesh_dim_names,
-                    backend_override=backend_override,
-                )
-                dim_group_names = []
-                for idx in range(0, res_mesh.ndim):
-                    if idx < dim:
-                        dim_group_names.append(self._dim_group_names[idx])
-                    elif idx >= dim + unflatten_length:
-                        dim_group_names.append(
-                            self._dim_group_names[idx - unflatten_length + 1]
-                        )
-                    else:
-                        dim_group_names.append(
-                            unflatten_submesh._dim_group_names[idx - dim]
-                        )
                 res_mesh._dim_group_names = dim_group_names
 
             return res_mesh

From 9a71d96256d247109bfb23cdbfce90d8a076115c Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Fri, 17 Oct 2025 18:08:59 +0000
Subject: [PATCH 059/123] Revert "[DebugMode][1/N] refactor logs into
 _DebugCalls (#165376)"

This reverts commit 556fc09a9f67f24ca5591ec049c5d0c347c5f62a.

Reverted https://github.com/pytorch/pytorch/pull/165376 on behalf of https://github.com/seemethere due to This is failing for internal tests, see D84877379 for more context ([comment](https://github.com/pytorch/pytorch/pull/165376#issuecomment-3416570407))
---
 torch/utils/_debug_mode.py | 113 ++++++++++++++-----------------------
 1 file changed, 42 insertions(+), 71 deletions(-)

diff --git a/torch/utils/_debug_mode.py b/torch/utils/_debug_mode.py
index 2c87aa8f1c4d..1986828c519b 100644
--- a/torch/utils/_debug_mode.py
+++ b/torch/utils/_debug_mode.py
@@ -81,66 +81,33 @@ def _arg_to_str(arg, attributes) -> str:
     return str(arg)
 
 
-class _DebugCall:
-    """Base class for tracking operator calls in DebugMode"""
-
-    def __init__(self, call_depth: int):
-        self.call_depth = call_depth
-
-    def render(self, attributes: list[str]) -> str:
-        raise NotImplementedError("Subclasses must implement string render()")
-
-
-class _OpCall(_DebugCall):
-    """Normal operator call"""
-
-    def __init__(self, op, args: tuple, kwargs: dict, call_depth: int):
-        super().__init__(call_depth)
-        self.op = op
-        self.args = args
-        self.kwargs = kwargs
-
-    def render(self, attributes: list[str]) -> str:
-        args_str = ", ".join(_arg_to_str(arg, attributes) for arg in self.args)
-
-        if self.kwargs:
-            kwargs_str = ", " + ", ".join(
-                f"{k}={_arg_to_str(v, attributes)}" for k, v in self.kwargs.items()
-            )
+def _op_to_str(op, attributes, *args, **kwargs) -> str:
+    if op == REDISTRIBUTE_FUNC:
+        if len(args) == 2:
+            args_str = f"{_arg_to_str(args[0], attributes)}, trace: {args[1]}"
+        elif len(args) == 3:
+            _args = [_arg_to_str(arg, attributes) for arg in args]
+            args_str = f"{_args[0]}, {_args[1]} -> {_args[2]}"
         else:
-            kwargs_str = ""
+            raise RuntimeError(f"Unsupported args for {REDISTRIBUTE_FUNC}: {args}")
+    else:
+        args_str = ", ".join(_arg_to_str(arg, attributes) for arg in args)
 
-        if isinstance(self.op, torch._ops.OpOverload):
-            op_name = self.op.__qualname__
-        elif hasattr(self.op, "__module__") and hasattr(self.op, "__name__"):
-            op_name = f"{self.op.__module__}.{self.op.__name__}"
-        else:
-            op_name = str(self.op)
+    if kwargs:
+        kwargs_str = ", " + ", ".join(
+            f"{k}={_arg_to_str(v, attributes)}" for k, v in kwargs.items()
+        )
+    else:
+        kwargs_str = ""
 
-        return f"{op_name}({args_str}{kwargs_str})"
+    if isinstance(op, torch._ops.OpOverload):
+        op_name = op.__qualname__
+    elif hasattr(op, "__module__") and hasattr(op, "__name__"):
+        op_name = f"{op.__module__}.{op.__name__}"
+    else:
+        op_name = str(op)
 
-
-class _RedistributeCall(_DebugCall):
-    """Redistribute call from DTensor dispatch"""
-
-    def __init__(
-        self, arg, src_placement, dst_placement, transform_info_str, call_depth
-    ):
-        super().__init__(call_depth)
-        self.arg = arg
-        self.src_placement = src_placement
-        self.dst_placement = dst_placement
-        self.transform_info_str = transform_info_str
-
-    def render(self, attributes: list[str]) -> str:
-        arg_str = f"{_arg_to_str(self.arg, attributes)}"
-        if self.transform_info_str is not None:  # prioritize over src/dst placements
-            placement_str = f"trace: {self.transform_info_str}"
-        else:
-            src_placement_str = _arg_to_str(self.src_placement, attributes)
-            dst_placement_str = _arg_to_str(self.dst_placement, attributes)
-            placement_str = f"{src_placement_str} -> {dst_placement_str}"
-        return f"{REDISTRIBUTE_FUNC}({arg_str}, {placement_str})"
+    return f"{op_name}({args_str}{kwargs_str})"
 
 
 class _NNModuleCall(_DebugCall):
@@ -193,7 +160,7 @@ class DebugMode(TorchDispatchMode):
         if kwargs is None:
             kwargs = {}
 
-        self.operators.append(_OpCall(func, args, kwargs, self.call_depth))
+        self.operators.append((func, args, kwargs, self.call_depth))
 
         try:
             self.call_depth += 1
@@ -207,19 +174,17 @@ class DebugMode(TorchDispatchMode):
 
         # Record the operation with its call depth
         if torch.distributed.tensor.DTensor in types:
-            self.operators.append(_OpCall(func, args, kwargs, self.call_depth))
+            self.operators.append((func, args, kwargs, self.call_depth))
             return NotImplemented
         elif FakeTensor in types or isinstance(
             _get_current_dispatch_mode(), FakeTensorMode
         ):
             if self.record_faketensor:
                 if func != torch.ops.prim.device.default:
-                    self.operators.append(
-                        _OpCall(func, args, kwargs, self.call_depth + 1)
-                    )
+                    self.operators.append((func, args, kwargs, self.call_depth + 1))
         elif len(types) == 0:
             if self.record_realtensor:
-                self.operators.append(_OpCall(func, args, kwargs, self.call_depth + 1))
+                self.operators.append((func, args, kwargs, self.call_depth + 1))
 
         result = func(*args, **kwargs)
 
@@ -265,19 +230,23 @@ class DebugMode(TorchDispatchMode):
     @contextlib.contextmanager
     def record_redistribute_calls(
         self,
-        arg,
+        arg_idx,
         src_placement,
         dst_placement,
         transform_info_str: Optional[str] = None,
     ):
         try:
+            arg_list = (
+                [arg_idx, transform_info_str]
+                if transform_info_str
+                else [arg_idx, src_placement, dst_placement]
+            )
             self.operators.append(
-                _RedistributeCall(
-                    arg,
-                    src_placement=src_placement,
-                    dst_placement=dst_placement,
-                    transform_info_str=transform_info_str,
-                    call_depth=self.call_depth + 1,
+                (
+                    REDISTRIBUTE_FUNC,
+                    arg_list,
+                    {},
+                    self.call_depth + 1,
                 )
             )
             self.call_depth += 1
@@ -289,8 +258,10 @@ class DebugMode(TorchDispatchMode):
         with torch._C.DisableTorchFunction():
             result = ""
             result += "\n".join(
-                "  " + "  " * op.call_depth + op.render(self.record_tensor_attributes)
-                for op in self.operators
+                "  "
+                + "  " * depth
+                + _op_to_str(op, self.record_tensor_attributes, *args, **kwargs)
+                for op, args, kwargs, depth in self.operators
             )
         return result
 

From ca5b7f8ded834970c092864647b5914b0e64cd94 Mon Sep 17 00:00:00 2001
From: Colin L Reliability Rice <clr@meta.com>
Date: Fri, 17 Oct 2025 18:21:18 +0000
Subject: [PATCH 060/123] torch.compile: populate compiler_config (#165581)

Summary: This starts writing the compiler_config metadata into logger

Test Plan:
Modified existing test case to make sure this is not null.
(Also eyeballed what we're logging tomake sure it's reasonable

Reviewed By: masnesral

Differential Revision: D84014636

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165581
Approved by: https://github.com/masnesral
---
 test/dynamo/test_utils.py | 25 +++++++++++++++++++++++++
 torch/_dynamo/utils.py    | 26 ++++++++++++++++++++++++++
 2 files changed, 51 insertions(+)

diff --git a/test/dynamo/test_utils.py b/test/dynamo/test_utils.py
index 8dec23534eff..a01c4e2e2195 100644
--- a/test/dynamo/test_utils.py
+++ b/test/dynamo/test_utils.py
@@ -8,6 +8,7 @@ from unittest import mock
 import torch
 import torch._dynamo.config as dynamo_config
 import torch._inductor.config as inductor_config
+import torch.compiler.config as compiler_config
 from torch._dynamo import utils
 from torch._inductor.test_case import TestCase
 
@@ -497,6 +498,7 @@ class TestDynamoTimed(TestCase):
             e.co_filename = None
             e.co_firstlineno = None
             e.inductor_config = None
+            e.compiler_config = None
             e.cuda_version = None
             e.triton_version = None
             e.python_version = None
@@ -530,6 +532,7 @@ class TestDynamoTimed(TestCase):
  'code_gen_time_s': 0.0,
  'compile_id': '1/0',
  'compile_time_autotune_time_us': None,
+ 'compiler_config': None,
  'compliant_custom_ops': set(),
  'config_inline_inbuilt_nn_modules': False,
  'config_suppress_errors': False,
@@ -616,6 +619,7 @@ class TestDynamoTimed(TestCase):
  'code_gen_time_s': 0.0,
  'compile_id': '1/0',
  'compile_time_autotune_time_us': None,
+ 'compiler_config': None,
  'compliant_custom_ops': set(),
  'config_inline_inbuilt_nn_modules': False,
  'config_suppress_errors': False,
@@ -714,6 +718,7 @@ class TestDynamoTimed(TestCase):
  'code_gen_time_s': 0.0,
  'compile_id': '1/0',
  'compile_time_autotune_time_us': None,
+ 'compiler_config': None,
  'compliant_custom_ops': None,
  'config_inline_inbuilt_nn_modules': False,
  'config_suppress_errors': False,
@@ -800,6 +805,7 @@ class TestDynamoTimed(TestCase):
  'code_gen_time_s': 0.0,
  'compile_id': '1/0',
  'compile_time_autotune_time_us': None,
+ 'compiler_config': None,
  'compliant_custom_ops': None,
  'config_inline_inbuilt_nn_modules': False,
  'config_suppress_errors': False,
@@ -875,6 +881,25 @@ class TestDynamoTimed(TestCase):
  'triton_version': None}""",  # noqa: B950
         )
 
+    @dynamo_config.patch(
+        {
+            "log_compilation_metrics": True,
+        }
+    )
+    @compiler_config.patch({"job_id": "test_job_id"})
+    def test_compiler_config(self):
+        def test1(x):
+            return x * x
+
+        compilation_events = []
+        with mock.patch("torch._dynamo.utils.log_compilation_event") as log_event:
+            torch.compile(test1)(torch.randn(1))
+            compilation_events = [arg[0][0] for arg in log_event.call_args_list]
+        self.assertIn(
+            '"job_id": "test_job_id"',
+            compilation_events[0].compiler_config,
+        )
+
     @dynamo_config.patch(
         {
             "log_compilation_metrics": True,
diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
index 08bfe58aacba..d83fd95a49d2 100644
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@@ -1315,6 +1315,7 @@ class CompilationMetrics:
     config_inline_inbuilt_nn_modules: Optional[bool] = None
     specialize_float: Optional[bool] = None
     dynamo_config: Optional[str] = None
+    compiler_config: Optional[str] = None
     is_forward: Optional[bool] = None
     num_triton_bundles: Optional[int] = None
     remote_fx_graph_cache_get_time_ms: Optional[int] = None
@@ -1555,6 +1556,30 @@ def _get_dynamo_config_for_logging() -> Optional[str]:
     return json.dumps(config_dict, sort_keys=True)
 
 
+def _compiler_config_for_logging() -> Optional[str]:
+    def clean_for_json(d: dict[str, Any]) -> dict[str, Any]:
+        blocklist = {
+            "TYPE_CHECKING",
+        }
+
+        return {
+            key: sorted(value) if isinstance(value, set) else value
+            for key, value in d.items()
+            if key not in blocklist
+        }
+
+    if not torch.compiler.config:
+        return None
+
+    try:
+        compiler_config_copy = torch.compiler.config.get_config_copy()  # type: ignore[attr-defined]
+    except (TypeError, AttributeError):
+        return "Compiler Config cannot be pickled"
+
+    config_dict = clean_for_json(compiler_config_copy)
+    return json.dumps(config_dict, sort_keys=True)
+
+
 def _scrubbed_inductor_config_for_logging() -> Optional[str]:
     """
     Method to parse and scrub uninteresting configs from inductor config
@@ -1642,6 +1667,7 @@ def record_compilation_metrics(
         "config_suppress_errors": config.suppress_errors,
         "config_inline_inbuilt_nn_modules": config.inline_inbuilt_nn_modules,
         "inductor_config": _scrubbed_inductor_config_for_logging(),
+        "compiler_config": _compiler_config_for_logging(),
         "cuda_version": torch.version.cuda,
         "triton_version": triton.__version__ if has_triton() else "",
         "remote_cache_version": remote_cache_version,

From b08d8c2e506532ed00c4be5c4a7bfa58c131156d Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Fri, 17 Oct 2025 18:22:46 +0000
Subject: [PATCH 061/123] Revert "[DebugMode][2/N] add nn.Module tracking
 (#165498)"

This reverts commit 45afaf08a14ab760d86ea80dea6d50cec8626513.

Reverted https://github.com/pytorch/pytorch/pull/165498 on behalf of https://github.com/seemethere due to First part of the stack was reverted so will need to revert this too ([comment](https://github.com/pytorch/pytorch/pull/165498#issuecomment-3416618198))
---
 .../tensor/debug/test_debug_mode.py           | 40 -----------------
 torch/utils/_debug_mode.py                    | 45 +------------------
 2 files changed, 1 insertion(+), 84 deletions(-)

diff --git a/test/distributed/tensor/debug/test_debug_mode.py b/test/distributed/tensor/debug/test_debug_mode.py
index 20da99f52eb0..aab91ddebe94 100644
--- a/test/distributed/tensor/debug/test_debug_mode.py
+++ b/test/distributed/tensor/debug/test_debug_mode.py
@@ -330,46 +330,6 @@ class TestDTensorDebugMode(TestCase):
             f(x)
         self.assertEqual(len(debug_mode.debug_string()), 0)
 
-    def test_nn_module(self):
-        class Foo(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.l1 = torch.nn.Linear(4, 4)
-                self.l2 = torch.nn.Linear(4, 4)
-
-            def forward(self, x):
-                return self.l2(self.l1(x))
-
-        class Bar(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.abc = Foo()
-                self.xyz = torch.nn.Linear(4, 4)
-
-            def forward(self, x):
-                return self.xyz(self.abc(x))
-
-        mod = Bar()
-        inp = torch.randn(4, 4)
-        with DebugMode(record_nn_module=True) as debug_mode:
-            _ = mod(inp)
-
-        self.assertExpectedInline(
-            debug_mode.debug_string(),
-            """\
-    [nn.Mod] Bar
-      [nn.Mod] Bar.abc
-        [nn.Mod] Bar.abc.l1
-          aten::t(t: f32[4, 4])
-          aten::addmm(t: f32[4], t: f32[4, 4], t: f32[4, 4])
-        [nn.Mod] Bar.abc.l2
-          aten::t(t: f32[4, 4])
-          aten::addmm(t: f32[4], t: f32[4, 4], t: f32[4, 4])
-      [nn.Mod] Bar.xyz
-        aten::t(t: f32[4, 4])
-        aten::addmm(t: f32[4], t: f32[4, 4], t: f32[4, 4])""",
-        )
-
 
 instantiate_parametrized_tests(TestDTensorDebugMode)
 
diff --git a/torch/utils/_debug_mode.py b/torch/utils/_debug_mode.py
index 1986828c519b..7f7de2b7334f 100644
--- a/torch/utils/_debug_mode.py
+++ b/torch/utils/_debug_mode.py
@@ -1,6 +1,6 @@
 # mypy: allow-untyped-defs
 import contextlib
-from typing import Optional, TYPE_CHECKING
+from typing import Optional
 
 import torch
 from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
@@ -13,10 +13,6 @@ from torch.utils._python_dispatch import (
 from torch.utils._pytree import tree_map
 
 
-if TYPE_CHECKING:
-    from torch.distributed._tools.mod_tracker import ModTracker
-
-
 __all__ = ["DebugMode", "get_active_debug_mode"]
 
 REDISTRIBUTE_FUNC = "redistribute_input"
@@ -110,17 +106,6 @@ def _op_to_str(op, attributes, *args, **kwargs) -> str:
     return f"{op_name}({args_str}{kwargs_str})"
 
 
-class _NNModuleCall(_DebugCall):
-    """Designates entering an nn.Module's forward method"""
-
-    def __init__(self, module_name: str, call_depth: int):
-        super().__init__(call_depth)
-        self.module_name = module_name
-
-    def render(self, attributes: list[str]) -> str:
-        return f"[nn.Mod] {self.module_name}"
-
-
 class DebugMode(TorchDispatchMode):
     def __init__(
         self,
@@ -129,7 +114,6 @@ class DebugMode(TorchDispatchMode):
         record_faketensor=False,
         record_realtensor=True,
         record_tensor_attributes=None,
-        record_nn_module=False,
     ):
         super().__init__()
         import torch.distributed.tensor  # noqa: F401
@@ -140,12 +124,6 @@ class DebugMode(TorchDispatchMode):
         self.record_realtensor = record_realtensor
         self.record_tensor_attributes = record_tensor_attributes or []
 
-        self.record_nn_module = record_nn_module
-
-        self.module_tracker: Optional[ModTracker] = None
-        if self.record_nn_module:
-            self.module_tracker_setup()
-
         self.operators = []
         self.call_depth = 0
 
@@ -198,35 +176,14 @@ class DebugMode(TorchDispatchMode):
             torch._C._push_on_torch_function_stack(self)
 
         super().__enter__()
-        if self.record_nn_module:
-            self.module_tracker.__enter__()  # type: ignore[attribute, union-attr]
         return self
 
     # pyrefly: ignore  # bad-override
     def __exit__(self, *args):
         super().__exit__(*args)
-        if self.record_nn_module:
-            self.module_tracker.__exit__()  # type: ignore[attribute, union-attr]
         if self.record_torchfunction:
             torch._C._pop_torch_function_stack()
 
-    def module_tracker_setup(self):
-        from torch.distributed._tools.mod_tracker import ModTracker
-
-        self.module_tracker = ModTracker()
-
-        # module pre-fw hook: record module call
-        def pre_fw_hook(module, input):
-            fqn = self.module_tracker._get_mod_name(module)  # type: ignore[attribute, union-attr]
-            self.operators.append(_NNModuleCall(fqn, self.call_depth + 1))
-            self.call_depth += 1
-
-        # module post-fw hook: decrement call depth
-        def post_fw_hook(module, input, output):
-            self.call_depth -= 1
-
-        self.module_tracker.register_user_hooks(pre_fw_hook, post_fw_hook)
-
     @contextlib.contextmanager
     def record_redistribute_calls(
         self,

From 3806e9767b03d06edc317cb90a3a996abdf192a0 Mon Sep 17 00:00:00 2001
From: Jane Xu <janeyx@meta.com>
Date: Thu, 16 Oct 2025 13:12:44 -0700
Subject: [PATCH 062/123] Refactor out headeronly ArrayRef (#164991)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164991
Approved by: https://github.com/swolchok
---
 c10/util/ArrayRef.h                           | 204 ++++-----------
 test/cpp/aoti_abi_check/CMakeLists.txt        |   1 +
 .../test_headeronlyarrayref.cpp               |  52 ++++
 torch/header_only_apis.txt                    |   3 +
 torch/headeronly/util/HeaderOnlyArrayRef.h    | 247 ++++++++++++++++++
 5 files changed, 355 insertions(+), 152 deletions(-)
 create mode 100644 test/cpp/aoti_abi_check/test_headeronlyarrayref.cpp
 create mode 100644 torch/headeronly/util/HeaderOnlyArrayRef.h

diff --git a/c10/util/ArrayRef.h b/c10/util/ArrayRef.h
index 64605f515359..1732d15c36a9 100644
--- a/c10/util/ArrayRef.h
+++ b/c10/util/ArrayRef.h
@@ -18,6 +18,7 @@
 #include <c10/macros/Macros.h>
 #include <c10/util/Exception.h>
 #include <c10/util/SmallVector.h>
+#include <torch/headeronly/util/HeaderOnlyArrayRef.h>
 
 #include <array>
 #include <cstddef>
@@ -40,200 +41,106 @@ namespace c10 {
 ///
 /// This is intended to be trivially copyable, so it should be passed by
 /// value.
+///
+/// NOTE: We have refactored out the headeronly parts of the ArrayRef struct
+/// into HeaderOnlyArrayRef. As adding `virtual` would change the performance of
+/// the underlying constexpr calls, we rely on apparent-type dispatch for
+/// inheritance. This should be fine because their memory format is the same,
+/// and it is never incorrect for ArrayRef to call HeaderOnlyArrayRef methods.
+/// However, you should prefer to use ArrayRef when possible, because its use
+/// of TORCH_CHECK will lead to better user-facing error messages.
 template <typename T>
-class ArrayRef final {
+class ArrayRef final : public HeaderOnlyArrayRef<T> {
  public:
-  using iterator = const T*;
-  using const_iterator = const T*;
-  using size_type = size_t;
-  using value_type = T;
-
-  using reverse_iterator = std::reverse_iterator<iterator>;
-
- private:
-  /// The start of the array, in an external buffer.
-  const T* Data;
-
-  /// The number of elements.
-  size_type Length;
-
-  void debugCheckNullptrInvariant() {
-    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-        Data != nullptr || Length == 0,
-        "created ArrayRef with nullptr and non-zero length! std::optional relies on this being illegal");
-  }
-
- public:
-  /// @name Constructors
+  /// @name Constructors, all inherited from HeaderOnlyArrayRef except for
+  /// SmallVector.
   /// @{
 
-  /// Construct an empty ArrayRef.
-  /* implicit */ constexpr ArrayRef() : Data(nullptr), Length(0) {}
+  using HeaderOnlyArrayRef<T>::HeaderOnlyArrayRef;
 
-  /// Construct an ArrayRef from a single element.
-  // TODO Make this explicit
-  constexpr ArrayRef(const T& OneElt) : Data(&OneElt), Length(1) {}
-
-  /// Construct an ArrayRef from a pointer and length.
-  constexpr ArrayRef(const T* data, size_t length)
-      : Data(data), Length(length) {
-    debugCheckNullptrInvariant();
-  }
-
-  /// Construct an ArrayRef from a range.
-  constexpr ArrayRef(const T* begin, const T* end)
-      : Data(begin), Length(end - begin) {
-    debugCheckNullptrInvariant();
-  }
+  /// Construct an ArrayRef from a std::vector.
+  /// This constructor is identical to the one in HeaderOnlyArrayRef, but we
+  /// include it to help with Class Template Argument Deduction (CTAD).
+  /// Without it, CTAD can fail sometimes due to the indirect constructor
+  /// inheritance. So we explicitly include this constructor.
+  template <typename A>
+  /* implicit */ ArrayRef(const std::vector<T, A>& Vec)
+      : HeaderOnlyArrayRef<T>(Vec.data(), Vec.size()) {}
 
   /// Construct an ArrayRef from a SmallVector. This is templated in order to
   /// avoid instantiating SmallVectorTemplateCommon<T> whenever we
   /// copy-construct an ArrayRef.
+  /// NOTE: this is the only constructor that is not inherited from
+  /// HeaderOnlyArrayRef.
   template <typename U>
   /* implicit */ ArrayRef(const SmallVectorTemplateCommon<T, U>& Vec)
-      : Data(Vec.data()), Length(Vec.size()) {
-    debugCheckNullptrInvariant();
-  }
-
-  template <
-      typename Container,
-      typename U = decltype(std::declval<Container>().data()),
-      typename = std::enable_if_t<
-          (std::is_same_v<U, T*> || std::is_same_v<U, T const*>)>>
-  /* implicit */ ArrayRef(const Container& container)
-      : Data(container.data()), Length(container.size()) {
-    debugCheckNullptrInvariant();
-  }
-
-  /// Construct an ArrayRef from a std::vector.
-  // The enable_if stuff here makes sure that this isn't used for
-  // std::vector<bool>, because ArrayRef can't work on a std::vector<bool>
-  // bitfield.
-  template <typename A>
-  /* implicit */ ArrayRef(const std::vector<T, A>& Vec)
-      : Data(Vec.data()), Length(Vec.size()) {
-    static_assert(
-        !std::is_same_v<T, bool>,
-        "ArrayRef<bool> cannot be constructed from a std::vector<bool> bitfield.");
-  }
-
-  /// Construct an ArrayRef from a std::array
-  template <size_t N>
-  /* implicit */ constexpr ArrayRef(const std::array<T, N>& Arr)
-      : Data(Arr.data()), Length(N) {}
-
-  /// Construct an ArrayRef from a C array.
-  template <size_t N>
-  // NOLINTNEXTLINE(*c-arrays*)
-  /* implicit */ constexpr ArrayRef(const T (&Arr)[N]) : Data(Arr), Length(N) {}
-
-  /// Construct an ArrayRef from a std::initializer_list.
-  /* implicit */ constexpr ArrayRef(const std::initializer_list<T>& Vec)
-      : Data(
-            std::begin(Vec) == std::end(Vec) ? static_cast<T*>(nullptr)
-                                             : std::begin(Vec)),
-        Length(Vec.size()) {}
+      : HeaderOnlyArrayRef<T>(Vec.data(), Vec.size()) {}
 
   /// @}
-  /// @name Simple Operations
+  /// @name Simple Operations, mostly inherited from HeaderOnlyArrayRef
   /// @{
 
-  constexpr iterator begin() const {
-    return Data;
-  }
-  constexpr iterator end() const {
-    return Data + Length;
-  }
-
-  // These are actually the same as iterator, since ArrayRef only
-  // gives you const iterators.
-  constexpr const_iterator cbegin() const {
-    return Data;
-  }
-  constexpr const_iterator cend() const {
-    return Data + Length;
-  }
-
-  constexpr reverse_iterator rbegin() const {
-    return reverse_iterator(end());
-  }
-  constexpr reverse_iterator rend() const {
-    return reverse_iterator(begin());
-  }
-
-  /// Check if all elements in the array satisfy the given expression
-  constexpr bool allMatch(const std::function<bool(const T&)>& pred) const {
-    return std::all_of(cbegin(), cend(), pred);
-  }
-
-  /// empty - Check if the array is empty.
-  constexpr bool empty() const {
-    return Length == 0;
-  }
-
-  constexpr const T* data() const {
-    return Data;
-  }
-
-  /// size - Get the array size.
-  constexpr size_t size() const {
-    return Length;
-  }
-
   /// front - Get the first element.
+  /// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of
+  /// STD_TORCH_CHECK
   constexpr const T& front() const {
     TORCH_CHECK(
-        !empty(), "ArrayRef: attempted to access front() of empty list");
-    return Data[0];
+        !this->empty(), "ArrayRef: attempted to access front() of empty list");
+    return this->Data[0];
   }
 
   /// back - Get the last element.
+  /// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of
+  /// STD_TORCH_CHECK
   constexpr const T& back() const {
-    TORCH_CHECK(!empty(), "ArrayRef: attempted to access back() of empty list");
-    return Data[Length - 1];
-  }
-
-  /// equals - Check for element-wise equality.
-  constexpr bool equals(ArrayRef RHS) const {
-    return Length == RHS.Length && std::equal(begin(), end(), RHS.begin());
+    TORCH_CHECK(
+        !this->empty(), "ArrayRef: attempted to access back() of empty list");
+    return this->Data[this->Length - 1];
   }
 
   /// slice(n, m) - Take M elements of the array starting at element N
+  /// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of
+  /// STD_TORCH_CHECK
   constexpr ArrayRef<T> slice(size_t N, size_t M) const {
     TORCH_CHECK(
-        N + M <= size(),
+        N + M <= this->size(),
         "ArrayRef: invalid slice, N = ",
         N,
         "; M = ",
         M,
         "; size = ",
-        size());
-    return ArrayRef<T>(data() + N, M);
+        this->size());
+    return ArrayRef<T>(this->data() + N, M);
   }
 
   /// slice(n) - Chop off the first N elements of the array.
+  /// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of
+  /// STD_TORCH_CHECK
   constexpr ArrayRef<T> slice(size_t N) const {
     TORCH_CHECK(
-        N <= size(), "ArrayRef: invalid slice, N = ", N, "; size = ", size());
-    return slice(N, size() - N);
+        N <= this->size(),
+        "ArrayRef: invalid slice, N = ",
+        N,
+        "; size = ",
+        this->size());
+    return slice(N, this->size() - N); // should this slice be this->slice?
   }
 
   /// @}
   /// @name Operator Overloads
   /// @{
-  constexpr const T& operator[](size_t Index) const {
-    return Data[Index];
-  }
 
   /// Vector compatibility
+  /// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of
+  /// STD_TORCH_CHECK
   constexpr const T& at(size_t Index) const {
     TORCH_CHECK(
-        Index < Length,
+        Index < this->Length,
         "ArrayRef: invalid index Index = ",
         Index,
         "; Length = ",
-        Length);
-    return Data[Index];
+        this->Length);
+    return this->Data[Index];
   }
 
   /// Disallow accidental assignment from a temporary.
@@ -253,13 +160,6 @@ class ArrayRef final {
   std::enable_if_t<std::is_same_v<U, T>, ArrayRef<T>>& operator=(
       std::initializer_list<U>) = delete;
 
-  /// @}
-  /// @name Expensive Operations
-  /// @{
-  std::vector<T> vec() const {
-    return std::vector<T>(Data, Data + Length);
-  }
-
   /// @}
 };
 
diff --git a/test/cpp/aoti_abi_check/CMakeLists.txt b/test/cpp/aoti_abi_check/CMakeLists.txt
index da67eb74f28b..6c161a83cb58 100644
--- a/test/cpp/aoti_abi_check/CMakeLists.txt
+++ b/test/cpp/aoti_abi_check/CMakeLists.txt
@@ -7,6 +7,7 @@ set(AOTI_ABI_CHECK_TEST_SRCS
   ${AOTI_ABI_CHECK_TEST_ROOT}/test_devicetype.cpp
   ${AOTI_ABI_CHECK_TEST_ROOT}/test_dtype.cpp
   ${AOTI_ABI_CHECK_TEST_ROOT}/test_exception.cpp
+  ${AOTI_ABI_CHECK_TEST_ROOT}/test_headeronlyarrayref.cpp
   ${AOTI_ABI_CHECK_TEST_ROOT}/test_macros.cpp
   ${AOTI_ABI_CHECK_TEST_ROOT}/test_math.cpp
   ${AOTI_ABI_CHECK_TEST_ROOT}/test_rand.cpp
diff --git a/test/cpp/aoti_abi_check/test_headeronlyarrayref.cpp b/test/cpp/aoti_abi_check/test_headeronlyarrayref.cpp
new file mode 100644
index 000000000000..184c0ade8360
--- /dev/null
+++ b/test/cpp/aoti_abi_check/test_headeronlyarrayref.cpp
@@ -0,0 +1,52 @@
+#include <gtest/gtest.h>
+
+#include <torch/headeronly/util/HeaderOnlyArrayRef.h>
+
+#include <vector>
+
+using torch::headeronly::HeaderOnlyArrayRef;
+
+TEST(TestHeaderOnlyArrayRef, TestEmpty) {
+  HeaderOnlyArrayRef<float> arr;
+  ASSERT_TRUE(arr.empty());
+}
+
+TEST(TestHeaderOnlyArrayRef, TestSingleton) {
+  float val = 5.0f;
+  HeaderOnlyArrayRef<float> arr(val);
+  ASSERT_FALSE(arr.empty());
+  EXPECT_EQ(arr.size(), 1);
+  EXPECT_EQ(arr[0], val);
+}
+
+TEST(TestHeaderOnlyArrayRef, TestAPIs) {
+  std::vector<int> vec = {1, 2, 3, 4, 5, 6, 7};
+  HeaderOnlyArrayRef<int> arr(vec);
+  ASSERT_FALSE(arr.empty());
+  EXPECT_EQ(arr.size(), 7);
+  for (size_t i = 0; i < arr.size(); i++) {
+    EXPECT_EQ(arr[i], i + 1);
+    EXPECT_EQ(arr.at(i), i + 1);
+  }
+  EXPECT_EQ(arr.front(), 1);
+  EXPECT_EQ(arr.back(), 7);
+  ASSERT_TRUE(arr.slice(3, 4).equals(arr.slice(3)));
+}
+
+TEST(TestHeaderOnlyArrayRef, TestFromInitializerList) {
+  std::vector<int> vec = {1, 2, 3, 4, 5, 6, 7};
+  HeaderOnlyArrayRef<int> arr({1, 2, 3, 4, 5, 6, 7});
+  auto res_vec = arr.vec();
+  for (size_t i = 0; i < vec.size(); i++) {
+    EXPECT_EQ(vec[i], res_vec[i]);
+  }
+}
+
+TEST(TestHeaderOnlyArrayRef, TestFromRange) {
+  std::vector<int> vec = {1, 2, 3, 4, 5, 6, 7};
+  HeaderOnlyArrayRef<int> arr(vec.data() + 3, vec.data() + 7);
+  auto res_vec = arr.vec();
+  for (size_t i = 0; i < res_vec.size(); i++) {
+    EXPECT_EQ(vec[i + 3], res_vec[i]);
+  }
+}
diff --git a/torch/header_only_apis.txt b/torch/header_only_apis.txt
index 8fe36f78063b..3cb3fff3081a 100644
--- a/torch/header_only_apis.txt
+++ b/torch/header_only_apis.txt
@@ -42,6 +42,9 @@ fp16_ieee_to_fp32_value
 # fp32_from_bits called from fp16_ieee_to_fp32_value
 # fp32_to_bits called from fp16_ieee_from_fp32_value
 
+# torch/headeronly/util/HeaderOnlyArrayRef.h
+HeaderOnlyArrayRef
+
 # c10/util/complex.h, torch/headeronly/util/complex.h
 complex
 
diff --git a/torch/headeronly/util/HeaderOnlyArrayRef.h b/torch/headeronly/util/HeaderOnlyArrayRef.h
new file mode 100644
index 000000000000..2387578ab8f5
--- /dev/null
+++ b/torch/headeronly/util/HeaderOnlyArrayRef.h
@@ -0,0 +1,247 @@
+#pragma once
+
+#include <torch/headeronly/macros/Macros.h>
+#include <torch/headeronly/util/Exception.h>
+
+#include <array>
+#include <cstddef>
+#include <functional>
+#include <initializer_list>
+#include <iterator>
+#include <type_traits>
+#include <vector>
+
+namespace c10 {
+
+/// HeaderOnlyArrayRef - A subset of ArrayRef that is implemented only
+/// in headers. This will be a base class from which ArrayRef inherits, so that
+/// we can keep much of the implementation shared.
+///
+/// [HeaderOnlyArrayRef vs ArrayRef note]
+/// As HeaderOnlyArrayRef is a subset of ArrayRef, it has slightly less
+/// functionality than ArrayRef. We document the minor differences below:
+/// 1. ArrayRef has an extra convenience constructor for SmallVector.
+/// 2. ArrayRef uses TORCH_CHECK. HeaderOnlyArrayRef uses header-only
+///    STD_TORCH_CHECK, which will output a std::runtime_error vs a
+///    c10::Error. Consequently, you should use ArrayRef when possible
+///    and HeaderOnlyArrayRef only when necessary to support headeronly code.
+/// In all other aspects, HeaderOnlyArrayRef is identical to ArrayRef, with the
+/// positive benefit of being header-only and thus independent of libtorch.so.
+template <typename T>
+class HeaderOnlyArrayRef {
+ public:
+  using iterator = const T*;
+  using const_iterator = const T*;
+  using size_type = size_t;
+  using value_type = T;
+
+  using reverse_iterator = std::reverse_iterator<iterator>;
+
+ protected:
+  /// The start of the array, in an external buffer.
+  const T* Data;
+
+  /// The number of elements.
+  size_type Length;
+
+ public:
+  /// @name Constructors
+  /// @{
+
+  /// Construct an empty HeaderOnlyArrayRef.
+  /* implicit */ constexpr HeaderOnlyArrayRef() : Data(nullptr), Length(0) {}
+
+  /// Construct a HeaderOnlyArrayRef from a single element.
+  // TODO Make this explicit
+  constexpr HeaderOnlyArrayRef(const T& OneElt) : Data(&OneElt), Length(1) {}
+
+  /// Construct a HeaderOnlyArrayRef from a pointer and length.
+  constexpr HeaderOnlyArrayRef(const T* data, size_t length)
+      : Data(data), Length(length) {}
+
+  /// Construct a HeaderOnlyArrayRef from a range.
+  constexpr HeaderOnlyArrayRef(const T* begin, const T* end)
+      : Data(begin), Length(end - begin) {}
+
+  template <
+      typename Container,
+      typename U = decltype(std::declval<Container>().data()),
+      typename = std::enable_if_t<
+          (std::is_same_v<U, T*> || std::is_same_v<U, T const*>)>>
+  /* implicit */ HeaderOnlyArrayRef(const Container& container)
+      : Data(container.data()), Length(container.size()) {}
+
+  /// Construct a HeaderOnlyArrayRef from a std::vector.
+  // The enable_if stuff here makes sure that this isn't used for
+  // std::vector<bool>, because ArrayRef can't work on a std::vector<bool>
+  // bitfield.
+  template <typename A>
+  /* implicit */ HeaderOnlyArrayRef(const std::vector<T, A>& Vec)
+      : Data(Vec.data()), Length(Vec.size()) {
+    static_assert(
+        !std::is_same_v<T, bool>,
+        "HeaderOnlyArrayRef<bool> cannot be constructed from a std::vector<bool> bitfield.");
+  }
+
+  /// Construct a HeaderOnlyArrayRef from a std::array
+  template <size_t N>
+  /* implicit */ constexpr HeaderOnlyArrayRef(const std::array<T, N>& Arr)
+      : Data(Arr.data()), Length(N) {}
+
+  /// Construct a HeaderOnlyArrayRef from a C array.
+  template <size_t N>
+  // NOLINTNEXTLINE(*c-arrays*)
+  /* implicit */ constexpr HeaderOnlyArrayRef(const T (&Arr)[N])
+      : Data(Arr), Length(N) {}
+
+  /// Construct a HeaderOnlyArrayRef from a std::initializer_list.
+  /* implicit */ constexpr HeaderOnlyArrayRef(
+      const std::initializer_list<T>& Vec)
+      : Data(
+            std::begin(Vec) == std::end(Vec) ? static_cast<T*>(nullptr)
+                                             : std::begin(Vec)),
+        Length(Vec.size()) {}
+
+  /// @}
+  /// @name Simple Operations
+  /// @{
+
+  constexpr iterator begin() const {
+    return this->Data;
+  }
+  constexpr iterator end() const {
+    return this->Data + this->Length;
+  }
+
+  // These are actually the same as iterator, since ArrayRef only
+  // gives you const iterators.
+  constexpr const_iterator cbegin() const {
+    return this->Data;
+  }
+  constexpr const_iterator cend() const {
+    return this->Data + this->Length;
+  }
+
+  constexpr reverse_iterator rbegin() const {
+    return reverse_iterator(end());
+  }
+  constexpr reverse_iterator rend() const {
+    return reverse_iterator(begin());
+  }
+
+  /// Check if all elements in the array satisfy the given expression
+  constexpr bool allMatch(const std::function<bool(const T&)>& pred) const {
+    return std::all_of(cbegin(), cend(), pred);
+  }
+
+  /// empty - Check if the array is empty.
+  constexpr bool empty() const {
+    return this->Length == 0;
+  }
+
+  constexpr const T* data() const {
+    return this->Data;
+  }
+
+  /// size - Get the array size.
+  constexpr size_t size() const {
+    return this->Length;
+  }
+
+  /// front - Get the first element.
+  constexpr const T& front() const {
+    STD_TORCH_CHECK(
+        !this->empty(),
+        "HeaderOnlyArrayRef: attempted to access front() of empty list");
+    return this->Data[0];
+  }
+
+  /// back - Get the last element.
+  constexpr const T& back() const {
+    STD_TORCH_CHECK(
+        !this->empty(),
+        "HeaderOnlyArrayRef: attempted to access back() of empty list");
+    return this->Data[this->Length - 1];
+  }
+
+  /// equals - Check for element-wise equality.
+  constexpr bool equals(HeaderOnlyArrayRef RHS) const {
+    return this->Length == RHS.Length &&
+        std::equal(begin(), end(), RHS.begin());
+  }
+
+  /// slice(n, m) - Take M elements of the array starting at element N
+  constexpr HeaderOnlyArrayRef<T> slice(size_t N, size_t M) const {
+    STD_TORCH_CHECK(
+        N + M <= this->size(),
+        "HeaderOnlyArrayRef: invalid slice, N = ",
+        N,
+        "; M = ",
+        M,
+        "; size = ",
+        this->size());
+    return HeaderOnlyArrayRef<T>(this->data() + N, M);
+  }
+
+  /// slice(n) - Chop off the first N elements of the array.
+  constexpr HeaderOnlyArrayRef<T> slice(size_t N) const {
+    STD_TORCH_CHECK(
+        N <= this->size(),
+        "HeaderOnlyArrayRef: invalid slice, N = ",
+        N,
+        "; size = ",
+        this->size());
+    return slice(N, this->size() - N);
+  }
+
+  /// @}
+  /// @name Operator Overloads
+  /// @{
+  constexpr const T& operator[](size_t Index) const {
+    return this->Data[Index];
+  }
+
+  /// Vector compatibility
+  constexpr const T& at(size_t Index) const {
+    STD_TORCH_CHECK(
+        Index < this->Length,
+        "HeaderOnlyArrayRef: invalid index Index = ",
+        Index,
+        "; Length = ",
+        this->Length);
+    return this->Data[Index];
+  }
+
+  /// Disallow accidental assignment from a temporary.
+  ///
+  /// The declaration here is extra complicated so that "arrayRef = {}"
+  /// continues to select the move assignment operator.
+  template <typename U>
+  std::enable_if_t<std::is_same_v<U, T>, HeaderOnlyArrayRef<T>>& operator=(
+      // NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
+      U&& Temporary) = delete;
+
+  /// Disallow accidental assignment from a temporary.
+  ///
+  /// The declaration here is extra complicated so that "arrayRef = {}"
+  /// continues to select the move assignment operator.
+  template <typename U>
+  std::enable_if_t<std::is_same_v<U, T>, HeaderOnlyArrayRef<T>>& operator=(
+      std::initializer_list<U>) = delete;
+
+  /// @}
+  /// @name Expensive Operations
+  /// @{
+  std::vector<T> vec() const {
+    return std::vector<T>(this->Data, this->Data + this->Length);
+  }
+
+  /// @}
+};
+
+} // namespace c10
+
+namespace torch::headeronly {
+using c10::HeaderOnlyArrayRef;
+using IntHeaderOnlyArrayRef = HeaderOnlyArrayRef<int64_t>;
+} // namespace torch::headeronly

From e4454947e2c692db1a249591121f8583fefe7df1 Mon Sep 17 00:00:00 2001
From: Jane Xu <janeyx@meta.com>
Date: Thu, 16 Oct 2025 13:12:44 -0700
Subject: [PATCH 063/123] Widen ops support to take in IntHOArrayRef vs only
 std::vec (#165152)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165152
Approved by: https://github.com/mikaylagawarecki
ghstack dependencies: #164991
---
 .../libtorch_agnostic/csrc/kernel.cpp           | 12 ++++++------
 torch/csrc/stable/ops.h                         | 17 +++++++----------
 2 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp
index 58c812b08ccc..87aaa46e64c9 100644
--- a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp
@@ -311,10 +311,9 @@ void boxed_fill_infinity(
 }
 
 Tensor my_pad(Tensor t) {
-  std::vector<int64_t> padding = {1, 2, 2, 1};
   std::string mode = "constant";
   double value = 0.0;
-  return pad(t, padding, mode, value);
+  return pad(t, {1, 2, 2, 1}, mode, value);
 }
 
 void boxed_my_pad(
@@ -342,6 +341,9 @@ void boxed_my_narrow(
 }
 
 Tensor my_new_empty_dtype_variant(Tensor t) {
+  // Still using a std::vector below even though people can just pass in an
+  // initializer list (which will be implicitly converted to an HeaderOnlyArrayRef)
+  // directly.
   std::vector<int64_t> sizes = {2, 5};
   auto dtype = std::make_optional(torch::headeronly::ScalarType::BFloat16);
   return new_empty(t, sizes, dtype);
@@ -353,9 +355,8 @@ void boxed_my_new_empty_dtype_variant(StableIValue* stack, uint64_t num_args, ui
 }
 
 Tensor my_new_zeros_dtype_variant(Tensor t) {
-  std::vector<int64_t> sizes = {2, 5};
   auto dtype = std::make_optional(at::ScalarType::Float);
-  return new_zeros(t, sizes, dtype);
+  return new_zeros(t, {2, 5}, dtype);
 }
 
 void boxed_my_new_zeros_dtype_variant(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
@@ -429,8 +430,7 @@ void boxed_my_amax(StableIValue* stack, uint64_t num_args, uint64_t num_outputs)
 }
 
 Tensor my_amax_vec(Tensor t) {
-  std::vector<int64_t> v = {0,1};
-  return amax(t, v, false);
+  return amax(t, {0,1}, false);
 }
 
 void boxed_my_amax_vec(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
diff --git a/torch/csrc/stable/ops.h b/torch/csrc/stable/ops.h
index 549b2b95ec41..be230c5577a3 100644
--- a/torch/csrc/stable/ops.h
+++ b/torch/csrc/stable/ops.h
@@ -5,10 +5,10 @@
 #include <cstdint>
 #include <optional>
 #include <string>
-#include <vector>
 
 #include <torch/csrc/inductor/aoti_torch/generated/c_shim_aten.h>
 #include <torch/headeronly/core/ScalarType.h>
+#include <torch/headeronly/util/HeaderOnlyArrayRef.h>
 
 namespace torch::stable {
 
@@ -60,7 +60,7 @@ inline torch::stable::Tensor narrow(
 // only dtype information.
 inline torch::stable::Tensor new_empty(
     const torch::stable::Tensor& self,
-    std::vector<int64_t> size,
+    torch::headeronly::IntHeaderOnlyArrayRef size,
     std::optional<c10::ScalarType> dtype = std::nullopt) {
   int32_t device_type;
   TORCH_ERROR_CODE_CHECK(aoti_torch_get_device_type(self.get(), &device_type));
@@ -98,7 +98,7 @@ inline torch::stable::Tensor new_empty(
 // only dtype information.
 inline torch::stable::Tensor new_zeros(
     const torch::stable::Tensor& self,
-    std::vector<int64_t> size,
+    torch::headeronly::IntHeaderOnlyArrayRef size,
     std::optional<c10::ScalarType> dtype = std::nullopt) {
   int32_t device_type;
   TORCH_ERROR_CODE_CHECK(aoti_torch_get_device_type(self.get(), &device_type));
@@ -134,12 +134,10 @@ inline torch::stable::Tensor new_zeros(
 
 // We expect this to be the stable version of the pad.default op.
 // pad.default takes in a SymInt[] as the pad argument however pad is typed as
-// use std::vector<int64_t> because
-// (1) IntArrayRef is not yet header-only
-// (2) SymInt is not yet header-only
+// torch::headeronly::IntHeaderOnlyArrayRef as SymInt is not yet header-only.
 inline torch::stable::Tensor pad(
     const torch::stable::Tensor& self,
-    std::vector<int64_t> pad,
+    torch::headeronly::IntHeaderOnlyArrayRef pad,
     const std::string& mode = "constant",
     double value = 0.0) {
   AtenTensorHandle ret0 = nullptr;
@@ -171,11 +169,10 @@ inline torch::stable::Tensor amax(
 // This function is an overload to compute the maximum value along each slice of
 // `self` reducing over all the dimensions in the vector `dims`. The
 // amax.default op takes in a SymInt[] as the dims argument, however dims is
-// typed as use std::vector<int64_t> here because (1) IntArrayRef is not yet
-// header-only (2) SymInt is not yet header-only
+// typed as use IntHeaderOnlyArrayRef here because SymInt is not yet header-only
 inline torch::stable::Tensor amax(
     const torch::stable::Tensor& self,
-    std::vector<int64_t> dims,
+    torch::headeronly::IntHeaderOnlyArrayRef dims,
     bool keepdim = false) {
   AtenTensorHandle ret = nullptr;
   TORCH_ERROR_CODE_CHECK(aoti_torch_aten_amax(

From 7a657700131f31577544e93587eb339618677e97 Mon Sep 17 00:00:00 2001
From: Sherlock Huang <baihan.huang@gmail.com>
Date: Thu, 16 Oct 2025 20:37:07 -0700
Subject: [PATCH 064/123] Update gm.print_readable to include Annotation
 (#165397)

Sample output
```
[rank0]:        # Annotation: {'compile_with_inductor': 'flex_attention'} File: /data/users/bahuang/pytorch/torch/nn/attention/flex_attention.py:1490 in flex_attention, code: out, lse, max_scores = flex_attention_hop(
[rank0]:        score_mod_2 = self.score_mod_2
[rank0]:        mask_fn_2 = self.mask_fn_2
[rank0]:        flex_attention_1 = torch.ops.higher_order.flex_attention(xq_5, xk_5, xv_3, score_mod_2, (2048, 2048, g____import_torchtitan_dot_models_dot_attention___flex_attention_block_masks___block_causal___none___kv_num_blocks, g____import_torchtitan_dot_models_dot_attention___flex_attention_block_masks___block_causal___none___kv_indices, g____import_torchtitan_dot_models_dot_attention___flex_attention_block_masks___block_causal___none___full_kv_num_blocks, g____import_torchtitan_dot_models_dot_attention___flex_attention_block_masks___block_causal___none___full_kv_indices, g____import_torchtitan_dot_models_dot_attention___flex_attention_block_masks___block_causal___none___q_num_blocks, g____import_torchtitan_dot_models_dot_attention___flex_attention_block_masks___block_causal___none___q_indices, g____import_torchtitan_dot_models_dot_attention___flex_attention_block_masks___block_causal___none___full_q_num_blocks, g____import_torchtitan_dot_models_dot_attention___flex_attention_block_masks___block_causal___none___full_q_indices, 128, 128, mask_fn_2), 0.25, {'PRESCALE_QK': False, 'ROWS_GUARANTEED_SAFE': False, 'BLOCKS_ARE_CONTIGUOUS': False, 'WRITE_DQ': True, 'OUTPUT_LOGSUMEXP': True, 'OUTPUT_MAX': False}, (), (g____import_torchtitan_dot_models_dot_attention___flex_attention_block_masks___block_causal___none___mask_mod___closure___0_cell_contents,));  xq_5 = xk_5 = xv_3 = score_mod_2 = mask_fn_2 = None
[rank0]:        out_2: "bf16[8, 4, 2048, 16]" = flex_attention_1[0];  flex_attention_1 = None
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165397
Approved by: https://github.com/yushangdi, https://github.com/anijain2305
---
 test/dynamo/test_higher_order_ops.py          | 30 -----------------
 test/dynamo/test_subclasses.py                |  1 -
 test/export/test_export.py                    |  2 ++
 test/functorch/test_control_flow.py           |  5 ---
 test/higher_order_ops/test_invoke_subgraph.py | 22 ++++++-------
 test/inductor/test_compiled_autograd.py       |  1 -
 torch/fx/graph.py                             | 32 ++++++++++---------
 7 files changed, 30 insertions(+), 63 deletions(-)

diff --git a/test/dynamo/test_higher_order_ops.py b/test/dynamo/test_higher_order_ops.py
index 8b71fe398263..693c90a10b3a 100644
--- a/test/dynamo/test_higher_order_ops.py
+++ b/test/dynamo/test_higher_order_ops.py
@@ -3802,7 +3802,6 @@ class GraphModule(torch.nn.Module):
         dual: "f32[4, 3, 4, 3]" = _unpack_dual[1];  _unpack_dual = None
 
         primals_out_unflatten: "f32[4, 3, 4, 3]" = torch._C._functorch._unwrap_for_grad(primal, 2);  primal = primals_out_unflatten = None
-
         tangents_out_unflatten: "f32[4, 3, 4, 3]" = torch._C._functorch._unwrap_for_grad(dual, 2);  dual = None
 
         _exit_dual_level = torch._C._exit_dual_level(0);  _exit_dual_level = None
@@ -3933,7 +3932,6 @@ class GraphModule(torch.nn.Module):
         tangent: "f32[4, 3, 3, 4]" = torch.zeros_like(primal)
 
         child_8: "f32[4, 3, 3, 4]" = torch._C._functorch._unwrap_for_grad(primal, 2);  primal = child_8 = None
-
         child_9: "f32[4, 3, 3, 4]" = torch._C._functorch._unwrap_for_grad(tangent, 2);  tangent = None
 
         _exit_dual_level = torch._C._exit_dual_level(0);  _exit_dual_level = None
@@ -4146,7 +4144,6 @@ class GraphModule(torch.nn.Module):
         primals_out: "f32[3, 4]" = diff_primals.sin()
 
         aux_1: "f32[4, 3]" = torch._C._functorch._unwrap_for_grad(aux, 1);  aux = None
-
         results: "f32[3, 4]" = torch._C._functorch._unwrap_for_grad(primals_out, 1)
 
         _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting();  _grad_decrement_nesting = None
@@ -4381,7 +4378,6 @@ class GraphModule(torch.nn.Module):
         primals_out: "f32[]" = sin.sum();  sin = None
 
         aux: "f32[5]" = torch._C._functorch._unwrap_for_grad(child, 1);  child = aux = None
-
         results: "f32[]" = torch._C._functorch._unwrap_for_grad(primals_out, 1);  primals_out = None
 
         _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting();  _grad_decrement_nesting = None
@@ -4571,7 +4567,6 @@ class GraphModule(torch.nn.Module):
         grad_input: "f32[3, 3, 3]" = _autograd_grad[0];  _autograd_grad = None
 
         grad_input_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(grad_input, 1);  grad_input = None
-
         output_1: "f32[]" = torch._C._functorch._unwrap_for_grad(output, 1);  output = output_1 = None
 
         _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting();  _grad_decrement_nesting = None
@@ -4639,7 +4634,6 @@ class GraphModule(torch.nn.Module):
         grad_input: "f32[3, 3, 3]" = _autograd_grad[0];  _autograd_grad = None
 
         grad_input_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(grad_input, 1);  grad_input = None
-
         output_1: "f32[]" = torch._C._functorch._unwrap_for_grad(output, 1);  output = output_1 = None
 
         _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting();  _grad_decrement_nesting = None
@@ -4696,7 +4690,6 @@ class GraphModule(torch.nn.Module):
         grad_input: "f32[3, 3, 3]" = _autograd_grad[0];  _autograd_grad = None
 
         grad_input_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(grad_input, 1);  grad_input = None
-
         output_1: "f32[]" = torch._C._functorch._unwrap_for_grad(output, 1);  output = output_1 = None
 
         _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting();  _grad_decrement_nesting = None
@@ -4753,7 +4746,6 @@ class GraphModule(torch.nn.Module):
         grad_input: "f32[3, 3, 3]" = _autograd_grad[0];  _autograd_grad = None
 
         grad_input_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(grad_input, 1);  grad_input = None
-
         output_1: "f32[]" = torch._C._functorch._unwrap_for_grad(output, 1);  output = output_1 = None
 
         _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting();  _grad_decrement_nesting = None
@@ -4808,9 +4800,7 @@ class GraphModule(torch.nn.Module):
         grad_input: "f32[3, 3, 3]" = _autograd_grad[0];  _autograd_grad = None
 
         grad_input_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(grad_input, 1);  grad_input = None
-
         output_1: "f32[]" = torch._C._functorch._unwrap_for_grad(output, 1);  output = output_1 = None
-
         aux_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(aux, 1);  aux = None
 
         _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting();  _grad_decrement_nesting = None
@@ -4866,9 +4856,7 @@ class GraphModule(torch.nn.Module):
         grad_input: "f32[3, 3, 3]" = _autograd_grad[0];  _autograd_grad = None
 
         grad_input_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(grad_input, 1);  grad_input = None
-
         output_1: "f32[]" = torch._C._functorch._unwrap_for_grad(output, 1);  output = output_1 = None
-
         aux_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(aux, 1);  aux = None
 
         _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting();  _grad_decrement_nesting = None
@@ -4942,9 +4930,7 @@ class GraphModule(torch.nn.Module):
 
         _unwrap_for_grad: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(child_2, 1);  child_2 = None
         _unwrap_for_grad_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(child_3, 1);  child_3 = None
-
         output_1: "f32[]" = torch._C._functorch._unwrap_for_grad(output, 1);  output = output_1 = None
-
         aux_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(aux, 1);  aux = None
 
         _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting();  _grad_decrement_nesting = None
@@ -4988,9 +4974,7 @@ class GraphModule(torch.nn.Module):
 
         _unwrap_for_grad: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(child_2, 1);  child_2 = None
         _unwrap_for_grad_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(child_3, 1);  child_3 = None
-
         output_1: "f32[]" = torch._C._functorch._unwrap_for_grad(output, 1);  output = output_1 = None
-
         aux_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(aux, 1);  aux = None
 
         _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting();  _grad_decrement_nesting = None
@@ -5050,7 +5034,6 @@ class GraphModule(torch.nn.Module):
         grad_input: "f32[]" = _autograd_grad[0];  _autograd_grad = None
 
         grad_input_1: "f32[]" = torch._C._functorch._unwrap_for_grad(grad_input, 2);  grad_input = None
-
         output_1: "f32[]" = torch._C._functorch._unwrap_for_grad(output, 2);  output = output_1 = None
 
         _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting();  _grad_decrement_nesting = None
@@ -5060,7 +5043,6 @@ class GraphModule(torch.nn.Module):
         grad_input_2: "f32[]" = _autograd_grad_1[0];  _autograd_grad_1 = None
 
         grad_input_3: "f32[]" = torch._C._functorch._unwrap_for_grad(grad_input_2, 1);  grad_input_2 = None
-
         output_2: "f32[]" = torch._C._functorch._unwrap_for_grad(grad_input_1, 1);  grad_input_1 = output_2 = None
 
         _grad_decrement_nesting_1 = torch._C._functorch._grad_decrement_nesting();  _grad_decrement_nesting_1 = None
@@ -5166,7 +5148,6 @@ class GraphModule(torch.nn.Module):
         grad_input: "f32[3, 3, 3]" = _autograd_grad[0];  _autograd_grad = None
 
         grad_input_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(grad_input, 1);  grad_input = None
-
         output_1: "f32[]" = torch._C._functorch._unwrap_for_grad(output, 1);  output = output_1 = None
 
         _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting();  _grad_decrement_nesting = None
@@ -5245,7 +5226,6 @@ class GraphModule(torch.nn.Module):
         dual: "f32[4, 3]" = _unpack_dual[1];  _unpack_dual = None
 
         primals_out_unflatten: "f32[4, 3]" = torch._C._functorch._unwrap_for_grad(primal, 2);  primal = primals_out_unflatten = None
-
         tangents_out_unflatten: "f32[4, 3]" = torch._C._functorch._unwrap_for_grad(dual, 2);  dual = None
 
         _exit_dual_level = torch._C._exit_dual_level(0);  _exit_dual_level = None
@@ -5327,7 +5307,6 @@ class GraphModule(torch.nn.Module):
         dual: "f32[3, 4]" = _unpack_dual[1];  _unpack_dual = None
 
         primals_out_unflatten: "f32[3, 4]" = torch._C._functorch._unwrap_for_grad(primal, 2);  primal = primals_out_unflatten = None
-
         tangents_out_unflatten: "f32[3, 4]" = torch._C._functorch._unwrap_for_grad(dual, 2);  dual = None
 
         _exit_dual_level = torch._C._exit_dual_level(0);  _exit_dual_level = None
@@ -5411,7 +5390,6 @@ class GraphModule(torch.nn.Module):
         dual: "f32[3, 4]" = _unpack_dual[1];  _unpack_dual = None
 
         primals_out_unflatten: "f32[3, 4]" = torch._C._functorch._unwrap_for_grad(primal, 2);  primal = primals_out_unflatten = None
-
         tangents_out_unflatten: "f32[3, 4]" = torch._C._functorch._unwrap_for_grad(dual, 2);  dual = None
 
         _exit_dual_level = torch._C._exit_dual_level(0);  _exit_dual_level = None
@@ -5502,7 +5480,6 @@ class GraphModule(torch.nn.Module):
 
         child_4: "f32[3, 4]" = torch._C._functorch._unwrap_for_grad(primal, 2);  primal = child_4 = None
         child_5: "f32[4, 3]" = torch._C._functorch._unwrap_for_grad(primal_1, 2);  primal_1 = child_5 = None
-
         child_6: "f32[3, 4]" = torch._C._functorch._unwrap_for_grad(tangent, 2);  tangent = None
         child_7: "f32[4, 3]" = torch._C._functorch._unwrap_for_grad(dual, 2);  dual = None
 
@@ -5572,7 +5549,6 @@ class GraphModule(torch.nn.Module):
         dual: "f32[]" = _unpack_dual[1];  _unpack_dual = None
 
         primals_out_unflatten: "f32[]" = torch._C._functorch._unwrap_for_grad(primal, 1);  primal = None
-
         tangents_out_unflatten: "f32[]" = torch._C._functorch._unwrap_for_grad(dual, 1);  dual = None
 
         _exit_dual_level = torch._C._exit_dual_level(0);  _exit_dual_level = None
@@ -5626,7 +5602,6 @@ class GraphModule(torch.nn.Module):
         dual: "f32[]" = _unpack_dual[1];  _unpack_dual = None
 
         primals_out_unflatten: "f32[]" = torch._C._functorch._unwrap_for_grad(primal, 1);  primal = None
-
         tangents_out_unflatten: "f32[]" = torch._C._functorch._unwrap_for_grad(dual, 1);  dual = None
 
         _exit_dual_level = torch._C._exit_dual_level(0);  _exit_dual_level = None
@@ -5688,7 +5663,6 @@ class GraphModule(torch.nn.Module):
         dual: "f32[3, 3]" = _unpack_dual[1];  _unpack_dual = None
 
         primals_out_unflatten: "f32[3, 3]" = torch._C._functorch._unwrap_for_grad(primal, 1);  primal = None
-
         tangents_out_unflatten: "f32[3, 3]" = torch._C._functorch._unwrap_for_grad(dual, 1);  dual = None
 
         _exit_dual_level = torch._C._exit_dual_level(0);  _exit_dual_level = None
@@ -5742,7 +5716,6 @@ class GraphModule(torch.nn.Module):
         dual: "f32[]" = _unpack_dual[1];  _unpack_dual = None
 
         primals_out_unflatten: "f32[]" = torch._C._functorch._unwrap_for_grad(primal, 1);  primal = None
-
         tangents_out_unflatten: "f32[]" = torch._C._functorch._unwrap_for_grad(dual, 1);  dual = None
 
         _exit_dual_level = torch._C._exit_dual_level(0);  _exit_dual_level = None
@@ -5810,7 +5783,6 @@ class GraphModule(torch.nn.Module):
         dual: "f32[]" = _unpack_dual[1];  _unpack_dual = None
 
         primals_out_unflatten: "f32[]" = torch._C._functorch._unwrap_for_grad(primal, 1);  primal = None
-
         tangents_out_unflatten: "f32[]" = torch._C._functorch._unwrap_for_grad(dual, 1);  dual = None
 
         _exit_dual_level = torch._C._exit_dual_level(0);  _exit_dual_level = None
@@ -5887,7 +5859,6 @@ class GraphModule(torch.nn.Module):
         dual: "f32[3, 3, 3]" = _unpack_dual[1];  _unpack_dual = None
 
         primals_out_unflatten: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(primal, 2);  primal = None
-
         tangents_out_unflatten: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(dual, 2);  dual = None
 
         _set_fwd_grad_enabled_2 = torch._C._set_fwd_grad_enabled(True);  _set_fwd_grad_enabled_2 = None
@@ -5902,7 +5873,6 @@ class GraphModule(torch.nn.Module):
 
         _unwrap_for_grad_2: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(primal_1, 1);  primal_1 = None
         _unwrap_for_grad_3: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(primal_2, 1);  primal_2 = None
-
         _unwrap_for_grad_4: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(dual_1, 1);  dual_1 = None
         _unwrap_for_grad_5: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(dual_2, 1);  dual_2 = None
 
diff --git a/test/dynamo/test_subclasses.py b/test/dynamo/test_subclasses.py
index c590abe63788..39a0dc628bae 100644
--- a/test/dynamo/test_subclasses.py
+++ b/test/dynamo/test_subclasses.py
@@ -3166,7 +3166,6 @@ class GraphModule(torch.nn.Module):
     ):
         slice_1: "f64[s64, s55]" = torch.ops.aten.slice.Tensor(tangents_1, 1, 0, primals_10)
         slice_2: "f64[s64, s55]" = torch.ops.aten.slice.Tensor(tangents_1, 1, primals_10, add_2);  tangents_1 = add_2 = None
-
         add_4: "f64[s64, s55]" = torch.ops.aten.add.Tensor(slice_1, slice_2);  slice_1 = slice_2 = None
         return (
             None,  # None
diff --git a/test/export/test_export.py b/test/export/test_export.py
index 23a7ad9bff1e..2842723ea25b 100755
--- a/test/export/test_export.py
+++ b/test/export/test_export.py
@@ -16061,6 +16061,7 @@ class GraphModule(torch.nn.Module):
                 add: "f32[2, 4]" = torch.ops.aten.add.Tensor(relu, arg1_1);  relu = arg1_1 = None
                 return (add,)
 """,
+            ignore_empty_lines=True,
         )
 
         ep = export(M(), (x, y), strict=strict).run_decompositions({})
@@ -16093,6 +16094,7 @@ class GraphModule(torch.nn.Module):
                 add: "f32[2, 4]" = torch.ops.aten.add.Tensor(relu, arg1_1);  relu = arg1_1 = None
                 return (add,)
 """,
+            ignore_empty_lines=True,
         )
 
     @testing.expectedFailureStrict  # test_hop doesn't have a dynamo implementation
diff --git a/test/functorch/test_control_flow.py b/test/functorch/test_control_flow.py
index e47aaa9e9e2b..cac6ae1ba36a 100644
--- a/test/functorch/test_control_flow.py
+++ b/test/functorch/test_control_flow.py
@@ -8104,7 +8104,6 @@ class GraphModule(torch.nn.Module):
 
         x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
         _guards_fn = self._guards_fn(x);  _guards_fn = None
-
         sym_size_int_1: "Sym(s77)" = torch.ops.aten.sym_size.int(x, 0)
 
         while_loop_cond_graph_0 = self.while_loop_cond_graph_0
@@ -8404,7 +8403,6 @@ class GraphModule(torch.nn.Module):
 
         x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
         _guards_fn = self._guards_fn(x);  _guards_fn = None
-
         sym_size_int_1: "Sym(s6)" = torch.ops.aten.sym_size.int(x, 0)
 
         sin: "f32[s6, 3]" = torch.ops.aten.sin.default(x);  x = None
@@ -8691,10 +8689,8 @@ class GraphModule(torch.nn.Module):
             t_4: "f32[3, 3]" = torch.ops.aten.t.default(t_3);  t_3 = None
             mul_4: "f32[3, 3]" = torch.ops.aten.mul.Tensor(arg1_1, select)
             mul_5: "f32[3, 3]" = torch.ops.aten.mul.Tensor(arg1_1, select);  arg1_1 = select = None
-
             add_7: "f32[3, 3]" = torch.ops.aten.add.Tensor(mm, mul_5);  mm = mul_5 = None
             add_8: "f32[3, 3]" = torch.ops.aten.add.Tensor(add_7, mul_4);  add_7 = mul_4 = None
-
             add_9: "i64[]" = torch.ops.aten.add.Tensor(arg0_1, 1);  arg0_1 = None
             add_10: "f32[3]" = torch.ops.aten.add.Tensor(view, arg2_1);  view = arg2_1 = None
             add_11: "f32[3, 3]" = torch.ops.aten.add.Tensor(t_4, arg3_1);  t_4 = arg3_1 = None
@@ -8909,7 +8905,6 @@ class GraphModule(torch.nn.Module):
 
         x, y, z, = fx_pytree.tree_flatten_spec(([x, y, z], {}), self._in_spec)
         _guards_fn = self._guards_fn(x, y, z);  _guards_fn = None
-
         sym_size_int_4: "Sym(s17)" = torch.ops.aten.sym_size.int(y, 0);  y = None
         sym_size_int_5: "Sym(s68)" = torch.ops.aten.sym_size.int(z, 0)
 
diff --git a/test/higher_order_ops/test_invoke_subgraph.py b/test/higher_order_ops/test_invoke_subgraph.py
index ffbefe5cd9b4..700751942ba1 100644
--- a/test/higher_order_ops/test_invoke_subgraph.py
+++ b/test/higher_order_ops/test_invoke_subgraph.py
@@ -17,6 +17,7 @@ from functorch.compile import aot_function, nop
 from torch._dynamo.testing import (
     AotEagerAndRecordGraphs,
     EagerAndRecordGraphs,
+    empty_line_normalizer,
     InductorAndRecordGraphs,
     normalize_gm,
 )
@@ -351,10 +352,8 @@ class GraphModule(torch.nn.Module):
         getitem_14: "f32[8]" = invoke_subgraph_6[2]
         getitem_13: "f32[8]" = invoke_subgraph_6[1]
         getitem_1: "f32[8]" = invoke_subgraph_6[0];  invoke_subgraph_6 = None
-
         add: "f32[8]" = torch.ops.aten.add.Tensor(getitem, getitem_1);  getitem = getitem_1 = None
         return (add, getitem_12, getitem_11, getitem_10, getitem_15, getitem_14, getitem_13)
-
     class partitioned_fw_subgraph_0_0(torch.nn.Module):
         def forward(self, primals_0: "f32[8]", primals_1: "f32[8]", primals_2: "f32[8]"):
             mul: "f32[8]" = torch.ops.aten.mul.Tensor(primals_0, primals_1)
@@ -363,6 +362,7 @@ class GraphModule(torch.nn.Module):
             mul_2: "f32[8]" = torch.ops.aten.mul.Tensor(mul_1, primals_2);  mul_1 = None
             return (mul_2, primals_0, primals_1, primals_2)
 """,
+            ignore_empty_lines=True,
         )
         self.assertExpectedInline(
             normalize_gm(backend.bw_graphs[0].print_readable(print_output=False)),
@@ -377,7 +377,6 @@ class GraphModule(torch.nn.Module):
         invoke_subgraph_5 = torch.ops.higher_order.invoke_subgraph(partitioned_bw_subgraph_0_0, 'partitioned_bw_subgraph_0_0', getitem_10, getitem_11, getitem_12, tangents_1);  partitioned_bw_subgraph_0_0 = getitem_10 = getitem_11 = getitem_12 = tangents_1 = None
         getitem_6: "f32[8]" = invoke_subgraph_5[0]
         getitem_7: "f32[8]" = invoke_subgraph_5[1];  invoke_subgraph_5 = None
-
         add_1: "f32[8]" = torch.ops.aten.add.Tensor(getitem_2, getitem_6);  getitem_2 = getitem_6 = None
         add_2: "f32[8]" = torch.ops.aten.add.Tensor(getitem_3, getitem_7);  getitem_3 = getitem_7 = None
         return (add_1, add_2, None)
@@ -393,6 +392,7 @@ class GraphModule(torch.nn.Module):
             mul_7: "f32[8]" = torch.ops.aten.mul.Tensor(mul_5, primals_1);  mul_5 = primals_1 = None
             return (mul_7, mul_6, None)
 """,
+            ignore_empty_lines=True,
         )
 
     def test_buffer_mutation_works_under_no_grad(self):
@@ -681,6 +681,7 @@ class GraphModule(torch.nn.Module):
             sin: "f32[8]" = torch.ops.aten.sin.default(primals_0)
             return (sin, primals_0)
 """,
+                ignore_empty_lines=True,
             )
 
     @inductor_config.patch("fx_graph_cache", False)
@@ -722,6 +723,7 @@ class <lambda>(torch.nn.Module):
             mul_1: "f32[8]" = torch.ops.aten.mul.Tensor(mul, 2.0);  mul = None
             return (mul_1,)
 """,
+                ignore_empty_lines=True,
             )
 
     def test_dedupe(self):
@@ -770,7 +772,6 @@ class GraphModule(torch.nn.Module):
         subgraph_0 = self.subgraph_0
         invoke_subgraph = torch.ops.higher_order.invoke_subgraph(subgraph_0, 'subgraph_0', l_x_, l_y_);  subgraph_0 = l_x_ = None
         a: "f32[8]" = invoke_subgraph[0];  invoke_subgraph = None
-
         subgraph_1 = self.subgraph_0
         invoke_subgraph_1 = torch.ops.higher_order.invoke_subgraph(subgraph_1, 'subgraph_0', a, l_y_);  subgraph_1 = a = l_y_ = None
         getitem_1: "f32[8]" = invoke_subgraph_1[0];  invoke_subgraph_1 = None
@@ -806,6 +807,7 @@ class GraphModule(torch.nn.Module):
             mul: "f32[8]" = torch.ops.aten.mul.Tensor(primals_0, primals_1)
             return (mul, primals_0, primals_1)
 """,
+            ignore_empty_lines=True,
         )
 
     def test_dce(self):
@@ -889,7 +891,6 @@ class GraphModule(torch.nn.Module):
         subgraph_0 = self.subgraph_0
         invoke_subgraph = torch.ops.higher_order.invoke_subgraph(subgraph_0, 'subgraph_0', l_x_, l_y_);  subgraph_0 = l_x_ = None
         a: "f32[8]" = invoke_subgraph[0];  invoke_subgraph = None
-
         subgraph_1 = self.subgraph_1
         invoke_subgraph_1 = torch.ops.higher_order.invoke_subgraph(subgraph_1, 'subgraph_1', a, l_y_);  subgraph_1 = a = l_y_ = None
         getitem_1: "f32[8]" = invoke_subgraph_1[0];  invoke_subgraph_1 = None
@@ -1535,7 +1536,6 @@ class GraphModule(torch.nn.Module):
         def forward(self, tangents_0: "f32[8, 8]", tangents_1: "f32[8, 8]"):
             mul_2: "f32[8, 8]" = torch.ops.aten.mul.Tensor(tangents_1, 3)
             mul_3: "f32[8, 8]" = torch.ops.aten.mul.Tensor(tangents_1, 2);  tangents_1 = None
-
             add: "f32[8, 8]" = torch.ops.aten.add.Tensor(mul_2, mul_3);  mul_2 = mul_3 = None
             return (add,)
 """,
@@ -2145,7 +2145,6 @@ class GraphModule(torch.nn.Module):
         subgraph_0 = self.subgraph_0
         invoke_subgraph = torch.ops.higher_order.invoke_subgraph(subgraph_0, 'subgraph_0', x, y);  subgraph_0 = x = None
         z: "f32[5]" = invoke_subgraph[0];  invoke_subgraph = None
-
         subgraph_1 = self.subgraph_1
         invoke_subgraph_1 = torch.ops.higher_order.invoke_subgraph(subgraph_1, 'subgraph_1', z, y);  subgraph_1 = z = y = None
         getitem_1: "f32[5]" = invoke_subgraph_1[0];  invoke_subgraph_1 = None
@@ -2283,6 +2282,7 @@ class GraphModule(torch.nn.Module):
             cos: "f32[s77, 16]" = torch.ops.aten.cos.default(primals_1)
             return (cos, primals_1, primals_0)
 """,
+                ignore_empty_lines=True,
             )
             self.assertExpectedInline(
                 normalize_gm(backend.bw_graphs[0].print_readable(print_output=False)),
@@ -2294,7 +2294,6 @@ class GraphModule(torch.nn.Module):
         partitioned_bw_subgraph_0_0 = self.partitioned_bw_subgraph_0_0
         invoke_subgraph_15 = torch.ops.higher_order.invoke_subgraph(partitioned_bw_subgraph_0_0, 'partitioned_bw_subgraph_0_0', getitem_23, getitem_22, expand);  partitioned_bw_subgraph_0_0 = getitem_23 = getitem_22 = None
         getitem_5: "f32[s77, 16]" = invoke_subgraph_15[1];  invoke_subgraph_15 = None
-
         add_16: "f32[s77, 16]" = torch.ops.aten.add.Tensor(expand, getitem_5);  expand = getitem_5 = None
 
         partitioned_bw_subgraph_0_3 = self.partitioned_bw_subgraph_0_1
@@ -2326,6 +2325,7 @@ class GraphModule(torch.nn.Module):
             mul_10: "f32[s77, 16]" = torch.ops.aten.mul.Tensor(tangents_0, neg);  tangents_0 = neg = None
             return (None, mul_10)
 """,
+                ignore_empty_lines=True,
             )
 
     def test_div(self):
@@ -2535,19 +2535,19 @@ class TestInvokeSubgraphExport(TestCase):
         self.assertEqual(len(list(ep.graph_module.named_modules())), 2)
 
         self.assertExpectedInline(
-            normalize_gm(ep.graph_module.print_readable(print_output=False)),
+            empty_line_normalizer(
+                normalize_gm(ep.graph_module.print_readable(print_output=False))
+            ),
             """\
 class GraphModule(torch.nn.Module):
     def forward(self, x: "f32[8]", y: "f32[8]"):
         repeated_subgraph0 = self.repeated_subgraph0
         invoke_subgraph = torch.ops.higher_order.invoke_subgraph(repeated_subgraph0, 'subgraph_0', x, y);  repeated_subgraph0 = x = None
         getitem: "f32[8]" = invoke_subgraph[0];  invoke_subgraph = None
-
         repeated_subgraph0_1 = self.repeated_subgraph0
         invoke_subgraph_1 = torch.ops.higher_order.invoke_subgraph(repeated_subgraph0_1, 'subgraph_0', getitem, y);  repeated_subgraph0_1 = getitem = y = None
         getitem_1: "f32[8]" = invoke_subgraph_1[0];  invoke_subgraph_1 = None
         return (getitem_1,)
-
     class repeated_subgraph0(torch.nn.Module):
         def forward(self, arg0_1: "f32[8]", arg1_1: "f32[8]"):
             mul: "f32[8]" = torch.ops.aten.mul.Tensor(arg0_1, arg1_1);  arg0_1 = arg1_1 = None
diff --git a/test/inductor/test_compiled_autograd.py b/test/inductor/test_compiled_autograd.py
index 2612af01f6ff..fee2b289db90 100644
--- a/test/inductor/test_compiled_autograd.py
+++ b/test/inductor/test_compiled_autograd.py
@@ -3621,7 +3621,6 @@ class CompiledAutograd0(torch.nn.Module):
 
         aot0_mul_2 = torch.ops.aten.mul.Tensor(aot0_tangents_1, aot0_primals_1);  aot0_tangents_1 = aot0_primals_1 = None
         aot0_mul_3 = torch.ops.aten.mul.Tensor(aot0_tangents_2, aot0_primals_2);  aot0_tangents_2 = aot0_primals_2 = None
-
         aot0_add_2 = torch.ops.aten.add.Tensor(aot0_mul_2, aot0_mul_2);  aot0_mul_2 = None
         aot0_add_3 = torch.ops.aten.add.Tensor(aot0_mul_3, aot0_mul_3);  aot0_mul_3 = None
 
diff --git a/torch/fx/graph.py b/torch/fx/graph.py
index 940737e7e3a6..7577b6bc6148 100644
--- a/torch/fx/graph.py
+++ b/torch/fx/graph.py
@@ -606,29 +606,31 @@ class CodeGen:
             else:
                 body.append("\n")
 
-        prev_stacktrace = None
+        prev_summary_str = None
 
         def append_stacktrace_summary(node: Node):
             """
             Append a summary of the stacktrace to the generated code. This is
             useful for debugging.
             """
-            nonlocal prev_stacktrace
+            nonlocal prev_summary_str
 
             if node.op not in {"placeholder", "output"}:
-                stack_trace = node.stack_trace
-                if stack_trace:
-                    if stack_trace != prev_stacktrace:
-                        prev_stacktrace = stack_trace
-                        if parsed_stack_trace := _parse_stack_trace(stack_trace):
-                            summary_str = parsed_stack_trace.get_summary_str()
-                        else:
-                            summary_str = ""
-                        body.append(f"\n {dim(f'# {summary_str}')}\n")
-                elif prev_stacktrace != "":
-                    prev_stacktrace = ""
-                    no_stacktrace_msg = "# No stacktrace found for following nodes"
-                    body.append(f"\n{dim(no_stacktrace_msg)}\n")
+                annotation_str = ""
+                annotation = node.meta.get("custom", {})
+                if annotation:
+                    annotation_str = f" Annotation: {annotation}"
+
+                stack_trace_str = "No stacktrace found for following nodes"
+                if stack_trace := node.stack_trace:
+                    if parsed_stack_trace := _parse_stack_trace(stack_trace):
+                        stack_trace_str = parsed_stack_trace.get_summary_str()
+
+                summary_str = f"\n{dim(f'#{annotation_str} {stack_trace_str}')}\n"
+
+                if summary_str != prev_summary_str:
+                    prev_summary_str = summary_str
+                    body.append(summary_str)
 
         def stringify_shape(shape: Iterable) -> str:
             return f"[{', '.join([str(x) for x in shape])}]"

From fae74cd52f3449ec92fdb519c577c8cd142ab7b1 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Fri, 17 Oct 2025 18:55:53 +0000
Subject: [PATCH 065/123] Revert "shrink_group implementation to expose
 ncclCommShrink API (#164518)"

This reverts commit a032510db38e8331afa08f7635d146f9cefdd0ab.

Reverted https://github.com/pytorch/pytorch/pull/164518 on behalf of https://github.com/pytorch-auto-revert due to Reverted automatically by pytorch's autorevert, to avoid this behaviour add the tag autorevert: disable ([comment](https://github.com/pytorch/pytorch/pull/164518#issuecomment-3416718767))
---
 docs/source/distributed.md                    |   4 -
 test/distributed/logging_utils.py             |  43 --
 test/distributed/test_c10d_nccl.py            | 640 +-----------------
 torch/csrc/distributed/c10d/Backend.hpp       |  17 -
 torch/csrc/distributed/c10d/NCCLUtils.cpp     |  59 --
 torch/csrc/distributed/c10d/NCCLUtils.hpp     |  12 -
 .../distributed/c10d/ProcessGroupNCCL.cpp     | 135 +---
 .../distributed/c10d/ProcessGroupNCCL.hpp     |  21 -
 torch/csrc/distributed/c10d/init.cpp          |  11 -
 torch/distributed/distributed_c10d.py         | 515 --------------
 torch/testing/_internal/common_distributed.py |  48 --
 11 files changed, 2 insertions(+), 1503 deletions(-)
 delete mode 100644 test/distributed/logging_utils.py

diff --git a/docs/source/distributed.md b/docs/source/distributed.md
index 69df7be1fa80..5da02bb8a194 100644
--- a/docs/source/distributed.md
+++ b/docs/source/distributed.md
@@ -394,10 +394,6 @@ an opaque group handle that can be given as a `group` argument to all collective
 .. autofunction:: new_group
 ```
 
-```{eval-rst}
-.. autofunction:: torch.distributed.distributed_c10d.shrink_group
-```
-
 ```{eval-rst}
 .. autofunction:: get_group_rank
 ```
diff --git a/test/distributed/logging_utils.py b/test/distributed/logging_utils.py
deleted file mode 100644
index 09a0adccfd80..000000000000
--- a/test/distributed/logging_utils.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import logging
-import time
-
-
-_start_time = time.time()
-_logger = logging.getLogger(__name__)
-
-
-def _ts():
-    return time.time() - _start_time
-
-
-def configure(level=logging.INFO, force=False):
-    try:
-        logging.basicConfig(
-            level=level,
-            format="%(asctime)s %(name)s %(levelname)s: %(message)s",
-            force=force,
-        )
-    except TypeError:
-        logging.basicConfig(
-            level=level, format="%(asctime)s %(name)s %(levelname)s: %(message)s"
-        )
-
-
-def log_test_info(rank, message):
-    _logger.info("[%7.3fs][Rank %s] %s", _ts(), rank, message)
-
-
-def log_test_success(rank, message):
-    _logger.info("[%7.3fs][Rank %s] ✅ %s", _ts(), rank, message)
-
-
-def log_test_validation(rank, message):
-    _logger.info("[%7.3fs][Rank %s] ✓ %s", _ts(), rank, message)
-
-
-def log_test_warning(rank, message):
-    _logger.warning("[%7.3fs][Rank %s] ⚠️ %s", _ts(), rank, message)
-
-
-def log_test_error(rank, message):
-    _logger.error("[%7.3fs][Rank %s] ✗ %s", _ts(), rank, message)
diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index 0f518fab62cf..7410255d27a8 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -2,7 +2,6 @@
 
 import copy
 import json
-import logging
 import os
 import pickle
 import random
@@ -22,7 +21,6 @@ from unittest import mock, SkipTest
 import torch
 import torch.distributed as c10d
 import torch.distributed._functional_collectives as _functional_collectives
-from torch.distributed.distributed_c10d import SHRINK_ABORT as NCCL_SHRINK_ABORT
 
 
 if not c10d.is_available() or not c10d.is_nccl_available():
@@ -49,15 +47,12 @@ from torch._C._distributed_c10d import ErrorType, OpType, WorkResult
 from torch.nn.parallel import DistributedDataParallel
 from torch.testing._internal.common_cuda import _get_torch_rocm_version, TEST_MULTIGPU
 from torch.testing._internal.common_distributed import (
-    get_required_world_size,
     get_timeout,
     init_multigpu_helper,
     MultiProcessTestCase,
     requires_multicast_support,
     requires_nccl,
-    requires_nccl_shrink,
     requires_nccl_version,
-    requires_world_size,
     skip_if_lt_x_gpu,
     skip_if_rocm_multiprocess,
     sm_is_or_higher_than,
@@ -92,17 +87,6 @@ BFLOAT16_AVAILABLE = torch.cuda.is_available() and (
     torch.version.cuda is not None or torch.version.hip is not None
 )
 
-from logging_utils import (
-    configure as _log_configure,
-    log_test_info,
-    log_test_success,
-    log_test_validation,
-    log_test_warning,
-)
-
-
-_log_configure(level=logging.INFO, force=True)
-
 
 class RendezvousEnvTest(TestCase):
     @retry_on_connect_failures
@@ -333,7 +317,7 @@ class ProcessGroupNCCLGroupTest(MultiProcessTestCase):
 
     @property
     def world_size(self):
-        return get_required_world_size(self, 2)
+        return 2
 
     @property
     def rank_to_GPU(self):
@@ -1271,628 +1255,6 @@ class ProcessGroupNCCLGroupTest(MultiProcessTestCase):
         pg_2 = c10d.new_group([0, 1])
         self.assertEqual(pg_2.group_desc, "undefined")
 
-    @requires_nccl_shrink()
-    @requires_world_size(2)
-    def test_shrink_group_basic(self):
-        """Test basic shrink_group functionality."""
-        self._perform_shrink_test([1], "Basic shrink test")
-
-    @requires_nccl_shrink()
-    @requires_world_size(2)
-    def test_shrink_group_validation(self):
-        """Test input validation in shrink_group."""
-        device, pg = self._setup_shrink_test("validation")
-
-        def _test_invalid_input(ranks, description, expected_exception):
-            """Helper to test invalid inputs."""
-            try:
-                c10d.shrink_group(ranks)
-                self.fail(f"Expected {expected_exception.__name__} for {description}")
-            except expected_exception:
-                log_test_validation(self.rank, f"✓ {description}")
-            except Exception:
-                if expected_exception == Exception:  # Accept any exception
-                    log_test_validation(self.rank, f"✓ {description}")
-                else:
-                    raise
-
-        # Test cases
-        _test_invalid_input([], "Empty exclusion list", ValueError)
-        if self.world_size > 1:
-            _test_invalid_input([0, 0, 1], "Duplicate ranks", Exception)
-        _test_invalid_input([self.world_size + 1], "Out of bounds rank", Exception)
-
-        log_test_success(self.rank, "All validation tests passed")
-        dist.destroy_process_group()
-
-    @requires_nccl_shrink()
-    @requires_world_size(2)
-    def test_shrink_group_backend_properties(self):
-        """Test that backend properties are preserved after shrinking."""
-
-        test_name = "Backend Properties Test"
-        ranks_to_exclude = [0]
-
-        # Reuse _setup_shrink_test for complete setup (device, environment, and process group)
-        device, pg = self._setup_shrink_test("backend_properties")
-
-        # Follow _perform_shrink_test pattern from here
-        log_test_info(self.rank, f"{test_name} (world_size={self.world_size})")
-
-        is_excluded = self.rank in ranks_to_exclude
-        log_test_info(
-            self.rank,
-            f"Excluding ranks: {ranks_to_exclude}, am_excluded: {is_excluded}",
-        )
-
-        # Store original backend property values (not references) before shrinking
-        original_timeout = None
-        original_high_priority = None
-        if not is_excluded:
-            original_backend = pg._get_backend(device)
-            original_timeout = original_backend.options._timeout
-            original_high_priority = original_backend.options.is_high_priority_stream
-            log_test_info(
-                self.rank,
-                f"Storing original backend properties: timeout={original_timeout}, high_priority={original_high_priority}",
-            )
-
-        if is_excluded:
-            log_test_info(
-                self.rank,
-                f"Excluded rank {self.rank} - setup complete, skipping shrink operation",
-            )
-            dist.destroy_process_group()  # hang without it
-            return
-
-        # Only non-excluded ranks proceed with shrink (same as _perform_shrink_test)
-        log_test_info(self.rank, "Non-excluded rank calling shrink_group")
-        shrunk_pg = c10d.shrink_group(ranks_to_exclude)
-
-        # Reuse _validate_shrunk_group helper (same as _perform_shrink_test)
-        expected_size = self.world_size - len(ranks_to_exclude)
-        _ = self._validate_shrunk_group(shrunk_pg, expected_size, test_name)
-
-        # Add custom backend properties validation
-        new_backend = shrunk_pg._get_backend(device)
-        log_test_info(self.rank, "Validating backend properties are preserved")
-
-        new_timeout = new_backend.options._timeout
-        new_high_priority = new_backend.options.is_high_priority_stream
-
-        log_test_info(
-            self.rank,
-            f"Timeout comparison - original: {original_timeout}, new: {new_timeout}",
-        )
-        self.assertEqual(
-            original_timeout, new_timeout, f"{test_name}: timeout not preserved"
-        )
-
-        log_test_info(
-            self.rank,
-            f"High priority stream comparison - original: {original_high_priority}, new: {new_high_priority}",
-        )
-        self.assertEqual(
-            original_high_priority,
-            new_high_priority,
-            f"{test_name}: high_priority_stream not preserved",
-        )
-
-        log_test_validation(
-            self.rank, f"{test_name}: Backend properties preserved successfully"
-        )
-        log_test_success(
-            self.rank, f"{test_name} successful (shrink + backend validation)"
-        )
-
-        # Cleanup (same as _perform_shrink_test)
-        dist.destroy_process_group()
-
-    @requires_nccl_shrink()
-    @requires_world_size(2)
-    def test_shrink_group_multiple_comms(self):
-        """Test shrink_group with multiple communicators and subgroup invalidation."""
-
-        device, pg = self._setup_shrink_test("multiple_comms")
-
-        # Create subgroup [0, 1] and test shrinking it
-        subgroup = c10d.new_group([0, 1])
-        if self.rank <= 1:
-            # Shrink subgroup: exclude rank 1
-            if self.rank == 0:  # Only rank 0 remains
-                shrunk_subgroup = c10d.shrink_group([1], group=subgroup)
-                self.assertEqual(shrunk_subgroup.size(), 1)
-                # Test communication on shrunk subgroup
-                tensor = torch.full((1,), self.rank).cuda(device)
-                c10d.all_reduce(tensor, group=shrunk_subgroup)
-                self.assertEqual(tensor.item(), 0)  # Only rank 0
-                log_test_success(self.rank, "Subgroup shrinking successful")
-
-        dist.barrier()  # Sync before default group test
-
-        # Shrink default group: exclude last rank
-        ranks_to_exclude = [self.world_size - 1]
-        if self.rank not in ranks_to_exclude:
-            shrunk_default = c10d.shrink_group(ranks_to_exclude)
-            expected_size = self.world_size - 1
-            self.assertEqual(shrunk_default.size(), expected_size)
-
-            # Test collective on shrunk default group
-            tensor = torch.full((1,), self.rank).cuda(device)
-            c10d.all_reduce(tensor, group=shrunk_default)
-            expected_sum = sum(
-                range(self.world_size - 1)
-            )  # 0 + 1 + ... + (world_size-2)
-            self.assertEqual(tensor.item(), expected_sum)
-            log_test_success(self.rank, "Default group shrinking successful")
-
-            # Note: After shrinking default group, the old subgroup is invalid
-            # due to global rank reassignment
-
-        dist.destroy_process_group()
-
-    def _test_shrink_group_with_flag(self, shrink_flag, flag_name, rank_to_exclude):
-        """Helper method to test shrink_group with a specific flag."""
-        if self.world_size < 2:
-            log_test_info(self.rank, f"Skipping (needs ≥2 GPUs, got {self.world_size})")
-            return
-        ranks_to_exclude = [rank_to_exclude]
-        log_test_info(self.rank, f"Using {flag_name} flag (value: {shrink_flag})")
-        if flag_name == "NCCL_SHRINK_ABORT":
-            log_test_info(
-                self.rank,
-                "ABORT flag will terminate ongoing operations before shrinking",
-            )
-
-        self._perform_shrink_test(
-            ranks_to_exclude, f"{flag_name} flag test", shrink_flags=shrink_flag
-        )
-
-    @requires_nccl_shrink()
-    @requires_world_size(2)
-    def test_shrink_group_flags(self):
-        """Test shrink_group with different shrink flags."""
-        # Test ABORT flags
-        log_test_info(self.rank, "Testing NCCL_SHRINK_ABORT flag")
-        self._test_shrink_group_with_flag(NCCL_SHRINK_ABORT, "NCCL_SHRINK_ABORT", 1)
-
-    @requires_nccl_shrink()
-    @requires_world_size(2)
-    def test_shrink_group_nccl_config(self):
-        """Verify that passing NCCL config via pg_options influences the shrunk group's backend options."""
-        device, pg = self._setup_shrink_test("config")
-        if self.rank == self.world_size - 1:
-            # excluded rank should not call shrink_group
-            dist.destroy_process_group()
-            return
-
-        # Prepare pg_options with NCCL config overrides
-        # Capture parent's current backend options to ensure we can prove override vs inherit
-        parent_backend = pg._get_backend(torch.device("cuda"))
-        parent_hp = parent_backend.options.is_high_priority_stream
-        parent_blocking = parent_backend.options.config.blocking
-
-        # Choose overrides that differ from the parent (flip where possible)
-        override_hp = not parent_hp
-        if parent_blocking in (0, 1):
-            override_blocking = 1 - parent_blocking
-        else:
-            # If undefined or unexpected, set to 1 which is a concrete value
-            override_blocking = 1
-
-        opts = c10d.ProcessGroupNCCL.Options()
-        opts.is_high_priority_stream = override_hp
-        opts.config.blocking = override_blocking
-
-        shrunk_pg = c10d.shrink_group([self.world_size - 1], pg_options=opts)
-
-        # Validate backend options propagated
-        backend = shrunk_pg._get_backend(torch.device("cuda"))
-        # is_high_priority_stream should exactly match our override and differ from parent
-        self.assertEqual(backend.options.is_high_priority_stream, override_hp)
-        self.assertNotEqual(backend.options.is_high_priority_stream, parent_hp)
-        # config is a struct; check representative field and difference from parent when meaningful
-        self.assertEqual(backend.options.config.blocking, override_blocking)
-        if parent_blocking in (0, 1):
-            self.assertNotEqual(backend.options.config.blocking, parent_blocking)
-
-        dist.destroy_process_group()
-
-    @requires_nccl_shrink()
-    @requires_world_size(2)
-    def test_shrink_group_performance(self):
-        """Test shrink_group performance and regression detection."""
-        import time
-
-        ranks_to_exclude = self._get_default_ranks_to_exclude()
-        is_excluded = self.rank in ranks_to_exclude
-
-        if not ranks_to_exclude:
-            log_test_info(self.rank, "Skipping performance test (world_size=1)")
-            return
-
-        log_test_info(self.rank, f"Performance test with {self.world_size} processes")
-        device, pg = self._setup_shrink_test("performance")
-
-        if not is_excluded:
-            log_test_info(self.rank, "Measuring shrink_group performance")
-            start_time = time.time()
-            shrunk_pg = c10d.shrink_group(ranks_to_exclude)
-            end_time = time.time()
-
-            elapsed_time = end_time - start_time
-            log_test_info(self.rank, f"shrink_group: {elapsed_time:.3f}s")
-
-            # Regression check: should complete within reasonable time
-            self.assertLess(
-                elapsed_time,
-                30.0,
-                f"shrink_group took {elapsed_time:.3f}s, possible regression",
-            )
-
-            # Test collective performance
-            expected_size = self.world_size - len(ranks_to_exclude)
-            self._validate_shrunk_group(shrunk_pg, expected_size, "performance")
-
-            collective_start = time.time()
-            _ = self._test_collective_on_shrunk_group(
-                shrunk_pg, device, ranks_to_exclude, "performance"
-            )
-            collective_time = time.time() - collective_start
-
-            log_test_info(self.rank, f"all_reduce: {collective_time:.3f}s")
-            log_test_success(self.rank, "Performance test passed")
-        else:
-            log_test_info(self.rank, "Excluded rank - waiting")
-
-        dist.destroy_process_group()
-
-    @requires_nccl_shrink()
-    @requires_world_size(4)
-    def test_shrink_group_multiple_exclusions(self):
-        """Test shrink_group with multiple ranks excluded at once."""
-        # Scale exclusions with world size
-        ranks_to_exclude = list(range(2, self.world_size, 2))  # Every other rank from 2
-
-        self._perform_shrink_test(ranks_to_exclude, "Multiple exclusions test")
-
-    @requires_nccl_shrink()
-    @requires_world_size(3)
-    def test_shrink_group_multiple_iterations(self):
-        """Test multiple shrink operations in sequence."""
-        log_test_info(
-            self.rank,
-            f"Starting test_shrink_group_multiple_iterations with world_size={self.world_size}",
-        )
-
-        store = c10d.FileStore(self.file_name, self.world_size)
-        device = torch.device(f"cuda:{self.rank}")
-        _ = self._create_process_group_nccl(store, self.opts(), device_id=device)
-
-        # Track current effective world size throughout shrinking operations
-        current_world_size = self.world_size
-        log_test_info(self.rank, f"Initial world_size: {current_world_size}")
-
-        # First shrinking: exclude the last rank(s)
-        first_exclusion = [self.world_size - 1]
-        if self.world_size >= 6:
-            first_exclusion.append(
-                self.world_size - 2
-            )  # Exclude last two ranks for larger sizes
-
-        log_test_info(self.rank, f"First shrinking: excluding ranks {first_exclusion}")
-
-        if self.rank not in first_exclusion:
-            # Only non-excluded ranks should call shrink_group
-            first_pg = c10d.shrink_group(first_exclusion)
-            self.assertIsNotNone(first_pg)
-            # IMPORTANT: Update world size after first shrinking
-            current_world_size = first_pg.size()
-            expected_first_size = self.world_size - len(first_exclusion)
-            log_test_info(
-                self.rank,
-                f"After first shrinking: world_size {self.world_size} -> {current_world_size}",
-            )
-            self.assertEqual(first_pg.size(), expected_first_size)
-
-            # Second shrinking: exclude another rank from the remaining group
-            # Choose a rank that's in the middle range
-            if current_world_size >= 3:
-                second_exclusion = [
-                    current_world_size - 1
-                ]  # Exclude the new "last" rank
-                log_test_info(
-                    self.rank,
-                    f"Second shrinking from group of size {current_world_size}: excluding ranks {second_exclusion}",
-                )
-
-                if self.rank not in second_exclusion:
-                    # Only non-excluded ranks should call shrink_group for second iteration
-                    second_pg = c10d.shrink_group(second_exclusion, group=first_pg)
-                    self.assertIsNotNone(second_pg)
-                    # IMPORTANT: Update world size after second shrinking
-                    final_world_size = second_pg.size()
-                    expected_final_size = current_world_size - len(second_exclusion)
-                    log_test_info(
-                        self.rank,
-                        f"After second shrinking: world_size {current_world_size} -> {final_world_size}",
-                    )
-                    self.assertEqual(second_pg.size(), expected_final_size)
-
-                    # Test collective on final group
-                    tensor = torch.full((1,), self.rank).cuda(device)
-                    log_test_info(
-                        self.rank,
-                        f"Performing all_reduce on final group (size {final_world_size}) with tensor: {tensor.item()}",
-                    )
-                    c10d.all_reduce(tensor, group=second_pg)
-                    log_test_info(
-                        self.rank,
-                        f"Final all_reduce completed, result: {tensor.item()}",
-                    )
-
-                    # Calculate expected sum of remaining ranks
-                    all_excluded = set(first_exclusion + second_exclusion)
-                    remaining_ranks = [
-                        r for r in range(self.world_size) if r not in all_excluded
-                    ]
-                    expected_sum = sum(remaining_ranks)
-                    log_test_info(
-                        self.rank,
-                        f"Remaining ranks: {remaining_ranks}, expected sum: {expected_sum}, actual: {tensor.item()}",
-                    )
-                    self.assertEqual(tensor.item(), expected_sum)
-                    log_test_info(self.rank, "Final verification passed")
-                else:
-                    log_test_info(
-                        self.rank,
-                        "This rank excluded in second shrinking, not calling shrink_group",
-                    )
-            else:
-                log_test_info(
-                    self.rank, "Skipping second shrinking (remaining group too small)"
-                )
-        else:
-            log_test_info(
-                self.rank,
-                "This rank excluded in first shrinking, not calling shrink_group",
-            )
-
-        log_test_info(self.rank, "Destroying process group")
-        dist.destroy_process_group()
-        log_test_info(self.rank, "test_shrink_group_multiple_iterations completed")
-
-    # Helper methods for optimized shrink group tests
-    def _setup_shrink_test(self, test_suffix, world_size=None, warmup=True):
-        """Common setup for shrink group tests."""
-        os.environ["TORCH_NCCL_USE_COMM_NONBLOCKING"] = "1"
-        world_size = world_size or self.world_size
-        store = c10d.FileStore(self.file_name + f"_{test_suffix}", world_size)
-        device = torch.device(f"cuda:{self.rank}")
-        c10d.init_process_group(
-            "nccl",
-            world_size=world_size,
-            rank=self.rank,
-            store=store,
-            pg_options=self.opts(),
-            device_id=device,
-        )
-        pg = c10d.distributed_c10d._get_default_group()
-
-        if warmup:
-            c10d.all_reduce(torch.ones(1).cuda(device), group=pg)
-
-        return device, pg
-
-    def _validate_shrunk_group(self, shrunk_pg, expected_size, test_name=""):
-        """Validate properties of a shrunk process group."""
-        self.assertIsNotNone(shrunk_pg, f"{test_name}: shrunk_pg should not be None")
-        actual_size = shrunk_pg.size()
-        self.assertEqual(
-            actual_size, expected_size, f"{test_name}: group size mismatch"
-        )
-
-        new_rank = shrunk_pg.rank()
-        self.assertTrue(
-            0 <= new_rank < expected_size, f"{test_name}: invalid new rank {new_rank}"
-        )
-
-        log_test_info(
-            self.rank,
-            f"{test_name}: world_size {self.world_size} -> {actual_size}, rank {self.rank} -> {new_rank}",
-        )
-        return new_rank
-
-    def _test_collective_on_shrunk_group(
-        self, shrunk_pg, device, ranks_to_exclude, test_name=""
-    ):
-        """Test collective communication on shrunk group and verify correctness."""
-        test_tensor = torch.full((1,), self.rank, device=device, dtype=torch.float32)
-        c10d.all_reduce(test_tensor, group=shrunk_pg)
-
-        result = test_tensor.item()
-        expected_sum = sum(
-            r for r in range(self.world_size) if r not in ranks_to_exclude
-        )
-
-        self.assertEqual(
-            result, expected_sum, f"{test_name}: collective result mismatch"
-        )
-        log_test_info(
-            self.rank, f"{test_name}: collective passed ({result} == {expected_sum})"
-        )
-        return result
-
-    def _perform_shrink_test(
-        self, ranks_to_exclude, test_name, shrink_flags=0, with_collective=True
-    ):
-        """Complete shrink test flow: setup, shrink, validate, test collective, cleanup.
-
-        Consistent API: All ranks perform setup to initialize distributed environment.
-        ONLY non-excluded ranks call shrink_group() for both default and non-default groups.
-        Excluded ranks perform setup, then exit without calling shrink_group() or waiting.
-        """
-        log_test_info(self.rank, f"{test_name} (world_size={self.world_size})")
-
-        is_excluded = self.rank in ranks_to_exclude
-        log_test_info(
-            self.rank,
-            f"Excluding ranks: {ranks_to_exclude}, am_excluded: {is_excluded}",
-        )
-
-        # All ranks (including excluded ones) perform setup to initialize distributed environment
-        device, pg = self._setup_shrink_test(test_name.lower().replace(" ", "_"))
-        is_default_group = pg == c10d.distributed_c10d._get_default_group()
-
-        if is_excluded:
-            log_test_info(
-                self.rank,
-                f"Excluded rank {self.rank} - setup complete, skipping shrink operation",
-            )
-            if shrink_flags & NCCL_SHRINK_ABORT:
-                log_test_info(self.rank, f"Using abort for excluded rank {self.rank}")
-                pg._get_backend(torch.device(device)).abort()
-                log_test_info(
-                    self.rank, f"cleanup resources for excluded rank {self.rank}"
-                )
-                dist.destroy_process_group()
-                log_test_info(self.rank, f"Excluded rank {self.rank} - exit")
-            else:
-                log_test_info(
-                    self.rank, f"Using regular destroy for excluded rank {self.rank}"
-                )
-                dist.destroy_process_group()
-            return None
-
-        # Only non-excluded ranks proceed with shrink
-        log_test_info(
-            self.rank,
-            f"Non-excluded rank calling shrink_group (default_group={is_default_group})",
-        )
-        shrunk_pg = c10d.shrink_group(ranks_to_exclude, shrink_flags=shrink_flags)
-        log_test_info(
-            self.rank,
-            f"Non-excluded rank calling shrink_group (default_group={is_default_group}) done",
-        )
-
-        # Non-excluded ranks: validate and test the new group
-        expected_size = self.world_size - len(ranks_to_exclude)
-        _ = self._validate_shrunk_group(shrunk_pg, expected_size, test_name)
-
-        if with_collective:
-            _ = self._test_collective_on_shrunk_group(
-                shrunk_pg, device, ranks_to_exclude, test_name
-            )
-            log_test_success(self.rank, f"{test_name} successful (shrink + collective)")
-        else:
-            log_test_success(self.rank, f"{test_name} successful (shrink only)")
-
-        dist.destroy_process_group()
-        return shrunk_pg
-
-    def _get_default_ranks_to_exclude(self):
-        """Get default ranks to exclude based on world size."""
-        if self.world_size <= 1:
-            return []
-        return [self.world_size - 1]  # Exclude last rank by default
-
-    @requires_nccl_shrink()
-    @requires_world_size(3)
-    def test_shrink_group_vs_abort_reinit_performance(self):
-        """Compare performance of shrink_group vs traditional abort+reinit (simplified for reliability)."""
-        log_test_info(self.rank, "=== TEST 1: abort+reinit ===")
-
-        device, pg1 = self._setup_shrink_test("_perf_reinit")
-        torch.cuda.synchronize(device)
-
-        # Test 1: Traditional abort + reinit
-        start_time = time.perf_counter()
-        dist.destroy_process_group()
-
-        device, new_pg = self._setup_shrink_test("perf_shrink_test1")
-        reinit_time = time.perf_counter() - start_time
-
-        # Test collective with original rank values for fair comparison (non-blocking mode)
-        test_tensor = torch.full((1,), self.rank, device=device, dtype=torch.float32)
-        work = c10d.all_reduce(test_tensor, group=new_pg, async_op=True)
-        work.wait()
-
-        torch.cuda.synchronize(device)
-
-        # Verify correctness
-        expected_sum = sum(r for r in range(self.world_size))
-        self.assertEqual(test_tensor.item(), expected_sum, "Reinit collective failed")
-
-        log_test_info(self.rank, f"abort+reinit: {reinit_time:.4f}s")
-        dist.destroy_process_group(new_pg)
-
-        # Test 2: shrink_group with NCCL_SHRINK_ABORT
-        log_test_info(self.rank, "=== TEST 2: shrink_group ===")
-
-        ranks_to_exclude = [self.world_size - 1]
-        is_excluded = self.rank in ranks_to_exclude
-        log_test_info(
-            self.rank,
-            f"Excluding ranks: {ranks_to_exclude}, am_excluded: {is_excluded}",
-        )
-
-        device, pg1 = self._setup_shrink_test("perf_shrink_test2")  # Unique suffix
-
-        shrink_time = 0
-        if not is_excluded:
-            torch.cuda.synchronize(device)  # Ensure accurate timing
-            start_time = time.perf_counter()
-            shrunk_pg = c10d.shrink_group(
-                ranks_to_exclude, shrink_flags=NCCL_SHRINK_ABORT
-            )
-            c10d.all_reduce(torch.ones(1).cuda(device), group=shrunk_pg)
-            shrink_time = time.perf_counter() - start_time
-
-            # Test collective communication on shrunk group (non-blocking mode)
-            test_tensor = torch.full(
-                (1,), self.rank, device=device, dtype=torch.float32
-            )
-            work = c10d.all_reduce(test_tensor, group=shrunk_pg, async_op=True)
-            work.wait()
-
-            # Verify correctness
-            expected_sum = sum(
-                r for r in range(self.world_size) if r not in ranks_to_exclude
-            )
-            self.assertEqual(
-                test_tensor.item(),
-                expected_sum,
-                "shrink_test: collective result mismatch",
-            )
-
-            torch.cuda.synchronize(device)  # Ensure operations complete
-            log_test_info(self.rank, f"shrink_group: {shrink_time:.4f}s")
-            dist.destroy_process_group()
-        else:
-            log_test_info(self.rank, "Excluded from shrink test - exiting immediately")
-            dist.destroy_process_group()
-            return
-
-        # Performance analysis (only for participating ranks)
-        if shrink_time > 0 and reinit_time > 0:
-            speedup = reinit_time / shrink_time
-            time_saved = reinit_time - shrink_time
-
-            log_test_info(self.rank, "=== PERFORMANCE RESULTS ===")
-            log_test_info(self.rank, f"shrink_group:  {shrink_time:.4f}s")
-            log_test_info(self.rank, f"abort+reinit:  {reinit_time:.4f}s")
-            log_test_info(self.rank, f"time_saved:    {time_saved:+.4f}s")
-            log_test_info(self.rank, f"speedup:       {speedup:.2f}x")
-
-            if speedup > 1.1:
-                log_test_success(self.rank, "shrink_group significantly faster")
-            elif speedup > 0.9:
-                log_test_info(self.rank, "≈ comparable performance")
-            else:
-                log_test_warning(self.rank, "abort+reinit faster")
-
-        log_test_info(self.rank, "Performance test completed")
-
     @requires_nccl()
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
     def test_deterministic_mode_no_break(self):
diff --git a/torch/csrc/distributed/c10d/Backend.hpp b/torch/csrc/distributed/c10d/Backend.hpp
index 1ebf9394e064..655e0a5578c2 100644
--- a/torch/csrc/distributed/c10d/Backend.hpp
+++ b/torch/csrc/distributed/c10d/Backend.hpp
@@ -79,23 +79,6 @@ class TORCH_API Backend : public torch::CustomClassHolder {
     return false;
   }
 
-  virtual bool supportsShrinking() const {
-    return false;
-  }
-
-  // Shrink the backend by excluding specified ranks. Backends that support
-  // communicator shrinking should override this and return a new backend
-  // instance representing the shrunken group. Backends may use opts_override
-  // to supply backend-specific options for the new group.
-  virtual c10::intrusive_ptr<Backend> shrink(
-      const std::vector<int64_t>& /*ranks_to_exclude*/,
-      int /*shrink_flags*/ = 0,
-      const c10::intrusive_ptr<Options>& /*opts_override*/ = nullptr) {
-    TORCH_CHECK(
-        false,
-        c10::str("Backend ", getBackendName(), " does not support shrink"));
-  }
-
   virtual void setTimeout(std::chrono::milliseconds timeout) {
     TORCH_CHECK(
         false,
diff --git a/torch/csrc/distributed/c10d/NCCLUtils.cpp b/torch/csrc/distributed/c10d/NCCLUtils.cpp
index a41f654b9ae2..8074cc98a04f 100644
--- a/torch/csrc/distributed/c10d/NCCLUtils.cpp
+++ b/torch/csrc/distributed/c10d/NCCLUtils.cpp
@@ -259,65 +259,6 @@ std::shared_ptr<NCCLComm> NCCLComm::split(
 }
 #endif
 
-#ifdef NCCL_HAS_COMM_SHRINK
-std::shared_ptr<NCCLComm> NCCLComm::shrink(
-    NCCLComm* source,
-    std::vector<int>& ranks_to_exclude,
-    ncclConfig_t* config,
-    int shrinkFlags) {
-  // Preconditions are validated in ProcessGroupNCCL::shrink
-
-  LOG(INFO) << "Rank " << source->rank_ << ": shrinking comm " << source->repr()
-            << " excluding " << ranks_to_exclude.size() << " ranks";
-
-  at::cuda::OptionalCUDAGuard gpuGuard(source->deviceIndex_);
-  auto comm = std::make_shared<NCCLComm>();
-
-  // This call will block until the source communicator is initialized
-  auto sourceComm = source->getNcclComm();
-
-  C10D_NCCL_CHECK_NONBLOCKING(
-      ncclCommShrink(
-          sourceComm,
-          ranks_to_exclude.data(),
-          ranks_to_exclude.size(),
-          reinterpret_cast<ncclComm_t*>(&(comm->ncclComm_)),
-          config,
-          shrinkFlags),
-      source->getNcclCommFailureReason());
-
-  // Wait for the child communicator to be ready
-  source->waitReady(true);
-  comm->initialized_ = true;
-
-  // NCCL automatically assigns rank during shrink - query it efficiently
-  int assigned_rank;
-  try {
-    C10D_NCCL_CHECK(
-        ncclCommUserRank(comm->ncclComm_, &assigned_rank), std::nullopt);
-    comm->rank_ = assigned_rank;
-  } catch (const std::exception& e) {
-    // Fallback: if ncclCommUserRank fails, we can't determine the rank
-    LOG(ERROR) << "Failed to query NCCL-assigned rank: " << e.what();
-    throw;
-  }
-
-  // Child comm should be on the same device as parent comm
-  comm->deviceIndex_ = source->deviceIndex_;
-  if (config != nullptr) {
-    comm->nonBlocking_ = config->blocking == 0;
-  } else {
-    // Inherit parent behavior if no config provided
-    comm->nonBlocking_ = source->nonBlocking_;
-  }
-
-  LOG(INFO) << "Rank " << source->rank_ << ": created shrunken comm "
-            << comm->repr() << " with NCCL-assigned rank " << assigned_rank;
-
-  return comm;
-}
-#endif
-
 void NCCLComm::finalize() {
   LockType lock(mutex_);
   if (aborted_) {
diff --git a/torch/csrc/distributed/c10d/NCCLUtils.hpp b/torch/csrc/distributed/c10d/NCCLUtils.hpp
index 142633b82374..fdd50f69ef3d 100644
--- a/torch/csrc/distributed/c10d/NCCLUtils.hpp
+++ b/torch/csrc/distributed/c10d/NCCLUtils.hpp
@@ -90,10 +90,6 @@ static_assert(
 #define NCCL_HAS_NVLS_CTAS
 #endif
 
-#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 27, 0)
-#define NCCL_HAS_COMM_SHRINK
-#endif
-
 // Macro to throw on a non-successful NCCL return value.
 #define C10D_NCCL_CHECK(cmd, failureReason)                                   \
   do {                                                                        \
@@ -298,14 +294,6 @@ class NCCLComm {
       ncclConfig_t& config);
 #endif // NCCL_HAS_COMM_SPLIT
 
-#ifdef NCCL_HAS_COMM_SHRINK
-  static std::shared_ptr<NCCLComm> shrink(
-      NCCLComm* source,
-      std::vector<int>& ranks_to_exclude,
-      ncclConfig_t* config,
-      int shrinkFlags = 0);
-#endif // NCCL_HAS_COMM_SHRINK
-
 #if (defined(IS_NCCLX) || defined(USE_ROCM)) && defined(NCCL_COMM_DUMP)
   std::unordered_map<std::string, std::string> ncclCommDump();
 #endif
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index 1a63128f8ddf..9b615b9f16b0 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -165,7 +165,7 @@ ncclRedOpRAII getNcclReduceOp(
 }
 
 // Get a key string from device
-inline std::string getKeyFromDevice(const at::Device& device) {
+inline std::string getKeyFromDevice(at::Device& device) {
   return std::to_string(device.index());
 }
 
@@ -5838,139 +5838,6 @@ at::Tensor ProcessGroupNCCL::allocateTensor(
   return tensor;
 }
 
-#ifdef NCCL_HAS_COMM_SHRINK
-c10::intrusive_ptr<Backend> ProcessGroupNCCL::shrink(
-    const std::vector<int64_t>& ranks_to_exclude,
-    int shrink_flags,
-    const c10::intrusive_ptr<Backend::Options>& opts_override) {
-  // Runtime version check with better error message
-  auto runtime_version = torch::cuda::nccl::version();
-  TORCH_CHECK(
-      runtime_version >= NCCL_VERSION(2, 27, 0),
-      "ProcessGroupNCCL::shrink requires NCCL version 2.27.0 or later. "
-      "Found version: ",
-      runtime_version);
-
-  // Early validation with detailed error messages
-  TORCH_CHECK_VALUE(
-      !ranks_to_exclude.empty(), "ranks_to_exclude cannot be empty");
-  TORCH_CHECK_VALUE(
-      static_cast<int>(ranks_to_exclude.size()) < size_,
-      "Cannot exclude all ranks (",
-      ranks_to_exclude.size(),
-      " >= ",
-      size_,
-      ")");
-
-  // Validate ranks and convert to int efficiently
-  std::vector<int> int_ranks_to_exclude;
-  int_ranks_to_exclude.reserve(ranks_to_exclude.size());
-  for (int64_t rank : ranks_to_exclude) {
-    TORCH_CHECK_VALUE(
-        rank >= 0 && rank < size_,
-        "Invalid rank ",
-        rank,
-        " for group size ",
-        size_);
-    int_ranks_to_exclude.push_back(static_cast<int>(rank));
-  }
-
-  // Get primary communicator with better error context
-  auto primary_device_index = guessDeviceId();
-  auto primary_device = at::Device(at::kCUDA, primary_device_index);
-  const auto primary_key = getKeyFromDevice(primary_device);
-
-  std::shared_ptr<NCCLComm> primary_comm = getNCCLComm(primary_key);
-  TORCH_CHECK(
-      primary_comm,
-      "Primary NCCL communicator for device ",
-      primary_device,
-      " (key: ",
-      primary_key,
-      ") is not initialized");
-
-  // Cache device index before shrink operation
-  at::DeviceIndex parent_device_index = primary_comm->getDeviceIndex();
-
-  ncclConfig_t* config = nullptr;
-  // Default to inheriting from parent options
-  bool high_priority_stream = options_->is_high_priority_stream;
-  if (opts_override) {
-    auto nccl_opts =
-        c10::static_intrusive_pointer_cast<ProcessGroupNCCL::Options>(
-            opts_override);
-    config = &nccl_opts->config;
-    // If user provided override options, honor is_high_priority_stream as well
-    high_priority_stream = nccl_opts->is_high_priority_stream;
-  }
-
-  std::shared_ptr<NCCLComm> shrunk_comm = NCCLComm::shrink(
-      primary_comm.get(),
-      int_ranks_to_exclude,
-      (config != nullptr ? config : &options_->config),
-      shrink_flags);
-
-  // Calculate new size and get NCCL-assigned rank
-  int new_size = size_ - static_cast<int>(ranks_to_exclude.size());
-  int new_rank = shrunk_comm->rank_;
-
-  // Create new ProcessGroupNCCL with optimized options cloning
-  auto new_store = store_->clone();
-  auto new_opts = ProcessGroupNCCL::Options::create(high_priority_stream);
-  new_opts->timeout = options_->timeout;
-  if (config != nullptr) {
-    new_opts->config = *config;
-  } else {
-    new_opts->config = options_->config;
-  }
-
-  auto new_pg = c10::make_intrusive<ProcessGroupNCCL>(
-      new_store, new_rank, new_size, new_opts);
-
-  // Set up the new process group with optimized device setup
-  new_pg->initializeDeviceStateForComm(
-      at::Device(at::kCUDA, parent_device_index), shrunk_comm);
-
-  return c10::static_intrusive_pointer_cast<Backend>(new_pg);
-}
-
-#else // !NCCL_HAS_COMM_SHRINK
-// Backend interface override: raise consistent error when shrink is
-// unsupported.
-c10::intrusive_ptr<Backend> ProcessGroupNCCL::shrink(
-    const std::vector<int64_t>& /*ranks_to_exclude*/,
-    int /*shrink_flags*/,
-    const c10::intrusive_ptr<Backend::Options>& /*opts_override*/) {
-  TORCH_CHECK(
-      false,
-      "ProcessGroupNCCL::shrink requires NCCL version 2.27.0 or later, "
-      "but PyTorch was built with an older version or without NCCL shrink support.");
-}
-
-#endif // NCCL_HAS_COMM_SHRINK
-
-void ProcessGroupNCCL::initializeDeviceStateForComm(
-    const at::Device& device,
-    std::shared_ptr<NCCLComm> comm) {
-  const auto key = getKeyFromDevice(device);
-  std::unique_lock<std::mutex> lock(mutex_);
-  at::cuda::OptionalCUDAGuard gpuGuard(device);
-
-  bool force_high = getCvarBool(TORCH_NCCL_HIGH_PRIORITY, false);
-  auto stream = at::cuda::getStreamFromPool(
-      options_->is_high_priority_stream || force_high);
-
-  devNCCLCommMap_[key] = comm;
-  ncclStreams_.emplace(key, stream);
-  ncclEvents_.emplace(key, at::cuda::CUDAEvent(cudaEventDisableTiming));
-  usedDeviceIdxs_.insert(device.index());
-
-  if (shouldAllCommunicatorsRegisterAllTensors()) {
-    std::lock_guard<std::mutex> map_lock(ncclCommMemPoolMapMutex);
-    ncclCommMemPoolMap.emplace(std::move(comm), MemPoolSet{});
-  }
-}
-
 } // namespace c10d
 
 #endif // USE_C10D_NCCL
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
index 2ead1a107394..286eab14d1a8 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
@@ -997,21 +997,6 @@ class TORCH_API ProcessGroupNCCL : public Backend {
 
   ErrorType getError() override;
 
-  bool supportsShrinking() const override {
-#ifdef NCCL_HAS_COMM_SHRINK
-    return true;
-#else
-    return false;
-#endif
-  }
-
-  // Backend-style shrink override that returns a Backend instance.
-  c10::intrusive_ptr<Backend> shrink(
-      const std::vector<int64_t>& ranks_to_exclude,
-      int shrink_flags = 0,
-      const c10::intrusive_ptr<Backend::Options>& opts_override =
-          nullptr) override;
-
   std::shared_ptr<c10::Allocator> getMemAllocator() override;
 
   // Allocate tensor from communication-optimized memory pool
@@ -1080,12 +1065,6 @@ class TORCH_API ProcessGroupNCCL : public Backend {
       int p2pRank = 0,
       bool isSendRecvSelf = false);
 
-  // Initialize device-specific state (comm, stream, event, bookkeeping) for a
-  // given communicator on this process group instance.
-  void initializeDeviceStateForComm(
-      const at::Device& device,
-      std::shared_ptr<NCCLComm> comm);
-
   // Wrapper method which can be overridden for tests.
   virtual std::exception_ptr checkForNCCLErrors(
       std::shared_ptr<NCCLComm>& ncclComm);
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index f7d60e0cb62d..bdf2576efbe7 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -2730,23 +2730,12 @@ Arguments:
               "supports_time_estimate",
               &::c10d::Backend::supportsTimeEstimation,
               "(test whether the backend supports collective time estimation)")
-          .def_property_readonly(
-              "supports_shrinking",
-              &::c10d::Backend::supportsShrinking,
-              "(test whether the backend supports communicator shrinking)")
           .def(
               "set_timeout",
               &::c10d::Backend::setTimeout,
               py::arg("timeout"),
               py::call_guard<py::gil_scoped_release>(),
               R"(Sets the default timeout for all future operations.)")
-          .def(
-              "shrink",
-              &::c10d::Backend::shrink,
-              py::arg("ranks_to_exclude"),
-              py::arg("shrink_flags") = 0,
-              py::arg("opts_override") = nullptr,
-              py::call_guard<py::gil_scoped_release>())
           .def(
               "broadcast",
               &::c10d::Backend::broadcast,
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index 0652024365de..ea194a6ebe9a 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -130,7 +130,6 @@ __all__ = [
     "reduce_scatter_tensor",
     "get_node_local_rank",
     "split_group",
-    "shrink_group",
 ]
 
 _MPI_AVAILABLE = True
@@ -5697,517 +5696,3 @@ def _get_process_group_name(pg: ProcessGroup) -> str:
 
 def _get_process_group_store(pg: ProcessGroup) -> Store:
     return _world.pg_map[pg][1]
-
-
-# Shrink flags for process group backends
-SHRINK_DEFAULT = 0x00
-SHRINK_ABORT = 0x01
-
-
-@_time_logger
-def shrink_group(
-    ranks_to_exclude: list[int],
-    group: Optional[ProcessGroup] = None,
-    shrink_flags: int = SHRINK_DEFAULT,
-    pg_options: Optional[Any] = None,
-) -> ProcessGroup:
-    """
-    Shrinks a process group by excluding specified ranks.
-
-    Creates and returns a new, smaller process group comprising only the ranks
-    from the original group that were not in the ``ranks_to_exclude`` list.
-
-    Args:
-        ranks_to_exclude (List[int]): A list of ranks from the original
-            ``group`` to exclude from the new group.
-        group (ProcessGroup, optional): The process group to shrink. If ``None``,
-            the default process group is used. Defaults to ``None``.
-        shrink_flags (int, optional): Flags to control the shrinking behavior.
-            Can be ``SHRINK_DEFAULT`` (default) or ``SHRINK_ABORT``.
-            ``SHRINK_ABORT`` will attempt to terminate ongoing operations
-            in the parent communicator before shrinking.
-            Defaults to ``SHRINK_DEFAULT``.
-        pg_options (ProcessGroupOptions, optional): Backend-specific options to apply
-            to the shrunken process group. If provided, the backend will use
-            these options when creating the new group. If omitted, the new group
-            inherits defaults from the parent.
-
-    Returns:
-        ProcessGroup: a new group comprised of the remaining ranks. If the
-        default group was shrunk, the returned group becomes the new default group.
-
-    Raises:
-        TypeError: if the group’s backend does not support shrinking.
-        ValueError: if ``ranks_to_exclude`` is invalid (empty, out of bounds,
-        duplicates, or excludes all ranks).
-        RuntimeError: if an excluded rank calls this function or the backend
-        fails the operation.
-
-    Notes:
-        - Only non-excluded ranks should call this function; excluded ranks
-          must not participate in the shrink operation.
-        - Shrinking the default group destroys all other process groups since
-          rank reassignment makes them inconsistent.
-    """
-    # Step 1: Validate input parameters with comprehensive error checking
-    _validate_shrink_inputs(ranks_to_exclude, shrink_flags)
-
-    # Step 2: Get target group and essential properties
-    target_group_info = _prepare_shrink_target_group(group)
-
-    # Step 3: Validate backend requirements and availability
-    backend_impl = _validate_shrink_backend_requirements(target_group_info)
-
-    # Step 4: Validate ranks against group and check for duplicates
-    excluded_ranks_set = _validate_and_process_excluded_ranks(
-        ranks_to_exclude, target_group_info
-    )
-
-    # Step 5: Execute the actual shrink operation (backend-specific)
-    new_backend = backend_impl.shrink(
-        sorted(excluded_ranks_set),
-        shrink_flags,
-        pg_options if pg_options is not None else None,
-    )
-
-    # Step 6: Handle cleanup and creation of new process group
-    target_group_info["pg_options_override"] = pg_options
-    return _finalize_shrunk_group(target_group_info, excluded_ranks_set, new_backend)
-
-
-def _validate_shrink_inputs(ranks_to_exclude: list[int], shrink_flags: int) -> None:
-    """Validate input parameters for shrink_group."""
-    if not isinstance(ranks_to_exclude, list):
-        raise TypeError(
-            f"ranks_to_exclude must be a list, but got {type(ranks_to_exclude).__name__}. "
-            f"Example: [1, 3, 5] to exclude ranks 1, 3, and 5."
-        )
-
-    if not ranks_to_exclude:
-        raise ValueError(
-            "ranks_to_exclude cannot be empty. To shrink a group, you must specify at least "
-            "one rank to exclude. Example: [failed_rank_id]"
-        )
-
-    # Validate shrink_flags with clear explanation of valid values
-    valid_flags = [SHRINK_DEFAULT, SHRINK_ABORT]
-    if not isinstance(shrink_flags, int) or shrink_flags not in valid_flags:
-        raise ValueError(
-            f"Invalid shrink_flags value: {shrink_flags}. Must be one of: "
-            f"SHRINK_DEFAULT ({SHRINK_DEFAULT}) or SHRINK_ABORT ({SHRINK_ABORT}). "
-            f"Use SHRINK_ABORT to abort ongoing operations before shrinking."
-        )
-
-
-def _prepare_shrink_target_group(group: Optional[ProcessGroup]) -> dict:
-    """Prepare and validate the target group for shrinking."""
-    target_pg = group if group is not None else _get_default_group()
-
-    # Cache frequently accessed properties to avoid repeated calls
-    group_size = int(target_pg.size())
-    group_info = {
-        "process_group": target_pg,
-        "is_default_group": (target_pg == _get_default_group()),
-        "group_size": group_size,
-        "current_rank": target_pg.rank(),
-        "group_name": _get_process_group_name(target_pg),
-    }
-
-    # Validate that we have a valid process group
-    if group_size <= 1:
-        raise ValueError(
-            f"Cannot shrink a process group with size {group_size}. "
-            f"Group must have at least 2 ranks to support shrinking."
-        )
-
-    return group_info
-
-
-def _validate_shrink_backend_requirements(group_info: dict) -> Any:
-    """Return the backend implementation for the target group or raise if unsupported."""
-    target_pg = group_info["process_group"]
-    group_name = group_info["group_name"]
-
-    # Get the group's backend directly via ProcessGroup API. Prefer a bound device if present,
-    # otherwise try CUDA then fall back to CPU.
-    try:
-        preferred_device = getattr(target_pg, "bound_device_id", None)
-        if preferred_device is not None:
-            backend_impl = target_pg._get_backend(preferred_device)
-        else:
-            # Try CUDA first if available, else CPU
-            try:
-                backend_impl = target_pg._get_backend(torch.device("cuda"))
-            except Exception:
-                backend_impl = target_pg._get_backend(torch.device("cpu"))
-    except RuntimeError as e:
-        raise RuntimeError(
-            f"Cannot access device backend for process group '{group_name}'. "
-            f"Ensure the process group was initialized with a compatible device backend and devices are available."
-        ) from e
-
-    try:
-        supports = bool(backend_impl.supports_shrinking)
-    except Exception:
-        supports = False
-    if not supports:
-        raise TypeError(
-            f"Process group backend for '{group_name}' does not support shrinking operations."
-        )
-
-    return backend_impl
-
-
-def _validate_and_process_excluded_ranks(
-    ranks_to_exclude: list[int], group_info: dict
-) -> set:
-    """Validate excluded ranks and convert to set for efficient operations."""
-    group_size = group_info["group_size"]
-    current_rank = group_info["current_rank"]
-
-    # Use set for O(1) duplicate detection and membership testing
-    excluded_ranks_set = set()
-
-    # Validate each rank with detailed error messages
-    for i, rank in enumerate(ranks_to_exclude):
-        if not isinstance(rank, int):
-            raise TypeError(
-                f"All elements in ranks_to_exclude must be integers. "
-                f"Element at index {i} is {type(rank).__name__}: {rank}"
-            )
-
-        if not (0 <= rank < group_size):
-            raise ValueError(
-                f"Rank {rank} at index {i} is out of bounds for group size {group_size}. "
-                f"Valid ranks are in range [0, {group_size - 1}]."
-            )
-
-        if rank in excluded_ranks_set:
-            raise ValueError(
-                f"Duplicate rank {rank} found in ranks_to_exclude at index {i}. "
-                f"Each rank can only be excluded once."
-            )
-
-        excluded_ranks_set.add(rank)
-
-    # Ensure we don't exclude all ranks
-    if len(excluded_ranks_set) >= group_size:
-        raise ValueError(
-            f"Cannot exclude all {group_size} ranks from process group. "
-            f"At least one rank must remain. Excluding {len(excluded_ranks_set)} ranks."
-        )
-
-    # Critical check: current rank should not be in excluded list
-    if current_rank in excluded_ranks_set:
-        raise RuntimeError(
-            f"Current rank {current_rank} is in the exclusion list and should not call shrink_group(). "
-            f"Only non-excluded ranks should participate in the shrinking operation. "
-            f"Excluded ranks should terminate their processes instead."
-        )
-
-    return excluded_ranks_set
-
-
-def _finalize_shrunk_group(
-    group_info: dict, excluded_ranks_set: set, new_backend
-) -> ProcessGroup:
-    """Clean up old group and create new shrunk process group."""
-    target_pg = group_info["process_group"]
-    is_default_group = group_info["is_default_group"]
-
-    # Handle default group dependencies - destroy other groups first
-    if is_default_group:
-        _destroy_all_other_groups(exclude_group=target_pg)
-
-    # Gather original group metadata before cleanup
-    original_group_metadata = _extract_group_metadata(target_pg)
-
-    # Calculate remaining ranks efficiently
-    original_ranks = get_process_group_ranks(target_pg)
-    remaining_ranks = [
-        rank for rank in original_ranks if rank not in excluded_ranks_set
-    ]
-
-    # Clean up the original group
-    _cleanup_original_group(target_pg, is_default_group)
-
-    # Create and configure the new process group
-    new_pg = _create_shrunk_process_group(
-        new_backend, remaining_ranks, original_group_metadata, is_default_group
-    )
-
-    # Register the new group in global state
-    if is_default_group:
-        _update_default_pg(new_pg)
-
-    # Update global state with new group information
-    rank_mapping = {
-        global_rank: group_rank
-        for group_rank, global_rank in enumerate(remaining_ranks)
-    }
-    _update_process_group_global_state(
-        pg=new_pg,
-        backend_name=original_group_metadata["backend_name"],
-        store=original_group_metadata["store"],
-        group_name=original_group_metadata["new_group_name"],
-        backend_config=original_group_metadata["backend_config"],
-        rank_mapping=rank_mapping,
-    )
-
-    return new_pg
-
-
-def _extract_group_metadata(target_pg: ProcessGroup) -> dict:
-    """Extract metadata from the original group before cleanup."""
-    original_backend_name, original_store = _world.pg_map[target_pg]
-    original_backend_config = _world.pg_backend_config.get(target_pg, "")
-    original_group_name = _get_process_group_name(target_pg)
-
-    # Extract device binding information before cleanup to avoid accessing destroyed group
-    bound_device_id = None
-    if hasattr(target_pg, "bound_device_id"):
-        bound_device_id = target_pg.bound_device_id
-
-    # Generate new group name for the shrunk group; hash for uniqueness across backends
-    remaining_ranks = list(get_process_group_ranks(target_pg))
-    new_group_name = _process_group_name(remaining_ranks, use_hashed_name=True)
-
-    return {
-        "backend_name": original_backend_name,
-        "store": original_store,
-        "backend_config": original_backend_config,
-        "original_group_name": original_group_name,
-        "new_group_name": new_group_name,
-        "bound_device_id": bound_device_id,  # Safe to access after cleanup
-    }
-
-
-def _cleanup_original_group(target_pg: ProcessGroup, is_default_group: bool) -> None:
-    """Clean up the original process group safely."""
-    try:
-        destroy_process_group(target_pg)
-    except Exception as e:
-        group_type = "default" if is_default_group else "non-default"
-        logger.warning("Failed to destroy %s group during shrinking: %s", group_type, e)
-
-    # Ensure global state cleanup even if destroy_process_group fails
-    _cleanup_process_group_global_state(target_pg)
-
-
-def _create_shrunk_process_group(
-    new_backend, remaining_ranks: list[int], metadata: dict, is_default_group: bool
-) -> ProcessGroup:
-    """Create and configure the new shrunk process group."""
-    # Create new group properties
-    new_group_rank = new_backend.rank()
-    new_group_size = new_backend.size()
-    group_name = metadata["new_group_name"]
-
-    # Generate descriptive group description
-    if is_default_group:
-        group_desc = "default:shrunken"
-    else:
-        group_desc = f"{metadata['original_group_name']}:shrunk"
-
-    # Create process group with new communicator (clone the parent store like split does)
-    prefix_store = PrefixStore(f"{group_name}/", metadata["store"].clone())
-    new_pg = ProcessGroup(prefix_store, new_group_rank, new_group_size)
-
-    # Configure backend using the device type of the new backend's bound device if available,
-    # otherwise derive from the original group's bound device or fall back to CPU.
-    backend_device = metadata.get("bound_device_id")
-    if backend_device is None:
-        # Default to CPU if no bound device is present
-        backend_device = torch.device("cpu")
-
-    # Choose backend enum based on device type
-    if backend_device.type == "cuda":
-        backend_type = ProcessGroup.BackendType.NCCL
-    else:
-        backend_type = ProcessGroup.BackendType.GLOO
-
-    new_pg._register_backend(backend_device, backend_type, new_backend)
-    new_pg._set_default_backend(backend_type)
-
-    # Inherit device binding from original group if it was bound
-    bound_device_id = metadata.get("bound_device_id")
-    if bound_device_id is not None:
-        new_pg.bound_device_id = bound_device_id
-
-    # Set group metadata
-    new_pg._set_group_name(group_name)
-    new_pg._set_group_desc(group_desc)
-
-    # Persist backend configuration overrides (if provided via shrink_group)
-    backend_config_override = metadata.get("backend_config")
-    if backend_config_override is not None:
-        # Store for introspection/debugging and potential backend hooks
-        _world.pg_backend_config[new_pg] = backend_config_override
-
-    return new_pg
-
-
-def _destroy_all_other_groups(exclude_group: Optional[ProcessGroup] = None) -> None:
-    """
-    Destroy all process groups except the excluded group and clean up all global state.
-
-    This is necessary when shrinking the default group because global ranks
-    are reassigned by NCCL, making all existing process groups inconsistent.
-
-    Note: Uses abort for non-collective cleanup since excluded ranks may not
-    participate in collective operations. Backend cleanup is handled independently per group.
-
-    Args:
-        exclude_group (ProcessGroup, optional): Process group to exclude from destruction.
-            If None, destroys all process groups.
-    """
-    # Get list of groups to destroy (avoid modifying dict while iterating)
-    groups_to_destroy = []
-    for pg in list(_world.pg_group_ranks.keys()):
-        if exclude_group is not None and pg == exclude_group:
-            continue
-        groups_to_destroy.append(pg)
-
-    # Warn user about automatic destruction
-    if groups_to_destroy:
-        group_names = [_get_process_group_name(pg) for pg in groups_to_destroy]
-        logger.warning(
-            "Shrinking default group will destroy %d other process groups: %s. "
-            "This is necessary because shrinking the default group reassigns global ranks, "
-            "making existing groups inconsistent.",
-            len(groups_to_destroy),
-            ", ".join(group_names),
-        )
-
-    # Destroy each group and clean up global state
-    for pg in groups_to_destroy:
-        try:
-            # First call abort_process_group which handles the C++ cleanup non-collectively
-            _abort_process_group(pg)
-        except Exception as e:
-            # Log but don't fail - some groups might already be destroyed
-            logger.warning(
-                "Failed to abort process group %s: %s",
-                _get_process_group_name(pg),
-                e,
-            )
-
-        # Ensure all global state is cleaned up even if _abort_process_group fails
-        # or doesn't clean up everything
-        _cleanup_process_group_global_state(pg)
-
-
-def _cleanup_process_group_global_state(pg: ProcessGroup) -> None:
-    """
-    Clean up all global state associated with a process group.
-
-    This function ensures complete cleanup of process group state from all
-    global dictionaries and registries, even if destroy_process_group fails
-    or doesn't clean up everything. This is critical when destroying multiple
-    groups to prevent inconsistent state.
-
-    The cleanup removes the process group from:
-    - _world.pg_map (backend and store mapping)
-    - _world.pg_names (group name mapping)
-    - _world.pg_group_ranks (rank mappings)
-    - _world.pg_backend_config (backend configuration)
-    - _world.tags_to_pg and _world.pg_to_tag (tag mappings)
-    - _world.pg_coalesce_state (coalescing state)
-    - C++ internal registries via _unregister_process_group
-
-    Args:
-        pg (ProcessGroup): The process group to clean up.
-    """
-    try:
-        # Clean up main process group mappings
-        _world.pg_map.pop(pg, None)
-        _world.pg_group_ranks.pop(pg, None)
-        _world.pg_backend_config.pop(pg, None)
-
-        # Clean up process group name mapping
-        group_name = _world.pg_names.pop(pg, None)
-
-        # Clean up tag mappings
-        pg_tag = _world.pg_to_tag.pop(pg, None)
-        if pg_tag is not None and pg_tag in _world.tags_to_pg:
-            try:
-                _world.tags_to_pg[pg_tag].remove(pg)
-                # Remove the tag entry if list is empty
-                if not _world.tags_to_pg[pg_tag]:
-                    _world.tags_to_pg.pop(pg_tag, None)
-            except (ValueError, KeyError):
-                # Process group was already removed from the list
-                pass
-
-        # Clean up any registered process group names using C++ unregister function
-        if group_name is not None:
-            try:
-                _unregister_process_group(group_name)
-            except Exception:
-                # Process group name might not be registered or already unregistered
-                pass
-
-        # Clean up coalesce state if present
-        _world.pg_coalesce_state.pop(pg, None)
-
-    except Exception as e:
-        # Log cleanup failures but don't propagate - we want to continue with other cleanups
-        logger.warning("Failed to fully clean up global state for process group: %s", e)
-
-
-def _update_process_group_global_state(
-    pg: ProcessGroup,
-    backend_name: str,
-    store: Store,
-    group_name: str,
-    backend_config: str,
-    rank_mapping: Optional[dict[int, int]] = None,
-    pg_tag: Optional[str] = None,
-    user_tag: Optional[str] = None,
-) -> None:
-    """
-    Update all global state dictionaries for a process group.
-
-    This helper function consolidates the common pattern of updating multiple
-    global state dictionaries when creating or modifying process groups.
-
-    Args:
-        pg (ProcessGroup): The process group to update state for.
-        backend_name (str): Backend name for pg_map.
-        store (Store): Store instance for pg_map.
-        group_name (str): Group name for pg_names and registration.
-        backend_config (str): Backend configuration string.
-        rank_mapping (Dict[int, int], optional): Global rank to group rank mapping.
-            If None, skips updating pg_group_ranks.
-        pg_tag (str, optional): Process group tag. If None, defaults to f"ptd:{group_name}".
-        user_tag (str, optional): User-provided tag for special tag handling.
-            If provided, creates "user:{user_tag}" tag and also adds to default "".
-    """
-    # Update main process group mappings
-    _world.pg_map[pg] = (backend_name, store)
-    _world.pg_names[pg] = group_name
-    _world.pg_backend_config[pg] = backend_config
-
-    # Register the process group name
-    _register_process_group(group_name, pg)
-
-    # Update rank mapping if provided
-    if rank_mapping is not None:
-        _world.pg_group_ranks[pg] = rank_mapping
-
-    # Handle tag management
-    if pg_tag is None:
-        pg_tag = f"ptd:{group_name}"
-
-    if user_tag is not None:
-        # Special handling for user-provided tags
-        # Add to default "" tag first
-        _world.tags_to_pg.setdefault("", []).append(pg)
-        # Then create user-specific tag
-        user_pg_tag = f"user:{user_tag}"
-        _world.tags_to_pg.setdefault(user_pg_tag, []).append(pg)
-        _world.pg_to_tag[pg] = user_pg_tag
-    else:
-        # Standard process group tag
-        _world.tags_to_pg.setdefault(pg_tag, []).append(pg)
-        _world.pg_to_tag[pg] = pg_tag
diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index 8ce17367b86b..17a317463cb5 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -228,47 +228,6 @@ def skip_if_lt_x_gpu(x):
     return decorator
 
 
-def requires_world_size(n: int):
-    """
-    Decorator to request a specific world size for a test. The test harness can
-    read this attribute to set the number of ranks to spawn. If there are fewer
-    than `n` CUDA devices available, the test should be skipped by the harness.
-
-    Usage:
-        @require_world_size(3)
-        def test_something(self):
-            ...
-    """
-
-    def decorator(func):
-        func._required_world_size = n
-        available = torch.cuda.device_count()
-        return unittest.skipUnless(
-            available >= n, f"requires {n} GPUs, found {available}"
-        )(func)
-
-    return decorator
-
-
-def get_required_world_size(obj: Any, default: int) -> int:
-    """
-    Returns the requested world size for the currently running unittest method on `obj`
-    if annotated via `@require_world_size(n)`, else returns `default`.
-    """
-    try:
-        # Try MultiProcessTestCase helper first, then unittest fallback
-        test_name = (
-            obj._current_test_name()  # type: ignore[attr-defined]
-            if hasattr(obj, "_current_test_name") and callable(obj._current_test_name)
-            else obj._testMethodName
-        )
-        fn = getattr(obj, test_name)
-        value = fn._required_world_size
-        return int(value)
-    except Exception:
-        return default
-
-
 # This decorator helps avoiding initializing cuda while testing other backends
 def nccl_skip_if_lt_x_gpu(backend, x):
     def decorator(func):
@@ -396,13 +355,6 @@ def requires_nccl_version(version, msg):
         )
 
 
-def requires_nccl_shrink():
-    """
-    Require NCCL shrink support (NCCL available and version >= 2.27).
-    """
-    return requires_nccl_version((2, 27), "Need NCCL 2.27+ for shrink_group")
-
-
 def requires_nccl():
     return skip_but_pass_in_sandcastle_if(
         not c10d.is_nccl_available(),

From 08c97b4a1f22cbd652c35c08b0896c930e9fa2f3 Mon Sep 17 00:00:00 2001
From: Tugsbayasgalan Manlaibaatar <tmanlaibaatar@fb.com>
Date: Thu, 16 Oct 2025 22:36:18 -0700
Subject: [PATCH 066/123] Don't run compile inside kernel invocation (#165687)

When we call torch.compile during fake tensor prop, we shouldn't actually compile because we can't guarantee that the compiled artifact can be fake tensor prop-d. (for example, inductor backend). Instead we should just skip compiling. However, the inner compile will be triggered when being executed in runtime.

Fixes: https://github.com/pytorch/pytorch/issues/151328

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165687
Approved by: https://github.com/zou3519
---
 test/dynamo/test_misc.py    | 51 +++++++++++++++++++++++++++++++++++++
 torch/_dynamo/eval_frame.py |  8 ++++++
 2 files changed, 59 insertions(+)

diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
index 9e728cd80962..60883b69a4d5 100644
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@@ -242,6 +242,57 @@ class MiscTests(torch._inductor.test_case.TestCase):
         self.assertTrue(same(val4, correct1))
         self.assertEqual(counter.frame_count, 3)
 
+    def test_dynamo_inside_custom_op(self):
+        cnt = torch._dynamo.testing.InductorAndRecordGraphs()
+        cnt1 = torch._dynamo.testing.InductorAndRecordGraphs()
+
+        with torch.library._scoped_library("mylib", "FRAGMENT") as m:
+            m.define("foo(Tensor x) -> Tensor")
+
+            def inner(x):
+                return x.sin().cos()
+
+            def foo_impl(x):
+                return torch.compile(inner, fullgraph=True, dynamic=True, backend=cnt)(
+                    x
+                )
+
+            m.impl("foo", foo_impl, "CompositeExplicitAutograd")
+
+            @torch.compile(fullgraph=True, dynamic=True, backend=cnt1)
+            def f(x):
+                return torch.ops.mylib.foo.default(x)
+
+            x = torch.randn(3)
+            res = f(x)
+            res1 = f(x)
+            res2 = f(x)
+            expected = x.sin().cos()
+            self.assertEqual(res, expected)
+            self.assertEqual(res1, expected)
+            self.assertEqual(res2, expected)
+            self.assertTrue(len(cnt.inductor_graphs), 1)
+            self.assertTrue(len(cnt1.inductor_graphs), 1)
+            self.assertExpectedInline(
+                str(cnt.inductor_graphs[0].graph).strip(),
+                """\
+graph():
+    %arg0_1 : [num_users=0] = placeholder[target=arg0_1]
+    %arg1_1 : [num_users=1] = placeholder[target=arg1_1]
+    %sin : [num_users=1] = call_function[target=torch.ops.aten.sin.default](args = (%arg1_1,), kwargs = {})
+    %cos : [num_users=1] = call_function[target=torch.ops.aten.cos.default](args = (%sin,), kwargs = {})
+    return (cos,)""",
+            )
+            self.assertExpectedInline(
+                str(cnt1.inductor_graphs[0].graph).strip(),
+                """\
+graph():
+    %arg0_1 : [num_users=0] = placeholder[target=arg0_1]
+    %arg1_1 : [num_users=1] = placeholder[target=arg1_1]
+    %foo : [num_users=1] = call_function[target=torch.ops.mylib.foo.default](args = (%arg1_1,), kwargs = {})
+    return (foo,)""",
+            )
+
     @torch._dynamo.config.patch(accumulated_recompile_limit=1)
     def test_dynamo_disabled_in_custom_op_kernels(self):
         counters.clear()
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index 472905eca6c1..036f1ba7d01a 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -847,6 +847,14 @@ class _TorchDynamoContext:
         def compile_wrapper(*args: Any, **kwargs: Any) -> Any:
             prior = set_eval_frame(None)
             try:
+                # We shouldn't compile inside kernel invocation.
+                if tracing_context := torch._guards.TracingContext.try_get():
+                    if (
+                        tracing_context.fake_mode is not None
+                        and tracing_context.fake_mode.in_kernel_invocation
+                    ):
+                        return fn(*args, **kwargs)
+                # Skip nested compile - just inline the function
                 if is_fx_symbolic_tracing():
                     if config.error_on_nested_fx_trace:
                         raise RuntimeError(

From 9c12651417bd8a10870702fb368b4d92d70ca667 Mon Sep 17 00:00:00 2001
From: vishalgoyal316 <visgoyal@redhat.com>
Date: Fri, 17 Oct 2025 19:06:00 +0000
Subject: [PATCH 067/123] Improve error message for non-positive groups in
 convolution (#165669)

Prevents from segmentation fault for invalid groups value in convolution.

Fixes #142835

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165669
Approved by: https://github.com/mikaylagawarecki
---
 aten/src/ATen/native/Convolution.cpp |  1 +
 test/nn/test_convolution.py          | 49 ++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+)

diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index 78a0af03e198..1158359be239 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -658,6 +658,7 @@ static void check_shape_forward(const at::Tensor& input,
   TORCH_CHECK(!params.is_output_padding_neg(), "negative output_padding is not supported");
   TORCH_CHECK(!params.is_stride_nonpos(), "non-positive stride is not supported");
   TORCH_CHECK(!params.is_dilation_neg(), "dilation should be greater than zero");
+  TORCH_CHECK(groups > 0, "expected groups to be greater than 0, but got groups=", groups);
 
   TORCH_CHECK(weight_dim == k,
            "Expected ", weight_dim, "-dimensional input for ", weight_dim,
diff --git a/test/nn/test_convolution.py b/test/nn/test_convolution.py
index fe93775f0830..4cdcac707644 100644
--- a/test/nn/test_convolution.py
+++ b/test/nn/test_convolution.py
@@ -230,6 +230,55 @@ class TestConvolutionNN(NNTestCase):
         with self.assertRaisesRegex(ValueError, "groups must be a positive integer"):
             torch.nn.Conv3d(1, 1, kernel_size=3, dilation=2, stride=2, groups=-2)
 
+    def test_conv_aten_invalid_groups(self):
+        # test low-level aten ops with invalid groups parameter
+        grad_output = torch.randn(2, 4, 8, dtype=torch.double)
+        input = torch.randn(2, 5, 8, dtype=torch.double)
+        weight = torch.randn(5, 4, 3, dtype=torch.double)
+        bias_sizes = [4]
+        stride = [1]
+        padding = [1]
+        dilation = [1]
+        transposed = True
+        output_padding = [0]
+        output_mask = [True, True, True]
+
+        # test groups=0
+        with self.assertRaisesRegex(
+            RuntimeError, "expected groups to be greater than 0, but got groups=0"
+        ):
+            torch.ops.aten.convolution_backward(
+                grad_output,
+                input,
+                weight,
+                bias_sizes,
+                stride,
+                padding,
+                dilation,
+                transposed,
+                output_padding,
+                0,
+                output_mask,
+            )
+
+        # test groups=-1
+        with self.assertRaisesRegex(
+            RuntimeError, "expected groups to be greater than 0, but got groups=-1"
+        ):
+            torch.ops.aten.convolution_backward(
+                grad_output,
+                input,
+                weight,
+                bias_sizes,
+                stride,
+                padding,
+                dilation,
+                transposed,
+                output_padding,
+                -1,
+                output_mask,
+            )
+
     def test_conv3d_overflow_values(self):
         input = torch.full(
             (

From a664b299ac2840b3399835097813e0d3986bb984 Mon Sep 17 00:00:00 2001
From: Kasparas Karlauskas <121799419+kasparas-k@users.noreply.github.com>
Date: Fri, 17 Oct 2025 19:06:29 +0000
Subject: [PATCH 068/123] Update docs for torch.mode (#165614)

Currently the docs for `torch.mode` include a note:

`This function is not defined for torch.cuda.Tensor yet.`

However with `torch==2.7.1+cu126` when I try to get the mode of a Tensor that is in cuda memory, I do not face any issues:

```
>>> a = torch.tensor([0, 2, 1, 1, 1, 3, 3])
>>> a.mode()
torch.return_types.mode(
values=tensor(1),
indices=tensor(4))
>>> a.cuda().mode()
torch.return_types.mode(
values=tensor(1, device='cuda:0'),
indices=tensor(4, device='cuda:0'))
```

Am I misunderstanding the note? If not, I suggest removing it.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165614
Approved by: https://github.com/mikaylagawarecki
---
 torch/_torch_docs.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 9a0e4ff30721..681025f5d283 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -7673,8 +7673,6 @@ If :attr:`keepdim` is ``True``, the output tensors are of the same size as
 Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting
 in the output tensors having 1 fewer dimension than :attr:`input`.
 
-.. note:: This function is not defined for ``torch.cuda.Tensor`` yet.
-
 Args:
     {input}
     {opt_dim}

From 382b0150de1247bf392b424edea71b541cae7d52 Mon Sep 17 00:00:00 2001
From: vishalgoyal316 <visgoyal@redhat.com>
Date: Fri, 17 Oct 2025 19:11:52 +0000
Subject: [PATCH 069/123] [docs] Add usage examples to ConvTranspose1d
 docstring (#165618)

Fixes #165615

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165618
Approved by: https://github.com/mikaylagawarecki
---
 torch/nn/modules/conv.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/torch/nn/modules/conv.py b/torch/nn/modules/conv.py
index 1fc2d63eb4f3..35ae57bcbcd2 100644
--- a/torch/nn/modules/conv.py
+++ b/torch/nn/modules/conv.py
@@ -901,6 +901,23 @@ class ConvTranspose1d(_ConvTransposeNd):
                          sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
                          :math:`k = \frac{groups}{C_\text{out} * \text{kernel\_size}}`
 
+    Examples::
+
+        >>> # With square kernels and equal stride
+        >>> m = nn.ConvTranspose1d(16, 33, 3, stride=2)
+        >>> input = torch.randn(20, 16, 50)
+        >>> output = m(input)
+        >>> # exact output size can be also specified as an argument
+        >>> input = torch.randn(1, 16, 12)
+        >>> downsample = nn.Conv1d(16, 16, 3, stride=2, padding=1)
+        >>> upsample = nn.ConvTranspose1d(16, 16, 3, stride=2, padding=1)
+        >>> h = downsample(input)
+        >>> h.size()
+        torch.Size([1, 16, 6])
+        >>> output = upsample(h, output_size=input.size())
+        >>> output.size()
+        torch.Size([1, 16, 12])
+
     .. _`here`:
         https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
 

From a16fd6b4885206fc2a29ac94124107f05e23a9c6 Mon Sep 17 00:00:00 2001
From: Eddie Yan <eddiey@nvidia.com>
Date: Fri, 17 Oct 2025 19:33:26 +0000
Subject: [PATCH 070/123] [NVSHMEM][Triton] Fix NVSHMEM triton test for wacky
 world sizes (#165704)

Currently assumes divisible by 4? world size

Not as slick as the old setup code but more general

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165704
Approved by: https://github.com/Skylion007, https://github.com/kwen2501
---
 test/distributed/test_nvshmem_triton.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/test/distributed/test_nvshmem_triton.py b/test/distributed/test_nvshmem_triton.py
index ddbaa089d1b9..3fec9a01f049 100644
--- a/test/distributed/test_nvshmem_triton.py
+++ b/test/distributed/test_nvshmem_triton.py
@@ -1141,9 +1141,8 @@ class NVSHMEMTritonTest(MultiProcContinuousTest):
         vals[0, ::2] = 1
         vals[0, 1::2] = 2
         vals[1] = 1
-        vals2 = vals[2].view(-1, 2, 2)
-        vals2[:, 0] = 1
-        vals2[:, 1] = 2
+        for rank in range(world_size):
+            vals[2, rank] = 1 if (rank // 2) % 2 == 0 else 2
         expected = vals.prod(-1).tolist()
 
         # Synchronize before reduction

From 75e2a9fae37f9d07229a6d4e8e4b2e1d910e3dad Mon Sep 17 00:00:00 2001
From: Shangdi Yu <shangdiy@meta.com>
Date: Fri, 17 Oct 2025 20:10:49 +0000
Subject: [PATCH 071/123] [annotate] add annotate_fn function decorator
 (#165703)

Example usage:

```
        @fx_traceback.annotate_fn({"pp_stage": 1})
        def example_function(x):
            return x * x

        class SimpleLinear(nn.Module):
            def __init__(self):
                super().__init__()
                self.linear = nn.Linear(3, 2)

            def forward(self, x):
                with fx_traceback.annotate({"pp_stage": 0}):
                    y = self.linear(x)
                y = example_function(y)
                return y - 1
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165703
Approved by: https://github.com/SherlockNoMad
---
 .../test_aot_joint_with_descriptors.py        | 40 +++++++++++++++++
 torch/fx/traceback.py                         | 43 ++++++++++++++++++-
 2 files changed, 81 insertions(+), 2 deletions(-)

diff --git a/test/functorch/test_aot_joint_with_descriptors.py b/test/functorch/test_aot_joint_with_descriptors.py
index 167215bb8be1..d797b36748d0 100644
--- a/test/functorch/test_aot_joint_with_descriptors.py
+++ b/test/functorch/test_aot_joint_with_descriptors.py
@@ -922,6 +922,46 @@ class inner_f(torch.nn.Module):
             in custom_metadata
         )
 
+    def test_preserve_annotate_function(self):
+        """Test basic annotate_fn usage"""
+
+        @fx_traceback.annotate_fn({"pp_stage": 1})
+        def example_function(x):
+            return x * x
+
+        class SimpleLinear(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = nn.Linear(3, 2)
+
+            def forward(self, x):
+                with fx_traceback.annotate({"pp_stage": 0}):
+                    y = self.linear(x)
+                y = example_function(y)
+                return y - 1
+
+        inputs = (torch.randn(4, 3),)
+        model = SimpleLinear()
+
+        for with_export in [True, False]:
+            graph_module = graph_capture(model, inputs, with_export)
+            custom_metadata = fx_traceback._get_custom_metadata(graph_module)
+            self.assertExpectedInline(
+                str(custom_metadata),
+                """\
+('call_function', 't', {'pp_stage': 0})
+('call_function', 'addmm', {'pp_stage': 0})
+('call_function', 'mul', {'pp_stage': 1})
+('call_function', 'mul_1', {'pp_stage': 1})
+('call_function', 'mul_2', {'pp_stage': 1})
+('call_function', 't_1', {'pp_stage': 0})
+('call_function', 'mm', {'pp_stage': 0})
+('call_function', 't_2', {'pp_stage': 0})
+('call_function', 'sum_1', {'pp_stage': 0})
+('call_function', 'view', {'pp_stage': 0})
+('call_function', 't_3', {'pp_stage': 0})""",
+            )
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/fx/traceback.py b/torch/fx/traceback.py
index 3d1e3b7c5d53..2774c76850aa 100644
--- a/torch/fx/traceback.py
+++ b/torch/fx/traceback.py
@@ -18,6 +18,7 @@ log = logging.getLogger(__name__)
 
 __all__ = [
     "annotate",
+    "annotate_fn",
     "preserve_node_meta",
     "has_preserved_node_meta",
     "set_stack_trace",
@@ -266,9 +267,10 @@ def annotate(annotation_dict: dict):
             into the FX trace metadata.
 
     Example:
+        After exiting the context, custom annotations are removed.
+
         >>> with annotate({"source": "custom_pass", "tag": 42}):
-        ...     # compute here
-        # After exiting the context, custom annotations are removed.
+        ...     pass  # Your computation here
     """
 
     global current_meta
@@ -291,6 +293,43 @@ def annotate(annotation_dict: dict):
             del current_meta["custom"]
 
 
+@compatibility(is_backward_compatible=False)
+def annotate_fn(annotation_dict: dict):
+    """
+    A decorator that wraps a function with the annotate context manager.
+    Use this when you want to annotate an entire function instead of a specific code block.
+
+    Note:
+        This API is **not backward compatible** and may evolve in future releases.
+
+    Note:
+        This API is not compatible with fx.symbolic_trace or jit.trace. It's intended
+        to be used with PT2 family of tracers, e.g. torch.export and dynamo.
+
+    Args:
+        annotation_dict (dict): A dictionary of custom key-value pairs to inject
+            into the FX trace metadata for all operations in the function.
+
+    Example:
+        All operations in my_function will have {"pp_stage": 1} in their metadata.
+
+        >>> @annotate_fn({"pp_stage": 1})
+        ... def my_function(x):
+        ...     return x + 1
+    """
+    from functools import wraps
+
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            with annotate(annotation_dict):
+                return func(*args, **kwargs)
+
+        return wrapper
+
+    return decorator
+
+
 @compatibility(is_backward_compatible=False)
 def set_grad_fn_seq_nr(seq_nr):
     global current_meta

From 2bcd892c86349ad6e91d66760fb3d2257526625d Mon Sep 17 00:00:00 2001
From: Rohit Singh Rathaur <rrathaur@redhat.com>
Date: Fri, 17 Oct 2025 20:14:32 +0000
Subject: [PATCH 072/123] [distributed] Replace assert statements in
 distributed checkpoint with explicit checks (#165256)

Fixes partially #164878

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165256
Approved by: https://github.com/albanD
---
 .../checkpoint/_async_process_executor.py     | 21 +++++++--
 torch/distributed/checkpoint/_checkpointer.py |  3 +-
 .../checkpoint/_dedup_save_plans.py           |  3 +-
 .../checkpoint/_experimental/barriers.py      |  5 +-
 .../_experimental/checkpoint_process.py       | 16 +++----
 .../_experimental/checkpoint_reader.py        | 11 +++--
 .../checkpoint/_experimental/staging.py       | 21 +++++----
 .../checkpoint/_fsspec_filesystem.py          |  3 +-
 torch/distributed/checkpoint/_pg_transport.py | 13 +++---
 .../checkpoint/_state_dict_stager.py          | 13 +++---
 .../distributed/checkpoint/default_planner.py | 20 +++++---
 .../examples/async_checkpointing_example.py   |  6 ++-
 torch/distributed/checkpoint/filesystem.py    | 30 ++++++++----
 torch/distributed/checkpoint/format_utils.py  | 23 +++++++---
 torch/distributed/checkpoint/hf_storage.py    | 14 +++---
 torch/distributed/checkpoint/optimizer.py     | 16 +++----
 .../checkpoint/quantized_hf_storage.py        |  7 +--
 torch/distributed/checkpoint/staging.py       | 19 +++++---
 torch/distributed/checkpoint/state_dict.py    | 46 ++++++++++++++-----
 .../checkpoint/state_dict_loader.py           | 15 ++++--
 .../checkpoint/state_dict_saver.py            | 27 ++++++-----
 torch/distributed/checkpoint/utils.py         | 15 ++++--
 22 files changed, 218 insertions(+), 129 deletions(-)

diff --git a/torch/distributed/checkpoint/_async_process_executor.py b/torch/distributed/checkpoint/_async_process_executor.py
index 03d368506828..7c8aa6b63984 100644
--- a/torch/distributed/checkpoint/_async_process_executor.py
+++ b/torch/distributed/checkpoint/_async_process_executor.py
@@ -109,7 +109,8 @@ class _AsyncCheckpointProcess:
         # Wait for the checkpoint background process to initialize.
         # Using default GLOO init timeout.
         response = self._wait_for_response(timeout=1800)
-        assert response == _CheckpointSaveProcessControlOpts.INIT_COMPLETE
+        if not response == _CheckpointSaveProcessControlOpts.INIT_COMPLETE:
+            raise AssertionError(f"Expected INIT_COMPLETE response, got {response}")
 
     def __del__(self) -> None:
         if self._save_process.is_alive():
@@ -175,7 +176,8 @@ class _AsyncCheckpointProcess:
         )
         self._send(async_cp_request)
         result = self._wait_for_response()
-        assert isinstance(result, Metadata)
+        if not isinstance(result, Metadata):
+            raise AssertionError(f"Expected Metadata response, got {type(result)}")
         return result
 
     @staticmethod
@@ -245,7 +247,10 @@ class _AsyncCheckpointProcess:
                 ):
                     logger.info("Terminating the checkpoint background process.")
                     return
-                assert isinstance(obj, _AsyncCheckpointRequest)
+                if not isinstance(obj, _AsyncCheckpointRequest):
+                    raise AssertionError(
+                        f"Expected _AsyncCheckpointRequest, got {type(obj)}"
+                    )
                 logger.info(
                     f"Received async checkpoint request with id={obj.checkpoint_request_id.checkpoint_id}"  # noqa: G004
                 )
@@ -296,7 +301,10 @@ class _ProcessBasedAsyncCheckpointExecutor(_AsyncCheckpointExecutor):
     ) -> Metadata:
         global _CHECKPOINT_PROCESS
         if _CHECKPOINT_PROCESS is None:
-            assert pg_init_info is not None
+            if pg_init_info is None:
+                raise AssertionError(
+                    "pg_init_info must not be None when _CHECKPOINT_PROCESS is None"
+                )
             ckpt_kwargs = {}
             if (ckpt_id := getattr(storage_writer, "checkpoint_id", None)) is not None:
                 ckpt_kwargs["checkpoint_id"] = ckpt_id
@@ -310,7 +318,10 @@ class _ProcessBasedAsyncCheckpointExecutor(_AsyncCheckpointExecutor):
 
             create_checkpoint_daemon_process()
 
-        assert _CHECKPOINT_PROCESS is not None
+        if _CHECKPOINT_PROCESS is None:
+            raise AssertionError(
+                "_CHECKPOINT_PROCESS must not be None after initialization"
+            )
         staged_state_dict = (
             staging_future_or_state_dict.result()
             if isinstance(staging_future_or_state_dict, Future)
diff --git a/torch/distributed/checkpoint/_checkpointer.py b/torch/distributed/checkpoint/_checkpointer.py
index d21d8248d204..d54de9092a93 100644
--- a/torch/distributed/checkpoint/_checkpointer.py
+++ b/torch/distributed/checkpoint/_checkpointer.py
@@ -89,7 +89,8 @@ class _Checkpointer:
             process_group=self.process_group,
             planner=self.save_planner,
         )
-        assert isinstance(response, Future)
+        if not isinstance(response, Future):
+            raise AssertionError("response should be a Future instance")
         return response
 
     def load(self, state_dict: dict[str, Any]) -> None:
diff --git a/torch/distributed/checkpoint/_dedup_save_plans.py b/torch/distributed/checkpoint/_dedup_save_plans.py
index 3e2cf954c409..acb81c418628 100644
--- a/torch/distributed/checkpoint/_dedup_save_plans.py
+++ b/torch/distributed/checkpoint/_dedup_save_plans.py
@@ -54,7 +54,8 @@ def dedup_save_plans(
         for plan_idx in plan_indices - {select_plan_idx}:
             plan_to_item_indices[plan_idx].discard(write_item_idx)
     # Sanity check
-    assert len(all_plans) == len(plan_to_item_indices)
+    if len(all_plans) != len(plan_to_item_indices):
+        raise AssertionError("len(all_plans) != len(plan_to_item_indices)")
     # Create new plans with the updated write items post deduplication
     return [
         dataclasses.replace(
diff --git a/torch/distributed/checkpoint/_experimental/barriers.py b/torch/distributed/checkpoint/_experimental/barriers.py
index 18de93c81d13..bcea8ad91401 100644
--- a/torch/distributed/checkpoint/_experimental/barriers.py
+++ b/torch/distributed/checkpoint/_experimental/barriers.py
@@ -150,9 +150,8 @@ class DistBarrier(Barrier):
         Raises:
             AssertionError: If the distributed process group is not initialized.
         """
-        assert dist.is_initialized(), (
-            "DistBarrier requires an initialized process group."
-        )
+        if not dist.is_initialized():
+            raise AssertionError("DistBarrier requires an initialized process group.")
 
     def execute_barrier(self) -> None:
         """
diff --git a/torch/distributed/checkpoint/_experimental/checkpoint_process.py b/torch/distributed/checkpoint/_experimental/checkpoint_process.py
index 96a62caa379f..4e1c8e7f8253 100644
--- a/torch/distributed/checkpoint/_experimental/checkpoint_process.py
+++ b/torch/distributed/checkpoint/_experimental/checkpoint_process.py
@@ -135,7 +135,8 @@ class CheckpointProcess:
         )
 
         # wait for the timeout or a response from subprocess
-        assert self._parent_end is not None, "Parent end of pipe should be initialized"
+        if self._parent_end is None:
+            raise AssertionError("Parent end of pipe should be initialized")
         if not self._parent_end.poll(timeout=config.subprocess_init_timeout_secs):
             msg = f"Timed out after {config.subprocess_init_timeout_secs}s waiting for checkpoint subprocess to initialize"
             logger.error(msg)
@@ -161,7 +162,8 @@ class CheckpointProcess:
             os.getpid(),
         )
 
-        assert sub_rank == 0, "We need only one checkpointer per parent training"
+        if sub_rank != 0:
+            raise AssertionError("We need only one checkpointer per parent training")
         request = WorkerRequest(request_type=RequestType.PING, payload={})
 
         try:
@@ -226,9 +228,8 @@ class CheckpointProcess:
 
     def _send(self, request_type: RequestType, payload: dict[str, Any]) -> None:
         try:
-            assert self._parent_end is not None, (
-                "Parent end of pipe should be initialized"
-            )
+            if self._parent_end is None:
+                raise AssertionError("Parent end of pipe should be initialized")
             self._parent_end.send(
                 WorkerRequest(
                     request_type=request_type,
@@ -244,9 +245,8 @@ class CheckpointProcess:
 
     def _recv(self) -> Optional[dict[str, Any]]:
         try:
-            assert self._parent_end is not None, (
-                "Parent end of pipe should be initialized"
-            )
+            if self._parent_end is None:
+                raise AssertionError("Parent end of pipe should be initialized")
             response = self._parent_end.recv()
             if response.success is False:
                 error_msg = (
diff --git a/torch/distributed/checkpoint/_experimental/checkpoint_reader.py b/torch/distributed/checkpoint/_experimental/checkpoint_reader.py
index 5f0abc4a40ed..fb1bcf46198b 100644
--- a/torch/distributed/checkpoint/_experimental/checkpoint_reader.py
+++ b/torch/distributed/checkpoint/_experimental/checkpoint_reader.py
@@ -134,11 +134,12 @@ class CheckpointReader:
 
                 tensor_offset = source.untyped_storage()._checkpoint_offset
 
-                assert tensor_offset is not None, (
-                    "checkpoint_offset for tensor in torch serialized file is not set. This could"
-                    "happen if the checkpoint was saved with a older version of Pytorch."
-                    "Please make sure that the checkpoint was saved with Pytorch 2.7 or later."
-                )
+                if tensor_offset is None:
+                    raise AssertionError(
+                        "checkpoint_offset for tensor in torch serialized file is not set. This could "
+                        "happen if the checkpoint was saved with a older version of Pytorch. "
+                        "Please make sure that the checkpoint was saved with Pytorch 2.7 or later."
+                    )
 
                 tensor_len = source.nelement() * source.element_size()
                 file.seek(
diff --git a/torch/distributed/checkpoint/_experimental/staging.py b/torch/distributed/checkpoint/_experimental/staging.py
index b9de0696243f..199532e2d116 100644
--- a/torch/distributed/checkpoint/_experimental/staging.py
+++ b/torch/distributed/checkpoint/_experimental/staging.py
@@ -158,9 +158,10 @@ class DefaultStager(CheckpointStager):
                 self._staging_stream = torch.Stream()
 
         if self._config.use_non_blocking_copy:
-            assert torch.accelerator.is_available(), (
-                "Non-blocking copy requires that the current accelerator is available."
-            )
+            if not torch.accelerator.is_available():
+                raise AssertionError(
+                    "Non-blocking copy requires that the current accelerator is available."
+                )
 
     def stage(
         self,
@@ -168,9 +169,10 @@ class DefaultStager(CheckpointStager):
         **kwargs: Any,
     ) -> Union[STATE_DICT, Future[STATE_DICT]]:
         if self._config.use_async_staging:
-            assert self._staging_executor is not None, (
-                "Staging executor should be initialized for async staging"
-            )
+            if self._staging_executor is None:
+                raise AssertionError(
+                    "Staging executor should be initialized for async staging"
+                )
             return self._staging_executor.submit(
                 self._stage,
                 state_dict,
@@ -185,9 +187,10 @@ class DefaultStager(CheckpointStager):
         )
 
         if self._config.use_non_blocking_copy:
-            assert self._staging_stream or not self._config.use_async_staging, (
-                "Non-blocking copy in a background thread for async staging needs staging_stream to be initialized."
-            )
+            if not (self._staging_stream or not self._config.use_async_staging):
+                raise AssertionError(
+                    "Non-blocking copy in a background thread for async staging needs staging_stream to be initialized."
+                )
 
             # waits for the enqued copy operations to finish.
             self._staging_stream.synchronize() if self._staging_stream else torch.accelerator.synchronize()
diff --git a/torch/distributed/checkpoint/_fsspec_filesystem.py b/torch/distributed/checkpoint/_fsspec_filesystem.py
index 377c34ae1e5d..e239bbe891fb 100644
--- a/torch/distributed/checkpoint/_fsspec_filesystem.py
+++ b/torch/distributed/checkpoint/_fsspec_filesystem.py
@@ -37,7 +37,8 @@ class FileSystem(FileSystemBase):
     def create_stream(
         self, path: Union[str, os.PathLike], mode: str
     ) -> Generator[io.IOBase, None, None]:
-        assert self.fs is not None
+        if self.fs is None:
+            raise AssertionError("fs should not be None")
         path = os.fspath(path)
 
         # fsspec does not support concurrent transactions, and not all
diff --git a/torch/distributed/checkpoint/_pg_transport.py b/torch/distributed/checkpoint/_pg_transport.py
index de5b2a2927fe..6a327afd445f 100644
--- a/torch/distributed/checkpoint/_pg_transport.py
+++ b/torch/distributed/checkpoint/_pg_transport.py
@@ -193,12 +193,12 @@ def _cast_tensor(tensor: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
     caveat that the cast tensor may be larger than the original tensor due to
     the differences in striding.
     """
-    assert type(tensor) is torch.Tensor, (
-        f"can only cast standard tensors not {type(tensor)}"
-    )
+    if type(tensor) is not torch.Tensor:
+        raise AssertionError(f"can only cast standard tensors not {type(tensor)}")
     storage = tensor.untyped_storage()
     ret = torch.tensor(storage, dtype=dtype, device=tensor.device)
-    assert ret.untyped_storage() is storage, "storage should be the same"
+    if ret.untyped_storage() is not storage:
+        raise AssertionError("storage should be the same")
     return ret
 
 
@@ -317,9 +317,8 @@ class PGTransport:
                 if isinstance(inplace, DTensor):
                     inplace = inplace._local_tensor
                 t = _cast_tensor(inplace, torch.uint8)
-                assert t.nbytes == v.nbytes, (
-                    "inplace tensor storage must be the same size"
-                )
+                if t.nbytes != v.nbytes:
+                    raise AssertionError("inplace tensor storage must be the same size")
             else:
                 t = torch.empty(v.nbytes, dtype=torch.uint8, device=self._device)
 
diff --git a/torch/distributed/checkpoint/_state_dict_stager.py b/torch/distributed/checkpoint/_state_dict_stager.py
index 45fbd7686d89..1a5945657d26 100644
--- a/torch/distributed/checkpoint/_state_dict_stager.py
+++ b/torch/distributed/checkpoint/_state_dict_stager.py
@@ -123,12 +123,13 @@ class StateDictStager:
         # Check if we've already cached this storage
         if storage in self._cached_storage_mapping:
             cached_storage = self._cached_storage_mapping[storage]
-            assert cached_storage.size() == storage.size(), (
-                "For async checkpointing,  We cache storages in DRAM and reuse them."
-                "Cached storage size does not match original storage size."
-                "This should never happen as we track the original storage weakref "
-                "and clean up the cache storage. Please report this to PyTorch Distributed Checkpointing."
-            )
+            if cached_storage.size() != storage.size():
+                raise AssertionError(
+                    "For async checkpointing,  We cache storages in DRAM and reuse them. "
+                    "Cached storage size does not match original storage size. "
+                    "This should never happen as we track the original storage weakref "
+                    "and clean up the cache storage. Please report this to PyTorch Distributed Checkpointing."
+                )
             # Reuse cached storage but update with new data
             cached_storage.copy_(storage, non_blocking=non_blocking)
             return cached_storage
diff --git a/torch/distributed/checkpoint/default_planner.py b/torch/distributed/checkpoint/default_planner.py
index 0f76400acb67..ee0029ec7d63 100644
--- a/torch/distributed/checkpoint/default_planner.py
+++ b/torch/distributed/checkpoint/default_planner.py
@@ -313,7 +313,8 @@ class DefaultLoadPlanner(LoadPlanner):
         self.is_coordinator = is_coordinator
 
     def create_local_plan(self) -> LoadPlan:
-        assert self.metadata is not None
+        if self.metadata is None:
+            raise AssertionError("self.metadata is not None")
         if self.flatten_state_dict:
             # To support checkpoints that are saved before v2.4, we have to
             # differentiate if the missing keys are due to old checkpoints.
@@ -432,8 +433,10 @@ class _EmptyStateDictLoadPlanner(DefaultLoadPlanner):
         metadata: Optional[Metadata] = None,
         is_coordinator: bool = False,
     ) -> None:
-        assert not state_dict
-        assert metadata is not None
+        if state_dict:
+            raise AssertionError("not state_dict")
+        if metadata is None:
+            raise AssertionError("metadata is not None")
 
         # rebuild the state dict from the metadata
         for k, v in metadata.state_dict_metadata.items():
@@ -549,13 +552,15 @@ def create_default_global_save_plan(
         new_items = []
         for item in plan.items:
             if item.type != WriteItemType.SHARD:
-                assert item.index.fqn not in md
+                if item.index.fqn in md:
+                    raise AssertionError("item.index.fqn not in md")
 
             if item.type == WriteItemType.BYTE_IO:
                 md[item.index.fqn] = BytesStorageMetadata()
                 new_items.append(item)
             else:
-                assert item.tensor_data is not None
+                if item.tensor_data is None:
+                    raise AssertionError("item.tensor_data is not None")
                 tensor_md = cast(
                     TensorStorageMetadata,
                     md.setdefault(
@@ -575,10 +580,11 @@ def create_default_global_save_plan(
                     new_item = dataclasses.replace(item, index=new_index)
                 new_items.append(new_item)
 
-                assert item.tensor_data.chunk is not None, f"""
+                if item.tensor_data.chunk is None:
+                    raise AssertionError(f"""
                     Cannot create MD for tensor without bounds.
                     FQN: {item.index.fqn}
-                """
+                """)
                 tensor_md.chunks.append(item.tensor_data.chunk)
         new_plans.append(dataclasses.replace(plan, items=new_items))
     return (new_plans, Metadata(md))
diff --git a/torch/distributed/checkpoint/examples/async_checkpointing_example.py b/torch/distributed/checkpoint/examples/async_checkpointing_example.py
index 5a0a6582b069..c3375c375437 100644
--- a/torch/distributed/checkpoint/examples/async_checkpointing_example.py
+++ b/torch/distributed/checkpoint/examples/async_checkpointing_example.py
@@ -109,7 +109,8 @@ def run(rank, world_size):
 
             if epoch % SAVE_PERIOD == 0:
                 if f is not None:
-                    assert isinstance(f, Future)
+                    if not isinstance(f, Future):
+                        raise AssertionError("f should be a Future instance")
                     f.result()
                 f = dcp.state_dict_saver.async_save(
                     state_dict, checkpoint_id=CHECKPOINT_DIR
@@ -126,7 +127,8 @@ def run(rank, world_size):
 
             _print("Reloading model from last checkpoint!")
             if f is not None:
-                assert isinstance(f, Future)
+                if not isinstance(f, Future):
+                    raise AssertionError("f should be a Future instance") from None
                 f.result()
             dcp.load(state_dict)
 
diff --git a/torch/distributed/checkpoint/filesystem.py b/torch/distributed/checkpoint/filesystem.py
index 80e40c27b2ab..5def6c13dc14 100644
--- a/torch/distributed/checkpoint/filesystem.py
+++ b/torch/distributed/checkpoint/filesystem.py
@@ -201,7 +201,8 @@ class _OverlappingCpuLoader(_TensorLoader):
                 self.in_flight_data += tensor.numel() * tensor.element_size()
 
     def _finish(self) -> Iterable[tuple[torch.Tensor, object]]:
-        assert self._done
+        if not self._done:
+            raise AssertionError("_finish called before all items were processed")
         if len(self.current_items) > 0:
             self.stream.synchronize()
         return self.current_items
@@ -281,7 +282,8 @@ class _StorageWriterTransforms:
 
 def _item_size(item: WriteItem) -> int:
     size = 1
-    assert item.tensor_data is not None
+    if item.tensor_data is None:
+        raise AssertionError("WriteItem tensor_data must not be None")
     # can't use math.prod as PT needs to support older python
     for s in item.tensor_data.size:
         size *= s
@@ -329,11 +331,16 @@ def _write_item(
     )
 
     if write_item.type == WriteItemType.BYTE_IO:
-        assert isinstance(data, io.BytesIO)
+        if not isinstance(data, io.BytesIO):
+            raise AssertionError("Data must be io.BytesIO for BYTE_IO write items")
         transform_to.write(data.getbuffer())
     else:
-        assert isinstance(data, torch.Tensor)
-        assert data.device == torch.device("cpu")
+        if not isinstance(data, torch.Tensor):
+            raise AssertionError(
+                "Data must be torch.Tensor for non-BYTE_IO write items"
+            )
+        if data.device != torch.device("cpu"):
+            raise AssertionError("Tensor must be on CPU device")
         if serialization_format == SerializationFormat.TORCH_SAVE:
             torch.save(data, transform_to)
 
@@ -428,7 +435,8 @@ def _write_files_from_queue(
                 tensor_dict = {}
                 metadata_dict = {}
                 for tensor, write_item in loader.values():
-                    assert tensor.is_cpu
+                    if not tensor.is_cpu:
+                        raise AssertionError("Tensor must be on CPU")
                     write_results.append(
                         _write_item(
                             transforms,
@@ -903,9 +911,10 @@ class FileSystemReader(StorageReader):
                         )
                         target_tensor = planner.resolve_tensor(req).detach()
 
-                        assert target_tensor.size() == tensor.size(), (
-                            f"req {req.storage_index} mismatch sizes {target_tensor.size()} vs {tensor.size()}"
-                        )
+                        if target_tensor.size() != tensor.size():
+                            raise AssertionError(
+                                f"req {req.storage_index} mismatch sizes {target_tensor.size()} vs {tensor.size()}"
+                            )
                         target_tensor.copy_(tensor)
                         planner.commit_tensor(req, target_tensor)
 
@@ -936,7 +945,8 @@ class FileSystemReader(StorageReader):
         self.storage_data = metadata.storage_data
         self.rank = kwargs.get("rank")
         self.use_collectives = kwargs.get("use_collectives", True)
-        assert self.storage_data is not None
+        if self.storage_data is None:
+            raise AssertionError("storage_data must not be None in metadata")
 
     def prepare_local_plan(self, plan: LoadPlan) -> LoadPlan:
         return plan
diff --git a/torch/distributed/checkpoint/format_utils.py b/torch/distributed/checkpoint/format_utils.py
index 383be3b30945..b61474f675db 100644
--- a/torch/distributed/checkpoint/format_utils.py
+++ b/torch/distributed/checkpoint/format_utils.py
@@ -84,7 +84,8 @@ class BroadcastingTorchSaveReader(StorageReader):
         # the entire checkpoint on each rank, hopefully preventing OOM issues
         # TODO: read on each host, instead of only the coordinator
         if self.is_coordinator:
-            assert self.checkpoint_id is not None
+            if self.checkpoint_id is None:
+                raise AssertionError("checkpoint_id must be set before reading data")
             torch_state_dict = torch.load(
                 self.checkpoint_id, map_location="cpu", weights_only=False
             )
@@ -112,10 +113,11 @@ class BroadcastingTorchSaveReader(StorageReader):
 
             tensor = narrow_tensor_by_index(tensor, req.storage_offsets, req.lengths)
             target_tensor = planner.resolve_tensor(req).detach()
-            assert target_tensor.size() == tensor.size(), (
-                f"req {req.storage_index} mismatch sizes, "
-                f"{target_tensor.size()} vs {tensor.size()}"
-            )
+            if not target_tensor.size() == tensor.size():
+                raise AssertionError(
+                    f"req {req.storage_index} mismatch sizes, "
+                    f"{target_tensor.size()} vs {tensor.size()}"
+                )
             target_tensor.copy_(tensor)
             planner.commit_tensor(req, target_tensor)
 
@@ -128,9 +130,16 @@ class BroadcastingTorchSaveReader(StorageReader):
         """Implementation of the StorageReader method"""
         self.is_coordinator = is_coordinator
         if self.is_coordinator:
-            assert dist.get_rank() == self.coordinator_rank
+            if not dist.get_rank() == self.coordinator_rank:
+                raise AssertionError(
+                    f"Coordinator rank mismatch: expected {self.coordinator_rank}, "
+                    f"got {dist.get_rank()}"
+                )
 
-        assert self.checkpoint_id is not None
+        if self.checkpoint_id is None:
+            raise AssertionError(
+                "checkpoint_id must be set before setting up storage reader"
+            )
 
     def prepare_local_plan(self, plan: LoadPlan) -> LoadPlan:
         """Implementation of the StorageReader method"""
diff --git a/torch/distributed/checkpoint/hf_storage.py b/torch/distributed/checkpoint/hf_storage.py
index 90720dac802b..c769565229b3 100644
--- a/torch/distributed/checkpoint/hf_storage.py
+++ b/torch/distributed/checkpoint/hf_storage.py
@@ -226,9 +226,10 @@ class HuggingFaceStorageReader(FileSystemReader):
         tensor = f.get_slice(req.storage_index.fqn)[slices]
         target_tensor = planner.resolve_tensor(req).detach()
 
-        assert target_tensor.size() == tensor.size(), (
-            f"req {req.storage_index} mismatch sizes {target_tensor.size()} vs {tensor.size()}"
-        )
+        if target_tensor.size() != tensor.size():
+            raise AssertionError(
+                f"req {req.storage_index} mismatch sizes {target_tensor.size()} vs {tensor.size()}"
+            )
 
         target_tensor.copy_(tensor)
         planner.commit_tensor(req, target_tensor)
@@ -299,9 +300,10 @@ class HuggingFaceStorageReader(FileSystemReader):
             except queue.Empty:
                 pass
 
-            assert processed_count == len(per_file), (
-                f"Not all files were processed: {processed_count} out of {len(per_file)}"
-            )
+            if processed_count != len(per_file):
+                raise AssertionError(
+                    f"Not all files were processed: {processed_count} out of {len(per_file)}"
+                )
 
         fut: Future = Future()
         fut.set_result(None)
diff --git a/torch/distributed/checkpoint/optimizer.py b/torch/distributed/checkpoint/optimizer.py
index 89c83a944b17..7d72633b6a94 100644
--- a/torch/distributed/checkpoint/optimizer.py
+++ b/torch/distributed/checkpoint/optimizer.py
@@ -137,12 +137,10 @@ def _get_state_dict_2d_layout(
     for key, value in state_dict.items():
         specs[key] = (None, value.size())
         if _is_nested_tensor(value):
-            assert len(value.local_shards()) == 1, (
-                "Cannot handle ST with multiple shards"
-            )
-            assert isinstance(value, ShardedTensor), (
-                "Can only handle nested ShardedTensor"
-            )
+            if not len(value.local_shards()) == 1:
+                raise AssertionError("Cannot handle ST with multiple shards")
+            if not isinstance(value, ShardedTensor):
+                raise AssertionError("Can only handle nested ShardedTensor")
             shard = value.local_shards()[0]
             specs[key] = (
                 shard.metadata.shard_offsets,
@@ -184,7 +182,8 @@ class _ReaderWithOffset(DefaultLoadPlanner):
 
             offset = self.fqn_to_offset[fqn]
 
-            assert len(obj.local_shards()) == 1
+            if not len(obj.local_shards()) == 1:
+                raise AssertionError("Expected exactly one local shard")
             original_shard = obj.local_shards()[0]
             local_chunks = [
                 ChunkStorageMetadata(
@@ -201,7 +200,8 @@ class _ReaderWithOffset(DefaultLoadPlanner):
             # TODO: The ReadItems will have a displaced MetadataIndex, fix it.
             # TODO: we should change _create_sharded_read_items to have more ergonomic API
             for ri in reqs:
-                assert ri.dest_index.offset is not None
+                if ri.dest_index.offset is None:
+                    raise AssertionError("dest_index.offset must not be None")
                 original_offset = _element_wise_sub(ri.dest_index.offset, offset)
                 original_index = dataclasses.replace(
                     ri.dest_index, offset=torch.Size(original_offset)
diff --git a/torch/distributed/checkpoint/quantized_hf_storage.py b/torch/distributed/checkpoint/quantized_hf_storage.py
index 734d1a21a155..2cb189d515a8 100644
--- a/torch/distributed/checkpoint/quantized_hf_storage.py
+++ b/torch/distributed/checkpoint/quantized_hf_storage.py
@@ -107,9 +107,10 @@ class QuantizedHuggingFaceStorageReader(HuggingFaceStorageReader):
 
         target_tensor = planner.resolve_tensor(req).detach()
 
-        assert target_tensor.size() == tensor.size(), (
-            f"req {req.storage_index} mismatch sizes {target_tensor.size()} vs {tensor.size()}"
-        )
+        if target_tensor.size() != tensor.size():
+            raise AssertionError(
+                f"req {req.storage_index} mismatch sizes {target_tensor.size()} vs {tensor.size()}"
+            )
 
         target_tensor.copy_(tensor)
         planner.commit_tensor(req, target_tensor)
diff --git a/torch/distributed/checkpoint/staging.py b/torch/distributed/checkpoint/staging.py
index aa2f50da1b02..d3ea5334d68b 100644
--- a/torch/distributed/checkpoint/staging.py
+++ b/torch/distributed/checkpoint/staging.py
@@ -193,9 +193,10 @@ class DefaultStager(AsyncStager):
                 self._staging_stream = torch.Stream()
 
         if self._config.use_non_blocking_copy:
-            assert torch.accelerator.is_available(), (
-                "Non-blocking copy requires that the current accelerator is available."
-            )
+            if not torch.accelerator.is_available():
+                raise AssertionError(
+                    "Non-blocking copy requires that the current accelerator is available."
+                )
 
         self._staging_future: Optional[Future[STATE_DICT_TYPE]] = None
 
@@ -215,7 +216,10 @@ class DefaultStager(AsyncStager):
             state_dict (STATE_DICT_TYPE): The state_dict to be staged.
         """
         if self._config.use_async_staging:
-            assert self._staging_executor is not None
+            if self._staging_executor is None:
+                raise AssertionError(
+                    "staging_executor should not be None for async staging"
+                )
             self._staging_future = self._staging_executor.submit(
                 self._stage,
                 state_dict,
@@ -227,9 +231,10 @@ class DefaultStager(AsyncStager):
 
     def _stage(self, state_dict: STATE_DICT_TYPE, **kwargs: Any) -> STATE_DICT_TYPE:
         if self._config.use_non_blocking_copy:
-            assert self._staging_stream or not self._config.use_async_staging, (
-                "Non-blocking copy in a background thread for async staging needs staging_stream to be initialized."
-            )
+            if not (self._staging_stream or not self._config.use_async_staging):
+                raise AssertionError(
+                    "Non-blocking copy in a background thread for async staging needs staging_stream to be initialized."
+                )
             with (
                 self._staging_stream
                 if self._staging_stream is not None
diff --git a/torch/distributed/checkpoint/state_dict.py b/torch/distributed/checkpoint/state_dict.py
index b1970a6a7418..d401db7a8460 100644
--- a/torch/distributed/checkpoint/state_dict.py
+++ b/torch/distributed/checkpoint/state_dict.py
@@ -186,7 +186,8 @@ def _get_fqns(
     curr_obj = model
     for i, curr_obj_name in enumerate(obj_names):
         if isinstance(curr_obj, DDP):
-            assert curr_obj_name == "module"
+            if curr_obj_name != "module":
+                raise AssertionError(f"Expected 'module', got '{curr_obj_name}'")
             curr_obj = curr_obj.module
             if not skip_ddp_prefix:
                 fqn_obj_names.append(curr_obj_name)
@@ -203,7 +204,8 @@ def _get_fqns(
                 fqn_obj_names.append(curr_obj_name)
                 curr_obj = getattr(curr_obj, curr_obj_name)
         elif isinstance(curr_obj, torch._dynamo.eval_frame.OptimizedModule):
-            assert curr_obj_name == "_orig_mod"
+            if curr_obj_name != "_orig_mod":
+                raise AssertionError(f"Expected '_orig_mod', got '{curr_obj_name}'")
             curr_obj = curr_obj._orig_mod
             if not skip_compiler_prefix:
                 fqn_obj_names.append(curr_obj_name)
@@ -329,7 +331,8 @@ def _verify_options(
             if module not in submodules:
                 continue
             fqns = _get_fqns(model, name)
-            assert len(fqns) == 1, "Submodule FQN should only have 1 instance"
+            if len(fqns) != 1:
+                raise AssertionError("Submodule FQN should only have 1 instance")
             submodule_prefixes.update(f"{fqn}." for fqn in fqns)
 
     if options.broadcast_from_rank0 and not options.full_state_dict:
@@ -408,7 +411,8 @@ def _verify_state_dict(
 ) -> None:
     for module in info.fsdp_modules:
         fsdp_state = _get_module_fsdp_state_if_fully_sharded_module(module)
-        assert fsdp_state is not None, "Expected a fsdp_state with a fsdp module."
+        if fsdp_state is None:
+            raise AssertionError("Expected a fsdp_state with a fsdp module.")
 
     # Verify if the model_state_dict and optim_state_dict are valid. This API
     # should give the users an explicit error message to debug or report.
@@ -483,7 +487,10 @@ def _get_model_state_dict(
 
     for key in list(state_dict.keys()):
         fqns = _get_fqns(model, key)
-        assert len(fqns) == 1, (key, fqns)
+        if len(fqns) != 1:
+            raise AssertionError(
+                f"Expected 1 FQN for key '{key}', got {len(fqns)}: {fqns}"
+            )
         fqn = next(iter(fqns))
         if fqn != key:
             # As we only support FSDP, DDP, and TP, the only cases are
@@ -746,7 +753,8 @@ def _unflatten_optim_state_dict(
                     continue
 
                 params = pg_state[-1][_PARAMS]
-                assert isinstance(params, list)  # typing
+                if not isinstance(params, list):
+                    raise AssertionError(f"Expected list, got {type(params)}")
                 params.append(fqn)
                 if not param.requires_grad:
                     continue
@@ -808,7 +816,10 @@ def _get_optim_state_dict(
             fqn_pid_mapping = {}
             for key, param in model.named_parameters():
                 fqns = _get_fqns(model, key)
-                assert len(fqns) == 1
+                if len(fqns) != 1:
+                    raise AssertionError(
+                        f"Expected 1 FQN for key '{key}', got {len(fqns)}"
+                    )
                 fqn = next(iter(fqns))
                 if param not in param_pid_mapping:
                     continue
@@ -886,7 +897,8 @@ def _split_optim_state_dict(
                     continue
 
                 params = pg_state[-1][_PARAMS]
-                assert isinstance(params, list)
+                if not isinstance(params, list):
+                    raise AssertionError(f"Expected list, got {type(params)}")
                 params.append(fqn)
                 if param.requires_grad:
                     state[fqn] = cast(DictValueType, optim_state_dict[_STATE])[fqn]
@@ -965,7 +977,10 @@ def _load_optim_state_dict(
                 if fqns == fqns_with_compiler:
                     continue
 
-                assert len(fqns) == 1
+                if len(fqns) != 1:
+                    raise AssertionError(
+                        f"Expected 1 FQN for '{original_fqn}', got {len(fqns)}"
+                    )
                 fqn = fqns.pop()
                 fqn_with_compiler = fqns_with_compiler.pop()
                 for g in optim_state_dict[_PG]:
@@ -999,7 +1014,8 @@ def _load_optim_state_dict(
                 return t
 
             _ = tree_map_only(torch.Tensor, _device, local_state_dict)
-            assert device is not None
+            if device is None:
+                raise AssertionError("Expected device to be set")
             flatten_osd, osd_mapping = _flatten_state_dict(optim_state_dict)
             flatten_local_osd, local_osd_mapping = _flatten_state_dict(local_state_dict)
             if info.broadcast_from_rank0:
@@ -1012,7 +1028,10 @@ def _load_optim_state_dict(
             # having additional parameters ultimately.
             for optim_key in flatten_osd.keys():
                 if optim_key not in flatten_local_osd:
-                    assert optim_key in osd_mapping
+                    if optim_key not in osd_mapping:
+                        raise AssertionError(
+                            f"Expected key '{optim_key}' in osd_mapping"
+                        )
                     flatten_local_osd[optim_key] = flatten_osd[optim_key]
                     local_osd_mapping[optim_key] = osd_mapping[optim_key]
             optim_state_dict = _unflatten_state_dict(
@@ -1225,7 +1244,10 @@ def _unflatten_model_state_dict(
                     continue
 
                 fqns = _get_fqns(model, name)
-                assert len(fqns) == 1, "FQNs for a submodule should only have 1 element"
+                if len(fqns) != 1:
+                    raise AssertionError(
+                        "FQNs for a submodule should only have 1 element"
+                    )
                 prefix = f"{next(iter(fqns))}."
                 new_state_dict.update(
                     {prefix + subfqn: value for subfqn, value in sub_state_dict.items()}
diff --git a/torch/distributed/checkpoint/state_dict_loader.py b/torch/distributed/checkpoint/state_dict_loader.py
index ae3c4df775ab..389dc0e5e571 100644
--- a/torch/distributed/checkpoint/state_dict_loader.py
+++ b/torch/distributed/checkpoint/state_dict_loader.py
@@ -246,8 +246,10 @@ def _load_state_dict(
             except Exception:
                 logger.info("Rank local metadata is not found.")
 
-        assert planner is not None
-        assert metadata is not None
+        if planner is None:
+            raise AssertionError("planner is None")
+        if metadata is None:
+            raise AssertionError("metadata is None")
         planner.set_up_planner(state_dict, metadata, distW.is_coordinator)
 
         if (
@@ -269,7 +271,8 @@ def _load_state_dict(
 
     @_dcp_method_logger(**ckpt_kwargs)
     def global_step(all_local_plans):
-        assert planner is not None
+        if planner is None:
+            raise AssertionError("planner is None")
         all_local_plans = planner.create_global_plan(all_local_plans)
         all_local_plans = storage_reader.prepare_global_plan(all_local_plans)
         return all_local_plans
@@ -284,8 +287,10 @@ def _load_state_dict(
 
     @_dcp_method_logger(**ckpt_kwargs)
     def read_data():
-        assert planner is not None
-        assert central_plan is not None
+        if planner is None:
+            raise AssertionError("planner is None")
+        if central_plan is None:
+            raise AssertionError("central_plan is None")
         final_local_plan = planner.finish_plan(central_plan)
         all_reads = storage_reader.read_data(final_local_plan, planner)
 
diff --git a/torch/distributed/checkpoint/state_dict_saver.py b/torch/distributed/checkpoint/state_dict_saver.py
index d4fe0f5502ff..58a4bd0e85ef 100644
--- a/torch/distributed/checkpoint/state_dict_saver.py
+++ b/torch/distributed/checkpoint/state_dict_saver.py
@@ -292,11 +292,10 @@ def async_save(
 
     if dist.is_available() and dist.is_initialized():
         pg = process_group or _get_default_group()
-        assert (
-            torch.device("cpu") in pg._device_types  # type: ignore[attr-defined]
-        ), (
-            "A CPU backend must be enabled for async save; try initializing process group with 'cpu:gloo,cuda:nccl'"
-        )
+        if torch.device("cpu") not in pg._device_types:
+            raise AssertionError(
+                "A CPU backend must be enabled for async save; try initializing process group with 'cpu:gloo,cuda:nccl'"
+            )
 
     if async_stager is None:
         if storage_writer is not None and isinstance(storage_writer, AsyncStager):
@@ -396,7 +395,8 @@ def _save_state_dict(
     distW = _DistWrapper(process_group, not no_dist, coordinator_rank)
     if planner is None:
         planner = DefaultSavePlanner()
-    assert planner is not None
+    if planner is None:
+        raise AssertionError("planner is None")
 
     global_metadata = None
 
@@ -407,7 +407,8 @@ def _save_state_dict(
 
     @_dcp_method_logger(**ckpt_kwargs)
     def local_step():
-        assert planner is not None
+        if planner is None:
+            raise AssertionError("planner is None")
         storage_meta = storage_writer.storage_meta()
         if "storage_meta" not in inspect.signature(planner.set_up_planner).parameters:
             warnings.warn(
@@ -443,7 +444,8 @@ def _save_state_dict(
     def global_step(all_local_plans):
         nonlocal global_metadata
 
-        assert planner is not None
+        if planner is None:
+            raise AssertionError("planner is None")
         all_local_plans, global_metadata = planner.create_global_plan(all_local_plans)
         all_local_plans = storage_writer.prepare_global_plan(all_local_plans)
         return all_local_plans
@@ -458,8 +460,10 @@ def _save_state_dict(
 
     @_dcp_method_logger(**ckpt_kwargs)
     def write_data():
-        assert planner is not None
-        assert central_plan is not None
+        if planner is None:
+            raise AssertionError("planner is None")
+        if central_plan is None:
+            raise AssertionError("central_plan is None")
         final_local_plan = planner.finish_plan(central_plan)
         all_writes = storage_writer.write_data(final_local_plan, planner)
 
@@ -468,7 +472,8 @@ def _save_state_dict(
 
     @_dcp_method_logger(**ckpt_kwargs)
     def finish_checkpoint(all_results):
-        assert global_metadata is not None
+        if global_metadata is None:
+            raise AssertionError("global_metadata is None")
         storage_writer.finish(metadata=global_metadata, results=all_results)
         return global_metadata
 
diff --git a/torch/distributed/checkpoint/utils.py b/torch/distributed/checkpoint/utils.py
index 0140d80bdcfa..c06c50223836 100644
--- a/torch/distributed/checkpoint/utils.py
+++ b/torch/distributed/checkpoint/utils.py
@@ -168,7 +168,8 @@ class _DistWrapper:
 
             local_reply = gather_result[0]
         else:
-            assert object_list is not None
+            if object_list is None:
+                raise AssertionError("object_list is None")
             local_reply = object_list[0]
         return local_reply
 
@@ -196,7 +197,8 @@ class _DistWrapper:
         all_data = self.gather_object(local_data)
         all_results: Optional[list[Union[R, CheckpointException]]] = None
         if self.is_coordinator:
-            assert all_data is not None
+            if all_data is None:
+                raise AssertionError("all_data is None")
             node_failures = _get_failure_dict(all_data)
 
             if len(node_failures) == 0:
@@ -243,7 +245,8 @@ class _DistWrapper:
         all_data = self.gather_object(local_data)
         result: Optional[Union[R, CheckpointException]] = None
         if self.is_coordinator:
-            assert all_data is not None
+            if all_data is None:
+                raise AssertionError("all_data is None")
             node_failures = _get_failure_dict(all_data)
             if len(node_failures) == 0:
                 try:
@@ -465,10 +468,12 @@ def _api_bc_check(func):
                 p.name for p in sig.parameters.values() if p.kind == p.KEYWORD_ONLY
             ]
             if "storage_writer" in kwonlyargs:
-                assert "storage_writer" not in kwargs, (args, kwargs)
+                if "storage_writer" in kwargs:
+                    raise AssertionError(f"storage_writer in kwargs: {(args, kwargs)}")
                 kwargs["storage_writer"] = args[1]
             elif "storage_reader" in kwonlyargs:
-                assert "storage_reader" not in kwargs, (args, kwargs)
+                if "storage_reader" in kwargs:
+                    raise AssertionError(f"storage_reader in kwargs: {(args, kwargs)}")
                 kwargs["storage_reader"] = args[1]
             else:
                 raise RuntimeError(f"Unexpected kwonlyargs = {kwonlyargs}")

From 6c9c6e0936751116f6f988d7194eefe16a24e5a1 Mon Sep 17 00:00:00 2001
From: Yuanyuan Chen <cyyever@outlook.com>
Date: Fri, 17 Oct 2025 20:15:34 +0000
Subject: [PATCH 073/123] Enable C407 of flake8 (#165046)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR enables C407 on flake8. The description is `C407` is `Unnecessary list comprehension - ‘<builtin>’ can take a generator`.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165046
Approved by: https://github.com/albanD
---
 .flake8 | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.flake8 b/.flake8
index 2cac8d3009b7..2be8eab0dc83 100644
--- a/.flake8
+++ b/.flake8
@@ -13,8 +13,6 @@ ignore =
     EXE001,
     # these ignores are from flake8-bugbear; please fix!
     B007,B008,B017,B019,B023,B028,B903,B905,B906,B907,B908,B910
-    # these ignores are from flake8-comprehensions; please fix!
-    C407,
     # these ignores are from flake8-logging-format; please fix!
     G100,G101,G200
     # these ignores are from flake8-simplify. please fix or ignore with commented reason

From 06d324365c24395b6d326b2c5e904460bb426dcd Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Fri, 17 Oct 2025 20:45:48 +0000
Subject: [PATCH 074/123] Revert "Escaped html tags name and target to appear
 as strings (#165543)"

This reverts commit 080365b7d82a3c99c995cab6dc912b7dfe22aa41.

Reverted https://github.com/pytorch/pytorch/pull/165543 on behalf of https://github.com/pytorch-auto-revert due to Reverted automatically by pytorch's autorevert, to avoid this behaviour add the tag autorevert: disable ([comment](https://github.com/pytorch/pytorch/pull/165543#issuecomment-3417102048))
---
 docs/source/export/ir_spec.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/export/ir_spec.md b/docs/source/export/ir_spec.md
index 879df6ee04a0..562cae1e337f 100644
--- a/docs/source/export/ir_spec.md
+++ b/docs/source/export/ir_spec.md
@@ -158,11 +158,11 @@ This format captures everything present in the Node class, with the exception of
 
 Concretely:
 
-- **\<name>** is the name of the node as it would appear in `node.name`.
-- **\<op_name>** is the `node.op` field, which must be one of these:
+- **<name>** is the name of the node as it would appear in `node.name`.
+- **<op_name>** is the `node.op` field, which must be one of these:
   `<call_function>`, `<placeholder>`,
   `<get_attr>`, or `<output>`.
-- **\<target>** is the target of the node as `node.target`. The meaning of this
+- **<target>** is the target of the node as `node.target`. The meaning of this
   field depends on `op_name`.
 - **args1, … args 4…** are what is listed in the `node.args` tuple. If a
   value in the list is an {class}`torch.fx.Node`, then it will be especially

From ab65498d71bf8626b6480fa3924b52ad93b4a046 Mon Sep 17 00:00:00 2001
From: zpcore <zpcore@gmail.com>
Date: Fri, 17 Oct 2025 20:54:46 +0000
Subject: [PATCH 075/123] Fix `_StridedShard` incorrect split (#165533)

https://github.com/pytorch/pytorch/pull/164820 introduced a bug that `_StridedShard` will call parent class `Shard`'s `split_tensor` method, thus results in incorrect data locality. (I think @ezyang spotted this issue, but we have no test to capture this)

Meanwhile, I notice another bug that when we normalize a `_StridedShard`'s placement, it will also trigger parent class `Shard`'s `split_tensor` method because it will create a Shard class [here](https://github.com/pytorch/pytorch/blob/0c14f55de674790fd3b2b5808de9f1a523c4feec/torch/distributed/tensor/_api.py#L783). I think we never test `distribute_tensor` for `_StridedShard` before. So I added a test here to compare against ordered shard.

Using classmethod because the _split_tensor logic is different between `Shard` and `_StridedShard`. Basically I want to shard on local tensors without initializing the Shard object:
```
local_tensor = _StridedShard._make_shard_tensor(dim, tensor, mesh, mesh_dim, split_factor=split_factor)
local_tensor = Shard._make_shard_tensor(dim, tensor, mesh, mesh_dim)
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165533
Approved by: https://github.com/XilunWu
---
 test/distributed/tensor/test_redistribute.py | 17 ++++
 torch/distributed/tensor/_api.py             | 34 +++++---
 torch/distributed/tensor/placement_types.py  | 83 ++++++++++----------
 3 files changed, 82 insertions(+), 52 deletions(-)

diff --git a/test/distributed/tensor/test_redistribute.py b/test/distributed/tensor/test_redistribute.py
index 8b5d031bccfd..1eb0830422f6 100644
--- a/test/distributed/tensor/test_redistribute.py
+++ b/test/distributed/tensor/test_redistribute.py
@@ -20,6 +20,7 @@ from torch.distributed.tensor._collective_utils import shard_dim_alltoall
 from torch.distributed.tensor._dtensor_spec import ShardOrderEntry
 from torch.distributed.tensor._redistribute import redistribute_local_tensor
 from torch.distributed.tensor.debug import CommDebugMode
+from torch.distributed.tensor.placement_types import _StridedShard
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
@@ -1145,6 +1146,22 @@ class DistributeWithDeviceOrderTest(DTensorTestBase):
             sharded_dt, mesh, tgt_placement, shard_order=None
         )
 
+    @with_comms
+    def test_shard_order_same_data_as_strided_shard(self):
+        device_mesh = init_device_mesh(self.device_type, (4, 2))
+        x = torch.randn(8, 4, device=self.device_type)
+        # specify right-to-left order use _StridedShard
+        strided_placement = [_StridedShard(-2, split_factor=2), Shard(-2)]
+        x_strided_dt = distribute_tensor(x, device_mesh, strided_placement)
+        # specify right-to-left order use ordered shard
+        x_ordered_dt = self.distribute_tensor(
+            x,
+            device_mesh,
+            placements=[Shard(0), Shard(0)],
+            shard_order=(ShardOrderEntry(tensor_dim=0, mesh_dims=(1, 0)),),
+        )
+        self.assertEqual(x_ordered_dt.to_local(), x_strided_dt.to_local())
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/distributed/tensor/_api.py b/torch/distributed/tensor/_api.py
index 03eec9c7d1d4..5fd66b2c5f8e 100644
--- a/torch/distributed/tensor/_api.py
+++ b/torch/distributed/tensor/_api.py
@@ -25,6 +25,7 @@ from torch.distributed.tensor._utils import (
     normalize_to_torch_size,
 )
 from torch.distributed.tensor.placement_types import (
+    _StridedShard,
     Partial,
     Placement,
     Replicate,
@@ -776,18 +777,29 @@ def distribute_tensor(
     # distribute the tensor according to the placements.
     placements = list(placements)
     for idx, placement in enumerate(placements):
-        if placement.is_shard():
-            placement = cast(Shard, placement)
-            if placement.dim < 0:
-                # normalize shard placement dim
-                placement = Shard(placement.dim + tensor.ndim)
-                placements[idx] = placement
-            local_tensor = placement._shard_tensor(
-                local_tensor, device_mesh, idx, src_data_rank
+        if isinstance(placement, Shard):
+            placement_dim = (
+                placement.dim + tensor.ndim if placement.dim < 0 else placement.dim
             )
-        elif placement.is_replicate():
-            placement = cast(Replicate, placement)
-            local_tensor = placement._replicate_tensor(
+            if isinstance(placement, _StridedShard):
+                local_tensor = _StridedShard._make_shard_tensor(
+                    placement_dim,
+                    local_tensor,
+                    device_mesh,
+                    idx,
+                    src_data_rank,
+                    split_factor=placement.split_factor,
+                )
+                placements[idx] = _StridedShard(
+                    placement_dim, split_factor=placement.split_factor
+                )
+            else:
+                local_tensor = Shard._make_shard_tensor(
+                    placement_dim, local_tensor, device_mesh, idx, src_data_rank
+                )
+                placements[idx] = Shard(placement_dim)
+        elif isinstance(placement, Replicate):
+            local_tensor = Replicate._make_replicate_tensor(
                 local_tensor, device_mesh, idx, src_data_rank
             )
         else:
diff --git a/torch/distributed/tensor/placement_types.py b/torch/distributed/tensor/placement_types.py
index d6b7efadee6e..5f68ff03ee22 100644
--- a/torch/distributed/tensor/placement_types.py
+++ b/torch/distributed/tensor/placement_types.py
@@ -69,9 +69,8 @@ class Shard(Placement):
         else:
             return True
 
-    @staticmethod
-    def _make_split_tensor(
-        dim: int,
+    def _split_tensor(
+        self,
         tensor: torch.Tensor,
         num_chunks: int,
         *,
@@ -87,47 +86,31 @@ class Shard(Placement):
             few ranks before calling the collectives (i.e. scatter/all_gather, etc.).
             This is because collectives usually require equal size tensor inputs
         """
-        assert dim <= tensor.ndim, (
-            f"Sharding dim {dim} greater than tensor ndim {tensor.ndim}"
+        assert self.dim <= tensor.ndim, (
+            f"Sharding dim {self.dim} greater than tensor ndim {tensor.ndim}"
         )
 
         # chunk tensor over dimension `dim` into n slices
-        tensor_list = list(torch.chunk(tensor, num_chunks, dim=dim))
+        tensor_list = list(torch.chunk(tensor, num_chunks, dim=self.dim))
         tensor_list = fill_empty_tensor_to_shards(
-            tensor_list, dim, num_chunks - len(tensor_list)
+            tensor_list, self.dim, num_chunks - len(tensor_list)
         )
 
         # compute the chunk size inline with ``torch.chunk`` to calculate padding
-        full_chunk_size = (tensor.size(dim) + num_chunks - 1) // num_chunks
+        full_chunk_size = (tensor.size(self.dim) + num_chunks - 1) // num_chunks
 
         shard_list: list[torch.Tensor] = []
         pad_sizes: list[int] = []
         for shard in tensor_list:
             if with_padding:
-                pad_size = full_chunk_size - shard.size(dim)
-                shard = pad_tensor(shard, dim, pad_size)
+                pad_size = full_chunk_size - shard.size(self.dim)
+                shard = pad_tensor(shard, self.dim, pad_size)
                 pad_sizes.append(pad_size)
             if contiguous:
                 shard = shard.contiguous()
             shard_list.append(shard)
         return shard_list, pad_sizes
 
-    def _split_tensor(
-        self,
-        tensor: torch.Tensor,
-        num_chunks: int,
-        *,
-        with_padding: bool = True,
-        contiguous: bool = True,
-    ) -> tuple[list[torch.Tensor], list[int]]:
-        return Shard._make_split_tensor(
-            self.dim,
-            tensor,
-            num_chunks,
-            with_padding=with_padding,
-            contiguous=contiguous,
-        )
-
     @staticmethod
     @maybe_run_for_local_tensor
     def local_shard_size_and_offset(
@@ -186,9 +169,8 @@ class Shard(Placement):
                 local_tensor = local_tensor.contiguous()
         return local_tensor
 
-    @staticmethod
-    def _make_shard_tensor(
-        dim: int,
+    def _shard_tensor(
+        self,
         tensor: torch.Tensor,
         mesh: DeviceMesh,
         mesh_dim: int,
@@ -210,14 +192,14 @@ class Shard(Placement):
         if src_data_rank is None:
             # src_data_rank specified as None explicitly means to skip the
             # communications, simply split
-            scatter_list, _ = Shard._make_split_tensor(
-                dim, tensor, num_chunks, with_padding=False, contiguous=True
+            scatter_list, _ = self._split_tensor(
+                tensor, num_chunks, with_padding=False, contiguous=True
             )
 
-            return Shard._select_shard(scatter_list, mesh_dim_local_rank)
+            return self._select_shard(scatter_list, mesh_dim_local_rank)
 
-        scatter_list, pad_sizes = Shard._make_split_tensor(
-            dim, tensor, num_chunks, with_padding=True, contiguous=True
+        scatter_list, pad_sizes = self._split_tensor(
+            tensor, num_chunks, with_padding=True, contiguous=True
         )
 
         it = iter(scatter_list)
@@ -234,17 +216,20 @@ class Shard(Placement):
         )
 
         return Shard._maybe_unpad_tensor_with_sizes(
-            dim, output, pad_sizes, mesh_dim_local_rank, True
+            self.dim, output, pad_sizes, mesh_dim_local_rank, True
         )
 
-    def _shard_tensor(
-        self,
+    @classmethod
+    def _make_shard_tensor(
+        cls,
+        dim: int,
         tensor: torch.Tensor,
         mesh: DeviceMesh,
         mesh_dim: int,
         src_data_rank: Optional[int] = 0,
     ) -> torch.Tensor:
-        return Shard._make_shard_tensor(self.dim, tensor, mesh, mesh_dim, src_data_rank)
+        shard_placement = cls(dim)
+        return shard_placement._shard_tensor(tensor, mesh, mesh_dim, src_data_rank)
 
     def _reduce_shard_tensor(
         self,
@@ -267,8 +252,8 @@ class Shard(Placement):
         is_padded = tensor.size(self.dim) % num_chunks != 0
         pad_sizes = None
         if is_padded:
-            scattered_list, pad_sizes = Shard._make_split_tensor(
-                self.dim, tensor, num_chunks, with_padding=True, contiguous=True
+            scattered_list, pad_sizes = self._split_tensor(
+                tensor, num_chunks, with_padding=True, contiguous=True
             )
             tensor = torch.cat(scattered_list, dim=self.dim)
         elif not tensor.is_contiguous():
@@ -538,6 +523,21 @@ class _StridedShard(Shard):
         """human readable representation of the _StridedShard placement"""
         return f"_S({self.dim}, {self.split_factor})"
 
+    @classmethod
+    def _make_shard_tensor(
+        cls,
+        dim: int,
+        tensor: torch.Tensor,
+        mesh: DeviceMesh,
+        mesh_dim: int,
+        src_data_rank: Optional[int] = 0,
+        split_factor: int = 1,
+    ) -> torch.Tensor:
+        strided_shard_placement = cls(dim=dim, split_factor=split_factor)
+        return strided_shard_placement._shard_tensor(
+            tensor, mesh, mesh_dim, src_data_rank
+        )
+
     def _split_tensor(
         self,
         tensor: torch.Tensor,
@@ -704,8 +704,9 @@ class Replicate(Placement):
         """
         return "R"
 
-    @staticmethod
+    @classmethod
     def _make_replicate_tensor(
+        cls,
         tensor: torch.Tensor,
         mesh: DeviceMesh,
         mesh_dim: int,

From 8cb2fb44f29f6b19400a04ea970807f651657b0c Mon Sep 17 00:00:00 2001
From: Nan Zhang <nanzha@meta.com>
Date: Fri, 17 Oct 2025 21:08:29 +0000
Subject: [PATCH 076/123] [Inductor] Support fallback for all gemm like ops
 (#165755)

Summary: Fill op_override field for bmm aten ops so they can be converted properly in the wrapper_fxir backend

Reviewed By: StellarrZ

Differential Revision: D84840948

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165755
Approved by: https://github.com/blaine-rister
---
 torch/_inductor/kernel/bmm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/_inductor/kernel/bmm.py b/torch/_inductor/kernel/bmm.py
index b22e7a1f6149..06c4a63497d7 100644
--- a/torch/_inductor/kernel/bmm.py
+++ b/torch/_inductor/kernel/bmm.py
@@ -119,7 +119,7 @@ bmm_template = TritonTemplate(
     cache_codegen_enabled_for_template=True,
 )
 
-aten_bmm = ExternKernelChoice(torch.bmm, "at::bmm_out")
+aten_bmm = ExternKernelChoice(torch.bmm, "at::bmm_out", op_overload=aten.bmm.out)
 aten_bmm_dtype = ExternKernelChoice(
     torch.bmm,
     "at::_bmm_out_dtype_cuda",

From 86ebce1766b6e20b269f35955fbc3e97332aa765 Mon Sep 17 00:00:00 2001
From: Zhengxu Chen <zhxchen17@outlook.com>
Date: Fri, 17 Oct 2025 21:52:01 +0000
Subject: [PATCH 077/123] [precompile] Pass tensor_to_context to backend.
 (#165702)

Summary:

Fixing a VLLM issue https://github.com/vllm-project/vllm/issues/27040 where
aot precompile fails on some models using symbolic shapes in inductor.

Test Plan:
pp HF_HUB_DISABLE_XET=1 VLLM_ENABLE_V1_MULTIPROCESSING=0 VLLM_USE_AOT_COMPILE=1 vllm bench latency --model microsoft/DialoGPT-small --input-len 128 --output-len 256 --num-iters 50 --dtype float16

Reviewers:

Subscribers:

Tasks:

Tags:

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165702
Approved by: https://github.com/tugsbayasgalan
---
 torch/_dynamo/aot_compile.py   | 4 +++-
 torch/_dynamo/convert_frame.py | 9 +++++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/torch/_dynamo/aot_compile.py b/torch/_dynamo/aot_compile.py
index c49f54edfd3f..cc1391cb7748 100644
--- a/torch/_dynamo/aot_compile.py
+++ b/torch/_dynamo/aot_compile.py
@@ -247,8 +247,10 @@ def aot_compile_fullgraph(
         assert backend_input is not None
         backend_input.graph_module._backend_id = backend_input.backend_id  # type: ignore[assignment]
         device_type = _graph_device_type(backend_input.graph_module.graph)
+        tracing_context = TracingContext(backend_input.fake_mode)
+        tracing_context.tensor_to_context = backend_input.tensor_to_context
         with (
-            torch._guards.tracing(TracingContext(backend_input.fake_mode)),
+            torch._guards.tracing(tracing_context),
             torch._functorch.config.patch(
                 {
                     "bundled_autograd_cache": True,
diff --git a/torch/_dynamo/convert_frame.py b/torch/_dynamo/convert_frame.py
index cf7392763e6c..6f87d1cd445e 100644
--- a/torch/_dynamo/convert_frame.py
+++ b/torch/_dynamo/convert_frame.py
@@ -176,6 +176,8 @@ except ModuleNotFoundError:
 
 
 if typing.TYPE_CHECKING:
+    from torch.utils.weak import WeakIdKeyDictionary
+
     from .backends.registry import CompilerFn
     from .package import CompilePackage
     from .repro.after_dynamo import WrapBackendDebug
@@ -909,6 +911,7 @@ class BackendInput:
     graph_module: torch.fx.GraphModule
     example_inputs: Any
     fake_mode: torch._subclasses.fake_tensor.FakeTensorMode
+    tensor_to_context: WeakIdKeyDictionary
 
 
 @dataclass
@@ -1080,11 +1083,13 @@ def _fullgraph_capture_frame(
         gm: torch.fx.GraphModule, example_inputs: list[torch.Tensor]
     ) -> torch.fx.GraphModule:
         nonlocal backend_input
-        fake_mode = TracingContext.get().fake_mode
+        tracing_context = TracingContext.get()
+        fake_mode = tracing_context.fake_mode
+        tensor_to_context = tracing_context.tensor_to_context
         assert fake_mode is not None
         assert isinstance(gm.meta["backend_id"], str)
         backend_input = BackendInput(
-            gm.meta["backend_id"], gm, example_inputs, fake_mode
+            gm.meta["backend_id"], gm, example_inputs, fake_mode, tensor_to_context
         )
         return gm
 

From c18ddfc5721dd91bf29c769e850a99c4fdb6f380 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Fri, 17 Oct 2025 09:46:53 -0700
Subject: [PATCH 078/123] [dynamo][easy] Support
 torch.accelerator.current_accelerator (#165734)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165734
Approved by: https://github.com/Skylion007
---
 test/dynamo/test_repros.py       | 8 ++++++++
 torch/_dynamo/variables/torch.py | 1 +
 2 files changed, 9 insertions(+)

diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index 47692a4fa81b..362a541918c3 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -8101,6 +8101,14 @@ class ReproTestsDevice(torch._dynamo.test_case.TestCase):
         res = gm(x, y)
         self.assertEqual(res, ref)
 
+    def test_current_accelerator(self):
+        @torch.compile(backend="eager", fullgraph=True)
+        def fn(x):
+            torch.accelerator.current_accelerator()
+            return x + 1
+
+        self.assertEqual(fn(torch.ones(3)), torch.ones(3) + 1)
+
 
 instantiate_parametrized_tests(ReproTests)
 
diff --git a/torch/_dynamo/variables/torch.py b/torch/_dynamo/variables/torch.py
index 1c4bf8a72766..d659f3a24d86 100644
--- a/torch/_dynamo/variables/torch.py
+++ b/torch/_dynamo/variables/torch.py
@@ -146,6 +146,7 @@ REWRITE_OPS_TO_TENSOR_SIZE_METHOD = dict.fromkeys(
 
 constant_fold_functions_need_guards = [
     torch.accelerator.current_device_index,
+    torch.accelerator.current_accelerator,
     torch.cuda.current_device,
     torch.cuda.is_initialized,
     torch.xpu.current_device,

From 616c6bdf8ff5052a03f3bfa4e6258c3a527f93db Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Fri, 17 Oct 2025 11:11:57 -0700
Subject: [PATCH 079/123] [dynamo][ac] Config flag to allow eager and compile
 AC divergence for side-effects (#165775)

Eager AC/SAC reapplies the mutations (like global dict mutations) in the backward during the recomputation of forward. torch.compile has no easy way to reapply python mutations in the backward. But many users might be ok to skip reapplication of side effects in the backward. They can set this config flag to accept this eager and compile divergence.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165775
Approved by: https://github.com/zou3519
ghstack dependencies: #165734
---
 test/dynamo/test_activation_checkpointing.py | 23 ++++++++++++++++++++
 torch/_dynamo/config.py                      |  8 +++++++
 torch/_dynamo/side_effects.py                |  5 ++++-
 torch/_dynamo/variables/higher_order_ops.py  | 12 ++++++++++
 4 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/test/dynamo/test_activation_checkpointing.py b/test/dynamo/test_activation_checkpointing.py
index 5dfaa14067d3..9c168a8e04ae 100644
--- a/test/dynamo/test_activation_checkpointing.py
+++ b/test/dynamo/test_activation_checkpointing.py
@@ -1647,6 +1647,29 @@ Non-primal fwd outputs from model w/o backward hook: {mod_no_hook_fwd_outputs_no
 
         self.assertEqual(opt_fn(x), fn(x))
 
+    @torch._dynamo.config.patch(skip_fwd_side_effects_in_bwd_under_checkpoint=True)
+    def test_nonlocal_mutation(self):
+        counter = 0
+
+        def gn(x):
+            nonlocal counter
+            counter += 1
+            return torch.sin(x)
+
+        def fn(x):
+            return torch.utils.checkpoint.checkpoint(gn, x, use_reentrant=True)
+
+        x = torch.randn(4, 4, requires_grad=True)
+        fn(x).sum().backward()
+        # The mutation is reapplied in the backward as well
+        self.assertEqual(counter, 2)
+        counter = 0
+
+        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+        opt_fn(x).sum().backward()
+        # The mutation is not reapplied in the backward because the flag was on.
+        self.assertEqual(counter, 1)
+
 
 devices = ["cuda", "hpu"]
 instantiate_device_type_tests(
diff --git a/torch/_dynamo/config.py b/torch/_dynamo/config.py
index d62dd086f055..d35ba10ef1af 100644
--- a/torch/_dynamo/config.py
+++ b/torch/_dynamo/config.py
@@ -633,6 +633,14 @@ compiled_autograd = False
 # See https://github.com/pytorch/pytorch/issues/157452 for more context
 graph_break_on_nn_param_ctor = True
 
+# Eager AC/SAC reapplies the mutations (like global dict mutations) in the
+# backward during the recomputation of forward. torch.compile has no easy way to
+# reapply python mutations in the backward. But many users might be ok to skip
+# reapplication of side effects in the backward. They can set this config flag
+# to accept this eager and compile divergence.
+skip_fwd_side_effects_in_bwd_under_checkpoint = False
+
+
 # Overrides torch.compile() kwargs for Compiled Autograd:
 compiled_autograd_kwargs_override: dict[str, Any] = {}
 """Overrides torch.compile() kwargs for Compiled Autograd.
diff --git a/torch/_dynamo/side_effects.py b/torch/_dynamo/side_effects.py
index 4e45dc7446d2..47912dadb941 100644
--- a/torch/_dynamo/side_effects.py
+++ b/torch/_dynamo/side_effects.py
@@ -218,7 +218,10 @@ class SideEffects:
         return bool(
             output_graph
             and output_graph.current_tx.output.current_tracer.under_activation_checkpoint
-            and output_graph.current_tx.output.current_tracer.allow_side_effects_under_checkpoint
+            and (
+                output_graph.current_tx.output.current_tracer.allow_side_effects_under_checkpoint
+                or torch._dynamo.config.skip_fwd_side_effects_in_bwd_under_checkpoint
+            )
         )
 
     def should_allow_externally_visible_side_effects_in_subtracer(self) -> bool:
diff --git a/torch/_dynamo/variables/higher_order_ops.py b/torch/_dynamo/variables/higher_order_ops.py
index 8c08a68e3b27..956eb4676018 100644
--- a/torch/_dynamo/variables/higher_order_ops.py
+++ b/torch/_dynamo/variables/higher_order_ops.py
@@ -2145,6 +2145,9 @@ class ReparametrizeModuleCallVariable(FunctorchHigherOrderVariable):
 class WrapHigherOrderVariable(TorchHigherOrderOperatorVariable):
     supports_input_mutation = True
     supports_aliasing = True
+    # TODO - Go through all subclasses of WrapHigherOrderVariable to see if
+    # restore_side_effects can be ignored. For now, this is conservative.
+    restore_side_effects = True
 
     def install_subgraph_in_output_graph(
         self, tx, fn_vt, fn_args_vt, kwargs, body_gmod, attr_name="wrap_body"
@@ -2178,6 +2181,7 @@ class WrapHigherOrderVariable(TorchHigherOrderOperatorVariable):
             kwargs,
             description,
             source_target=self.value,
+            restore_side_effects=self.restore_side_effects,
             should_flatten_outputs=True,
             under_activation_checkpoint=under_activation_checkpoint,
             supports_input_mutation=self.supports_input_mutation,
@@ -2565,6 +2569,14 @@ class StrictModeHigherOrderVariable(TorchHigherOrderOperatorVariable):
 
 
 class CheckpointHigherOrderVariable(WrapHigherOrderVariable):
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        # If side effects are allowed under checkpoint, we should not restore
+        # the side effects after speculate subgraph.
+        self.restore_side_effects = (
+            not torch._dynamo.config.skip_fwd_side_effects_in_bwd_under_checkpoint
+        )
+
     def _call_function(
         self,
         tx: "InstructionTranslator",

From 2e22b1a61ea20a54448edf34a5d22fbe8391d626 Mon Sep 17 00:00:00 2001
From: Wes Bland <wbland@meta.com>
Date: Fri, 17 Oct 2025 22:06:33 +0000
Subject: [PATCH 080/123] [pytorch] Composite backend potential fix for
 is_backend_available (#165061)

Summary: `is_backend_available` takes in a string and expects it to only be backend, if its given a composite (device:backend) string, it fails.

Reviewed By: prashrock

Differential Revision: D81886736

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165061
Approved by: https://github.com/H-Huang
---
 torch/distributed/distributed_c10d.py | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index ea194a6ebe9a..2419e5aecca3 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -1258,6 +1258,18 @@ def is_xccl_available() -> bool:
     return _XCCL_AVAILABLE
 
 
+def _check_single_backend_availability(backend_name: str) -> bool:
+    """
+    Helper function to check if a single backend is available.
+    """
+    available_func = getattr(
+        torch.distributed, f"is_{str(backend_name).lower()}_available", None
+    )
+    if available_func:
+        return available_func()
+    return str(backend_name).lower() in Backend.backend_list
+
+
 def is_backend_available(backend: str) -> bool:
     """
     Check backend availability.
@@ -1271,11 +1283,16 @@ def is_backend_available(backend: str) -> bool:
         bool: Returns true if the backend is available otherwise false.
     """
     # If the backend has an ``is_backend_available`` function, return the result of that function directly
-    available_func = getattr(torch.distributed, f"is_{backend.lower()}_available", None)
-    if available_func:
-        return available_func()
-
-    return backend.lower() in Backend.backend_list
+    if ":" in backend.lower():  # composite backend like "cpu:gloo"
+        backend_config = BackendConfig(Backend(backend))
+        device_backend_map = backend_config.get_device_backend_map()
+        return all(
+            _check_single_backend_availability(str(backend_name))
+            for backend_name in device_backend_map.values()
+        )
+    else:
+        # Handle simple backend strings like "nccl", "gloo"
+        return _check_single_backend_availability(backend)
 
 
 def is_initialized() -> bool:

From e50dc40d28ba409930023c77a031ec0dd20fd73b Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Fri, 17 Oct 2025 22:35:50 +0000
Subject: [PATCH 081/123] Revert "Update gm.print_readable to include
 Annotation (#165397)"

This reverts commit 7a657700131f31577544e93587eb339618677e97.

Reverted https://github.com/pytorch/pytorch/pull/165397 on behalf of https://github.com/malfet due to I don't know how/why, but it breaks windows tests, see https://hud.pytorch.org/hud/pytorch/pytorch/2e22b1a61ea20a54448edf34a5d22fbe8391d626/1?per_page=50&name_filter=win&mergeEphemeralLF=true ([comment](https://github.com/pytorch/pytorch/pull/165397#issuecomment-3417428128))
---
 test/dynamo/test_higher_order_ops.py          | 30 +++++++++++++++++
 test/dynamo/test_subclasses.py                |  1 +
 test/export/test_export.py                    |  2 --
 test/functorch/test_control_flow.py           |  5 +++
 test/higher_order_ops/test_invoke_subgraph.py | 22 ++++++-------
 test/inductor/test_compiled_autograd.py       |  1 +
 torch/fx/graph.py                             | 32 +++++++++----------
 7 files changed, 63 insertions(+), 30 deletions(-)

diff --git a/test/dynamo/test_higher_order_ops.py b/test/dynamo/test_higher_order_ops.py
index 693c90a10b3a..8b71fe398263 100644
--- a/test/dynamo/test_higher_order_ops.py
+++ b/test/dynamo/test_higher_order_ops.py
@@ -3802,6 +3802,7 @@ class GraphModule(torch.nn.Module):
         dual: "f32[4, 3, 4, 3]" = _unpack_dual[1];  _unpack_dual = None
 
         primals_out_unflatten: "f32[4, 3, 4, 3]" = torch._C._functorch._unwrap_for_grad(primal, 2);  primal = primals_out_unflatten = None
+
         tangents_out_unflatten: "f32[4, 3, 4, 3]" = torch._C._functorch._unwrap_for_grad(dual, 2);  dual = None
 
         _exit_dual_level = torch._C._exit_dual_level(0);  _exit_dual_level = None
@@ -3932,6 +3933,7 @@ class GraphModule(torch.nn.Module):
         tangent: "f32[4, 3, 3, 4]" = torch.zeros_like(primal)
 
         child_8: "f32[4, 3, 3, 4]" = torch._C._functorch._unwrap_for_grad(primal, 2);  primal = child_8 = None
+
         child_9: "f32[4, 3, 3, 4]" = torch._C._functorch._unwrap_for_grad(tangent, 2);  tangent = None
 
         _exit_dual_level = torch._C._exit_dual_level(0);  _exit_dual_level = None
@@ -4144,6 +4146,7 @@ class GraphModule(torch.nn.Module):
         primals_out: "f32[3, 4]" = diff_primals.sin()
 
         aux_1: "f32[4, 3]" = torch._C._functorch._unwrap_for_grad(aux, 1);  aux = None
+
         results: "f32[3, 4]" = torch._C._functorch._unwrap_for_grad(primals_out, 1)
 
         _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting();  _grad_decrement_nesting = None
@@ -4378,6 +4381,7 @@ class GraphModule(torch.nn.Module):
         primals_out: "f32[]" = sin.sum();  sin = None
 
         aux: "f32[5]" = torch._C._functorch._unwrap_for_grad(child, 1);  child = aux = None
+
         results: "f32[]" = torch._C._functorch._unwrap_for_grad(primals_out, 1);  primals_out = None
 
         _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting();  _grad_decrement_nesting = None
@@ -4567,6 +4571,7 @@ class GraphModule(torch.nn.Module):
         grad_input: "f32[3, 3, 3]" = _autograd_grad[0];  _autograd_grad = None
 
         grad_input_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(grad_input, 1);  grad_input = None
+
         output_1: "f32[]" = torch._C._functorch._unwrap_for_grad(output, 1);  output = output_1 = None
 
         _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting();  _grad_decrement_nesting = None
@@ -4634,6 +4639,7 @@ class GraphModule(torch.nn.Module):
         grad_input: "f32[3, 3, 3]" = _autograd_grad[0];  _autograd_grad = None
 
         grad_input_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(grad_input, 1);  grad_input = None
+
         output_1: "f32[]" = torch._C._functorch._unwrap_for_grad(output, 1);  output = output_1 = None
 
         _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting();  _grad_decrement_nesting = None
@@ -4690,6 +4696,7 @@ class GraphModule(torch.nn.Module):
         grad_input: "f32[3, 3, 3]" = _autograd_grad[0];  _autograd_grad = None
 
         grad_input_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(grad_input, 1);  grad_input = None
+
         output_1: "f32[]" = torch._C._functorch._unwrap_for_grad(output, 1);  output = output_1 = None
 
         _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting();  _grad_decrement_nesting = None
@@ -4746,6 +4753,7 @@ class GraphModule(torch.nn.Module):
         grad_input: "f32[3, 3, 3]" = _autograd_grad[0];  _autograd_grad = None
 
         grad_input_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(grad_input, 1);  grad_input = None
+
         output_1: "f32[]" = torch._C._functorch._unwrap_for_grad(output, 1);  output = output_1 = None
 
         _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting();  _grad_decrement_nesting = None
@@ -4800,7 +4808,9 @@ class GraphModule(torch.nn.Module):
         grad_input: "f32[3, 3, 3]" = _autograd_grad[0];  _autograd_grad = None
 
         grad_input_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(grad_input, 1);  grad_input = None
+
         output_1: "f32[]" = torch._C._functorch._unwrap_for_grad(output, 1);  output = output_1 = None
+
         aux_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(aux, 1);  aux = None
 
         _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting();  _grad_decrement_nesting = None
@@ -4856,7 +4866,9 @@ class GraphModule(torch.nn.Module):
         grad_input: "f32[3, 3, 3]" = _autograd_grad[0];  _autograd_grad = None
 
         grad_input_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(grad_input, 1);  grad_input = None
+
         output_1: "f32[]" = torch._C._functorch._unwrap_for_grad(output, 1);  output = output_1 = None
+
         aux_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(aux, 1);  aux = None
 
         _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting();  _grad_decrement_nesting = None
@@ -4930,7 +4942,9 @@ class GraphModule(torch.nn.Module):
 
         _unwrap_for_grad: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(child_2, 1);  child_2 = None
         _unwrap_for_grad_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(child_3, 1);  child_3 = None
+
         output_1: "f32[]" = torch._C._functorch._unwrap_for_grad(output, 1);  output = output_1 = None
+
         aux_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(aux, 1);  aux = None
 
         _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting();  _grad_decrement_nesting = None
@@ -4974,7 +4988,9 @@ class GraphModule(torch.nn.Module):
 
         _unwrap_for_grad: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(child_2, 1);  child_2 = None
         _unwrap_for_grad_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(child_3, 1);  child_3 = None
+
         output_1: "f32[]" = torch._C._functorch._unwrap_for_grad(output, 1);  output = output_1 = None
+
         aux_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(aux, 1);  aux = None
 
         _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting();  _grad_decrement_nesting = None
@@ -5034,6 +5050,7 @@ class GraphModule(torch.nn.Module):
         grad_input: "f32[]" = _autograd_grad[0];  _autograd_grad = None
 
         grad_input_1: "f32[]" = torch._C._functorch._unwrap_for_grad(grad_input, 2);  grad_input = None
+
         output_1: "f32[]" = torch._C._functorch._unwrap_for_grad(output, 2);  output = output_1 = None
 
         _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting();  _grad_decrement_nesting = None
@@ -5043,6 +5060,7 @@ class GraphModule(torch.nn.Module):
         grad_input_2: "f32[]" = _autograd_grad_1[0];  _autograd_grad_1 = None
 
         grad_input_3: "f32[]" = torch._C._functorch._unwrap_for_grad(grad_input_2, 1);  grad_input_2 = None
+
         output_2: "f32[]" = torch._C._functorch._unwrap_for_grad(grad_input_1, 1);  grad_input_1 = output_2 = None
 
         _grad_decrement_nesting_1 = torch._C._functorch._grad_decrement_nesting();  _grad_decrement_nesting_1 = None
@@ -5148,6 +5166,7 @@ class GraphModule(torch.nn.Module):
         grad_input: "f32[3, 3, 3]" = _autograd_grad[0];  _autograd_grad = None
 
         grad_input_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(grad_input, 1);  grad_input = None
+
         output_1: "f32[]" = torch._C._functorch._unwrap_for_grad(output, 1);  output = output_1 = None
 
         _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting();  _grad_decrement_nesting = None
@@ -5226,6 +5245,7 @@ class GraphModule(torch.nn.Module):
         dual: "f32[4, 3]" = _unpack_dual[1];  _unpack_dual = None
 
         primals_out_unflatten: "f32[4, 3]" = torch._C._functorch._unwrap_for_grad(primal, 2);  primal = primals_out_unflatten = None
+
         tangents_out_unflatten: "f32[4, 3]" = torch._C._functorch._unwrap_for_grad(dual, 2);  dual = None
 
         _exit_dual_level = torch._C._exit_dual_level(0);  _exit_dual_level = None
@@ -5307,6 +5327,7 @@ class GraphModule(torch.nn.Module):
         dual: "f32[3, 4]" = _unpack_dual[1];  _unpack_dual = None
 
         primals_out_unflatten: "f32[3, 4]" = torch._C._functorch._unwrap_for_grad(primal, 2);  primal = primals_out_unflatten = None
+
         tangents_out_unflatten: "f32[3, 4]" = torch._C._functorch._unwrap_for_grad(dual, 2);  dual = None
 
         _exit_dual_level = torch._C._exit_dual_level(0);  _exit_dual_level = None
@@ -5390,6 +5411,7 @@ class GraphModule(torch.nn.Module):
         dual: "f32[3, 4]" = _unpack_dual[1];  _unpack_dual = None
 
         primals_out_unflatten: "f32[3, 4]" = torch._C._functorch._unwrap_for_grad(primal, 2);  primal = primals_out_unflatten = None
+
         tangents_out_unflatten: "f32[3, 4]" = torch._C._functorch._unwrap_for_grad(dual, 2);  dual = None
 
         _exit_dual_level = torch._C._exit_dual_level(0);  _exit_dual_level = None
@@ -5480,6 +5502,7 @@ class GraphModule(torch.nn.Module):
 
         child_4: "f32[3, 4]" = torch._C._functorch._unwrap_for_grad(primal, 2);  primal = child_4 = None
         child_5: "f32[4, 3]" = torch._C._functorch._unwrap_for_grad(primal_1, 2);  primal_1 = child_5 = None
+
         child_6: "f32[3, 4]" = torch._C._functorch._unwrap_for_grad(tangent, 2);  tangent = None
         child_7: "f32[4, 3]" = torch._C._functorch._unwrap_for_grad(dual, 2);  dual = None
 
@@ -5549,6 +5572,7 @@ class GraphModule(torch.nn.Module):
         dual: "f32[]" = _unpack_dual[1];  _unpack_dual = None
 
         primals_out_unflatten: "f32[]" = torch._C._functorch._unwrap_for_grad(primal, 1);  primal = None
+
         tangents_out_unflatten: "f32[]" = torch._C._functorch._unwrap_for_grad(dual, 1);  dual = None
 
         _exit_dual_level = torch._C._exit_dual_level(0);  _exit_dual_level = None
@@ -5602,6 +5626,7 @@ class GraphModule(torch.nn.Module):
         dual: "f32[]" = _unpack_dual[1];  _unpack_dual = None
 
         primals_out_unflatten: "f32[]" = torch._C._functorch._unwrap_for_grad(primal, 1);  primal = None
+
         tangents_out_unflatten: "f32[]" = torch._C._functorch._unwrap_for_grad(dual, 1);  dual = None
 
         _exit_dual_level = torch._C._exit_dual_level(0);  _exit_dual_level = None
@@ -5663,6 +5688,7 @@ class GraphModule(torch.nn.Module):
         dual: "f32[3, 3]" = _unpack_dual[1];  _unpack_dual = None
 
         primals_out_unflatten: "f32[3, 3]" = torch._C._functorch._unwrap_for_grad(primal, 1);  primal = None
+
         tangents_out_unflatten: "f32[3, 3]" = torch._C._functorch._unwrap_for_grad(dual, 1);  dual = None
 
         _exit_dual_level = torch._C._exit_dual_level(0);  _exit_dual_level = None
@@ -5716,6 +5742,7 @@ class GraphModule(torch.nn.Module):
         dual: "f32[]" = _unpack_dual[1];  _unpack_dual = None
 
         primals_out_unflatten: "f32[]" = torch._C._functorch._unwrap_for_grad(primal, 1);  primal = None
+
         tangents_out_unflatten: "f32[]" = torch._C._functorch._unwrap_for_grad(dual, 1);  dual = None
 
         _exit_dual_level = torch._C._exit_dual_level(0);  _exit_dual_level = None
@@ -5783,6 +5810,7 @@ class GraphModule(torch.nn.Module):
         dual: "f32[]" = _unpack_dual[1];  _unpack_dual = None
 
         primals_out_unflatten: "f32[]" = torch._C._functorch._unwrap_for_grad(primal, 1);  primal = None
+
         tangents_out_unflatten: "f32[]" = torch._C._functorch._unwrap_for_grad(dual, 1);  dual = None
 
         _exit_dual_level = torch._C._exit_dual_level(0);  _exit_dual_level = None
@@ -5859,6 +5887,7 @@ class GraphModule(torch.nn.Module):
         dual: "f32[3, 3, 3]" = _unpack_dual[1];  _unpack_dual = None
 
         primals_out_unflatten: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(primal, 2);  primal = None
+
         tangents_out_unflatten: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(dual, 2);  dual = None
 
         _set_fwd_grad_enabled_2 = torch._C._set_fwd_grad_enabled(True);  _set_fwd_grad_enabled_2 = None
@@ -5873,6 +5902,7 @@ class GraphModule(torch.nn.Module):
 
         _unwrap_for_grad_2: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(primal_1, 1);  primal_1 = None
         _unwrap_for_grad_3: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(primal_2, 1);  primal_2 = None
+
         _unwrap_for_grad_4: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(dual_1, 1);  dual_1 = None
         _unwrap_for_grad_5: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(dual_2, 1);  dual_2 = None
 
diff --git a/test/dynamo/test_subclasses.py b/test/dynamo/test_subclasses.py
index 39a0dc628bae..c590abe63788 100644
--- a/test/dynamo/test_subclasses.py
+++ b/test/dynamo/test_subclasses.py
@@ -3166,6 +3166,7 @@ class GraphModule(torch.nn.Module):
     ):
         slice_1: "f64[s64, s55]" = torch.ops.aten.slice.Tensor(tangents_1, 1, 0, primals_10)
         slice_2: "f64[s64, s55]" = torch.ops.aten.slice.Tensor(tangents_1, 1, primals_10, add_2);  tangents_1 = add_2 = None
+
         add_4: "f64[s64, s55]" = torch.ops.aten.add.Tensor(slice_1, slice_2);  slice_1 = slice_2 = None
         return (
             None,  # None
diff --git a/test/export/test_export.py b/test/export/test_export.py
index 2842723ea25b..23a7ad9bff1e 100755
--- a/test/export/test_export.py
+++ b/test/export/test_export.py
@@ -16061,7 +16061,6 @@ class GraphModule(torch.nn.Module):
                 add: "f32[2, 4]" = torch.ops.aten.add.Tensor(relu, arg1_1);  relu = arg1_1 = None
                 return (add,)
 """,
-            ignore_empty_lines=True,
         )
 
         ep = export(M(), (x, y), strict=strict).run_decompositions({})
@@ -16094,7 +16093,6 @@ class GraphModule(torch.nn.Module):
                 add: "f32[2, 4]" = torch.ops.aten.add.Tensor(relu, arg1_1);  relu = arg1_1 = None
                 return (add,)
 """,
-            ignore_empty_lines=True,
         )
 
     @testing.expectedFailureStrict  # test_hop doesn't have a dynamo implementation
diff --git a/test/functorch/test_control_flow.py b/test/functorch/test_control_flow.py
index cac6ae1ba36a..e47aaa9e9e2b 100644
--- a/test/functorch/test_control_flow.py
+++ b/test/functorch/test_control_flow.py
@@ -8104,6 +8104,7 @@ class GraphModule(torch.nn.Module):
 
         x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
         _guards_fn = self._guards_fn(x);  _guards_fn = None
+
         sym_size_int_1: "Sym(s77)" = torch.ops.aten.sym_size.int(x, 0)
 
         while_loop_cond_graph_0 = self.while_loop_cond_graph_0
@@ -8403,6 +8404,7 @@ class GraphModule(torch.nn.Module):
 
         x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
         _guards_fn = self._guards_fn(x);  _guards_fn = None
+
         sym_size_int_1: "Sym(s6)" = torch.ops.aten.sym_size.int(x, 0)
 
         sin: "f32[s6, 3]" = torch.ops.aten.sin.default(x);  x = None
@@ -8689,8 +8691,10 @@ class GraphModule(torch.nn.Module):
             t_4: "f32[3, 3]" = torch.ops.aten.t.default(t_3);  t_3 = None
             mul_4: "f32[3, 3]" = torch.ops.aten.mul.Tensor(arg1_1, select)
             mul_5: "f32[3, 3]" = torch.ops.aten.mul.Tensor(arg1_1, select);  arg1_1 = select = None
+
             add_7: "f32[3, 3]" = torch.ops.aten.add.Tensor(mm, mul_5);  mm = mul_5 = None
             add_8: "f32[3, 3]" = torch.ops.aten.add.Tensor(add_7, mul_4);  add_7 = mul_4 = None
+
             add_9: "i64[]" = torch.ops.aten.add.Tensor(arg0_1, 1);  arg0_1 = None
             add_10: "f32[3]" = torch.ops.aten.add.Tensor(view, arg2_1);  view = arg2_1 = None
             add_11: "f32[3, 3]" = torch.ops.aten.add.Tensor(t_4, arg3_1);  t_4 = arg3_1 = None
@@ -8905,6 +8909,7 @@ class GraphModule(torch.nn.Module):
 
         x, y, z, = fx_pytree.tree_flatten_spec(([x, y, z], {}), self._in_spec)
         _guards_fn = self._guards_fn(x, y, z);  _guards_fn = None
+
         sym_size_int_4: "Sym(s17)" = torch.ops.aten.sym_size.int(y, 0);  y = None
         sym_size_int_5: "Sym(s68)" = torch.ops.aten.sym_size.int(z, 0)
 
diff --git a/test/higher_order_ops/test_invoke_subgraph.py b/test/higher_order_ops/test_invoke_subgraph.py
index 700751942ba1..ffbefe5cd9b4 100644
--- a/test/higher_order_ops/test_invoke_subgraph.py
+++ b/test/higher_order_ops/test_invoke_subgraph.py
@@ -17,7 +17,6 @@ from functorch.compile import aot_function, nop
 from torch._dynamo.testing import (
     AotEagerAndRecordGraphs,
     EagerAndRecordGraphs,
-    empty_line_normalizer,
     InductorAndRecordGraphs,
     normalize_gm,
 )
@@ -352,8 +351,10 @@ class GraphModule(torch.nn.Module):
         getitem_14: "f32[8]" = invoke_subgraph_6[2]
         getitem_13: "f32[8]" = invoke_subgraph_6[1]
         getitem_1: "f32[8]" = invoke_subgraph_6[0];  invoke_subgraph_6 = None
+
         add: "f32[8]" = torch.ops.aten.add.Tensor(getitem, getitem_1);  getitem = getitem_1 = None
         return (add, getitem_12, getitem_11, getitem_10, getitem_15, getitem_14, getitem_13)
+
     class partitioned_fw_subgraph_0_0(torch.nn.Module):
         def forward(self, primals_0: "f32[8]", primals_1: "f32[8]", primals_2: "f32[8]"):
             mul: "f32[8]" = torch.ops.aten.mul.Tensor(primals_0, primals_1)
@@ -362,7 +363,6 @@ class GraphModule(torch.nn.Module):
             mul_2: "f32[8]" = torch.ops.aten.mul.Tensor(mul_1, primals_2);  mul_1 = None
             return (mul_2, primals_0, primals_1, primals_2)
 """,
-            ignore_empty_lines=True,
         )
         self.assertExpectedInline(
             normalize_gm(backend.bw_graphs[0].print_readable(print_output=False)),
@@ -377,6 +377,7 @@ class GraphModule(torch.nn.Module):
         invoke_subgraph_5 = torch.ops.higher_order.invoke_subgraph(partitioned_bw_subgraph_0_0, 'partitioned_bw_subgraph_0_0', getitem_10, getitem_11, getitem_12, tangents_1);  partitioned_bw_subgraph_0_0 = getitem_10 = getitem_11 = getitem_12 = tangents_1 = None
         getitem_6: "f32[8]" = invoke_subgraph_5[0]
         getitem_7: "f32[8]" = invoke_subgraph_5[1];  invoke_subgraph_5 = None
+
         add_1: "f32[8]" = torch.ops.aten.add.Tensor(getitem_2, getitem_6);  getitem_2 = getitem_6 = None
         add_2: "f32[8]" = torch.ops.aten.add.Tensor(getitem_3, getitem_7);  getitem_3 = getitem_7 = None
         return (add_1, add_2, None)
@@ -392,7 +393,6 @@ class GraphModule(torch.nn.Module):
             mul_7: "f32[8]" = torch.ops.aten.mul.Tensor(mul_5, primals_1);  mul_5 = primals_1 = None
             return (mul_7, mul_6, None)
 """,
-            ignore_empty_lines=True,
         )
 
     def test_buffer_mutation_works_under_no_grad(self):
@@ -681,7 +681,6 @@ class GraphModule(torch.nn.Module):
             sin: "f32[8]" = torch.ops.aten.sin.default(primals_0)
             return (sin, primals_0)
 """,
-                ignore_empty_lines=True,
             )
 
     @inductor_config.patch("fx_graph_cache", False)
@@ -723,7 +722,6 @@ class <lambda>(torch.nn.Module):
             mul_1: "f32[8]" = torch.ops.aten.mul.Tensor(mul, 2.0);  mul = None
             return (mul_1,)
 """,
-                ignore_empty_lines=True,
             )
 
     def test_dedupe(self):
@@ -772,6 +770,7 @@ class GraphModule(torch.nn.Module):
         subgraph_0 = self.subgraph_0
         invoke_subgraph = torch.ops.higher_order.invoke_subgraph(subgraph_0, 'subgraph_0', l_x_, l_y_);  subgraph_0 = l_x_ = None
         a: "f32[8]" = invoke_subgraph[0];  invoke_subgraph = None
+
         subgraph_1 = self.subgraph_0
         invoke_subgraph_1 = torch.ops.higher_order.invoke_subgraph(subgraph_1, 'subgraph_0', a, l_y_);  subgraph_1 = a = l_y_ = None
         getitem_1: "f32[8]" = invoke_subgraph_1[0];  invoke_subgraph_1 = None
@@ -807,7 +806,6 @@ class GraphModule(torch.nn.Module):
             mul: "f32[8]" = torch.ops.aten.mul.Tensor(primals_0, primals_1)
             return (mul, primals_0, primals_1)
 """,
-            ignore_empty_lines=True,
         )
 
     def test_dce(self):
@@ -891,6 +889,7 @@ class GraphModule(torch.nn.Module):
         subgraph_0 = self.subgraph_0
         invoke_subgraph = torch.ops.higher_order.invoke_subgraph(subgraph_0, 'subgraph_0', l_x_, l_y_);  subgraph_0 = l_x_ = None
         a: "f32[8]" = invoke_subgraph[0];  invoke_subgraph = None
+
         subgraph_1 = self.subgraph_1
         invoke_subgraph_1 = torch.ops.higher_order.invoke_subgraph(subgraph_1, 'subgraph_1', a, l_y_);  subgraph_1 = a = l_y_ = None
         getitem_1: "f32[8]" = invoke_subgraph_1[0];  invoke_subgraph_1 = None
@@ -1536,6 +1535,7 @@ class GraphModule(torch.nn.Module):
         def forward(self, tangents_0: "f32[8, 8]", tangents_1: "f32[8, 8]"):
             mul_2: "f32[8, 8]" = torch.ops.aten.mul.Tensor(tangents_1, 3)
             mul_3: "f32[8, 8]" = torch.ops.aten.mul.Tensor(tangents_1, 2);  tangents_1 = None
+
             add: "f32[8, 8]" = torch.ops.aten.add.Tensor(mul_2, mul_3);  mul_2 = mul_3 = None
             return (add,)
 """,
@@ -2145,6 +2145,7 @@ class GraphModule(torch.nn.Module):
         subgraph_0 = self.subgraph_0
         invoke_subgraph = torch.ops.higher_order.invoke_subgraph(subgraph_0, 'subgraph_0', x, y);  subgraph_0 = x = None
         z: "f32[5]" = invoke_subgraph[0];  invoke_subgraph = None
+
         subgraph_1 = self.subgraph_1
         invoke_subgraph_1 = torch.ops.higher_order.invoke_subgraph(subgraph_1, 'subgraph_1', z, y);  subgraph_1 = z = y = None
         getitem_1: "f32[5]" = invoke_subgraph_1[0];  invoke_subgraph_1 = None
@@ -2282,7 +2283,6 @@ class GraphModule(torch.nn.Module):
             cos: "f32[s77, 16]" = torch.ops.aten.cos.default(primals_1)
             return (cos, primals_1, primals_0)
 """,
-                ignore_empty_lines=True,
             )
             self.assertExpectedInline(
                 normalize_gm(backend.bw_graphs[0].print_readable(print_output=False)),
@@ -2294,6 +2294,7 @@ class GraphModule(torch.nn.Module):
         partitioned_bw_subgraph_0_0 = self.partitioned_bw_subgraph_0_0
         invoke_subgraph_15 = torch.ops.higher_order.invoke_subgraph(partitioned_bw_subgraph_0_0, 'partitioned_bw_subgraph_0_0', getitem_23, getitem_22, expand);  partitioned_bw_subgraph_0_0 = getitem_23 = getitem_22 = None
         getitem_5: "f32[s77, 16]" = invoke_subgraph_15[1];  invoke_subgraph_15 = None
+
         add_16: "f32[s77, 16]" = torch.ops.aten.add.Tensor(expand, getitem_5);  expand = getitem_5 = None
 
         partitioned_bw_subgraph_0_3 = self.partitioned_bw_subgraph_0_1
@@ -2325,7 +2326,6 @@ class GraphModule(torch.nn.Module):
             mul_10: "f32[s77, 16]" = torch.ops.aten.mul.Tensor(tangents_0, neg);  tangents_0 = neg = None
             return (None, mul_10)
 """,
-                ignore_empty_lines=True,
             )
 
     def test_div(self):
@@ -2535,19 +2535,19 @@ class TestInvokeSubgraphExport(TestCase):
         self.assertEqual(len(list(ep.graph_module.named_modules())), 2)
 
         self.assertExpectedInline(
-            empty_line_normalizer(
-                normalize_gm(ep.graph_module.print_readable(print_output=False))
-            ),
+            normalize_gm(ep.graph_module.print_readable(print_output=False)),
             """\
 class GraphModule(torch.nn.Module):
     def forward(self, x: "f32[8]", y: "f32[8]"):
         repeated_subgraph0 = self.repeated_subgraph0
         invoke_subgraph = torch.ops.higher_order.invoke_subgraph(repeated_subgraph0, 'subgraph_0', x, y);  repeated_subgraph0 = x = None
         getitem: "f32[8]" = invoke_subgraph[0];  invoke_subgraph = None
+
         repeated_subgraph0_1 = self.repeated_subgraph0
         invoke_subgraph_1 = torch.ops.higher_order.invoke_subgraph(repeated_subgraph0_1, 'subgraph_0', getitem, y);  repeated_subgraph0_1 = getitem = y = None
         getitem_1: "f32[8]" = invoke_subgraph_1[0];  invoke_subgraph_1 = None
         return (getitem_1,)
+
     class repeated_subgraph0(torch.nn.Module):
         def forward(self, arg0_1: "f32[8]", arg1_1: "f32[8]"):
             mul: "f32[8]" = torch.ops.aten.mul.Tensor(arg0_1, arg1_1);  arg0_1 = arg1_1 = None
diff --git a/test/inductor/test_compiled_autograd.py b/test/inductor/test_compiled_autograd.py
index fee2b289db90..2612af01f6ff 100644
--- a/test/inductor/test_compiled_autograd.py
+++ b/test/inductor/test_compiled_autograd.py
@@ -3621,6 +3621,7 @@ class CompiledAutograd0(torch.nn.Module):
 
         aot0_mul_2 = torch.ops.aten.mul.Tensor(aot0_tangents_1, aot0_primals_1);  aot0_tangents_1 = aot0_primals_1 = None
         aot0_mul_3 = torch.ops.aten.mul.Tensor(aot0_tangents_2, aot0_primals_2);  aot0_tangents_2 = aot0_primals_2 = None
+
         aot0_add_2 = torch.ops.aten.add.Tensor(aot0_mul_2, aot0_mul_2);  aot0_mul_2 = None
         aot0_add_3 = torch.ops.aten.add.Tensor(aot0_mul_3, aot0_mul_3);  aot0_mul_3 = None
 
diff --git a/torch/fx/graph.py b/torch/fx/graph.py
index 7577b6bc6148..940737e7e3a6 100644
--- a/torch/fx/graph.py
+++ b/torch/fx/graph.py
@@ -606,31 +606,29 @@ class CodeGen:
             else:
                 body.append("\n")
 
-        prev_summary_str = None
+        prev_stacktrace = None
 
         def append_stacktrace_summary(node: Node):
             """
             Append a summary of the stacktrace to the generated code. This is
             useful for debugging.
             """
-            nonlocal prev_summary_str
+            nonlocal prev_stacktrace
 
             if node.op not in {"placeholder", "output"}:
-                annotation_str = ""
-                annotation = node.meta.get("custom", {})
-                if annotation:
-                    annotation_str = f" Annotation: {annotation}"
-
-                stack_trace_str = "No stacktrace found for following nodes"
-                if stack_trace := node.stack_trace:
-                    if parsed_stack_trace := _parse_stack_trace(stack_trace):
-                        stack_trace_str = parsed_stack_trace.get_summary_str()
-
-                summary_str = f"\n{dim(f'#{annotation_str} {stack_trace_str}')}\n"
-
-                if summary_str != prev_summary_str:
-                    prev_summary_str = summary_str
-                    body.append(summary_str)
+                stack_trace = node.stack_trace
+                if stack_trace:
+                    if stack_trace != prev_stacktrace:
+                        prev_stacktrace = stack_trace
+                        if parsed_stack_trace := _parse_stack_trace(stack_trace):
+                            summary_str = parsed_stack_trace.get_summary_str()
+                        else:
+                            summary_str = ""
+                        body.append(f"\n {dim(f'# {summary_str}')}\n")
+                elif prev_stacktrace != "":
+                    prev_stacktrace = ""
+                    no_stacktrace_msg = "# No stacktrace found for following nodes"
+                    body.append(f"\n{dim(no_stacktrace_msg)}\n")
 
         def stringify_shape(shape: Iterable) -> str:
             return f"[{', '.join([str(x) for x in shape])}]"

From fe80f03726a7a50439be063327b67c7fba6279b2 Mon Sep 17 00:00:00 2001
From: drisspg <drisspguessous@gmail.com>
Date: Fri, 17 Oct 2025 17:00:44 +0000
Subject: [PATCH 082/123] Add B200 files to labeler and update codeowners
 (#165767)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165767
Approved by: https://github.com/slayton58
---
 .github/labeler.yml | 29 +++++++++++++++++++++++++++++
 CODEOWNERS          | 14 ++++++++++++++
 2 files changed, 43 insertions(+)

diff --git a/.github/labeler.yml b/.github/labeler.yml
index eb4076d81331..7b47b9fefb5d 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -133,3 +133,32 @@
 
 "ciflow/vllm":
 - .github/ci_commit_pins/vllm.txt
+
+"ciflow/b200":
+- test/test_matmul_cuda.py
+- test/test_scaled_matmul_cuda.py
+- test/inductor/test_fp8.py
+- aten/src/ATen/native/cuda/Blas.cpp
+- torch/**/*cublas*
+- torch/_inductor/kernel/mm.py
+- test/inductor/test_max_autotune.py
+- third_party/fbgemm
+
+"ciflow/h100":
+- test/test_matmul_cuda.py
+- test/test_scaled_matmul_cuda.py
+- test/inductor/test_fp8.py
+- aten/src/ATen/native/cuda/Blas.cpp
+- torch/**/*cublas*
+- torch/_inductor/kernel/mm.py
+- test/inductor/test_max_autotune.py
+- third_party/fbgemm
+
+"ciflow/rocm":
+- test/test_matmul_cuda.py
+- test/test_scaled_matmul_cuda.py
+- test/inductor/test_fp8.py
+- aten/src/ATen/native/cuda/Blas.cpp
+- torch/_inductor/kernel/mm.py
+- test/inductor/test_max_autotune.py
+- third_party/fbgemm
diff --git a/CODEOWNERS b/CODEOWNERS
index 1f0943d3ad54..cc249dc4f43a 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -201,3 +201,17 @@ torch/backends/cudnn/ @eqy @syed-ahmed @Aidyn-A
 /torch/csrc/stable/ @janeyx99 @mikaylagawarecki
 /torch/headeronly/ @janeyx99
 /torch/header_only_apis.txt @janeyx99
+
+# FlexAttention
+/torch/nn/attention/flex_attention.py @drisspg
+/torch/_higher_order_ops/flex_attention.py @drisspg
+/torch/_inductor/kernel/flex/ @drisspg
+/torch/_inductor/codegen/cpp_flex_attention_template.py @drisspg
+/test/inductor/test_flex_attention.py @drisspg
+/test/inductor/test_flex_decoding.py @drisspg
+
+# Low Precision GEMMs
+/aten/src/ATen/native/cuda/Blas.cpp @drisspg @slayton58
+/aten/src/ATen/cuda/CUDABlas.cpp @drisspg @slayton58
+/aten/src/ATen/cuda/CUDABlas.h @drisspg @slayton58
+/test/test_scaled_matmul_cuda.py @drisspg @slayton58

From 1b397420f22b22f90a1093233ecd9167656e50cb Mon Sep 17 00:00:00 2001
From: Dzmitry Huba <huba@meta.com>
Date: Fri, 17 Oct 2025 09:01:44 -0700
Subject: [PATCH 083/123] Enable more DTensor tests in local tensor mode and
 fix more integration issues (#165716)

- During op dispatch local tensor is supposed to collect rng state from CPU and CUDA
devices so that it can be reset before execution of the op for each such that ops
with randomness produces the same result for all ranks (note that we are planning a
separate change to add support of per rank rng state). Previously we relied on
op input arguments to deduce which devices to get rng state from. Which doesn't work
for factory functions such torch.randn. Hence this changes switches to uncondionally
collecting rng state from all devices.

- Fixing per rank specific computations in _MaskedPartial and Shard placements discovered
during test enablement.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165716
Approved by: https://github.com/ezyang
---
 test/distributed/tensor/test_tensor_ops.py    | 15 +++-
 torch/distributed/_local_tensor/__init__.py   | 78 +++++++++++++++++--
 .../distributed/tensor/_ops/_embedding_ops.py | 41 ++++++----
 torch/distributed/tensor/_sharding_prop.py    |  3 +
 torch/distributed/tensor/debug/__init__.py    | 11 +++
 torch/distributed/tensor/placement_types.py   | 18 ++++-
 torch/testing/_internal/common_distributed.py | 16 +++-
 .../distributed/_tensor/common_dtensor.py     |  3 +
 8 files changed, 155 insertions(+), 30 deletions(-)

diff --git a/test/distributed/tensor/test_tensor_ops.py b/test/distributed/tensor/test_tensor_ops.py
index eaa1969068c1..8368befabfec 100644
--- a/test/distributed/tensor/test_tensor_ops.py
+++ b/test/distributed/tensor/test_tensor_ops.py
@@ -17,6 +17,7 @@ from torch.distributed.tensor.debug import CommDebugMode
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_utils import run_tests, skipIfRocm
 from torch.testing._internal.distributed._tensor.common_dtensor import (
+    create_local_tensor_test_class,
     DTensorConverter,
     DTensorTestBase,
     with_comms,
@@ -704,6 +705,12 @@ class DistTensorOpsTest(DTensorTestBase):
 
     @with_comms
     def test_dtensor_dtype_conversion(self):
+        from torch.distributed.tensor.debug import (
+            _clear_sharding_prop_cache,
+            _get_sharding_prop_cache_info,
+        )
+
+        _clear_sharding_prop_cache()
         device_mesh = self.build_device_mesh()
         shard_spec = [Shard(0)]
         # by default we start from bf16 dtype
@@ -722,8 +729,6 @@ class DistTensorOpsTest(DTensorTestBase):
         self.assertEqual(bf16_sharded_dtensor1.dtype, torch.bfloat16)
         self.assertEqual(bf16_sharded_dtensor1.to_local().dtype, torch.bfloat16)
 
-        from torch.distributed.tensor.debug import _get_sharding_prop_cache_info
-
         # by this point we only have cache misses
         hits, misses, _, _ = _get_sharding_prop_cache_info()
         self.assertEqual(hits, 0)
@@ -775,7 +780,7 @@ class DistTensorOpsTest(DTensorTestBase):
         )
 
     def _test_split_on_partial(self, reduce_op: str, split_size: int, split_dim: int):
-        torch.manual_seed(self.rank)
+        self.init_manual_seed_for_rank()
         mesh = self.build_device_mesh()
 
         partial_tensor = torch.randn(8, 8, device=self.device_type)
@@ -822,5 +827,9 @@ class DistTensorOpsTest(DTensorTestBase):
                     self.assertEqual(x.full_tensor(), y)
 
 
+DistTensorOpsTestWithLocalTensor = create_local_tensor_test_class(
+    DistTensorOpsTest,
+)
+
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/distributed/_local_tensor/__init__.py b/torch/distributed/_local_tensor/__init__.py
index d9eb7b47e9a3..8121b367790a 100644
--- a/torch/distributed/_local_tensor/__init__.py
+++ b/torch/distributed/_local_tensor/__init__.py
@@ -104,6 +104,62 @@ def _map_to_rank_local_val(val: Any, rank: int) -> Any:
     return val
 
 
+def collect_cuda_rng_states() -> list[torch.Tensor]:
+    """
+    Collects RNG state from all available CUDA devices.
+
+    Returns:
+        List of RNG state tensors, one for each CUDA device.
+        Returns empty list if CUDA is not available.
+    """
+    if not torch.cuda.is_available():
+        return []
+
+    num_devices = torch.cuda.device_count()
+    rng_states = []
+
+    for device_idx in range(num_devices):
+        with torch.cuda.device(device_idx):
+            rng_state = torch.cuda.get_rng_state()
+            rng_states.append(rng_state)
+
+    return rng_states
+
+
+def set_cuda_rng_states(rng_states: list[torch.Tensor]) -> None:
+    """
+    Sets RNG state for all CUDA devices from a list of states.
+
+    Args:
+        rng_states: List of RNG state tensors to restore.
+    """
+    if not torch.cuda.is_available():
+        return
+
+    num_devices = min(len(rng_states), torch.cuda.device_count())
+
+    for device_idx in range(num_devices):
+        with torch.cuda.device(device_idx):
+            torch.cuda.set_rng_state(rng_states[device_idx])
+
+
+def _get_rng_state() -> tuple[torch.Tensor, list[torch.Tensor]]:
+    """
+    Gets CPU and CUDA rng states from all devices.
+    """
+    return (torch.get_rng_state(), collect_cuda_rng_states())
+
+
+def _set_rng_state(cpu_state: torch.Tensor, cuda_states: list[torch.Tensor]) -> None:
+    """
+    Sets CPU and CUDA rng states for all devices. If the list of cuda states
+    is shorter than the number of devices only the first len(cuda_states) devices
+    will get their rng state set.
+    """
+    torch.set_rng_state(cpu_state)
+    set_cuda_rng_states(cuda_states)
+
+
 def _for_each_rank_run_func(
     func: Callable[..., Any],
     ranks: frozenset[int],
@@ -117,14 +173,15 @@ def _for_each_rank_run_func(
         a.wait() if isinstance(a, AsyncCollectiveTensor) else a for a in flat_args
     ]
 
-    cpu_state = torch.get_rng_state()
-    devices, states = get_device_states((args, kwargs))
-
+    # NB: Before invoking an op we are collecting rng states from CPU and
+    # CUDA devices such that we can reset to the same before invoking op
+    # for each rank. This is not very efficient and will likely be revisited
+    # to support per rank rng state.
+    rng_state = _get_rng_state()
     flat_rank_rets = {}
 
     for r in sorted(ranks):
-        torch.set_rng_state(cpu_state)
-        set_device_states(devices, states)
+        _set_rng_state(*rng_state)
         rank_flat_args = [_map_to_rank_local_val(a, r) for a in flat_args]
         rank_args, rank_kwargs = pytree.tree_unflatten(rank_flat_args, args_spec)
         rank_ret = func(*rank_args, **rank_kwargs)
@@ -704,6 +761,11 @@ class _LocalDeviceMesh:
 
     @staticmethod
     def get_coordinate(self: DeviceMesh) -> Optional[list[int] | None]:
+        # NB: In order to support submeshes the code below recreates for each
+        # rank submesh with the same mesh dimensions as current mesh. We are
+        # doing this because when submesh is created it is created for a particular
+        # rank (therefore below we are patching get_rank method). We are trying to
+        # limit the invasiveness of local tensor.
         lm = local_tensor_mode()
         assert lm is not None, "Unexpectedly not in LocalTensorMode"
 
@@ -716,7 +778,9 @@ class _LocalDeviceMesh:
                 coords[d][r] = c
 
         out = [torch.SymInt(LocalIntNode(c)) for c in coords]
-
+        # The output contains coordinates for each of the ranks with respect to
+        # their meshes formed from root mesh and selecting the same dimensions
+        # as the current mesh.
         return out  # type: ignore[return-value]
 
 
@@ -794,8 +858,6 @@ def maybe_run_for_local_tensor(func: Callable[..., Any]) -> Callable[..., Any]:
         with lm.disable():
             ret = _for_each_rank_run_func(func, lm.ranks, args, kwargs, alias=False)
 
-        lm = local_tensor_mode()
-        assert lm is not None
         return ret
 
     return wrapper
diff --git a/torch/distributed/tensor/_ops/_embedding_ops.py b/torch/distributed/tensor/_ops/_embedding_ops.py
index 445b1830defe..283cffb78efd 100644
--- a/torch/distributed/tensor/_ops/_embedding_ops.py
+++ b/torch/distributed/tensor/_ops/_embedding_ops.py
@@ -6,6 +6,7 @@ from typing import cast, Optional
 
 import torch
 import torch.distributed._functional_collectives as funcol
+from torch.distributed._local_tensor import maybe_run_for_local_tensor
 from torch.distributed.device_mesh import DeviceMesh
 from torch.distributed.tensor._op_schema import (
     OpSchema,
@@ -83,20 +84,11 @@ class _MaskPartial(Partial):
     offset_shape: Optional[torch.Size] = None
     offset_dim: int = 0
 
-    def _partition_value(
-        self, tensor: torch.Tensor, mesh: DeviceMesh, mesh_dim: int
-    ) -> torch.Tensor:
-        # override parent logic to perform partial mask for embedding
-        num_chunks = mesh.size(mesh_dim)
-        # get local shard size and offset on the embedding_dim
-        assert self.offset_shape is not None, (
-            "offset_shape needs to be set for _MaskPartial"
-        )
-        local_shard_size, local_offset_on_dim = Shard.local_shard_size_and_offset(
-            self.offset_shape[self.offset_dim],
-            num_chunks,
-            mesh.get_local_rank(mesh_dim),
-        )
+    @staticmethod
+    @maybe_run_for_local_tensor
+    def _mask_tensor(
+        tensor: torch.Tensor, local_offset_on_dim: int, local_shard_size: int
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         # Build the input mask and save it for the current partial placement
         # this is so that the output of embedding op can reuse the same partial
         # placement saved mask to perform mask + reduction
@@ -106,6 +98,27 @@ class _MaskPartial(Partial):
         # mask the input tensor
         masked_tensor = tensor.clone() - local_offset_on_dim
         masked_tensor[mask] = 0
+        return mask, masked_tensor
+
+    def _partition_value(
+        self, tensor: torch.Tensor, mesh: DeviceMesh, mesh_dim: int
+    ) -> torch.Tensor:
+        my_coordinate = mesh.get_coordinate()
+        assert my_coordinate is not None, "my_coordinate should not be None"
+        # override parent logic to perform partial mask for embedding
+        num_chunks = mesh.size(mesh_dim)
+        # get local shard size and offset on the embedding_dim
+        assert self.offset_shape is not None, (
+            "offset_shape needs to be set for _MaskPartial"
+        )
+        local_shard_size, local_offset_on_dim = Shard.local_shard_size_and_offset(
+            self.offset_shape[self.offset_dim],
+            num_chunks,
+            my_coordinate[mesh_dim],
+        )
+        mask, masked_tensor = _MaskPartial._mask_tensor(
+            tensor, local_offset_on_dim, local_shard_size
+        )
         # materialize the mask buffer to be used for reduction
         self.mask_buffer.materialize_mask(mask)
         return masked_tensor
diff --git a/torch/distributed/tensor/_sharding_prop.py b/torch/distributed/tensor/_sharding_prop.py
index 4af72b4d3d8f..c1af2c131717 100644
--- a/torch/distributed/tensor/_sharding_prop.py
+++ b/torch/distributed/tensor/_sharding_prop.py
@@ -48,6 +48,9 @@ class LocalLRUCache(threading.local):
     def cache_info(self):
         return self.cache.cache_info()
 
+    def cache_clear(self):
+        return self.cache.cache_clear()
+
 
 class ShardingPropagator:
     def __init__(self) -> None:
diff --git a/torch/distributed/tensor/debug/__init__.py b/torch/distributed/tensor/debug/__init__.py
index e5bf3b833fe4..a74f1449ad12 100644
--- a/torch/distributed/tensor/debug/__init__.py
+++ b/torch/distributed/tensor/debug/__init__.py
@@ -19,6 +19,17 @@ def _get_sharding_prop_cache_info():
     )
 
 
+def _clear_sharding_prop_cache():
+    """
+    Clears the cache for the sharding propagation cache, used for debugging purpose only.
+    """
+    from torch.distributed.tensor._api import DTensor
+
+    return (
+        DTensor._op_dispatcher.sharding_propagator.propagate_op_sharding.cache_clear()  # type:ignore[attr-defined]
+    )
+
+
 # Set namespace for exposed private names
 CommDebugMode.__module__ = "torch.distributed.tensor.debug"
 visualize_sharding.__module__ = "torch.distributed.tensor.debug"
diff --git a/torch/distributed/tensor/placement_types.py b/torch/distributed/tensor/placement_types.py
index 5f68ff03ee22..8930d3b1b29c 100644
--- a/torch/distributed/tensor/placement_types.py
+++ b/torch/distributed/tensor/placement_types.py
@@ -359,6 +359,16 @@ class Shard(Placement):
 
         return Shard._select_shard(shards, shard_index)
 
+    @staticmethod
+    @maybe_run_for_local_tensor
+    def _get_shard_pad_size(
+        full_size: int, local_tensor: torch.Tensor, dim: int
+    ) -> int:
+        """
+        Get the padding size of the local tensor on the shard dimension.
+        """
+        return full_size - local_tensor.size(dim)
+
     def _to_new_shard_dim(
         self,
         local_tensor: torch.Tensor,
@@ -387,14 +397,16 @@ class Shard(Placement):
             old_dim_full_chunk_size = (
                 old_dim_logical_size + num_chunks - 1
             ) // num_chunks
-            old_dim_pad_size = old_dim_full_chunk_size - local_tensor.size(self.dim)
+            old_dim_pad_size = Shard._get_shard_pad_size(
+                old_dim_full_chunk_size, local_tensor, self.dim
+            )
             local_tensor = pad_tensor(local_tensor, self.dim, old_dim_pad_size)
         if new_dim_padding:
             new_dim_full_chunk_size = (
                 new_dim_logical_size + num_chunks - 1
             ) // num_chunks
-            new_dim_pad_size = new_dim_full_chunk_size * num_chunks - local_tensor.size(
-                new_shard_dim
+            new_dim_pad_size = Shard._get_shard_pad_size(
+                new_dim_full_chunk_size * num_chunks, local_tensor, new_shard_dim
             )
             local_tensor = pad_tensor(local_tensor, new_shard_dim, new_dim_pad_size)
 
diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index 17a317463cb5..64ea87852a86 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -211,6 +211,14 @@ def at_least_x_gpu(x):
     return False
 
 
+def _maybe_handle_skip_if_lt_x_gpu(args, msg) -> bool:
+    _handle_test_skip = getattr(args[0], "_handle_test_skip", None)
+    if len(args) == 0 or _handle_test_skip is None:
+        return False
+    _handle_test_skip(msg)
+    return True
+
+
 def skip_if_lt_x_gpu(x):
     def decorator(func):
         @wraps(func)
@@ -221,7 +229,9 @@ def skip_if_lt_x_gpu(x):
                 return func(*args, **kwargs)
             if TEST_XPU and torch.xpu.device_count() >= x:
                 return func(*args, **kwargs)
-            sys.exit(TEST_SKIPS[f"multi-gpu-{x}"].exit_code)
+            test_skip = TEST_SKIPS[f"multi-gpu-{x}"]
+            if _maybe_handle_skip_if_lt_x_gpu(args, test_skip.message):
+                sys.exit(test_skip.exit_code)
 
         return wrapper
 
@@ -237,7 +247,9 @@ def nccl_skip_if_lt_x_gpu(backend, x):
                 return func(*args, **kwargs)
             if torch.cuda.is_available() and torch.cuda.device_count() >= x:
                 return func(*args, **kwargs)
-            sys.exit(TEST_SKIPS[f"multi-gpu-{x}"].exit_code)
+            test_skip = TEST_SKIPS[f"multi-gpu-{x}"]
+            if _maybe_handle_skip_if_lt_x_gpu(args, test_skip.message):
+                sys.exit(test_skip.exit_code)
 
         return wrapper
 
diff --git a/torch/testing/_internal/distributed/_tensor/common_dtensor.py b/torch/testing/_internal/distributed/_tensor/common_dtensor.py
index 6c506c51e68a..a9beb0e60865 100644
--- a/torch/testing/_internal/distributed/_tensor/common_dtensor.py
+++ b/torch/testing/_internal/distributed/_tensor/common_dtensor.py
@@ -701,6 +701,9 @@ class DTensorConverter:
 
 
 class LocalDTensorTestBase(DTensorTestBase):
+    def _handle_test_skip(self, msg: str) -> None:
+        self.skipTest(msg)
+
     def _get_local_tensor_mode(self):
         return LocalTensorMode(frozenset(range(0, self.world_size)))
 

From 69c33898fa99f7c4552401a630a77675119c7ce7 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Fri, 17 Oct 2025 23:33:17 +0000
Subject: [PATCH 084/123] Revert "[Inductor][CuTeDSL] Move load_template up two
 directories (#165347) (#165576)"

This reverts commit febb60323018948b2b9d2cff35b3cc4e0d0c55c8.

Reverted https://github.com/pytorch/pytorch/pull/165576 on behalf of https://github.com/seemethere due to This was actually reverted internally, current PR is linked to a stale diff so diff train tools think that this is landed via co-dev when it was actually reverted ([comment](https://github.com/pytorch/pytorch/pull/165576#issuecomment-3417510146))
---
 torch/_inductor/kernel/flex/common.py               | 12 ++++++++----
 torch/_inductor/kernel/flex/flex_attention.py       | 10 +++++-----
 torch/_inductor/kernel/flex/flex_decoding.py        |  8 ++++----
 torch/_inductor/kernel/flex/flex_flash_attention.py |  5 ++---
 torch/_inductor/utils.py                            | 11 -----------
 5 files changed, 19 insertions(+), 27 deletions(-)

diff --git a/torch/_inductor/kernel/flex/common.py b/torch/_inductor/kernel/flex/common.py
index a83de2478a1d..3cd3056a7600 100644
--- a/torch/_inductor/kernel/flex/common.py
+++ b/torch/_inductor/kernel/flex/common.py
@@ -3,7 +3,6 @@
 
 import math
 from collections.abc import Sequence
-from functools import partial
 from pathlib import Path
 from typing import Any, Optional, Union
 
@@ -37,7 +36,6 @@ from ...lowering import (
     to_dtype,
 )
 from ...select_algorithm import realize_inputs
-from ...utils import load_template
 
 
 SubgraphResults = Union[list[Optional[ComputedBuffer]], Optional[ComputedBuffer]]
@@ -339,7 +337,13 @@ def next_power_of_two(n):
     return 2 ** math.ceil(math.log2(n))
 
 
-_FLEX_TEMPLATE_DIR = Path(__file__).parent / "templates"
-load_flex_template = partial(load_template, template_dir=_FLEX_TEMPLATE_DIR)
+_TEMPLATE_DIR = Path(__file__).parent / "templates"
+
+
+def load_template(name: str) -> str:
+    """Load a template file and return its content."""
+    with open(_TEMPLATE_DIR / f"{name}.py.jinja") as f:
+        return f.read()
+
 
 # Template strings have been moved to templates/common.py.jinja
diff --git a/torch/_inductor/kernel/flex/flex_attention.py b/torch/_inductor/kernel/flex/flex_attention.py
index e692b3237121..203ceeb112d1 100644
--- a/torch/_inductor/kernel/flex/flex_attention.py
+++ b/torch/_inductor/kernel/flex/flex_attention.py
@@ -29,7 +29,7 @@ from .common import (
     freeze_irnodes,
     get_fwd_subgraph_outputs,
     infer_dense_strides,
-    load_flex_template,
+    load_template,
     maybe_realize,
     set_head_dim_values,
     SubgraphResults,
@@ -79,9 +79,9 @@ def get_float32_precision():
 flex_attention_template = TritonTemplate(
     name="flex_attention",
     grid=flex_attention_grid,
-    source=load_flex_template("flex_attention")
-    + load_flex_template("utilities")
-    + load_flex_template("common"),
+    source=load_template("flex_attention")
+    + load_template("utilities")
+    + load_template("common"),
 )
 
 
@@ -464,7 +464,7 @@ def flex_attention_backward_grid(
 flex_attention_backward_template = TritonTemplate(
     name="flex_attention_backward",
     grid=flex_attention_backward_grid,
-    source=load_flex_template("flex_backwards") + load_flex_template("utilities"),
+    source=load_template("flex_backwards") + load_template("utilities"),
 )
 
 
diff --git a/torch/_inductor/kernel/flex/flex_decoding.py b/torch/_inductor/kernel/flex/flex_decoding.py
index bdab06eb0661..4374a93e8d0b 100644
--- a/torch/_inductor/kernel/flex/flex_decoding.py
+++ b/torch/_inductor/kernel/flex/flex_decoding.py
@@ -22,7 +22,7 @@ from .common import (
     create_num_blocks_fake_generator,
     freeze_irnodes,
     get_fwd_subgraph_outputs,
-    load_flex_template,
+    load_template,
     maybe_realize,
     set_head_dim_values,
 )
@@ -97,9 +97,9 @@ def flex_decoding_grid(batch_size, kv_heads, gqa_group_size, n_keys, d_model, me
 flex_decoding_template = TritonTemplate(
     name="flex_decoding",
     grid=flex_decoding_grid,
-    source=load_flex_template("flex_decode")
-    + load_flex_template("utilities")
-    + load_flex_template("common"),
+    source=load_template("flex_decode")
+    + load_template("utilities")
+    + load_template("common"),
 )
 
 
diff --git a/torch/_inductor/kernel/flex/flex_flash_attention.py b/torch/_inductor/kernel/flex/flex_flash_attention.py
index 5fedcedf6488..bcb235bd29d0 100644
--- a/torch/_inductor/kernel/flex/flex_flash_attention.py
+++ b/torch/_inductor/kernel/flex/flex_flash_attention.py
@@ -12,7 +12,7 @@ from torch.fx import GraphModule
 
 from ...ir import FixedLayout, ShapeAsConstantBuffer, Subgraph, TensorBox
 from ...lowering import empty_strided
-from .common import infer_dense_strides, load_flex_template, SubgraphResults
+from .common import infer_dense_strides, load_template, SubgraphResults
 
 
 aten = torch.ops.aten
@@ -36,8 +36,7 @@ from ...codegen.cutedsl.cutedsl_template import CuteDSLTemplate
 
 
 flash_attention_cutedsl_template = CuteDSLTemplate(
-    name="flash_attention_cutedsl",
-    source=load_flex_template("flash_attention"),
+    name="flash_attention_cutedsl", source=load_template("flash_attention")
 )
 
 
diff --git a/torch/_inductor/utils.py b/torch/_inductor/utils.py
index 6d7b58a96a56..233a294aaed6 100644
--- a/torch/_inductor/utils.py
+++ b/torch/_inductor/utils.py
@@ -67,10 +67,6 @@ from torch.utils._ordered_set import OrderedSet
 from torch.utils._pytree import tree_flatten, tree_map_only
 
 
-if TYPE_CHECKING:
-    from pathlib import Path
-
-
 OPTIMUS_EXCLUDE_POST_GRAD = [
     "activation_quantization_aten_pass",
     "inductor_autotune_lookup_table",
@@ -3890,10 +3886,3 @@ def is_nonfreeable_buffers(dep: Dep) -> bool:
     return dep_name.startswith(
         ("primals_", "arg", "fwd_rng_state", "bwd_rng_state", "tangents")
     )
-
-
-# Make sure to also include your jinja templates within torch_package_data in setup.py, or this function won't be able to find them
-def load_template(name: str, template_dir: Path) -> str:
-    """Load a template file and return its content."""
-    with open(template_dir / f"{name}.py.jinja") as f:
-        return f.read()

From a25a649e705447b55f5c8b91157472c00c0c42cd Mon Sep 17 00:00:00 2001
From: Shivam Raikundalia <sraikund@fb.com>
Date: Fri, 17 Oct 2025 23:46:02 +0000
Subject: [PATCH 085/123] [Mem Snapshot] Add Metadata Field (#165490)

Summary:
The implementation adds the ability to:

Set custom metadata strings that will be attached to all subsequent allocations
Clear or change the metadata at any point
View the metadata in memory snapshots via _dump_snapshot()

Test Plan: Added test in test_cuda.py and check manually in snapshot to see that metadata was added.

Differential Revision: D84654933

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165490
Approved by: https://github.com/yushangdi
---
 c10/cuda/CUDACachingAllocator.cpp   | 27 ++++++++++++++++++++++++++-
 c10/cuda/CUDACachingAllocator.h     | 19 +++++++++++++++++--
 test/test_cuda.py                   | 22 ++++++++++++++++++++++
 torch/_C/__init__.pyi.in            |  2 ++
 torch/csrc/cuda/Module.cpp          | 10 ++++++++++
 torch/csrc/cuda/memory_snapshot.cpp |  2 ++
 torch/cuda/memory.py                | 24 ++++++++++++++++++++++++
 7 files changed, 103 insertions(+), 3 deletions(-)

diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
index 48413e7a6f34..25058f87264f 100644
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@@ -1260,6 +1260,9 @@ class DeviceCachingAllocator {
   // thread local compile context for each device
   static thread_local std::stack<std::string> compile_context;
 
+  // thread local user metadata for annotating allocations
+  static thread_local std::string user_metadata;
+
  public:
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
   explicit DeviceCachingAllocator(c10::DeviceIndex id)
@@ -1302,6 +1305,14 @@ class DeviceCachingAllocator {
     }
   }
 
+  void setUserMetadata(const std::string& metadata) {
+    user_metadata = metadata;
+  }
+
+  std::string getUserMetadata() {
+    return user_metadata;
+  }
+
   bool checkPoolLiveAllocations(
       MempoolId_t mempool_id,
       const std::unordered_set<void*>& expected_live_allocations) const {
@@ -3682,7 +3693,8 @@ class DeviceCachingAllocator {
         mempool_id,
         getApproximateTime(),
         record_context_ >= RecordContext::ALLOC ? std::move(context) : nullptr,
-        compile_string);
+        compile_string,
+        user_metadata);
 
     // Callbacks should not include any Pytorch call
     for (const auto& cb : trace_trackers_) {
@@ -3737,6 +3749,7 @@ static void uncached_delete(void* ptr) {
 
 static void local_raw_delete(void* ptr);
 thread_local std::stack<std::string> DeviceCachingAllocator::compile_context;
+thread_local std::string DeviceCachingAllocator::user_metadata;
 #ifdef __cpp_lib_hardware_interference_size
 using std::hardware_destructive_interference_size;
 #else
@@ -3934,6 +3947,18 @@ class NativeCachingAllocator : public CUDAAllocator {
     device_allocator[device]->popCompileContext();
   }
 
+  void setUserMetadata(const std::string& metadata) override {
+    c10::DeviceIndex device = 0;
+    C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
+    device_allocator[device]->setUserMetadata(metadata);
+  }
+
+  std::string getUserMetadata() override {
+    c10::DeviceIndex device = 0;
+    C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
+    return device_allocator[device]->getUserMetadata();
+  }
+
   bool isHistoryEnabled() override {
     c10::DeviceIndex device = 0;
     C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
diff --git a/c10/cuda/CUDACachingAllocator.h b/c10/cuda/CUDACachingAllocator.h
index 89274c9f9946..fbe5dab18e0a 100644
--- a/c10/cuda/CUDACachingAllocator.h
+++ b/c10/cuda/CUDACachingAllocator.h
@@ -118,7 +118,8 @@ struct TraceEntry {
       MempoolId_t mempool,
       approx_time_t time,
       std::shared_ptr<GatheredContext> context = nullptr,
-      std::string compile_context = "")
+      std::string compile_context = "",
+      std::string user_metadata = "")
       : action_(action),
         device_(device),
         addr_(addr),
@@ -126,7 +127,8 @@ struct TraceEntry {
         stream_(stream),
         size_(size),
         mempool_(std::move(mempool)),
-        compile_context_(std::move(compile_context)) {
+        compile_context_(std::move(compile_context)),
+        user_metadata_(std::move(user_metadata)) {
     time_.approx_t_ = time;
   }
   Action action_;
@@ -138,6 +140,7 @@ struct TraceEntry {
   MempoolId_t mempool_;
   trace_time_ time_{};
   std::string compile_context_;
+  std::string user_metadata_;
 };
 
 // Calls made by record_function will save annotations
@@ -297,6 +300,10 @@ class CUDAAllocator : public DeviceAllocator {
       const std::vector<std::pair<std::string, std::string>>& /*md*/) {}
   virtual void pushCompileContext(std::string& md) {}
   virtual void popCompileContext() {}
+  virtual void setUserMetadata(const std::string& metadata) {}
+  virtual std::string getUserMetadata() {
+    return "";
+  }
   virtual void attachOutOfMemoryObserver(OutOfMemoryObserver observer) = 0;
 
   // Attached AllocatorTraceTracker callbacks will be called while the
@@ -536,6 +543,14 @@ inline void enablePeerAccess(
   get()->enablePeerAccess(dev, dev_to_access);
 }
 
+inline void setUserMetadata(const std::string& metadata) {
+  get()->setUserMetadata(metadata);
+}
+
+inline std::string getUserMetadata() {
+  return get()->getUserMetadata();
+}
+
 } // namespace c10::cuda::CUDACachingAllocator
 
 namespace c10::cuda {
diff --git a/test/test_cuda.py b/test/test_cuda.py
index fc52c2b92067..283b0fcf7bb8 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -4378,6 +4378,28 @@ class TestCudaMallocAsync(TestCase):
             finally:
                 torch.cuda.memory._record_memory_history(None)
 
+    @unittest.skipIf(
+        TEST_CUDAMALLOCASYNC, "setContextRecorder not supported by CUDAMallocAsync"
+    )
+    @requiresCppContext
+    def test_memory_plots_metadata(self):
+        for context in ["alloc", "all", "state"]:
+            try:
+                torch._C._cuda_clearCublasWorkspaces()
+                torch.cuda.memory.empty_cache()
+                torch.cuda.memory._set_memory_metadata("metadata test")
+                torch.cuda.memory._record_memory_history(context="all")
+                x = torch.rand(3, 4, device="cuda")
+                del x
+                torch.cuda.memory.empty_cache()
+                torch.cuda.memory._set_memory_metadata("")
+
+                ss = torch.cuda.memory._snapshot()
+                for event in ss["device_traces"][0]:
+                    self.assertTrue(event["user_metadata"] == "metadata test")
+            finally:
+                torch.cuda.memory._record_memory_history(None)
+
     @unittest.skipIf(
         TEST_CUDAMALLOCASYNC, "setContextRecorder not supported by CUDAMallocAsync"
     )
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 244200216ec9..b99fd3f2b80a 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -2081,6 +2081,8 @@ def _cuda_hostMemoryStats() -> dict[str, Any]: ...
 def _cuda_resetAccumulatedHostMemoryStats() -> None: ...
 def _cuda_resetPeakHostMemoryStats() -> None: ...
 def _cuda_memorySnapshot(mempool_id: tuple[_int, _int] | None) -> dict[str, Any]: ...
+def _cuda_setMemoryMetadata(metadata: str) -> None: ...
+def _cuda_getMemoryMetadata() -> str: ...
 def _cuda_record_memory_history_legacy(
     enabled: _bool,
     record_context: _bool,
diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp
index 0950192457d6..32ade3680980 100644
--- a/torch/csrc/cuda/Module.cpp
+++ b/torch/csrc/cuda/Module.cpp
@@ -765,6 +765,7 @@ PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* arg) {
   py::str frames_s = "frames";
   py::str time_us_s = "time_us";
   py::str compile_context_s = "compile_context";
+  py::str user_metadata_s = "user_metadata";
 
   py::list empty_frames;
   std::vector<CapturedTraceback*> to_gather_frames;
@@ -882,6 +883,7 @@ PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* arg) {
       trace_entry[stream_s] = int64_t(te.stream_);
       trace_entry[time_us_s] = te.time_.t_;
       trace_entry[compile_context_s] = te.compile_context_;
+      trace_entry[user_metadata_s] = te.user_metadata_;
       trace.append(trace_entry);
     }
     traces.append(trace);
@@ -1137,6 +1139,14 @@ static void registerCudaDeviceProperties(PyObject* module) {
     return c10::cuda::CUDACachingAllocator::isHistoryEnabled();
   });
 
+  m.def("_cuda_setMemoryMetadata", [](const std::string& metadata) {
+    c10::cuda::CUDACachingAllocator::setUserMetadata(metadata);
+  });
+
+  m.def("_cuda_getMemoryMetadata", []() {
+    return c10::cuda::CUDACachingAllocator::getUserMetadata();
+  });
+
   m.def("_cuda_get_conv_benchmark_empty_cache", []() {
     return at::native::_cudnn_get_conv_benchmark_empty_cache();
   });
diff --git a/torch/csrc/cuda/memory_snapshot.cpp b/torch/csrc/cuda/memory_snapshot.cpp
index d4382aa8cb32..830159d0a919 100644
--- a/torch/csrc/cuda/memory_snapshot.cpp
+++ b/torch/csrc/cuda/memory_snapshot.cpp
@@ -311,6 +311,7 @@ std::string _memory_snapshot_pickled() {
   IValue is_expandable_s = "is_expandable";
   IValue time_us_s = "time_us";
   IValue compile_contexts_s = "compile_context";
+  IValue user_metadata_s = "user_metadata";
 
   auto empty_frames = new_list();
 
@@ -428,6 +429,7 @@ std::string _memory_snapshot_pickled() {
       trace_entry.insert(size_s, (int64_t)te.size_);
       trace_entry.insert(stream_s, int64_t(te.stream_));
       trace_entry.insert(compile_contexts_s, te.compile_context_);
+      trace_entry.insert(user_metadata_s, te.user_metadata_);
       if (te.context_) {
         auto sc = getFromContext(te.context_);
         frame_tracebacks.push_back(sc);
diff --git a/torch/cuda/memory.py b/torch/cuda/memory.py
index 5eeaf3a8253f..b38cd2fa59c7 100644
--- a/torch/cuda/memory.py
+++ b/torch/cuda/memory.py
@@ -1063,6 +1063,30 @@ def _dump_snapshot(filename="dump_snapshot.pickle"):
         pickle.dump(s, f)
 
 
+def _set_memory_metadata(metadata: str):
+    """
+    Set custom metadata that will be attached to all subsequent CUDA memory allocations.
+
+    This metadata will be recorded in the memory snapshot for all allocations made
+    after this call until the metadata is cleared or changed.
+
+    Args:
+        metadata (str): Custom metadata string to attach to allocations.
+                       Pass an empty string to clear the metadata.
+    """
+    torch._C._cuda_setMemoryMetadata(metadata)
+
+
+def _get_memory_metadata() -> str:
+    """
+    Get the current custom metadata that is being attached to CUDA memory allocations.
+
+    Returns:
+        str: The current metadata string, or empty string if no metadata is set.
+    """
+    return torch._C._cuda_getMemoryMetadata()
+
+
 def _save_segment_usage(filename="output.svg", snapshot=None):
     if snapshot is None:
         snapshot = _snapshot()

From 29b029648ed3871b83c28d4625bb5f969fe4cb41 Mon Sep 17 00:00:00 2001
From: Chris Leonard <chleonar@redhat.com>
Date: Sat, 18 Oct 2025 01:00:50 +0000
Subject: [PATCH 086/123] Fixed issue with GradTrackingTensor not properly
 propagating sparse layout (#165765)

Fixes #164286

Fixed issue with GradTrackingTensor not properly propagating sparse layout.

@ezyang @jcaip
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165765
Approved by: https://github.com/ezyang
---
 aten/src/ATen/functorch/BatchedTensorImpl.h |  4 ++++
 test/functorch/test_eager_transforms.py     | 18 ++++++++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/aten/src/ATen/functorch/BatchedTensorImpl.h b/aten/src/ATen/functorch/BatchedTensorImpl.h
index 3eccc94d3ea6..985b289b3fe0 100644
--- a/aten/src/ATen/functorch/BatchedTensorImpl.h
+++ b/aten/src/ATen/functorch/BatchedTensorImpl.h
@@ -160,6 +160,10 @@ constexpr DispatchKeySet kKeysToPropagateToWrapper({
   DispatchKey::CUDA,
   DispatchKey::CPU,
   DispatchKey::PrivateUse1,
+  DispatchKey::SparseCPU,
+  DispatchKey::SparseCUDA,
+  DispatchKey::SparseCsrCPU,
+  DispatchKey::SparseCsrCUDA,
 });
 
 inline DispatchKeySet getKeysToPropagateToWrapper(const Tensor& tensor, DispatchKeySet to_propagate=kKeysToPropagateToWrapper) {
diff --git a/test/functorch/test_eager_transforms.py b/test/functorch/test_eager_transforms.py
index ca19be644466..0a5d03f9dd1f 100644
--- a/test/functorch/test_eager_transforms.py
+++ b/test/functorch/test_eager_transforms.py
@@ -313,6 +313,24 @@ class TestGradTransform(TestCase):
     def test_numel(self, device):
         self._test_attributes(lambda x: x.numel(), device)
 
+    def test_layout_sparse(self, device):
+        indices = torch.tensor([[0, 1, 1], [2, 0, 2]], device=device)
+        values = torch.tensor([3.0, 4.0, 5.0], device=device)
+        sparse_x = torch.sparse_coo_tensor(indices, values, (2, 3), device=device)
+
+        # Verify the input is sparse
+        self.assertEqual(sparse_x.layout, torch.sparse_coo)
+
+        def foo(x):
+            # assert GradTrackingTensor still reports sparse layout
+            self.assertEqual(x.layout, torch.sparse_coo)
+            return x.coalesce()._values().sum()
+
+        result = grad(foo)(sparse_x)
+
+        # The gradient should also be sparse
+        self.assertEqual(result.layout, torch.sparse_coo)
+
     def test_inplace(self, device):
         x = torch.randn([], device=device)
 

From e9f4999985c0aa1f3c2c5489cde5ae3614503154 Mon Sep 17 00:00:00 2001
From: orangeH25 <18085625039@163.com>
Date: Sat, 18 Oct 2025 01:08:40 +0000
Subject: [PATCH 087/123] [Code Clean] Replace std::runtime_error with
 TORCH_CHECK (#165305)

Fixes part of #148114

Including:

- torch/csrc/distributed

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165305
Approved by: https://github.com/FFFrog, https://github.com/albanD
---
 .../distributed/c10d/TCPStoreLibUvBackend.cpp |  9 ++++----
 .../control_collectives/StoreCollectives.cpp  |  6 +++---
 .../c10d/control_plane/PythonHandlers.cpp     |  5 ++---
 .../c10d/control_plane/WorkerServer.cpp       | 21 +++++++++----------
 4 files changed, 19 insertions(+), 22 deletions(-)

diff --git a/torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp b/torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp
index 52354de93edf..2843107e547a 100644
--- a/torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp
+++ b/torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp
@@ -361,13 +361,12 @@ class UvTcpServer : public UvTcpSocket {
 
     int addr_len = sizeof(addr_s);
 
-    if (uv_tcp_getsockname(
+    TORCH_CHECK(
+        uv_tcp_getsockname(
             (uv_tcp_t*)unsafeGetStream(),
             reinterpret_cast<::sockaddr*>(&addr_s),
-            &addr_len) != 0) {
-      throw std::runtime_error(
-          "The port number of the socket cannot be retrieved.");
-    }
+            &addr_len) == 0,
+        "The port number of the socket cannot be retrieved.");
 
     if (addr_s.ss_family == AF_INET) {
       portNum_ = ntohs(reinterpret_cast<sockaddr_in*>(&addr_s)->sin_port);
diff --git a/torch/csrc/distributed/c10d/control_collectives/StoreCollectives.cpp b/torch/csrc/distributed/c10d/control_collectives/StoreCollectives.cpp
index 995899441d46..b5bbe8351fb0 100644
--- a/torch/csrc/distributed/c10d/control_collectives/StoreCollectives.cpp
+++ b/torch/csrc/distributed/c10d/control_collectives/StoreCollectives.cpp
@@ -49,7 +49,7 @@ void StoreCollectives::barrier(
           msg += fmt::format("{}, ", i);
         }
       }
-      throw std::runtime_error(msg + e.what());
+      TORCH_CHECK(false, msg, e.what());
     }
   }
 }
@@ -118,7 +118,7 @@ std::vector<std::vector<uint8_t>> StoreCollectives::gatherRecv(
         msg += fmt::format("{}, ", i);
       }
     }
-    throw std::runtime_error(msg + e.what());
+    TORCH_CHECK(false, msg, e.what());
   }
 
   // insert local data
@@ -194,7 +194,7 @@ std::vector<std::vector<uint8_t>> StoreCollectives::allGather(
         msg += fmt::format("{}, ", i);
       }
     }
-    throw std::runtime_error(msg + e.what());
+    TORCH_CHECK(false, msg, e.what());
   }
 }
 
diff --git a/torch/csrc/distributed/c10d/control_plane/PythonHandlers.cpp b/torch/csrc/distributed/c10d/control_plane/PythonHandlers.cpp
index 3e89d8510710..f9fa068bed0d 100644
--- a/torch/csrc/distributed/c10d/control_plane/PythonHandlers.cpp
+++ b/torch/csrc/distributed/c10d/control_plane/PythonHandlers.cpp
@@ -4,6 +4,7 @@
 #include <fstream>
 #include <string>
 
+#include <c10/util/Exception.h>
 #include <c10/util/tempfile.h>
 #include <torch/csrc/distributed/c10d/exception.h>
 #include <torch/csrc/utils/pybind.h>
@@ -17,9 +18,7 @@ RegisterHandler tracebackHandler{
       auto tmpfile = c10::make_tempfile("torch-dump_traceback");
 
       auto cfile = ::fopen(tmpfile.name.c_str(), "w");
-      if (!cfile) {
-        throw std::runtime_error("failed to open file for writing");
-      }
+      TORCH_CHECK(cfile, "failed to open file for writing");
 
       {
         py::gil_scoped_acquire guard{};
diff --git a/torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp b/torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp
index a9a7722fe41f..02efb9ecbe02 100644
--- a/torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp
+++ b/torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp
@@ -2,6 +2,7 @@
 #include <unordered_map>
 
 #include <ATen/core/interned_strings.h>
+#include <c10/util/Exception.h>
 #include <c10/util/FileSystem.h>
 #include <c10/util/thread_name.h>
 #include <caffe2/utils/threadpool/WorkersPool.h>
@@ -144,21 +145,19 @@ WorkerServer::WorkerServer(const std::string& hostOrFile, int port) {
   if (port == -1) {
     // using unix sockets
     server_.set_address_family(AF_UNIX);
-
-    if (c10::filesystem::exists(hostOrFile)) {
-      throw std::runtime_error(fmt::format("{} already exists", hostOrFile));
-    }
+    TORCH_CHECK(
+        !c10::filesystem::exists(hostOrFile),
+        fmt::format("{} already exists", hostOrFile));
 
     C10D_WARNING("Server listening to UNIX {}", hostOrFile);
-    if (!server_.bind_to_port(hostOrFile, 80)) {
-      throw std::runtime_error(fmt::format("Error binding to {}", hostOrFile));
-    }
+    TORCH_CHECK(
+        server_.bind_to_port(hostOrFile, 80),
+        fmt::format("Error binding to {}", hostOrFile));
   } else {
     C10D_WARNING("Server listening to TCP {}:{}", hostOrFile, port);
-    if (!server_.bind_to_port(hostOrFile, port)) {
-      throw std::runtime_error(
-          fmt::format("Error binding to {}:{}", hostOrFile, port));
-    }
+    TORCH_CHECK(
+        server_.bind_to_port(hostOrFile, port),
+        fmt::format("Error binding to {}:{}", hostOrFile, port));
   }
 
   serverThread_ = std::thread([this]() {

From 543ddbf44c06640b424abf72a6469dddc829809f Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Sat, 18 Oct 2025 01:11:16 +0000
Subject: [PATCH 088/123] [ONNX] Support renaming in dynamic axes to shapes
 conversion (#165769)

Discovered in ##165748

This PR also deprecates the conversion. ONNX exporter team does not intend to maintain the conversion in long term.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165769
Approved by: https://github.com/justinchuby
---
 test/onnx/exporter/test_api.py                | 45 +++++++++++++++++++
 .../_internal/exporter/_dynamic_shapes.py     | 14 ++++--
 2 files changed, 56 insertions(+), 3 deletions(-)

diff --git a/test/onnx/exporter/test_api.py b/test/onnx/exporter/test_api.py
index 24a9176bbe5b..7e6a487e18f5 100644
--- a/test/onnx/exporter/test_api.py
+++ b/test/onnx/exporter/test_api.py
@@ -202,6 +202,51 @@ class TestExportAPIDynamo(common_utils.TestCase):
             dynamic_axes={"b": [0, 1, 2], "b_out": [0, 1, 2]},
         )
 
+    def test_from_dynamic_axes_to_dynamic_shapes_deprecation_warning(self):
+        with self.assertWarnsRegex(
+            DeprecationWarning,
+            "from_dynamic_axes_to_dynamic_shapes is deprecated and will be removed in a future release. "
+            "This function converts 'dynamic_axes' format \\(including custom axis names\\) to 'dynamic_shapes' format. "
+            "Instead of relying on this conversion, provide 'dynamic_shapes' directly with custom names.",
+        ):
+            self.assert_export(
+                SampleModelForDynamicShapes(),
+                (torch.randn(2, 2, 3), {"b": torch.randn(2, 2, 3)}),
+                dynamic_axes={
+                    "x": [0, 1, 2],
+                    "b": [0, 1, 2],
+                },
+            )
+
+    def test_from_dynamic_axes_to_dynamic_shapes_keeps_custom_axis_names(self):
+        model = SampleModelForDynamicShapes()
+        input = (
+            torch.randn(2, 2, 3),
+            {"b": torch.randn(2, 2, 3)},
+        )
+        dynamic_axes = {
+            "x": {0: "customx_x_0", 1: "customx_x_1", 2: "customx_x_2"},
+            "b": {0: "customb_b_0", 1: "customb_b_1", 2: "customb_b_2"},
+            "x_out": {0: "customx_out_x_0", 1: "customx_out_x_1", 2: "customx_out_x_2"},
+            "b_out": {0: "customb_out_b_0", 1: "customb_out_b_1", 2: "customb_out_b_2"},
+        }
+        onnx_program = torch.onnx.export(
+            model,
+            input,
+            dynamic_axes=dynamic_axes,
+            input_names=["x", "b"],
+            output_names=["x_out", "b_out"],
+            dynamo=True,
+        )
+
+        # Check whether the dynamic dimension names are preserved
+        self.assertIs(onnx_program.model.graph.inputs[0].shape[0].value, "customx_x_0")
+        self.assertIs(onnx_program.model.graph.inputs[0].shape[1].value, "customx_x_1")
+        self.assertIs(onnx_program.model.graph.inputs[0].shape[2].value, "customx_x_2")
+        self.assertIs(onnx_program.model.graph.inputs[1].shape[0].value, "customb_b_0")
+        self.assertIs(onnx_program.model.graph.inputs[1].shape[1].value, "customb_b_1")
+        self.assertIs(onnx_program.model.graph.inputs[1].shape[2].value, "customb_b_2")
+
     def test_saved_f_exists_after_export(self):
         with common_utils.TemporaryFileName(suffix=".onnx") as path:
             _ = torch.onnx.export(
diff --git a/torch/onnx/_internal/exporter/_dynamic_shapes.py b/torch/onnx/_internal/exporter/_dynamic_shapes.py
index 3b04ab85a886..20651017f3ea 100644
--- a/torch/onnx/_internal/exporter/_dynamic_shapes.py
+++ b/torch/onnx/_internal/exporter/_dynamic_shapes.py
@@ -39,6 +39,15 @@ def from_dynamic_axes_to_dynamic_shapes(
 
     Detail on Dim.DYNAMIC: `#133620 <https://github.com/pytorch/pytorch/pull/133620>`_
     """
+
+    warnings.warn(
+        "from_dynamic_axes_to_dynamic_shapes is deprecated and will be removed in a future release. "
+        "This function converts 'dynamic_axes' format (including custom axis names) to 'dynamic_shapes' format. "
+        "Instead of relying on this conversion, provide 'dynamic_shapes' directly with custom names.",
+        DeprecationWarning,
+        stacklevel=2,
+    )
+
     # https://github.com/pytorch/pytorch/pull/128371
     # 1. The function does not need to provide dynamic_shapes to torch.export.export
     if dynamic_axes is None:
@@ -62,9 +71,8 @@ def from_dynamic_axes_to_dynamic_shapes(
                 raise ValueError(
                     "The axis in dynamic_axes must be in the form of: dict[int, str] or list[int]."
                 )
-            dynamic_shapes[input_name] = {
-                k: torch.export.Dim.DYNAMIC for k, _ in axes.items()
-            }
+            # str will be converted to Dim.DYNAMIC in convert_str_to_export_dim
+            dynamic_shapes[input_name] = axes
         elif isinstance(axes, list):
             if any(not isinstance(k, int) for k in axes):
                 raise ValueError(

From de3da77cf7f51392be7c8ac9b9a0dab149be938d Mon Sep 17 00:00:00 2001
From: drisspg <drisspguessous@gmail.com>
Date: Fri, 17 Oct 2025 20:26:45 +0000
Subject: [PATCH 089/123] Thread deterministic config vars to subproc
 compilation (#165729)

# Summary

TIL (AFTER WAYYYY TOO MUCH INSANITY), that we do not serialize the full set of configs for the subproc compilation.

I found this while working on Flex-attention determinism: https://github.com/meta-pytorch/attention-gym/pull/168

might be good to audit if we need to thread through any more

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165729
Approved by: https://github.com/shunting314, https://github.com/eellison
---
 torch/_inductor/codegen/triton.py            | 1 +
 torch/_inductor/runtime/triton_heuristics.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
index a7d29a2fb736..e8d7996460fe 100644
--- a/torch/_inductor/codegen/triton.py
+++ b/torch/_inductor/codegen/triton.py
@@ -4762,6 +4762,7 @@ class TritonKernel(SIMDKernel[TritonCSEVariable]):
             "spill_threshold": config.triton.spill_threshold,
             "store_cubin": config.triton.store_cubin,
             "deterministic": config.deterministic,
+            "force_filter_reduction_configs": config.test_configs.force_filter_reduction_configs,
         }
 
         if config.write_are_deterministic_algorithms_enabled:
diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py
index 0dec399de318..44b567bf5ecd 100644
--- a/torch/_inductor/runtime/triton_heuristics.py
+++ b/torch/_inductor/runtime/triton_heuristics.py
@@ -2962,7 +2962,7 @@ def filter_reduction_configs_for_determinism(
     def _do_filter_due_to_inductor_config():
         return (
             inductor_meta.get("deterministic", False)
-            or torch._inductor.config.test_configs.force_filter_reduction_configs
+            or inductor_meta.get("force_filter_reduction_configs", False)
         ) or inductor_meta.get("are_deterministic_algorithms_enabled")
 
     if not _do_filter_due_to_inductor_config() or len(configs) == 1:

From cf3a787bbcf6dc4ca6d746aea1e9dd4ee0c0fbda Mon Sep 17 00:00:00 2001
From: Shangdi Yu <shangdiy@meta.com>
Date: Sat, 18 Oct 2025 01:54:27 +0000
Subject: [PATCH 090/123] [annotate] Annotate bw nodes before eliminate dead
 code (#165782)

Fixes https://github.com/pytorch/torchtitan/pull/1907

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165782
Approved by: https://github.com/SherlockNoMad
---
 torch/_functorch/_aot_autograd/graph_capture.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/torch/_functorch/_aot_autograd/graph_capture.py b/torch/_functorch/_aot_autograd/graph_capture.py
index 91af2933cc28..132cf335b387 100644
--- a/torch/_functorch/_aot_autograd/graph_capture.py
+++ b/torch/_functorch/_aot_autograd/graph_capture.py
@@ -468,12 +468,16 @@ def aot_dispatch_autograd_graph(
     # a fake tensor. Unlikely.
     # See Note: [Fake Modules and AOTAutograd]
     torch._dynamo.utils.assert_no_fake_params_or_buffers(fx_g)
+
+    # Have to copy before eliminate_dead_code otherwise the
+    # fw node match might be erased
+    copy_fwd_metadata_to_bw_nodes(fx_g)
+
     fx_g.graph.eliminate_dead_code()
     if not aot_config.disable_functionalization:
         # There should be *NO* mutating ops in the graph at this point.
         assert_functional_graph(fx_g.graph)
 
-    copy_fwd_metadata_to_bw_nodes(fx_g)
     fx_g.recompile()
 
     # TODO: in AOTAutograd, we create metadata like _indices_of_inps_to_detach to detect

From c137e222d42ee5f36670b3b2138243c1b12eae83 Mon Sep 17 00:00:00 2001
From: jmaczan <jedrzejpawel@maczan.pl>
Date: Sat, 18 Oct 2025 02:00:52 +0000
Subject: [PATCH 091/123] .venv/ in .gitignore  (#165418)

`uv venv` creates venv in `.venv/` directory. So, it's useful to have `.venv/` in `.gitignore`, since perhaps more people are using `uv` in their work. As per comment https://github.com/pytorch/pytorch/pull/164923/files/3592f5f4e5e536797cb042f03b048169661a428f#diff-bc37d034bad564583790a46f19d807abfe519c5671395fd494d8cce506c42947

uv docs  that confirms it: https://docs.astral.sh/uv/pip/environments/#using-arbitrary-python-environments
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165418
Approved by: https://github.com/ezyang
---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 3a4cae5d8290..447ef777e929 100644
--- a/.gitignore
+++ b/.gitignore
@@ -374,6 +374,7 @@ third_party/ruy/
 third_party/glog/
 
 # Virtualenv
+.venv/
 venv/
 
 # Log files

From de09bab4b66002a8a9a2195f50f96a78868a3d39 Mon Sep 17 00:00:00 2001
From: Aaron Gokaslan <aaronGokaslan@gmail.com>
Date: Sat, 18 Oct 2025 02:23:22 +0000
Subject: [PATCH 092/123] [BE]: Update cudnn frontend submodule to 1.15.0
 (#165776)

Update cudnn frontend submodule to 1.15.0
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165776
Approved by: https://github.com/eqy
---
 aten/src/ATen/native/cudnn/MHA.cpp | 8 ++------
 third_party/cudnn_frontend         | 2 +-
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/aten/src/ATen/native/cudnn/MHA.cpp b/aten/src/ATen/native/cudnn/MHA.cpp
index 366fd0ae3c3c..7604244997bc 100644
--- a/aten/src/ATen/native/cudnn/MHA.cpp
+++ b/aten/src/ATen/native/cudnn/MHA.cpp
@@ -487,9 +487,7 @@ std::unique_ptr<fe::graph::Graph> build_graph(
   auto scaled_dot_product_flash_attention_options =
       fe::graph::SDPA_attributes()
           .set_name("CUDNN_SDPA")
-          .set_is_inference(return_softmaxstats == false)
-          // TODO(eqy): switch to this API once cuDNN FE is upgraded
-          // .set_generate_stats(return_softmaxstats)
+          .set_generate_stats(return_softmaxstats)
           .set_causal_mask(is_causal)
           .set_attn_scale(attn_scale);
   if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
@@ -707,9 +705,7 @@ std::unique_ptr<fe::graph::Graph> build_graph_nestedtensor(
   auto scaled_dot_product_flash_attention_options =
       fe::graph::SDPA_attributes()
           .set_name("CUDNN_SDPA_NESTEDTENSOR")
-          .set_is_inference(return_softmaxstats == false)
-          // TODO(eqy): switch to this API once cuDNN FE is upgraded
-          // .set_generate_stats(return_softmaxstats)
+          .set_generate_stats(return_softmaxstats)
           .set_causal_mask(is_causal)
           .set_attn_scale(attn_scale)
           .set_seq_len_q(SEQ_LEN_Q_)
diff --git a/third_party/cudnn_frontend b/third_party/cudnn_frontend
index f937055efc6d..0b1577c8c834 160000
--- a/third_party/cudnn_frontend
+++ b/third_party/cudnn_frontend
@@ -1 +1 @@
-Subproject commit f937055efc6d414d11f4c6577e3977fe74f35fb6
+Subproject commit 0b1577c8c83401237d601d0d0db5210506705396

From c6a8db0b9acbefc66f02e7ff46ad6bbedabd8b4b Mon Sep 17 00:00:00 2001
From: Laith Sakka <lsakka@meta.com>
Date: Fri, 17 Oct 2025 11:00:15 -0700
Subject: [PATCH 093/123] Fix issues with generalized_scatter  and setitem
 allocated unbacked symbols. (#164341)

Three fixes:
1. When doing t[u0] +=1  if u0 is unbacked we could allocate a new unbacked symbol during the the indexing of t[u0] (when we fake trace setitem), namely because meta_select does allocate a new unbacked symbol for the storage offset when we do not know if u0>=0 or u0<0.  but the output size/stride of setitem(), does not depend on that new symbol. it's self consumed in setitem so we shall ignore it.

2. Also when we trace through generalized_scatter the applications of the views could allocate unbacked symints
but those do not effect final output, we also shall ignore them.

3.Before accessing strides in lowering we shall materialize.

Address  https://github.com/pytorch/pytorch/issues/114293 and https://github.com/pytorch/pytorch/issues/131911

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164341
Approved by: https://github.com/bobrenjc93
---
 test/test_dynamic_shapes.py            | 39 +++++++++++++++++++++++---
 torch/_dynamo/variables/tensor.py      | 15 +++++++++-
 torch/_inductor/fx_passes/reinplace.py | 12 +++++++-
 torch/_inductor/lowering.py            | 13 ++++++++-
 4 files changed, 72 insertions(+), 7 deletions(-)

diff --git a/test/test_dynamic_shapes.py b/test/test_dynamic_shapes.py
index 94f2b3fcb0a5..6baaaf26b9c5 100644
--- a/test/test_dynamic_shapes.py
+++ b/test/test_dynamic_shapes.py
@@ -3398,7 +3398,7 @@ class TestUnbacked(TestCase):
         self.assertFalse("SYMBOLIC_SHAPE_GUARD" in guards)
 
     @skipIfTorchDynamo("mark_unbacked is not traceable")
-    def test_div_unabacked_eq_input_tensors(self):
+    def test_div_unbacked_eq_input_tensors(self):
         @torch.compile(fullgraph=True)
         def func(a, b):
             x = a.size()[0]
@@ -3418,7 +3418,7 @@ class TestUnbacked(TestCase):
         func(a, b)
 
     @torch.compiler.config.patch(unbacked_sources="L['x'],L['y']")
-    def test_div_unabacked_eq_input_ints(self):
+    def test_div_unbacked_eq_input_ints(self):
         @torch.compile(fullgraph=True)
         def func(x, y):
             a = torch.rand(1)
@@ -3433,7 +3433,7 @@ class TestUnbacked(TestCase):
 
     @skipIfTorchDynamo("mark_unbacked is not traceable")
     @torch.compiler.config.patch(unbacked_sources="L['y']")
-    def test_div_unabacked_eq_globals(self):
+    def test_div_unbacked_eq_globals(self):
         tensor = torch.rand(10, 44)
         y = 10
 
@@ -3452,7 +3452,7 @@ class TestUnbacked(TestCase):
         func()
 
     @torch._dynamo.config.patch("capture_scalar_outputs", True)
-    def test_div_unabacked_eq_item(self):
+    def test_div_unbacked_eq_item(self):
         @torch.compile(fullgraph=True)
         def func(a, b):
             x = a.item()
@@ -4270,6 +4270,37 @@ def forward(self, arg0_1: "i64[1][1]cpu", arg1_1: "Sym(u1)", arg2_1: "i64[u1][1]
         result_compiled = compiled_program()
         self.assertEqual(result_original, result_compiled)
 
+    def test_unbacked_item_set_item(self):
+        def my_arithmetic(a, b):
+            wrk = torch.zeros(a.size(0))
+            for i in range(a.size(0)):
+                idx = b[i].item()
+                wrk[idx] += 1
+
+            return wrk
+
+        compiled = torch.compile(my_arithmetic, fullgraph=True, disable=False)
+        a = torch.randn([9])
+        b = torch.ones(9, dtype=torch.int32)
+        compiled(a, b)
+        self.assertEqual(compiled(a, b), my_arithmetic(a, b))
+
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_unbacked_item_set_item2(self):
+        def accumulate(X0, start):
+            start = start.item()
+            N = 3
+            result = X0[start]
+            for i in range(0, N):
+                result += X0[start + 1 + i]
+            return result
+
+        compiled = torch.compile(accumulate, fullgraph=True)
+        X0 = torch.randn(10, 10)
+        self.assertEqual(
+            accumulate(X0, torch.tensor([1])), compiled(X0, torch.tensor([1]))
+        )
+
 
 instantiate_parametrized_tests(TestUnbacked)
 
diff --git a/torch/_dynamo/variables/tensor.py b/torch/_dynamo/variables/tensor.py
index d331f1238b3c..437aded89235 100644
--- a/torch/_dynamo/variables/tensor.py
+++ b/torch/_dynamo/variables/tensor.py
@@ -23,6 +23,7 @@ import operator
 import textwrap
 import traceback
 import types
+from contextlib import nullcontext
 from typing import TYPE_CHECKING
 
 import sympy
@@ -1109,7 +1110,19 @@ class TensorVariable(VariableTracker):
             #   value.requires_grad is True => self.has_grad_fn becomes True
 
             # Not sure if __setitem__ can ever save activations, disabling just in case
-            with torch._dynamo.utils._disable_saved_tensors_hooks_during_tracing():
+
+            # Ignore fresh unbacked symbols that could arise from the internal indexing (selection),
+            # that happen in code like t[idx] += 1 when idx is unbacked. Namely the selection
+            # during 'setitem'.
+            # When the selection happens if idx is unbacked we allocate a new unbacked symbol for the
+            # storage offset in select_meta, but the output of the operation 'setitem' does not depend
+            # on the selection.
+            with (
+                torch._dynamo.utils._disable_saved_tensors_hooks_during_tracing(),
+                tx.fake_mode.shape_env.ignore_fresh_unbacked_symbols()
+                if tx.fake_mode and tx.fake_mode.shape_env
+                else nullcontext(),
+            ):
                 get_fake_value(proxy.node, tx, allow_non_graph_fake=False)
 
             vt = value
diff --git a/torch/_inductor/fx_passes/reinplace.py b/torch/_inductor/fx_passes/reinplace.py
index 8ba3779b4fd8..3a4900900540 100644
--- a/torch/_inductor/fx_passes/reinplace.py
+++ b/torch/_inductor/fx_passes/reinplace.py
@@ -4,6 +4,7 @@ import logging
 import operator
 from collections import defaultdict
 from collections.abc import Sequence
+from contextlib import nullcontext
 from dataclasses import dataclass
 from typing import Any, Callable, cast
 
@@ -12,6 +13,7 @@ import torch.fx.node
 from torch._C._dynamo.guards import compute_overlapping_tensors
 from torch._dispatch.python import enable_python_dispatcher
 from torch._dynamo.utils import ReinplaceCounters, ReInplaceTrigger
+from torch._guards import detect_fake_mode
 from torch._higher_order_ops.triton_kernel_wrap import (
     kernel_side_table,
     triton_kernel_wrapper_functional,
@@ -78,7 +80,15 @@ def _inplace_generalized_scatter(
             lambda node: node.meta["val"] if isinstance(node, torch.fx.Node) else node,
             (view.args, view.kwargs),
         )
-        tmp = view.target(tmp, *fake_args, **fake_kwargs)
+        # slice and select can allocate new unbacked symints, but those won't be reflected
+        # in the output of this function, hence shall be ignored.
+        fake_mode = detect_fake_mode(fake_args)
+        with (
+            fake_mode.shape_env.ignore_fresh_unbacked_symbols()
+            if fake_mode and fake_mode.shape_env
+            else nullcontext()
+        ):
+            tmp = view.target(tmp, *fake_args, **fake_kwargs)
     try:
         tmp.copy_(src)
     except RuntimeError as e:
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index 6df8f06cc02e..e6a9d4f27635 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -1956,6 +1956,9 @@ def select(x, dim, idx):
             # Additionally, we want to avoid accidental unbacked unsqueeze semantics. To resolve this,
             # we use as_strided instead.
             # Removing this branch will cause test_unbacked_select_index_with_check to fail.
+
+            # before accessing size, stride, and offset we need to realize.
+            x.realize()
             new_size = x.get_size()
             new_stride = x.get_stride()
             new_storage_offset = x.get_layout().offset + new_stride[dim] * actual_index
@@ -1979,6 +1982,8 @@ def select(x, dim, idx):
     assert len(unbacked_bindings) == 1, unbacked_bindings
     unbacked_offset_sym, _ = next(iter(unbacked_bindings.items()))
 
+    # before accessing size, stride, and offset we need to realize.
+    x.realize()
     new_size = x.get_size()
     new_stride = x.get_stride()
     new_storage_offset = unbacked_offset_sym
@@ -3159,8 +3164,14 @@ def select_scatter(x, src, dim: int, index: int):
     assert x.get_dtype() == src.get_dtype()
     x_loader = x.make_loader()
     dim = _validate_dim(x, dim, 0)
-    if V.graph.sizevars.evaluate_expr(sympy.Lt(index, 0)):
+    if V.graph.sizevars.guard_or_false(sympy.Lt(index, 0)):
         index = index + x.get_size()[dim]
+    elif V.graph.sizevars.guard_or_false(sympy.Ge(index, 0)):
+        pass
+    else:
+        # unbacked index
+        return fallback_handler(aten.select_scatter.default)(x, src, dim, index)
+
     V.graph.sizevars.check_leq(0, index)  # type: ignore[arg-type]
     V.graph.sizevars.check_lt(index, x.get_size()[dim])  # type: ignore[arg-type]
     src = expand(unsqueeze(src, dim), x.get_size())

From 017d2985f3a66955ae4a3fba217f2edca369fca4 Mon Sep 17 00:00:00 2001
From: Laith Sakka <lsakka@meta.com>
Date: Fri, 17 Oct 2025 11:01:15 -0700
Subject: [PATCH 094/123] set unbacked bindings in reinplace pass for newly
 created nodes during generalize_scatter decomp (#164948)

Two fixes:
1. in rein_place pass, set unbacked bindings for newly created nodes.
2. In inductor, ComputeBuffer used to miss detecting some used symbols, fixed that.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164948
Approved by: https://github.com/bobrenjc93
ghstack dependencies: #164341
---
 test/test_dynamic_shapes.py            | 28 ++++++++++++++++++++++++++
 torch/_inductor/fx_passes/reinplace.py | 14 ++++++++++++-
 torch/_inductor/ir.py                  |  4 +---
 3 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/test/test_dynamic_shapes.py b/test/test_dynamic_shapes.py
index 6baaaf26b9c5..fcc45521fbb1 100644
--- a/test/test_dynamic_shapes.py
+++ b/test/test_dynamic_shapes.py
@@ -4301,6 +4301,34 @@ def forward(self, arg0_1: "i64[1][1]cpu", arg1_1: "Sym(u1)", arg2_1: "i64[u1][1]
             accumulate(X0, torch.tensor([1])), compiled(X0, torch.tensor([1]))
         )
 
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_unbacked_item_set_item3(self):
+        def func(x, y):
+            u0 = y.item()
+            x[u0] = 0
+            return x
+
+        compiled = torch.compile(func, fullgraph=True, disable=False)
+        b = torch.tensor([0])
+        a = torch.ones(9, dtype=torch.int32)
+
+        compiled(a, b)
+        self.assertEqual(compiled(a, b), func(a, b))
+
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_select_scatter_unbacked_index(self):
+        def func(x, y):
+            u0 = y.item()
+            # Create a scalar tensor to scatter into the selected index
+            scalar_src = torch.tensor(42, dtype=x.dtype)
+            return x.select_scatter(scalar_src, 0, u0)
+
+        compiled = torch.compile(func, fullgraph=True, dynamic=True, backend="inductor")
+        b = torch.tensor([0])
+        a = torch.ones(9, dtype=torch.int32)
+
+        self.assertEqual(compiled(a, b), func(a, b))
+
 
 instantiate_parametrized_tests(TestUnbacked)
 
diff --git a/torch/_inductor/fx_passes/reinplace.py b/torch/_inductor/fx_passes/reinplace.py
index 3a4900900540..8b9deac6ba5a 100644
--- a/torch/_inductor/fx_passes/reinplace.py
+++ b/torch/_inductor/fx_passes/reinplace.py
@@ -24,7 +24,10 @@ from torch._inductor.lowering import (
     inplaceable_foreach_ops as inplaceable_foreach_ops_lowerings,
 )
 from torch._inductor.virtualized import V
-from torch.fx.experimental.symbolic_shapes import GuardOnDataDependentSymNode
+from torch.fx.experimental.symbolic_shapes import (
+    compute_unbacked_bindings,
+    GuardOnDataDependentSymNode,
+)
 from torch.fx.immutable_collections import immutable_dict, immutable_list
 from torch.fx.passes.reinplace import _is_view_op
 from torch.utils import _pytree as pytree
@@ -60,7 +63,9 @@ def graph_call_function(graph: torch.fx.Graph, fn, *args, **kwargs):
         fake_result = fn(*fake_args, **fake_kwargs)
 
     node = graph.call_function(fn, args, kwargs)
+
     node.meta["val"] = fake_result
+
     return node
 
 
@@ -171,6 +176,13 @@ def _decompose_scatter_mutating(
     tmp = inp
     for view in view_ops:  # type: ignore[union-attr]
         tmp = graph_call_function(graph, view.target, tmp, *view.args, **view.kwargs)  # type: ignore[union-attr]
+        # we need to set unbacked bindings that could have been created in the view ops.
+        if (V.fake_mode.shape_env) and (
+            symbol_to_path := compute_unbacked_bindings(
+                V.fake_mode.shape_env, tmp.meta["val"]
+            )
+        ):
+            tmp.meta["unbacked_bindings"] = symbol_to_path
 
     graph_call_function(graph, aten.copy_.default, tmp, src)
     return inp  # type: ignore[return-value]
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index 4c28ee8faf59..56a88caf6c7d 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -4542,9 +4542,7 @@ class ComputedBuffer(OperationBuffer):
             unbacked_only
         ) | self.data.get_free_symbol_uses(unbacked_only)
 
-        if self.has_store_function() and isinstance(
-            self.get_store_function(), LoopBody
-        ):
+        if self.has_store_function():
             result |= self.get_read_writes().get_free_symbol_uses(unbacked_only)
         return result
 

From e4d6c56ffb3d680d3874f0dd01907aee7ed2d3c5 Mon Sep 17 00:00:00 2001
From: Yiming Zhou <yimingzhou@meta.com>
Date: Sat, 18 Oct 2025 03:48:18 +0000
Subject: [PATCH 095/123] Improve dynamo graph capture stack trace for custom
 ops (#165693)

For a custom op
```
@torch.library.custom_op("my_lib::foo", mutates_args={})
def foo(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
    return x + y
```
ppl could call `torch.ops.my_lib.foo()` or directly call `foo()` in the `forward` of an `nn.Module`

These two calling conventions will lead to the same node in the output graph, but different stack traces.

When directly calling `foo()`, the displayed stack_trace in the graph will be
```
# File: .../pytorch/torch/_library/custom_ops.py:687 in __call__, code: return self._opoverload(*args, **kwargs)
```
This is not useful so we filter it out.

```
python test/functorch/test_aot_joint_with_descriptors.py -k test_custom_op_stack_trace
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165693
Approved by: https://github.com/SherlockNoMad, https://github.com/williamwen42
---
 .../test_aot_joint_with_descriptors.py        | 46 ++++++++++++++++++-
 torch/_dynamo/output_graph.py                 | 12 ++++-
 2 files changed, 55 insertions(+), 3 deletions(-)

diff --git a/test/functorch/test_aot_joint_with_descriptors.py b/test/functorch/test_aot_joint_with_descriptors.py
index d797b36748d0..24d9042bc9c9 100644
--- a/test/functorch/test_aot_joint_with_descriptors.py
+++ b/test/functorch/test_aot_joint_with_descriptors.py
@@ -38,7 +38,12 @@ from torch._functorch.aot_autograd import (
 )
 from torch._guards import tracing, TracingContext
 from torch.nn.attention.flex_attention import create_block_mask, flex_attention
-from torch.testing._internal.common_utils import requires_cuda, run_tests, TestCase
+from torch.testing._internal.common_utils import (
+    requires_cuda,
+    run_tests,
+    skipIfCrossRef,
+    TestCase,
+)
 
 
 def graph_capture(model, inputs, with_export):
@@ -962,6 +967,45 @@ class inner_f(torch.nn.Module):
 ('call_function', 't_3', {'pp_stage': 0})""",
             )
 
+    @skipIfCrossRef
+    def test_custom_op_stack_trace(self):
+        @torch.library.custom_op("my_lib::foo", mutates_args={})
+        def foo(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+            return x + y
+
+        @foo.register_fake
+        def foo_fake_impl(x, y):
+            return torch.empty_like(x)
+
+        def foo_setup_context(ctx, inputs, output):
+            pass
+
+        def foo_backward(ctx, grad_output):
+            return grad_output, grad_output
+
+        foo.register_autograd(foo_backward, setup_context=foo_setup_context)
+
+        class CustomOpModule(torch.nn.Module):
+            def forward(self, x, y):
+                return foo(x, y)
+
+        model = CustomOpModule()
+        inputs = (torch.randn(4, 3), torch.randn(4, 3))
+
+        gm = graph_capture(model, inputs, with_export=True)
+
+        foo_node = None
+        for node in gm.graph.nodes:
+            if node.op == "call_function" and node.name == "foo":
+                foo_node = node
+                break
+
+        self.assertTrue(foo_node is not None)
+        self.assertTrue("return foo(x, y)" in foo_node.meta.get("stack_trace", None))
+        self.assertTrue("return foo(x, y)" in gm.print_readable(print_output=False))
+        self.assertFalse("self._opoverload" in foo_node.meta.get("stack_trace", None))
+        self.assertFalse("self._opoverload" in gm.print_readable(print_output=False))
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/_dynamo/output_graph.py b/torch/_dynamo/output_graph.py
index feeeed32b9d1..9bce964c3f1a 100644
--- a/torch/_dynamo/output_graph.py
+++ b/torch/_dynamo/output_graph.py
@@ -67,6 +67,7 @@ from torch.fx.experimental.symbolic_shapes import (
     is_symbolic,
     ShapeEnv,
     Specialization,
+    uninteresting_files,
 )
 from torch.fx.node import Target
 from torch.fx.passes.runtime_assert import insert_deferred_runtime_asserts
@@ -3170,11 +3171,18 @@ class SubgraphTracer(fx.Tracer):
                 if not tx.is_co_filename_from_nn_modules():
                     frame_summaries.append(tx.frame_summary())
                 tx = getattr(tx, "parent", None)
+
+            filtered_frame_summaries = [
+                frame
+                for frame in frame_summaries
+                if frame.filename not in uninteresting_files()
+            ]
+
             # Reverse the frame_summaries, such that the innermost frame is at the last
-            frame_summaries.reverse()
+            filtered_frame_summaries.reverse()
 
             # official from_list stub doesn't have new-style type
-            msgs = traceback.StackSummary.from_list(frame_summaries).format()
+            msgs = traceback.StackSummary.from_list(filtered_frame_summaries).format()
             rv.node.stack_trace = "".join(msgs)
 
         if (

From 23417ae50f5d9bc02e988d916c103ff3a03c5903 Mon Sep 17 00:00:00 2001
From: Simon Layton <simonlayton@meta.com>
Date: Fri, 17 Oct 2025 23:11:36 +0000
Subject: [PATCH 096/123] [Submodule] Bump FBGEMM to latest (#165544)

Summary:

* FBGEMM submodule updated to main
* CMake updated to reflect necessary changes
* Notably pulls in NVFP4 grouped gemm kernels

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
Signed-off-by: Simon Layton <simonlayton@meta.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165544
Approved by: https://github.com/cyyever, https://github.com/jeffdaily
---
 aten/src/ATen/CMakeLists.txt        |  5 +++--
 third_party/fbgemm                  |  2 +-
 tools/amd_build/build_amd.py        | 32 +++++++++++++++++++++++++++++
 torch/utils/hipify/hipify_python.py |  2 ++
 4 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
index a9b836189012..a4786d681b73 100644
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -289,14 +289,15 @@ IF(USE_FBGEMM_GENAI)
 
     set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON)
 
-    set(fbgemm_genai_mx8mx8bf16_grouped
+    set(fbgemm_genai_cuh
       "${FBGEMM_GENAI_SRCS}/cutlass_extensions/mx8mx8bf16_grouped/"
+      "${FBGEMM_GENAI_SRCS}/"
     )
 
     target_include_directories(fbgemm_genai PRIVATE
       ${FBGEMM_THIRD_PARTY}/cutlass/include
       ${FBGEMM_THIRD_PARTY}/cutlass/tools/util/include
-      ${fbgemm_genai_mx8mx8bf16_grouped}
+      ${fbgemm_genai_cuh}
       ${FBGEMM_GENAI_SRCS}/common/include/   # includes fbgemm_gpu/quantize/utils.h, fbgemm_gpu/quantize/tuning_cache.hpp
       ${FBGEMM_GENAI_SRCS}/include/          # includes fbgemm_gpu/torch_ops.h
     )
diff --git a/third_party/fbgemm b/third_party/fbgemm
index 3cefe0564a8c..c0b988d39a9e 160000
--- a/third_party/fbgemm
+++ b/third_party/fbgemm
@@ -1 +1 @@
-Subproject commit 3cefe0564a8c3de514a152d40a2b4770f2ee5be0
+Subproject commit c0b988d39a9e47c794d699f29930ed4d7c7e13a4
diff --git a/tools/amd_build/build_amd.py b/tools/amd_build/build_amd.py
index 504bb01e4739..ba1486a093f6 100755
--- a/tools/amd_build/build_amd.py
+++ b/tools/amd_build/build_amd.py
@@ -201,6 +201,19 @@ for hip_platform_file in hip_platform_files:
                     sources.write(line)
             print(f"{hip_platform_file} updated")
 
+# NOTE: fbgemm sources needing hipify
+# fbgemm is its own project with its own build system. pytorch uses fbgemm as
+# a submodule to acquire some gpu source files but compiles only those sources
+# instead of using fbgemm's own build system. One of the source files refers
+# to a header file that is the result of running hipify, but fbgemm uses
+# slightly different hipify settings than pytorch. fbgemm normally hipifies
+# and renames tuning_cache.cuh to tuning_cache_hip.cuh, but pytorch's settings
+# for hipify puts it into its own 'hip' directory. After hipify runs below with
+# the added fbgemm file, we move it to its expected location.
+fbgemm_dir = "third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/common/include/fbgemm_gpu/quantize"
+fbgemm_original = f"{fbgemm_dir}/tuning_cache.cuh"
+fbgemm_move_src = f"{fbgemm_dir}/hip/tuning_cache.cuh"
+fbgemm_move_dst = f"{fbgemm_dir}/tuning_cache_hip.cuh"
 
 hipify_python.hipify(
     project_directory=proj_dir,
@@ -212,7 +225,26 @@ hipify_python.hipify(
         "torch/_inductor/codegen/cpp_wrapper_cpu.py",
         "torch/_inductor/codegen/cpp_wrapper_gpu.py",
         "torch/_inductor/codegen/wrapper.py",
+        fbgemm_original,
     ],
     out_of_place_only=args.out_of_place_only,
     hip_clang_launch=is_hip_clang(),
 )
+
+# only update the file if it changes or doesn't exist
+do_write = True
+src_lines = None
+with open(fbgemm_move_src) as src:
+    src_lines = src.readlines()
+if os.path.exists(fbgemm_move_dst):
+    dst_lines = None
+    with open(fbgemm_move_dst) as dst:
+        dst_lines = dst.readlines()
+    if src_lines == dst_lines:
+        print(f"{fbgemm_move_dst} skipped")
+        do_write = False
+if do_write:
+    with open(fbgemm_move_dst, "w") as dst:
+        for line in src_lines:
+            dst.write(line)
+    print(f"{fbgemm_move_dst} updated")
diff --git a/torch/utils/hipify/hipify_python.py b/torch/utils/hipify/hipify_python.py
index 2b19198f0c58..7e245262ea74 100755
--- a/torch/utils/hipify/hipify_python.py
+++ b/torch/utils/hipify/hipify_python.py
@@ -639,6 +639,8 @@ def is_pytorch_file(rel_filepath):
         return True
     if rel_filepath.startswith("third_party/nvfuser/"):
         return True
+    if rel_filepath.startswith("third_party/fbgemm/"):
+        return True
     if rel_filepath.startswith("tools/autograd/templates/"):
         return True
     return False

From d9f94e0d7d96e52a636899a1b104cf610dd1a905 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Fri, 17 Oct 2025 16:38:12 -0700
Subject: [PATCH 097/123] [dynamo] Support fx.traceback.annotate as decorator
 (#165805)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165805
Approved by: https://github.com/Lucaskabela, https://github.com/SherlockNoMad, https://github.com/yushangdi
---
 test/dynamo/test_fx_annotate.py  | 50 ++++++++++++++++++++++++++++++++
 torch/_dynamo/variables/torch.py |  6 +++-
 2 files changed, 55 insertions(+), 1 deletion(-)

diff --git a/test/dynamo/test_fx_annotate.py b/test/dynamo/test_fx_annotate.py
index ede0b51ef123..337ce0f5764c 100644
--- a/test/dynamo/test_fx_annotate.py
+++ b/test/dynamo/test_fx_annotate.py
@@ -238,6 +238,56 @@ class AnnotateTests(torch._dynamo.test_case.TestCase):
 ('call_function', 'getitem_5', {'compile_inductor': 0})""",  # noqa: B950
         )
 
+    def test_as_decorator(self):
+        class Mod(torch.nn.Module):
+            @fx_traceback.annotate({"fdsp_bucket": 0})
+            def sin(self, x):
+                return torch.sin(x)
+
+            def forward(self, x):
+                with fx_traceback.annotate({"pp_stage": 0}):
+                    sin = self.sin(x)
+                    sub = sin - 2
+                    mul = sub * 2
+                div = mul / 3
+                return div
+
+        m = Mod()
+        backend = AotEagerAndRecordGraphs()
+        opt_m = torch.compile(m, backend=backend, fullgraph=True)
+        x = torch.randn(10, requires_grad=True)
+        m(x)
+        opt_m(x).sum().backward()
+
+        self.assertEqual(len(backend.fw_graphs), 1)
+        self.assertEqual(len(backend.bw_graphs), 1)
+
+        dynamo_metadata = fx_traceback._get_custom_metadata(backend.graphs[0])
+        fw_metadata = fx_traceback._get_custom_metadata(backend.fw_graphs[0])
+        bw_metadata = fx_traceback._get_custom_metadata(backend.bw_graphs[0])
+        self.assertExpectedInline(
+            str(dynamo_metadata),
+            """\
+('placeholder', 'l_x_', {'pp_stage': 0, 'fdsp_bucket': 0})
+('call_function', 'sin', {'pp_stage': 0, 'fdsp_bucket': 0})
+('call_function', 'sub', {'pp_stage': 0})
+('call_function', 'mul', {'pp_stage': 0})""",  # noqa: B950
+        )
+        self.assertExpectedInline(
+            str(fw_metadata),
+            """\
+('call_function', 'sin', {'pp_stage': 0, 'fdsp_bucket': 0})
+('call_function', 'sub', {'pp_stage': 0})
+('call_function', 'mul', {'pp_stage': 0})""",  # noqa: B950
+        )
+        self.assertExpectedInline(
+            str(bw_metadata),
+            """\
+('call_function', 'mul_1', {'pp_stage': 0})
+('call_function', 'cos', {'pp_stage': 0, 'fdsp_bucket': 0})
+('call_function', 'mul_2', {'pp_stage': 0, 'fdsp_bucket': 0})""",  # noqa: B950
+        )
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/_dynamo/variables/torch.py b/torch/_dynamo/variables/torch.py
index d659f3a24d86..1e39187274cc 100644
--- a/torch/_dynamo/variables/torch.py
+++ b/torch/_dynamo/variables/torch.py
@@ -126,6 +126,7 @@ supported_ctx_manager_classes = dict.fromkeys(
         torch.cpu.amp.autocast_mode.autocast,
         torch.cuda.amp.autocast_mode.autocast,
         torch.fx.traceback.annotate,
+        torch.fx.traceback.annotate.__wrapped__,  # type: ignore[attr-defined]
         # We'll let Dynamo inline into the contextlib part of these context
         # manager instances, all the way till it invokes the wrapped function
         # itself (at which point we wrap it back to special context manager
@@ -364,7 +365,10 @@ class TorchCtxManagerClassVariable(BaseTorchVariable):
             assert len(args) <= 1 and len(kwargs) == 0
             inf_mode = args[0].as_python_constant() if len(args) == 1 else True
             return InferenceModeVariable.create(tx, inf_mode)
-        elif self.value is torch.fx.traceback.annotate:
+        elif self.value in (
+            torch.fx.traceback.annotate,
+            torch.fx.traceback.annotate.__wrapped__,  # type: ignore[attr-defined]
+        ):
             assert len(args) <= 1 and len(kwargs) == 0
             return FxTracebackAnnotateVariable(
                 args[0].as_python_constant(), source=self.source

From 9095a9dfae39ad3064a999558f2fd393ff78bd3e Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Sat, 18 Oct 2025 04:16:24 +0000
Subject: [PATCH 098/123] [CD] Apply the fix from #162455 to aarch64+cu129
 build (#165794)

When trying to bring cu129 back in https://github.com/pytorch/pytorch/pull/163029, I mainly looked at https://github.com/pytorch/pytorch/pull/163029 and missed another tweak coming from https://github.com/pytorch/pytorch/pull/162455

I discover this issue when testing aarch64+cu129 builds in https://github.com/pytorch/test-infra/actions/runs/18603342105/job/53046883322?pr=7373.  Surprisingly, there is no test running for aarch64 CUDA build from what I see in https://hud.pytorch.org/pytorch/pytorch/commit/79a37055e790482c12bf32e69b28c8e473d0209d.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165794
Approved by: https://github.com/malfet
---
 .../scripts/generate_binary_build_matrix.py   | 30 +++++++++----------
 ...linux-aarch64-binary-manywheel-nightly.yml | 14 ++++-----
 ...nerated-linux-binary-manywheel-nightly.yml | 14 ++++-----
 3 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/.github/scripts/generate_binary_build_matrix.py b/.github/scripts/generate_binary_build_matrix.py
index 242c1a6fcbcf..154b5a6f0b90 100644
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@@ -79,21 +79,21 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
         "nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'"
     ),
     "12.9": (
-        "nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'"
+        "nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | "
+        "nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | "
+        "nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | "
+        "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | "
+        "nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | "
+        "nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | "
+        "nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | "
+        "nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | "
+        "nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | "
+        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
+        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
+        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | "
+        "nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | "
+        "nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | "
+        "nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'"
     ),
     "13.0": (
         "nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | "
diff --git a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
index f2f43722a146..fd31e4819bb9 100644
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@@ -224,7 +224,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_10-cuda-aarch64-12_9
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -473,7 +473,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_11-cuda-aarch64-12_9
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -722,7 +722,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_12-cuda-aarch64-12_9
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -971,7 +971,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_13-cuda-aarch64-12_9
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -1220,7 +1220,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_13t-cuda-aarch64-12_9
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -1469,7 +1469,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_14-cuda-aarch64-12_9
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -1718,7 +1718,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_14t-cuda-aarch64-12_9
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/generated-linux-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
index 12117a7cb36a..a4a1e3cea95c 100644
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@@ -259,7 +259,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_10-cuda12_9
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_10-cuda12_9-test:  # Testing
@@ -925,7 +925,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_11-cuda12_9
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_11-cuda12_9-test:  # Testing
@@ -1591,7 +1591,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_12-cuda12_9
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_12-cuda12_9-test:  # Testing
@@ -2257,7 +2257,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13-cuda12_9
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13-cuda12_9-test:  # Testing
@@ -2923,7 +2923,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13t-cuda12_9
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13t-cuda12_9-test:  # Testing
@@ -3589,7 +3589,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_14-cuda12_9
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_14-cuda12_9-test:  # Testing
@@ -4255,7 +4255,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_14t-cuda12_9
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_14t-cuda12_9-test:  # Testing

From f02e3947f65cd3d6509224af8e5efdaaa348ef32 Mon Sep 17 00:00:00 2001
From: Maggie Moss <maggiebmoss@gmail.com>
Date: Sat, 18 Oct 2025 04:34:41 +0000
Subject: [PATCH 099/123] Expand type checking to mypy strict files (#165697)

Expands Pyrefly type checking to check the files outlined in the mypy-strict.ini configuration file:

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165697
Approved by: https://github.com/ezyang
---
 pyrefly.toml                                     | 16 +++++-----------
 tools/autograd/gen_autograd_functions.py         |  2 ++
 tools/autograd/gen_trace_type.py                 |  1 +
 tools/autograd/gen_variable_type.py              |  4 ++++
 tools/autograd/load_derivatives.py               |  6 ++++++
 .../package/tool/summarize_jsons.py              |  1 +
 tools/download_mnist.py                          |  1 +
 tools/dynamo/gb_id_mapping.py                    |  3 +++
 .../torchfuzz/multi_process_fuzzer.py            |  1 +
 .../experimental/torchfuzz/operators/constant.py |  1 +
 .../flight_recorder/components/config_manager.py |  7 +++++++
 tools/flight_recorder/components/utils.py        |  2 ++
 tools/flight_recorder/fr_trace.py                |  6 ++++++
 tools/gdb/pytorch-gdb.py                         |  1 +
 tools/gen_vulkan_spv.py                          |  6 ++++++
 tools/jit/gen_unboxing.py                        |  2 ++
 tools/linter/adapters/_linter/file_linter.py     |  1 +
 tools/linter/adapters/_linter/sets.py            |  1 +
 tools/linter/adapters/clangtidy_linter.py        |  2 ++
 tools/linter/adapters/codespell_linter.py        |  1 +
 tools/linter/adapters/pyfmt_linter.py            |  1 +
 tools/linter/adapters/s3_init.py                 |  1 +
 tools/linter/adapters/test_has_main_linter.py    |  3 +++
 .../adapters/workflow_consistency_linter.py      |  1 +
 .../gen_selected_mobile_ops_header.py            |  1 +
 tools/nightly.py                                 |  4 ++++
 tools/nightly_hotpatch.py                        |  1 +
 tools/pyi/gen_pyi.py                             |  1 +
 tools/setup_helpers/cmake.py                     |  1 +
 tools/setup_helpers/generate_linker_script.py    |  2 ++
 .../upload_utilization_stats.py                  |  1 +
 tools/test/gen_operators_yaml_test.py            |  2 ++
 tools/test/test_selective_build.py               |  3 +++
 .../historical_class_failure_correlation.py      |  2 ++
 tools/testing/upload_artifacts.py                |  2 ++
 torch/_inductor/codegen/common.py                |  1 +
 torch/_inductor/codegen/cpp_gemm_template.py     |  2 ++
 torch/_inductor/codegen/cpp_wrapper_gpu.py       |  1 +
 torch/_inductor/codegen/mps.py                   |  2 ++
 torch/_inductor/codegen/simd.py                  |  1 +
 torch/_inductor/codegen/wrapper_fxir.py          |  1 +
 torch/fx/experimental/proxy_tensor.py            |  1 +
 42 files changed, 89 insertions(+), 11 deletions(-)

diff --git a/pyrefly.toml b/pyrefly.toml
index ad74e4df084c..5516963d2622 100644
--- a/pyrefly.toml
+++ b/pyrefly.toml
@@ -5,6 +5,7 @@ python-version = "3.12"
 project-includes = [
     "torch",
     "caffe2",
+    "tools",
     "test/test_bundled_images.py",
     "test/test_bundled_inputs.py",
     "test/test_complex.py",
@@ -24,8 +25,11 @@ project-excludes = [
   # ==== to test Pyrefly on a specific directory, simply comment it out ====
   "torch/_inductor/runtime",
   "torch/_inductor/codegen/triton.py",
+  "tools/linter/adapters/test_device_bias_linter.py",
+  "tools/code_analyzer/gen_operators_yaml.py",
   # formatting issues, will turn on after adjusting where suppressions can be
   # in import statements
+  "tools/flight_recorder/components/types.py",
   "torch/linalg/__init__.py",
   "torch/package/importer.py",
   "torch/package/_package_pickler.py",
@@ -40,17 +44,6 @@ project-excludes = [
   "torch/distributed/elastic/metrics/__init__.py",
   "torch/_inductor/fx_passes/bucketing.py",
   # ====
-  "benchmarks/instruction_counts/main.py",
-  "benchmarks/instruction_counts/definitions/setup.py",
-  "benchmarks/instruction_counts/applications/ci.py",
-  "benchmarks/instruction_counts/core/api.py",
-  "benchmarks/instruction_counts/core/expand.py",
-  "benchmarks/instruction_counts/core/types.py",
-  "benchmarks/instruction_counts/core/utils.py",
-  "benchmarks/instruction_counts/definitions/standard.py",
-  "benchmarks/instruction_counts/definitions/setup.py",
-  "benchmarks/instruction_counts/execution/runner.py",
-  "benchmarks/instruction_counts/execution/work.py",
   "torch/include/**",
   "torch/csrc/**",
   "torch/distributed/elastic/agent/server/api.py",
@@ -137,3 +130,4 @@ errors.bad-param-name-override = false
 errors.implicit-import = false
 permissive-ignores = true
 replace-imports-with-any = ["!sympy.printing.*", "sympy.*", "onnxscript.onnx_opset.*"]
+search-path = ["tools/experimental"]
diff --git a/tools/autograd/gen_autograd_functions.py b/tools/autograd/gen_autograd_functions.py
index cdc805d5a4b5..2bd33cf8df9c 100644
--- a/tools/autograd/gen_autograd_functions.py
+++ b/tools/autograd/gen_autograd_functions.py
@@ -863,6 +863,7 @@ static PyObject* THP${op}_${name}_getter(THPCppFunction *self, void *_unused) {
             saved_variables.append(f"{type.cpp_type()} {name};")
 
             if type in MISC_GETTER_DEFS:
+                # pyrefly: ignore  # index-error
                 getter_def, body = MISC_GETTER_DEFS[type]
                 getter_definitions.append(
                     getter_def.substitute(op=info.op, name=name, body=body)
@@ -1033,6 +1034,7 @@ static PyObject* THP${op}_${name}_getter(THPCppFunction *self, void *_unused) {
     unpack_ivalues = []
     for typ, name in zip(apply_functional_args_ref_types, apply_functional_args):
         typ = typ.removesuffix("&")
+        # pyrefly: ignore  # bad-argument-type
         unpack_ivalues.append(f"auto {name} = packed_args.unpack<{typ}>();")
 
     schema_args = [f"std::array<bool, {len(input_name_to_idx)}>"]
diff --git a/tools/autograd/gen_trace_type.py b/tools/autograd/gen_trace_type.py
index 21069b4671e2..fb20c7872f85 100644
--- a/tools/autograd/gen_trace_type.py
+++ b/tools/autograd/gen_trace_type.py
@@ -182,6 +182,7 @@ def format_trace_inputs(f: NativeFunction) -> str:
             ADD_TRACE_INPUT.substitute(
                 name=f.func.arguments.out[i].name, input=f.func.arguments.out[i].name
             )
+            # pyrefly: ignore  # unbound-name
             for i in range(num_out_args)
         ]
 
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index 5ce3b06af145..df43f8060cea 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -1495,6 +1495,7 @@ def emit_body(
                 else:
                     expr = f"SavedVariable({var}, {str(is_output).lower()})"
                     if foreacharg is not None and "original_selfs" not in expr:
+                        # pyrefly: ignore  # unbound-name
                         expr = expr.replace(src_name, name_in_expr)
             elif (
                 type == BaseCType(tensorListT)
@@ -1844,12 +1845,14 @@ def emit_body(
                                 )
                             )
                         cur_derivative_conditions.append(
+                            # pyrefly: ignore  # bad-argument-type
                             FW_DERIVATIVE_CHECK_TEMPLATE.substitute(
                                 req_inp=inp_name + "[i]"
                             )
                         )
                     else:
                         cur_derivative_conditions.append(
+                            # pyrefly: ignore  # bad-argument-type
                             FW_DERIVATIVE_CHECK_TEMPLATE.substitute(req_inp=inp_name)
                         )
 
@@ -1920,6 +1923,7 @@ def emit_body(
                 unpacked_arguments += FW_DERIVATIVE_DEFINED_GRAD_TEMPLATE.substitute(
                     inp_name="original_self",
                     inp="original_self" + input_suffix,
+                    # pyrefly: ignore  # unbound-name
                     zeros_fn=zeros_fn,
                 )
                 unpacked_arguments += FW_DERIVATIVE_DEFINED_PRIMAL_TEMPLATE.substitute(
diff --git a/tools/autograd/load_derivatives.py b/tools/autograd/load_derivatives.py
index f61226f25fb9..c8a621bf950f 100644
--- a/tools/autograd/load_derivatives.py
+++ b/tools/autograd/load_derivatives.py
@@ -95,8 +95,11 @@ def add_view_copy_derivatives(
             else:
                 break
         # prefer manually-defined derivatives if any
+        # pyrefly: ignore  # unbound-name
         if len(view_copy_differentiability_infos) > 0 and fn_schema not in infos:
+            # pyrefly: ignore  # unbound-name
             assert fn_schema is not None
+            # pyrefly: ignore  # unbound-name
             view_infos[fn_schema] = view_copy_differentiability_infos
 
     infos.update(view_infos)
@@ -398,6 +401,7 @@ def postprocess_forward_derivatives(
             for arg_name in all_arg_names:
                 if arg_name in diff_arg_names:
                     arg_name = arg_name + "_t"
+                # pyrefly: ignore  # bad-argument-type
                 new_args.append(arg_name)
 
             # TODO we are trolling
@@ -938,6 +942,7 @@ def saved_variables(
             + f".sym_strides(), which returned a c10::SymIntArrayRef. formula={formula}"
         )
     for nctype in nctypes:
+        # pyrefly: ignore  # bad-assignment
         name = (
             nctype.name.name if isinstance(nctype.name, SpecialArgName) else nctype.name
         )
@@ -947,6 +952,7 @@ def saved_variables(
 
             def repl(m: re.Match[str]) -> str:
                 suffix: str = (
+                    # pyrefly: ignore  # bad-assignment
                     info["suffix"](m) if callable(info["suffix"]) else info["suffix"]
                 )
                 expr: str = info["expr"](name) if "expr" in info else m.group(0)
diff --git a/tools/code_coverage/package/tool/summarize_jsons.py b/tools/code_coverage/package/tool/summarize_jsons.py
index 3d53b37bcf6a..b41b5760e716 100644
--- a/tools/code_coverage/package/tool/summarize_jsons.py
+++ b/tools/code_coverage/package/tool/summarize_jsons.py
@@ -67,6 +67,7 @@ def is_intrested_file(
 
     # ignore files that are not belong to pytorch
     if platform == TestPlatform.OSS:
+        # pyrefly: ignore  # import-error
         from package.oss.utils import get_pytorch_folder
 
         if not file_path.startswith(get_pytorch_folder()):
diff --git a/tools/download_mnist.py b/tools/download_mnist.py
index d9bbe1f413f2..206753a61cce 100644
--- a/tools/download_mnist.py
+++ b/tools/download_mnist.py
@@ -24,6 +24,7 @@ def report_download_progress(
     file_size: int,
 ) -> None:
     if file_size != -1:
+        # pyrefly: ignore  # no-matching-overload
         percent = min(1, (chunk_number * chunk_size) / file_size)
         bar = "#" * int(64 * percent)
         sys.stdout.write(f"\r0% |{bar:<64}| {int(percent * 100)}%")
diff --git a/tools/dynamo/gb_id_mapping.py b/tools/dynamo/gb_id_mapping.py
index 8fef79bd8077..cb9cbc0dce63 100644
--- a/tools/dynamo/gb_id_mapping.py
+++ b/tools/dynamo/gb_id_mapping.py
@@ -105,8 +105,10 @@ def extract_info_from_keyword(source: str, kw: ast.keyword) -> Any:
         evaluated_context = []
         for value in kw.value.values:
             if isinstance(value, ast.FormattedValue):
+                # pyrefly: ignore  # bad-argument-type
                 evaluated_context.append(f"{{{ast.unparse(value.value)}}}")
             elif isinstance(value, ast.Constant):
+                # pyrefly: ignore  # bad-argument-type
                 evaluated_context.append(value.value)
         return "".join(evaluated_context)
     else:
@@ -152,6 +154,7 @@ def find_unimplemented_v2_calls(
 
                         for kw in node.keywords:
                             if kw.arg in info:
+                                # pyrefly: ignore  # unsupported-operation
                                 info[kw.arg] = extract_info_from_keyword(source, kw)
 
                         if info["gb_type"] is None:
diff --git a/tools/experimental/torchfuzz/multi_process_fuzzer.py b/tools/experimental/torchfuzz/multi_process_fuzzer.py
index 520c03271fe7..bbaf7d669b5d 100644
--- a/tools/experimental/torchfuzz/multi_process_fuzzer.py
+++ b/tools/experimental/torchfuzz/multi_process_fuzzer.py
@@ -296,6 +296,7 @@ def run_multi_process_fuzzer(
                 )
 
                 def write_func(msg):
+                    # pyrefly: ignore  # missing-attribute
                     pbar.write(msg)
             else:
                 persist_print("Progress: (install tqdm for better progress bar)")
diff --git a/tools/experimental/torchfuzz/operators/constant.py b/tools/experimental/torchfuzz/operators/constant.py
index 8fb0b33a4c1a..65f6d9c9c42b 100644
--- a/tools/experimental/torchfuzz/operators/constant.py
+++ b/tools/experimental/torchfuzz/operators/constant.py
@@ -111,6 +111,7 @@ class ConstantOperator(Operator):
                 ]:
                     # Clamp integer values to [0, 3] to avoid index overflow in multiplication
                     # Even with multiplication, indices should stay in reasonable range
+                    # pyrefly: ignore  # bad-argument-type
                     fill_value = max(0, min(3, abs(fill_value)))
 
                 tensor_creation = (
diff --git a/tools/flight_recorder/components/config_manager.py b/tools/flight_recorder/components/config_manager.py
index 1b4eafc3631d..6f7c93c0b58f 100644
--- a/tools/flight_recorder/components/config_manager.py
+++ b/tools/flight_recorder/components/config_manager.py
@@ -78,15 +78,22 @@ class JobConfig:
     def parse_args(
         self: "JobConfig", args: Optional[Sequence[str]]
     ) -> argparse.Namespace:
+        # pyrefly: ignore  # bad-assignment
         args = self.parser.parse_args(args)
+        # pyrefly: ignore  # missing-attribute
         if args.selected_ranks is not None:
+            # pyrefly: ignore  # missing-attribute
             assert args.just_print_entries, (
                 "Not support selecting ranks without printing entries"
             )
+        # pyrefly: ignore  # missing-attribute
         if args.pg_filters is not None:
+            # pyrefly: ignore  # missing-attribute
             assert args.just_print_entries, (
                 "Not support selecting pg filters without printing entries"
             )
+        # pyrefly: ignore  # missing-attribute
         if args.verbose:
             logger.set_log_level(logging.DEBUG)
+        # pyrefly: ignore  # bad-return
         return args
diff --git a/tools/flight_recorder/components/utils.py b/tools/flight_recorder/components/utils.py
index 69455a5a433b..c65a6b98c3c0 100644
--- a/tools/flight_recorder/components/utils.py
+++ b/tools/flight_recorder/components/utils.py
@@ -41,6 +41,7 @@ def format_frame(frame: dict[str, str]) -> str:
 def format_frames(frames: list[dict[str, str]]) -> str:
     formatted_frames = []
     for frame in frames:
+        # pyrefly: ignore  # bad-argument-type
         formatted_frames.append(format_frame(frame))
     return "\n".join(formatted_frames)
 
@@ -695,6 +696,7 @@ def check_version(version_by_ranks: dict[str, str], version: str) -> None:
 
 
 def get_version_detail(version: str) -> tuple[int, int]:
+    # pyrefly: ignore  # bad-assignment
     version = version.split(".")
     assert len(version) == 2, f"Invalid version {version}"
     major, minor = map(int, version)
diff --git a/tools/flight_recorder/fr_trace.py b/tools/flight_recorder/fr_trace.py
index 1d8abcefabfa..3bb64a12120a 100644
--- a/tools/flight_recorder/fr_trace.py
+++ b/tools/flight_recorder/fr_trace.py
@@ -40,11 +40,17 @@ from tools.flight_recorder.components.types import types
 
 def main(args: Optional[Sequence[str]] = None) -> None:
     config = JobConfig()
+    # pyrefly: ignore  # bad-assignment
     args = config.parse_args(args)
+    # pyrefly: ignore  # missing-attribute
     assert args.trace_dir, "Trace directory trace_dir is required"
+    # pyrefly: ignore  # bad-argument-type
     details, version = read_dir(args)
+    # pyrefly: ignore  # bad-argument-type
     db = build_db(details, args, version)
+    # pyrefly: ignore  # missing-attribute
     if args.output:
+        # pyrefly: ignore  # no-matching-overload
         with open(args.output, "wb") as f:
             pickle.dump((types, db), f)
 
diff --git a/tools/gdb/pytorch-gdb.py b/tools/gdb/pytorch-gdb.py
index b205afdc45d4..bb3f7e51f027 100644
--- a/tools/gdb/pytorch-gdb.py
+++ b/tools/gdb/pytorch-gdb.py
@@ -34,6 +34,7 @@ class TensorRepr(gdb.Command):  # type: ignore[misc, no-any-unimported]
     on it.
     """
 
+    # pyrefly: ignore  # bad-argument-type
     __doc__ = textwrap.dedent(__doc__).strip()
 
     def __init__(self) -> None:
diff --git a/tools/gen_vulkan_spv.py b/tools/gen_vulkan_spv.py
index 3c7539b21d86..6772e690a02c 100644
--- a/tools/gen_vulkan_spv.py
+++ b/tools/gen_vulkan_spv.py
@@ -118,6 +118,7 @@ def extract_filename(path: str, keep_ext: bool = True) -> Any:
 
 
 # https://gist.github.com/pypt/94d747fe5180851196eb
+# pyrefly: ignore  # invalid-inheritance
 class UniqueKeyLoader(Loader):
     def construct_mapping(self, node, deep=False):  # type: ignore[no-untyped-def]
         if not isinstance(node, MappingNode):
@@ -233,6 +234,7 @@ def preprocess(
         last_indent = input_indent
 
     while blank_lines != 0:
+        # pyrefly: ignore  # unbound-name
         python_lines.append(python_indent + "print(file=OUT_STREAM)")
         blank_lines -= 1
 
@@ -667,6 +669,7 @@ def generateShaderDispatchStr(shader_info: ShaderInfo, name: str) -> str:
             "    ",
         )
 
+    # pyrefly: ignore  # unbound-name
     return shader_dispatch_str
 
 
@@ -681,15 +684,18 @@ def genCppFiles(
         name = getName(spvPath).replace("_spv", "")
 
         sizeBytes, spv_bin_str = generateSpvBinStr(spvPath, name)
+        # pyrefly: ignore  # bad-argument-type
         spv_bin_strs.append(spv_bin_str)
 
         shader_info = getShaderInfo(srcPath)
 
         register_shader_info_strs.append(
+            # pyrefly: ignore  # bad-argument-type
             generateShaderInfoStr(shader_info, name, sizeBytes)
         )
 
         if shader_info.register_for is not None:
+            # pyrefly: ignore  # bad-argument-type
             shader_registry_strs.append(generateShaderDispatchStr(shader_info, name))
 
     spv_bin_arrays = "\n".join(spv_bin_strs)
diff --git a/tools/jit/gen_unboxing.py b/tools/jit/gen_unboxing.py
index b63b6f5ed251..6ff4d393f2f7 100644
--- a/tools/jit/gen_unboxing.py
+++ b/tools/jit/gen_unboxing.py
@@ -131,12 +131,14 @@ class ComputeCodegenUnboxedKernels:
                 else:
                     arg_cpp = f"c10::IValue({arg_default})"
             args_code.append(
+                # pyrefly: ignore  # bad-argument-type
                 f"""c10::Argument("{arg.name}", nullptr, ::std::nullopt, {arg_cpp})"""
             )
 
         returns = f.func.returns
         returns_code = []
         for ret in returns:
+            # pyrefly: ignore  # bad-argument-type
             returns_code.append(f"""c10::Argument("{ret.name if ret.name else ""}")""")
         return f"""
 // aten::{schema}
diff --git a/tools/linter/adapters/_linter/file_linter.py b/tools/linter/adapters/_linter/file_linter.py
index 7f9c0890fbf6..94b4dd33ac5e 100644
--- a/tools/linter/adapters/_linter/file_linter.py
+++ b/tools/linter/adapters/_linter/file_linter.py
@@ -112,6 +112,7 @@ class FileLinter:
         first_results = None
         original = replacement = pf.contents
 
+        # pyrefly: ignore  # bad-assignment
         while True:
             try:
                 results = sorted(self._lint(pf), key=LintResult.sort_key)
diff --git a/tools/linter/adapters/_linter/sets.py b/tools/linter/adapters/_linter/sets.py
index 0aab76876acf..24792301d754 100644
--- a/tools/linter/adapters/_linter/sets.py
+++ b/tools/linter/adapters/_linter/sets.py
@@ -41,6 +41,7 @@ class LineWithSets:
         t = self.tokens[i]
         after = i < len(self.tokens) - 1 and self.tokens[i + 1]
         if t.string == "Set" and t.type == token.NAME:
+            # pyrefly: ignore  # bad-return
             return after and after.string == "[" and after.type == token.OP
         return (
             (t.string == "set" and t.type == token.NAME)
diff --git a/tools/linter/adapters/clangtidy_linter.py b/tools/linter/adapters/clangtidy_linter.py
index c550f3e6db1d..61456c39993d 100644
--- a/tools/linter/adapters/clangtidy_linter.py
+++ b/tools/linter/adapters/clangtidy_linter.py
@@ -19,11 +19,13 @@ from typing import NamedTuple
 # PyTorch directory root
 def scm_root() -> str:
     path = os.path.abspath(os.getcwd())
+    # pyrefly: ignore  # bad-assignment
     while True:
         if os.path.exists(os.path.join(path, ".git")):
             return path
         if os.path.isdir(os.path.join(path, ".hg")):
             return path
+        # pyrefly: ignore  # bad-argument-type
         n = len(path)
         path = os.path.dirname(path)
         if len(path) == n:
diff --git a/tools/linter/adapters/codespell_linter.py b/tools/linter/adapters/codespell_linter.py
index 13498cff1320..ce0dd8b6692c 100644
--- a/tools/linter/adapters/codespell_linter.py
+++ b/tools/linter/adapters/codespell_linter.py
@@ -101,6 +101,7 @@ def check_dictionary(filename: str) -> list[LintMessage]:
         words_set = set(words)
         if len(words) != len(words_set):
             raise ValueError("The dictionary file contains duplicate entries.")
+        # pyrefly: ignore  # no-matching-overload
         uncased_words = list(map(str.lower, words))
         if uncased_words != sorted(uncased_words):
             raise ValueError(
diff --git a/tools/linter/adapters/pyfmt_linter.py b/tools/linter/adapters/pyfmt_linter.py
index ce5f8252a20f..7d70067b4913 100644
--- a/tools/linter/adapters/pyfmt_linter.py
+++ b/tools/linter/adapters/pyfmt_linter.py
@@ -12,6 +12,7 @@ from enum import Enum
 from pathlib import Path
 from typing import NamedTuple
 
+# pyrefly: ignore  # import-error
 import isort
 import usort
 
diff --git a/tools/linter/adapters/s3_init.py b/tools/linter/adapters/s3_init.py
index 80e61efb612f..b33497d2ce6a 100644
--- a/tools/linter/adapters/s3_init.py
+++ b/tools/linter/adapters/s3_init.py
@@ -55,6 +55,7 @@ def report_download_progress(
     Pretty printer for file download progress.
     """
     if file_size != -1:
+        # pyrefly: ignore  # no-matching-overload
         percent = min(1, (chunk_number * chunk_size) / file_size)
         bar = "#" * int(64 * percent)
         sys.stdout.write(f"\r0% |{bar:<64}| {int(percent * 100)}%")
diff --git a/tools/linter/adapters/test_has_main_linter.py b/tools/linter/adapters/test_has_main_linter.py
index e648a96e0df5..5ba653c3ff95 100644
--- a/tools/linter/adapters/test_has_main_linter.py
+++ b/tools/linter/adapters/test_has_main_linter.py
@@ -15,7 +15,10 @@ import multiprocessing as mp
 from enum import Enum
 from typing import NamedTuple
 
+# pyrefly: ignore  # import-error
 import libcst as cst
+
+# pyrefly: ignore  # import-error
 import libcst.matchers as m
 
 
diff --git a/tools/linter/adapters/workflow_consistency_linter.py b/tools/linter/adapters/workflow_consistency_linter.py
index 46ec00b1a1f2..54a98df699ca 100644
--- a/tools/linter/adapters/workflow_consistency_linter.py
+++ b/tools/linter/adapters/workflow_consistency_linter.py
@@ -69,6 +69,7 @@ def print_lint_message(path: Path, job: dict[str, Any], sync_tag: str) -> None:
 
     lint_message = LintMessage(
         path=str(path),
+        # pyrefly: ignore  # unbound-name
         line=line_number,
         char=None,
         code="WORKFLOWSYNC",
diff --git a/tools/lite_interpreter/gen_selected_mobile_ops_header.py b/tools/lite_interpreter/gen_selected_mobile_ops_header.py
index f90d33c5ba45..5c25d0934ee1 100644
--- a/tools/lite_interpreter/gen_selected_mobile_ops_header.py
+++ b/tools/lite_interpreter/gen_selected_mobile_ops_header.py
@@ -73,6 +73,7 @@ def get_selected_kernel_dtypes_code(
         for kernel_tag, dtypes in selective_builder.kernel_metadata.items():
             conditions = ["scalar_type == at::ScalarType::" + x for x in dtypes]
             body_parts.append(
+                # pyrefly: ignore  # bad-argument-type
                 if_condition_template.substitute(
                     kernel_tag_name=kernel_tag,
                     dtype_checks=" || ".join(conditions),
diff --git a/tools/nightly.py b/tools/nightly.py
index ab60c71ae9b7..a365bff1e6a1 100755
--- a/tools/nightly.py
+++ b/tools/nightly.py
@@ -311,6 +311,7 @@ class Venv:
             python=python,
             capture_output=True,
         ).stdout
+        # pyrefly: ignore  # no-matching-overload
         candidates = list(map(Path, filter(None, map(str.strip, output.splitlines()))))
         candidates = [p for p in candidates if p.is_dir() and p.name == "site-packages"]
         if not candidates:
@@ -480,6 +481,7 @@ class Venv:
         cmd = [str(python), *args]
         env = popen_kwargs.pop("env", None) or {}
         check = popen_kwargs.pop("check", True)
+        # pyrefly: ignore  # no-matching-overload
         return subprocess.run(
             cmd,
             check=check,
@@ -531,6 +533,7 @@ class Venv:
         cmd = [str(self.bindir / "uv"), *args]
         env = popen_kwargs.pop("env", None) or {}
         check = popen_kwargs.pop("check", True)
+        # pyrefly: ignore  # no-matching-overload
         return subprocess.run(
             cmd,
             check=check,
@@ -938,6 +941,7 @@ def _move_single(
 
 def _copy_files(listing: list[Path], source_dir: Path, target_dir: Path) -> None:
     for src in listing:
+        # pyrefly: ignore  # bad-argument-type
         _move_single(src, source_dir, target_dir, shutil.copy2, "Copying")
 
 
diff --git a/tools/nightly_hotpatch.py b/tools/nightly_hotpatch.py
index c956de267651..52833ea2cffa 100644
--- a/tools/nightly_hotpatch.py
+++ b/tools/nightly_hotpatch.py
@@ -118,6 +118,7 @@ def download_patch(pr_number: int, repo_url: str, download_dir: str) -> str:
             urllib.request.urlopen(patch_url) as response,
             open(patch_file, "wb") as out_file,
         ):
+            # pyrefly: ignore  # bad-specialization
             shutil.copyfileobj(response, out_file)
         if not os.path.isfile(patch_file):
             print(f"Failed to download patch for PR #{pr_number}")
diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py
index cb5d69009f74..38a83694a3c2 100644
--- a/tools/pyi/gen_pyi.py
+++ b/tools/pyi/gen_pyi.py
@@ -994,6 +994,7 @@ def add_docstr_to_hint(docstr: str, hint: str) -> str:
         hint = hint.removesuffix("...").rstrip()  # remove "..."
         content = hint + "\n" + textwrap.indent(f'r"""\n{docstr}\n"""', prefix="    ")
         # Remove trailing whitespace on each line
+        # pyrefly: ignore  # no-matching-overload
         return "\n".join(map(str.rstrip, content.splitlines())).rstrip()
 
     # attribute or property
diff --git a/tools/setup_helpers/cmake.py b/tools/setup_helpers/cmake.py
index 0fd6de50a56b..9dc22cc37531 100644
--- a/tools/setup_helpers/cmake.py
+++ b/tools/setup_helpers/cmake.py
@@ -100,6 +100,7 @@ class CMake:
             if ver is not None:
                 eprint(f"Found {cmd} ({command}) version: {ver}", end="")
                 cmake_versions.append(f"{cmd}=={ver}")
+                # pyrefly: ignore  # unsupported-operation
                 if ver >= CMAKE_MINIMUM_VERSION:
                     eprint(f" (>={CMAKE_MINIMUM_VERSION})")
                     valid_cmake_versions[cmd] = ver
diff --git a/tools/setup_helpers/generate_linker_script.py b/tools/setup_helpers/generate_linker_script.py
index b5a7a4ce7dec..bed5d8d742f1 100644
--- a/tools/setup_helpers/generate_linker_script.py
+++ b/tools/setup_helpers/generate_linker_script.py
@@ -31,7 +31,9 @@ def gen_linker_script(
     text_line_start = text_line_start[0]
 
     # ensure that parent directory exists before writing
+    # pyrefly: ignore  # bad-assignment
     fout = Path(fout)
+    # pyrefly: ignore  # missing-attribute
     fout.parent.mkdir(parents=True, exist_ok=True)
 
     with open(fout, "w") as f:
diff --git a/tools/stats/upload_utilization_stats/upload_utilization_stats.py b/tools/stats/upload_utilization_stats/upload_utilization_stats.py
index a0ad34c92205..9aa2935815f7 100644
--- a/tools/stats/upload_utilization_stats/upload_utilization_stats.py
+++ b/tools/stats/upload_utilization_stats/upload_utilization_stats.py
@@ -60,6 +60,7 @@ class SegmentGenerator:
         df[time_col_name] = pd.to_datetime(df[time_col_name], unit="s", utc=True)
 
         # get unique cmd names
+        # pyrefly: ignore  # bad-argument-type
         unique_cmds_df = pd.DataFrame(df[cmd_col_name].unique(), columns=[cmd_col_name])
 
         # get all detected python cmds
diff --git a/tools/test/gen_operators_yaml_test.py b/tools/test/gen_operators_yaml_test.py
index 815c8bf9fb5a..3c905a2bf269 100644
--- a/tools/test/gen_operators_yaml_test.py
+++ b/tools/test/gen_operators_yaml_test.py
@@ -7,6 +7,7 @@ import unittest
 from collections import defaultdict
 from unittest.mock import Mock, patch
 
+# pyrefly: ignore  # import-error
 from gen_operators_yaml import (
     fill_output,
     get_parser_options,
@@ -241,5 +242,6 @@ class GenOperatorsYAMLTest(unittest.TestCase):
 
         fill_output(output, options)
 
+        # pyrefly: ignore  # missing-attribute
         for op_val in output["operators"].values():
             self.assertFalse(op_val["include_all_overloads"])
diff --git a/tools/test/test_selective_build.py b/tools/test/test_selective_build.py
index fac6ca6c8b50..8f9b467b2017 100644
--- a/tools/test/test_selective_build.py
+++ b/tools/test/test_selective_build.py
@@ -88,6 +88,7 @@ operators:
         self.assertTrue(selector2.is_operator_selected("aten::sub.int"))
 
         selector_legacy_v1 = SelectiveBuilder.from_legacy_op_registration_allow_list(
+            # pyrefly: ignore  # bad-argument-type
             ["aten::add", "aten::add.int", "aten::mul.int"],
             False,
             False,
@@ -103,6 +104,7 @@ operators:
         )
 
         selector_legacy_v1 = SelectiveBuilder.from_legacy_op_registration_allow_list(
+            # pyrefly: ignore  # bad-argument-type
             ["aten::add", "aten::add.int", "aten::mul.int"],
             True,
             False,
@@ -118,6 +120,7 @@ operators:
         )
 
         selector_legacy_v1 = SelectiveBuilder.from_legacy_op_registration_allow_list(
+            # pyrefly: ignore  # bad-argument-type
             ["aten::add", "aten::add.int", "aten::mul.int"],
             False,
             True,
diff --git a/tools/testing/target_determination/heuristics/historical_class_failure_correlation.py b/tools/testing/target_determination/heuristics/historical_class_failure_correlation.py
index 6665301f01bb..58c85352db39 100644
--- a/tools/testing/target_determination/heuristics/historical_class_failure_correlation.py
+++ b/tools/testing/target_determination/heuristics/historical_class_failure_correlation.py
@@ -83,7 +83,9 @@ def _rank_correlated_tests(
 ) -> list[str]:
     # Find the tests failures that are correlated with the edited files.
     # Filter the list to only include tests we want to run.
+    # pyrefly: ignore  # bad-assignment
     tests_to_run = set(tests_to_run)
+    # pyrefly: ignore  # bad-argument-type
     ratings = _get_ratings_for_tests(tests_to_run)
     prioritize = sorted(ratings, key=lambda x: -ratings[x])
     return prioritize
diff --git a/tools/testing/upload_artifacts.py b/tools/testing/upload_artifacts.py
index bcc5b221f30a..57aefd9996d2 100644
--- a/tools/testing/upload_artifacts.py
+++ b/tools/testing/upload_artifacts.py
@@ -36,11 +36,13 @@ def concated_logs() -> str:
     for log_file in glob.glob(
         f"{REPO_ROOT}/test/test-reports/**/*.log", recursive=True
     ):
+        # pyrefly: ignore  # bad-argument-type
         logs.append(f"=== {log_file} ===")
         with open(log_file) as f:
             # For every line, prefix with fake timestamp for log classifier
             for line in f:
                 line = line.rstrip("\n")  # Remove any trailing newline
+                # pyrefly: ignore  # bad-argument-type
                 logs.append(f"2020-01-01T00:00:00.0000000Z {line}")
     return "\n".join(logs)
 
diff --git a/torch/_inductor/codegen/common.py b/torch/_inductor/codegen/common.py
index 36ded3aea2fe..743baec01dfa 100644
--- a/torch/_inductor/codegen/common.py
+++ b/torch/_inductor/codegen/common.py
@@ -1739,6 +1739,7 @@ class KernelArgs:
         for outer, inner in chain(
             # pyrefly: ignore  # bad-argument-type
             self.input_buffers.items(),
+            # pyrefly: ignore  # bad-argument-type
             self.output_buffers.items(),
         ):
             if outer in self.inplace_buffers or isinstance(inner, RemovedArg):
diff --git a/torch/_inductor/codegen/cpp_gemm_template.py b/torch/_inductor/codegen/cpp_gemm_template.py
index 9b26105bab10..cb17b5a7deb0 100644
--- a/torch/_inductor/codegen/cpp_gemm_template.py
+++ b/torch/_inductor/codegen/cpp_gemm_template.py
@@ -1480,6 +1480,7 @@ class CppGemmTemplate(CppTemplate):
             gemm_output_buffer = ir.Buffer(
                 # pyrefly: ignore  # missing-attribute
                 name=gemm_output_name,
+                # pyrefly: ignore  # missing-attribute
                 layout=template_buffer.layout,
             )
             current_input_buffer = gemm_output_buffer
@@ -1503,6 +1504,7 @@ class CppGemmTemplate(CppTemplate):
                     current_input_buffer = ir.Buffer(
                         # pyrefly: ignore  # missing-attribute
                         name=buffer_name,
+                        # pyrefly: ignore  # missing-attribute
                         layout=template_buffer.layout,
                     )
 
diff --git a/torch/_inductor/codegen/cpp_wrapper_gpu.py b/torch/_inductor/codegen/cpp_wrapper_gpu.py
index d1ddc7e1cd40..dd4a3a984d34 100644
--- a/torch/_inductor/codegen/cpp_wrapper_gpu.py
+++ b/torch/_inductor/codegen/cpp_wrapper_gpu.py
@@ -824,6 +824,7 @@ class CppWrapperGpu(CppWrapperCpu):
             call_args, arg_types = self.prepare_triton_wrapper_args(
                 # pyrefly: ignore  # bad-argument-type
                 call_args,
+                # pyrefly: ignore  # bad-argument-type
                 arg_types,
             )
             wrapper_name = f"call_{kernel_name}"
diff --git a/torch/_inductor/codegen/mps.py b/torch/_inductor/codegen/mps.py
index a74506d7247a..fb3939531b71 100644
--- a/torch/_inductor/codegen/mps.py
+++ b/torch/_inductor/codegen/mps.py
@@ -683,6 +683,7 @@ class MetalKernel(SIMDKernel):
                     # pyrefly: ignore  # missing-argument
                     t
                     for t in self.range_tree_nodes.values()
+                    # pyrefly: ignore  # missing-argument
                     if t.is_reduction
                 )
                 cmp_op = ">" if reduction_type == "argmax" else "<"
@@ -865,6 +866,7 @@ class MetalKernel(SIMDKernel):
                     # pyrefly: ignore  # missing-argument
                     t.numel
                     for t in self.range_trees
+                    # pyrefly: ignore  # missing-argument
                     if t.is_reduction
                 )
                 # If using dynamic shapes, set the threadgroup size to be the
diff --git a/torch/_inductor/codegen/simd.py b/torch/_inductor/codegen/simd.py
index e2294f05ddca..79d0b603220a 100644
--- a/torch/_inductor/codegen/simd.py
+++ b/torch/_inductor/codegen/simd.py
@@ -968,6 +968,7 @@ class SIMDKernel(Kernel[CSEVariableType], Generic[CSEVariableType]):
             # pyrefly: ignore  # missing-argument
             t
             for t in self.range_trees
+            # pyrefly: ignore  # missing-argument
             if not t.is_reduction or self.inside_reduction
         ]
 
diff --git a/torch/_inductor/codegen/wrapper_fxir.py b/torch/_inductor/codegen/wrapper_fxir.py
index 72c8e0335508..e123f9592770 100644
--- a/torch/_inductor/codegen/wrapper_fxir.py
+++ b/torch/_inductor/codegen/wrapper_fxir.py
@@ -1004,6 +1004,7 @@ class FxConverter:
                 # pyrefly: ignore  # missing-attribute
                 call_kwargs[key]
                 for key in signature
+                # pyrefly: ignore  # missing-attribute
                 if key not in cfg.kwargs
             ]
 
diff --git a/torch/fx/experimental/proxy_tensor.py b/torch/fx/experimental/proxy_tensor.py
index 805d59008e02..28a60bafcac8 100644
--- a/torch/fx/experimental/proxy_tensor.py
+++ b/torch/fx/experimental/proxy_tensor.py
@@ -421,6 +421,7 @@ def get_proxy_slot(
             else:
                 # Attempt to build it from first principles.
                 _build_proxy_for_sym_expr(tracer, obj.node.expr, obj)
+                # pyrefly: ignore  # no-matching-overload
                 value = tracker.get(obj)
 
     if value is None:

From b8194268a6fbc369cce413990826492d36d88bdc Mon Sep 17 00:00:00 2001
From: Yuanyuan Chen <cyyever@outlook.com>
Date: Sat, 18 Oct 2025 04:52:41 +0000
Subject: [PATCH 100/123] Remove unnecessary noqa suppressions  (#164106)

This PR removes unused `noqa` suppressions in Python code.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164106
Approved by: https://github.com/albanD
---
 torch/_inductor/fuzzer.py                      | 8 ++++----
 torch/_logging/_internal.py                    | 2 +-
 torch/nn/intrinsic/qat/modules/conv_fused.py   | 1 -
 torch/nn/intrinsic/qat/modules/linear_fused.py | 1 -
 torch/nn/intrinsic/qat/modules/linear_relu.py  | 1 -
 torch/nn/modules/pooling.py                    | 2 +-
 torch/nn/parallel/distributed.py               | 2 +-
 torch/nn/utils/_named_member_accessor.py       | 2 +-
 torch/utils/_triton.py                         | 2 +-
 9 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/torch/_inductor/fuzzer.py b/torch/_inductor/fuzzer.py
index 403e1c2eca9e..55e49b61f7c7 100644
--- a/torch/_inductor/fuzzer.py
+++ b/torch/_inductor/fuzzer.py
@@ -310,7 +310,7 @@ class SamplingMethod(Enum):
                 )
             try:
                 new_default = new_type()
-            except Exception:  # noqa: E722
+            except Exception:
                 # if default constructor doesn't work, try None
                 new_default = None
 
@@ -779,7 +779,7 @@ class ConfigFuzzer:
         test_model_fn = self.test_model_fn_factory()
         try:
             test_model_fn()
-        except Exception as exc:  # noqa: E722
+        except Exception as exc:
             return handle_return(
                 "Eager exception", Status.FAILED_RUN_EAGER_EXCEPTION, True, exc
             )
@@ -788,7 +788,7 @@ class ConfigFuzzer:
         try:
             test_model_fn2 = self.test_model_fn_factory()
             comp = torch.compile(test_model_fn2, backend="inductor")
-        except Exception as exc:  # noqa: E722
+        except Exception as exc:
             return handle_return(
                 "Exception compiling", Status.FAILED_COMPILE, True, exc
             )
@@ -796,7 +796,7 @@ class ConfigFuzzer:
         # try running compiled
         try:
             compile_result = comp()
-        except Exception as exc:  # noqa: E722
+        except Exception as exc:
             return handle_return(
                 "Exception running compiled",
                 Status.FAILED_RUN_COMPILE_EXCEPTION,
diff --git a/torch/_logging/_internal.py b/torch/_logging/_internal.py
index 87fe5836b147..a84268610263 100644
--- a/torch/_logging/_internal.py
+++ b/torch/_logging/_internal.py
@@ -699,7 +699,7 @@ Examples:
 
   TORCH_LOGS_OUT=/tmp/output.txt will output the logs to /tmp/output.txt as
   well. This is useful when the output is long.
-"""  # flake8: noqa: B950
+"""
     msg = f"""
 TORCH_LOGS Info
 {examples}
diff --git a/torch/nn/intrinsic/qat/modules/conv_fused.py b/torch/nn/intrinsic/qat/modules/conv_fused.py
index 79c7dc116a67..f8dc1d49aad3 100644
--- a/torch/nn/intrinsic/qat/modules/conv_fused.py
+++ b/torch/nn/intrinsic/qat/modules/conv_fused.py
@@ -1,4 +1,3 @@
-# flake8: noqa: F401
 r"""Intrinsic QAT Modules.
 
 This file is in the process of migration to `torch/ao/nn/intrinsic/qat`, and
diff --git a/torch/nn/intrinsic/qat/modules/linear_fused.py b/torch/nn/intrinsic/qat/modules/linear_fused.py
index 2c961557daff..79567d67bd1f 100644
--- a/torch/nn/intrinsic/qat/modules/linear_fused.py
+++ b/torch/nn/intrinsic/qat/modules/linear_fused.py
@@ -1,4 +1,3 @@
-# flake8: noqa: F401
 r"""Intrinsic QAT Modules.
 
 This file is in the process of migration to `torch/ao/nn/intrinsic/qat`, and
diff --git a/torch/nn/intrinsic/qat/modules/linear_relu.py b/torch/nn/intrinsic/qat/modules/linear_relu.py
index 1b9fad39f646..71705320075e 100644
--- a/torch/nn/intrinsic/qat/modules/linear_relu.py
+++ b/torch/nn/intrinsic/qat/modules/linear_relu.py
@@ -1,4 +1,3 @@
-# flake8: noqa: F401
 r"""Intrinsic QAT Modules.
 
 This file is in the process of migration to `torch/ao/nn/intrinsic/qat`, and
diff --git a/torch/nn/modules/pooling.py b/torch/nn/modules/pooling.py
index ed270a812eaf..777e6b0abd8c 100644
--- a/torch/nn/modules/pooling.py
+++ b/torch/nn/modules/pooling.py
@@ -298,7 +298,7 @@ class MaxPool3d(_MaxPoolNd):
 
     .. _link:
         https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
-    """  # noqa: E501
+    """
 
     kernel_size: _size_3_t
     stride: _size_3_t
diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
index d630771d6e8f..3436a97400ff 100644
--- a/torch/nn/parallel/distributed.py
+++ b/torch/nn/parallel/distributed.py
@@ -775,7 +775,7 @@ class DistributedDataParallel(Module, Joinable):
                     "DistributedDataParallel device_ids and output_device arguments "
                     "only work with single-device/multiple-device GPU modules or CPU modules, "
                     f"but got device_ids {device_ids}, output_device {output_device}, "
-                    f"and module parameters { ({p.device for p in self._module_parameters}) }.",  # noqa: E201,E202
+                    f"and module parameters { ({p.device for p in self._module_parameters}) }.",
                 )
 
             self.device_ids = None
diff --git a/torch/nn/utils/_named_member_accessor.py b/torch/nn/utils/_named_member_accessor.py
index 7178b11d00d8..111a24ec1863 100644
--- a/torch/nn/utils/_named_member_accessor.py
+++ b/torch/nn/utils/_named_member_accessor.py
@@ -146,7 +146,7 @@ class NamedMemberAccessor:
                     f"{module._get_name()} has no attribute `{attr}`"
                 ) from ex
             if not isinstance(submodule, torch.nn.Module):
-                raise TypeError(  # noqa: B904
+                raise TypeError(
                     f"submodule `{name}`: {submodule} is not an instance of torch.nn.Module"
                 )
             self.memo[name] = submodule
diff --git a/torch/utils/_triton.py b/torch/utils/_triton.py
index 5f0ca5b4eff8..f062f7e7508c 100644
--- a/torch/utils/_triton.py
+++ b/torch/utils/_triton.py
@@ -16,7 +16,7 @@ def has_triton_package() -> bool:
 @functools.cache
 def get_triton_version(fallback: tuple[int, int] = (0, 0)) -> tuple[int, int]:
     try:
-        import triton  # noqa: F401
+        import triton
 
         major, minor = tuple(int(v) for v in triton.__version__.split(".")[:2])
         return (major, minor)

From 0f0b4bf0295f988b62283efd72f08a5180d905c4 Mon Sep 17 00:00:00 2001
From: Yuanyuan Chen <cyyever@outlook.com>
Date: Sat, 18 Oct 2025 05:23:07 +0000
Subject: [PATCH 101/123] [1/N] Remove unused header inclusion (#165763)

This PR removes unused header inclusion in C++ files.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165763
Approved by: https://github.com/Skylion007
---
 c10/core/AllocatorConfig.cpp                               | 1 -
 c10/core/SymInt.cpp                                        | 1 -
 c10/core/TensorImpl.cpp                                    | 1 -
 c10/core/TensorOptions.cpp                                 | 4 ----
 c10/core/impl/COW.cpp                                      | 1 -
 c10/core/impl/TorchDispatchModeTLS.cpp                     | 1 -
 c10/cuda/CUDADeviceAssertionHost.cpp                       | 2 --
 c10/cuda/CUDAMallocAsyncAllocator.cpp                      | 1 -
 c10/cuda/CUDAMiscFunctions.cpp                             | 1 -
 c10/cuda/driver_api.cpp                                    | 1 -
 c10/util/ApproximateClock.cpp                              | 3 +--
 c10/util/complex_math.cpp                                  | 2 --
 c10/util/signal_handler.cpp                                | 1 -
 torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp | 1 -
 torch/csrc/profiler/util.cpp                               | 1 -
 15 files changed, 1 insertion(+), 21 deletions(-)

diff --git a/c10/core/AllocatorConfig.cpp b/c10/core/AllocatorConfig.cpp
index 750336d143f0..de09037113c2 100644
--- a/c10/core/AllocatorConfig.cpp
+++ b/c10/core/AllocatorConfig.cpp
@@ -1,5 +1,4 @@
 #include <c10/core/AllocatorConfig.h>
-#include <c10/core/DeviceType.h>
 #include <c10/util/env.h>
 
 namespace c10::CachingAllocator {
diff --git a/c10/core/SymInt.cpp b/c10/core/SymInt.cpp
index 8b8ffedc23f8..7ad5cdfb629e 100644
--- a/c10/core/SymInt.cpp
+++ b/c10/core/SymInt.cpp
@@ -4,7 +4,6 @@
 #include <c10/core/SymNodeImpl.h>
 #include <c10/util/intrusive_ptr.h>
 #include <c10/util/safe_numerics.h>
-#include <functional>
 
 namespace c10 {
 
diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp
index cd0321d3bb6f..c59524a0932c 100644
--- a/c10/core/TensorImpl.cpp
+++ b/c10/core/TensorImpl.cpp
@@ -9,7 +9,6 @@
 #include <c10/core/impl/TorchDispatchModeTLS.h>
 #include <c10/util/Logging.h>
 #include <c10/util/accumulate.h>
-#include <c10/util/irange.h>
 #include <optional>
 
 #include <utility>
diff --git a/c10/core/TensorOptions.cpp b/c10/core/TensorOptions.cpp
index 599868aea8fd..d3282ae7114e 100644
--- a/c10/core/TensorOptions.cpp
+++ b/c10/core/TensorOptions.cpp
@@ -1,9 +1,5 @@
 #include <c10/core/TensorOptions.h>
 
-#include <c10/core/Device.h>
-#include <c10/core/Layout.h>
-#include <c10/util/Optional.h>
-
 #include <iostream>
 
 namespace c10 {
diff --git a/c10/core/impl/COW.cpp b/c10/core/impl/COW.cpp
index 81bc86e64bda..78aa267d1254 100644
--- a/c10/core/impl/COW.cpp
+++ b/c10/core/impl/COW.cpp
@@ -2,7 +2,6 @@
 
 #include <c10/core/Allocator.h>
 #include <c10/core/StorageImpl.h>
-#include <c10/core/alignment.h>
 #include <c10/core/impl/COWDeleter.h>
 #include <c10/util/Exception.h>
 #include <c10/util/ParallelGuard.h>
diff --git a/c10/core/impl/TorchDispatchModeTLS.cpp b/c10/core/impl/TorchDispatchModeTLS.cpp
index c8bdc1bb59ba..55d9e24a5721 100644
--- a/c10/core/impl/TorchDispatchModeTLS.cpp
+++ b/c10/core/impl/TorchDispatchModeTLS.cpp
@@ -1,5 +1,4 @@
 #include <c10/core/DispatchKey.h>
-#include <c10/core/SafePyObject.h>
 #include <c10/core/impl/LocalDispatchKeySet.h>
 #include <c10/core/impl/TorchDispatchModeTLS.h>
 #include <c10/util/irange.h>
diff --git a/c10/cuda/CUDADeviceAssertionHost.cpp b/c10/cuda/CUDADeviceAssertionHost.cpp
index a6d4c3fe9079..d67ee4b23e69 100644
--- a/c10/cuda/CUDADeviceAssertionHost.cpp
+++ b/c10/cuda/CUDADeviceAssertionHost.cpp
@@ -1,8 +1,6 @@
 #include <c10/cuda/CUDADeviceAssertionHost.h>
 #include <c10/cuda/CUDAException.h>
 #include <c10/cuda/CUDAFunctions.h>
-#include <c10/util/Backtrace.h>
-#include <c10/util/Exception.h>
 #include <c10/util/env.h>
 #include <c10/util/irange.h>
 #include <cuda_runtime.h>
diff --git a/c10/cuda/CUDAMallocAsyncAllocator.cpp b/c10/cuda/CUDAMallocAsyncAllocator.cpp
index ce0f3d885543..2e9ad7d78d17 100644
--- a/c10/cuda/CUDAMallocAsyncAllocator.cpp
+++ b/c10/cuda/CUDAMallocAsyncAllocator.cpp
@@ -4,7 +4,6 @@
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/util/UniqueVoidPtr.h>
 #include <c10/util/flat_hash_map.h>
-#include <c10/util/irange.h>
 
 #include <unordered_set>
 #include <vector>
diff --git a/c10/cuda/CUDAMiscFunctions.cpp b/c10/cuda/CUDAMiscFunctions.cpp
index b1b6170f891e..b305008d44f8 100644
--- a/c10/cuda/CUDAMiscFunctions.cpp
+++ b/c10/cuda/CUDAMiscFunctions.cpp
@@ -1,7 +1,6 @@
 #include <c10/cuda/CUDAMiscFunctions.h>
 #include <c10/util/env.h>
 #include <cuda_runtime.h>
-#include <cstring>
 #include <string>
 
 namespace c10::cuda {
diff --git a/c10/cuda/driver_api.cpp b/c10/cuda/driver_api.cpp
index d545bf5477b6..887c2d06347b 100644
--- a/c10/cuda/driver_api.cpp
+++ b/c10/cuda/driver_api.cpp
@@ -1,7 +1,6 @@
 #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
 #include <c10/cuda/CUDAException.h>
 #include <c10/cuda/driver_api.h>
-#include <c10/util/CallOnce.h>
 #include <c10/util/Exception.h>
 #include <c10/util/Logging.h>
 #include <cuda_runtime.h>
diff --git a/c10/util/ApproximateClock.cpp b/c10/util/ApproximateClock.cpp
index a69128a44831..53a7b7aa1446 100644
--- a/c10/util/ApproximateClock.cpp
+++ b/c10/util/ApproximateClock.cpp
@@ -1,7 +1,6 @@
 #include <c10/util/ApproximateClock.h>
-#include <c10/util/ArrayRef.h>
+#include <c10/util/Exception.h>
 #include <c10/util/irange.h>
-#include <fmt/format.h>
 
 namespace c10 {
 
diff --git a/c10/util/complex_math.cpp b/c10/util/complex_math.cpp
index 886aadb14151..d1d690917a9b 100644
--- a/c10/util/complex_math.cpp
+++ b/c10/util/complex_math.cpp
@@ -1,7 +1,5 @@
 #include <c10/util/complex.h>
 
-#include <cmath>
-
 // Note [ Complex Square root in libc++]
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 // In libc++ complex square root is computed using polar form
diff --git a/c10/util/signal_handler.cpp b/c10/util/signal_handler.cpp
index 7c2bd055c58d..831c0d024524 100644
--- a/c10/util/signal_handler.cpp
+++ b/c10/util/signal_handler.cpp
@@ -11,7 +11,6 @@
 #include <unistd.h>
 
 #include <atomic>
-#include <chrono>
 #include <condition_variable>
 #include <cstdint>
 #include <cstdio>
diff --git a/torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp b/torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp
index 02efb9ecbe02..908540e6852a 100644
--- a/torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp
+++ b/torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp
@@ -5,7 +5,6 @@
 #include <c10/util/Exception.h>
 #include <c10/util/FileSystem.h>
 #include <c10/util/thread_name.h>
-#include <caffe2/utils/threadpool/WorkersPool.h>
 #include <torch/csrc/distributed/c10d/control_plane/WorkerServer.hpp>
 #include <torch/csrc/distributed/c10d/logging.h>
 
diff --git a/torch/csrc/profiler/util.cpp b/torch/csrc/profiler/util.cpp
index 0b2979e6fb7e..d266958e2cb6 100644
--- a/torch/csrc/profiler/util.cpp
+++ b/torch/csrc/profiler/util.cpp
@@ -1,6 +1,5 @@
 #include <torch/csrc/autograd/function.h>
 #include <torch/csrc/profiler/collection.h>
-#include <torch/csrc/profiler/kineto_shim.h>
 #include <torch/csrc/profiler/util.h>
 
 #include <c10/util/ArrayRef.h>

From aaac8cb0f5852bd52be558b59eca35c6e722313c Mon Sep 17 00:00:00 2001
From: Yuanyuan Chen <cyyever@outlook.com>
Date: Sat, 18 Oct 2025 05:26:29 +0000
Subject: [PATCH 102/123] [1/N] Add strict parameter to Python zip calls 
 (#165531)

Add `strict=True/False` to zip calls in test utils. `strict=True` is passed when possible.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165531
Approved by: https://github.com/Skylion007
---
 torch/testing/_comparison.py                  |  4 +-
 .../testing/_internal/autocast_test_lists.py  |  2 +-
 torch/testing/_internal/common_cuda.py        |  4 +-
 torch/testing/_internal/common_distributed.py |  2 +-
 torch/testing/_internal/common_fsdp.py        |  6 +-
 torch/testing/_internal/common_jit.py         |  2 +-
 .../_internal/common_methods_invocations.py   |  8 +-
 torch/testing/_internal/common_mkldnn.py      |  2 +-
 torch/testing/_internal/common_modules.py     |  2 +-
 torch/testing/_internal/common_nn.py          | 20 ++---
 torch/testing/_internal/common_utils.py       |  6 +-
 .../testing/_internal/composite_compliance.py | 10 +--
 torch/testing/_internal/custom_tensor.py      |  4 +-
 .../distributed/common_state_dict.py          |  4 +-
 .../ddp_under_dist_autograd_test.py           |  4 +-
 .../_internal/distributed/distributed_test.py | 79 +++++++++++++------
 .../distributed/multi_threaded_pg.py          |  6 +-
 .../distributed/rpc/dist_autograd_test.py     |  2 +-
 .../rpc/examples/parameter_server_test.py     |  2 +-
 .../reinforcement_learning_rpc_test.py        |  2 +-
 torch/testing/_internal/jit_utils.py          |  4 +-
 torch/testing/_internal/logging_utils.py      |  4 +-
 .../_internal/opinfo/definitions/_masked.py   |  4 +-
 torch/testing/_internal/two_tensor.py         |  2 +-
 24 files changed, 111 insertions(+), 74 deletions(-)

diff --git a/torch/testing/_comparison.py b/torch/testing/_comparison.py
index 6c4506f1a8a9..1d4a050b8047 100644
--- a/torch/testing/_comparison.py
+++ b/torch/testing/_comparison.py
@@ -92,7 +92,9 @@ def default_tolerances(
                 f"Expected a torch.Tensor or a torch.dtype, but got {type(input)} instead."
             )
     dtype_precisions = dtype_precisions or _DTYPE_PRECISIONS
-    rtols, atols = zip(*[dtype_precisions.get(dtype, (0.0, 0.0)) for dtype in dtypes])
+    rtols, atols = zip(
+        *[dtype_precisions.get(dtype, (0.0, 0.0)) for dtype in dtypes], strict=True
+    )
     return max(rtols), max(atols)
 
 
diff --git a/torch/testing/_internal/autocast_test_lists.py b/torch/testing/_internal/autocast_test_lists.py
index 11cfb179a97e..b3616fede6ce 100644
--- a/torch/testing/_internal/autocast_test_lists.py
+++ b/torch/testing/_internal/autocast_test_lists.py
@@ -437,7 +437,7 @@ class TestAutocast(TestCase):
                 if isinstance(first, torch.Tensor):
                     return torch.equal(first, second)
                 elif isinstance(first, collections.abc.Iterable):
-                    return all(compare(f, s) for f, s in zip(first, second))
+                    return all(compare(f, s) for f, s in zip(first, second, strict=False))
                 else:
                     return first == second
 
diff --git a/torch/testing/_internal/common_cuda.py b/torch/testing/_internal/common_cuda.py
index 916221d33651..8202a32ae8ad 100644
--- a/torch/testing/_internal/common_cuda.py
+++ b/torch/testing/_internal/common_cuda.py
@@ -252,7 +252,7 @@ def tf32_on_and_off(tf32_precision=1e-5, *, only_if=True):
 
         @functools.wraps(f)
         def wrapped(*args, **kwargs):
-            kwargs.update(zip(arg_names, args))
+            kwargs.update(zip(arg_names, args, strict=False))
             cond = torch.cuda.is_tf32_supported() and only_if
             if 'device' in kwargs:
                 cond = cond and (torch.device(kwargs['device']).type == 'cuda')
@@ -325,7 +325,7 @@ def _create_scaling_models_optimizers(device="cuda", optimizer_ctor=torch.optim.
     mod_control = torch.nn.Sequential(torch.nn.Linear(8, 8), torch.nn.Linear(8, 8)).to(device=device)
     mod_scaling = torch.nn.Sequential(torch.nn.Linear(8, 8), torch.nn.Linear(8, 8)).to(device=device)
     with torch.no_grad():
-        for c, s in zip(mod_control.parameters(), mod_scaling.parameters()):
+        for c, s in zip(mod_control.parameters(), mod_scaling.parameters(), strict=True):
             s.copy_(c)
 
     kwargs = {"lr": 1.0}
diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index 64ea87852a86..719713e7c9f6 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -1153,7 +1153,7 @@ def run_subtests(
     subtest_config_values: list[list[Any]] = [item[1] for item in subtest_config_items]
     for values in itertools.product(*subtest_config_values):
         # Map keyword to chosen value
-        subtest_kwargs = dict(zip(subtest_config_keys, values))
+        subtest_kwargs = dict(zip(subtest_config_keys, values, strict=True))
         with cls_inst.subTest(**subtest_kwargs):
             torch._dynamo.reset()
             test_fn(*test_args, **test_kwargs, **subtest_kwargs)
diff --git a/torch/testing/_internal/common_fsdp.py b/torch/testing/_internal/common_fsdp.py
index c18fbccb795d..dd211599cf14 100644
--- a/torch/testing/_internal/common_fsdp.py
+++ b/torch/testing/_internal/common_fsdp.py
@@ -157,7 +157,7 @@ def _assert_module_states(
     assert rank0_states is not None  # mypy
     for state in olist[1:]:
         assert state is not None  # mypy
-        for (_, p1), (_, p2) in zip(rank0_states, state):
+        for (_, p1), (_, p2) in zip(rank0_states, state, strict=True):
             assert_fn(p1, p2)
 
 
@@ -1135,7 +1135,9 @@ def check_sharded_parity(
     prefixes_to_ignore: tuple[str, ...] = (),
 ):
     for (replicated_name, replicated_param), (sharded_name, sharded_param) in zip(
-        replicated_module.named_parameters(), sharded_module.named_parameters()
+        replicated_module.named_parameters(),
+        sharded_module.named_parameters(),
+        strict=True,
     ):
         clean_sharded_name = sharded_name
         for prefix in prefixes_to_ignore:
diff --git a/torch/testing/_internal/common_jit.py b/torch/testing/_internal/common_jit.py
index 6ca05c51189b..ac6e851d7e28 100644
--- a/torch/testing/_internal/common_jit.py
+++ b/torch/testing/_internal/common_jit.py
@@ -135,7 +135,7 @@ def check_against_reference(self, func, reference_func, output_func, args, kwarg
 
         self.assertEqual(outputs, outputs_test)
         self.assertEqual(grads, grads_test)
-        for g2, g2_test in zip(grads2, grads2_test):
+        for g2, g2_test in zip(grads2, grads2_test, strict=True):
             if g2 is None and g2_test is None:
                 continue
             self.assertEqual(g2, g2_test, atol=5e-4, rtol=1e-4)
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index bafe4b241d3c..82e630519eb8 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -449,7 +449,7 @@ def sample_inputs_batch_norm(op_info, device, dtype, requires_grad, **kwargs):
     biases = [None, channels, None]
     is_training = [True, False, False]
 
-    for weight, bias, training in zip(weights, biases, is_training):
+    for weight, bias, training in zip(weights, biases, is_training, strict=True):
         yield SampleInput(
             make_arg(input_shape),
             args=(
@@ -3631,7 +3631,7 @@ class _TestParamsMaxPoolBase:
     def _gen_kwargs(self):
         keys = self.kwargs.keys()
         for values in product(*self.kwargs.values()):
-            yield dict(zip(keys, values))
+            yield dict(zip(keys, values, strict=True))
 
     def gen_input_params(self):
         yield from product(self._gen_shape(), self._gen_kwargs())
@@ -4400,7 +4400,7 @@ def sample_inputs_instance_norm(opinfo, device, dtype, requires_grad, **kwargs):
     weights = [channels, None]
     biases = [None, None]
 
-    for weight_channels, bias_channels in zip(weights, biases):
+    for weight_channels, bias_channels in zip(weights, biases, strict=True):
         running_mean = make_arg_without_requires_grad(channels, low=0)
         running_var = make_arg_without_requires_grad(channels, low=0)
         yield SampleInput(
@@ -11625,7 +11625,7 @@ def reference_searchsorted(sorted_sequence, boundary, out_int32=False, right=Fal
         split_sorter = [sorter[i] if (sorter is not None) else None for i in splits]
 
         split_ret = [np.searchsorted(s_seq, b, side=side, sorter=s_sort)
-                     for (s_seq, b, s_sort) in zip(split_sequence, split_boundary, split_sorter)]
+                     for (s_seq, b, s_sort) in zip(split_sequence, split_boundary, split_sorter, strict=True)]
         split_ret = [i.astype(np.int32) for i in split_ret] if out_int32 else split_ret
         return np.stack(split_ret).reshape(orig_shape)
 
diff --git a/torch/testing/_internal/common_mkldnn.py b/torch/testing/_internal/common_mkldnn.py
index 44da60a5ad1f..70ab98137bd7 100644
--- a/torch/testing/_internal/common_mkldnn.py
+++ b/torch/testing/_internal/common_mkldnn.py
@@ -91,7 +91,7 @@ def reduced_f32_on_and_off(bf32_precision=1e-2, tf32_precision=1e-5):
 
         @functools.wraps(f)
         def wrapped(*args, **kwargs):
-            kwargs.update(zip(arg_names, args))
+            kwargs.update(zip(arg_names, args, strict=False))
             cond = True
             if "device" in kwargs:
                 cond = cond and (torch.device(kwargs["device"]).type == "cpu")
diff --git a/torch/testing/_internal/common_modules.py b/torch/testing/_internal/common_modules.py
index 2cd6a89a0452..120a76eb5ef3 100644
--- a/torch/testing/_internal/common_modules.py
+++ b/torch/testing/_internal/common_modules.py
@@ -1413,7 +1413,7 @@ def module_inputs_torch_nn_L1Loss(module_info, device, dtype, requires_grad, tra
                     forward_input=FunctionInput(make_input((2, 3, 4)),
                                                 make_input((2, 3, 4))),
                     reference_fn=lambda m, p, i, t: 1. / i.numel() * sum((a - b).abs().sum()
-                                                                         for a, b in zip(i, t))),
+                                                                         for a, b in zip(i, t, strict=True))),
         ModuleInput(constructor_input=FunctionInput(),
                     forward_input=FunctionInput(make_input(()), make_input(())),
                     reference_fn=lambda m, p, i, t: 1. / i.numel() * (i - t).abs().sum(),
diff --git a/torch/testing/_internal/common_nn.py b/torch/testing/_internal/common_nn.py
index aaca0efe1eb4..68a35e8c40a1 100644
--- a/torch/testing/_internal/common_nn.py
+++ b/torch/testing/_internal/common_nn.py
@@ -2633,7 +2633,7 @@ def get_new_module_tests():
     # add conv padding mode tests:
     for padding_mode, cpp_padding_mode in zip(
             ['reflect', 'circular', 'replicate', 'zeros'],
-            ['torch::kReflect', 'torch::kCircular', 'torch::kReplicate', 'torch::kZeros']):
+            ['torch::kReflect', 'torch::kCircular', 'torch::kReplicate', 'torch::kZeros'], strict=True):
         # conv signature:
         #     in_channels, out_channels, kernel_size, stride=1,
         #     padding=0, dilation=1, groups=1,
@@ -2848,8 +2848,8 @@ def nllloss_reference(input, target, weight=None, ignore_index=-100,
         return (result, norm)
 
     losses_and_weights = [nll_loss_helper(i, t, weight, ignore_index)
-                          for i, t in zip(input, target)]
-    losses, weights = zip(*losses_and_weights)
+                          for i, t in zip(input, target, strict=True)]
+    losses, weights = zip(*losses_and_weights, strict=True)
     losses_tensor = input.new_tensor(losses)
     if reduction == 'mean':
         return sum(losses_tensor) / sum(weights)
@@ -3268,7 +3268,7 @@ class NNTestCase(TestCase):
         for i in range(output_size):
             param, d_param = self._get_parameters(module)
             # make non grad zeros
-            d_param = [torch.zeros_like(p) if d is None else d for (p, d) in zip(param, d_param)]
+            d_param = [torch.zeros_like(p) if d is None else d for (p, d) in zip(param, d_param, strict=True)]
 
             d_out = torch.zeros_like(output)
             flat_d_out = d_out.view(-1)
@@ -3282,7 +3282,7 @@ class NNTestCase(TestCase):
             d_input = self._backward(module, input, output, d_out)
 
             if jacobian_input:
-                for jacobian_x, d_x in zip(flat_jacobian_input, _iter_tensors(d_input)):
+                for jacobian_x, d_x in zip(flat_jacobian_input, _iter_tensors(d_input), strict=True):
                     jacobian_x[:, i] = d_x.contiguous().view(-1)
             if jacobian_parameters:
                 jacobian_param[:, i] = torch.cat(self._flatten_tensors(d_param), 0)
@@ -3320,7 +3320,7 @@ class NNTestCase(TestCase):
         numerical_t = list(_iter_tensors(numerical))
 
         differences = []
-        for a, n in zip(analytical_t, numerical_t):
+        for a, n in zip(analytical_t, numerical_t, strict=True):
             if a.numel() != 0:
                 differences.append(a.add(n, alpha=-1).abs().max())
             # TODO: compare structure (ensure analytic jacobian has correct shape)
@@ -3528,7 +3528,7 @@ class ModuleTest(TestBase):
             gpu_module = self.constructor(*self.constructor_args).float().cuda()
             cpu_param = test_case._get_parameters(cpu_module)
             gpu_param = test_case._get_parameters(gpu_module)
-            for cpu_p, gpu_p in zip(cpu_param[0], gpu_param[0]):
+            for cpu_p, gpu_p in zip(cpu_param[0], gpu_param[0], strict=True):
                 gpu_p.data.copy_(cpu_p)
 
             test_case._zero_grad_input(cpu_input_tuple)
@@ -3549,7 +3549,7 @@ class ModuleTest(TestBase):
                 cpu_gradInput = test_case._backward(cpu_module, cpu_input_tuple, cpu_output, cpu_gradOutput)
                 gpu_gradInput = test_case._backward(gpu_module, gpu_input_tuple, gpu_output, gpu_gradOutput)
                 test_case.assertEqual(cpu_gradInput, gpu_gradInput, atol=self.precision, rtol=0, exact_dtype=False)
-                for cpu_d_p, gpu_d_p in zip(cpu_param[1], gpu_param[1]):
+                for cpu_d_p, gpu_d_p in zip(cpu_param[1], gpu_param[1], strict=True):
                     test_case.assertEqual(cpu_d_p, gpu_d_p, atol=self.precision, rtol=0)
 
             # Run double-backwards on CPU and GPU and compare results
@@ -3575,7 +3575,7 @@ class ModuleTest(TestBase):
                     gpu_gradOutput,
                     create_graph=True)
 
-                for cpu_d_i, gpu_d_i in zip(cpu_gradInputs, gpu_gradInputs):
+                for cpu_d_i, gpu_d_i in zip(cpu_gradInputs, gpu_gradInputs, strict=True):
                     test_case.assertEqual(cpu_d_i, gpu_d_i, atol=self.precision, rtol=0, exact_dtype=False)
 
                 # We mix output into the second backwards computation so that
@@ -3598,7 +3598,7 @@ class ModuleTest(TestBase):
                     gpu_input_tuple + (gpu_gradOutput,) + tuple(gpu_module.parameters()),
                     retain_graph=True)
                 test_case.assertEqual(cpu_gradInput, gpu_gradInput, atol=self.precision, rtol=0, exact_dtype=False)
-                for cpu_d_p, gpu_d_p in zip(cpu_gg, gpu_gg):
+                for cpu_d_p, gpu_d_p in zip(cpu_gg, gpu_gg, strict=True):
                     test_case.assertEqual(cpu_d_p, gpu_d_p, atol=self.precision, rtol=0, exact_dtype=False)
 
             self.test_noncontig(test_case, gpu_module, gpu_input_tuple)
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 0146f37e4baf..284a3bdcfbd7 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -692,7 +692,7 @@ class parametrize(_TestParametrizer):
             return f"{name}{idx}"
 
     def _default_subtest_name(self, idx, values):
-        return '_'.join([self._formatted_str_repr(idx, a, v) for a, v in zip(self.arg_names, values)])
+        return '_'.join([self._formatted_str_repr(idx, a, v) for a, v in zip(self.arg_names, values, strict=True)])
 
     def _get_subtest_name(self, idx, values, explicit_name=None):
         if explicit_name:
@@ -736,7 +736,7 @@ class parametrize(_TestParametrizer):
                     raise RuntimeError(f'Expected # values == # arg names, but got: {len(values)} '
                                        f'values and {len(self.arg_names)} names for test "{test.__name__}"')
 
-                param_kwargs = dict(zip(self.arg_names, values))
+                param_kwargs = dict(zip(self.arg_names, values, strict=True))
 
                 test_name = self._get_subtest_name(idx, values, explicit_name=maybe_name)
 
@@ -3696,7 +3696,7 @@ class TestCase(expecttest.TestCase):
             n_compressed_dims, n_plain_dims = size[-1 - dense_dims] // blocksize1, size[-2 - dense_dims] // blocksize0
         blocknnz = nnz // (blocksize0 * blocksize1)
         sparse_tensors = [random_sparse_compressed(n_compressed_dims, n_plain_dims, blocknnz) for _ in range(n_batch)]
-        sparse_tensors_it = map(list, zip(*sparse_tensors))
+        sparse_tensors_it = map(list, zip(*sparse_tensors, strict=True))
 
         values = torch.stack(next(sparse_tensors_it)).reshape(*batch_shape, blocknnz, *blocksize, *dense_size)
         compressed_indices = torch.stack(next(sparse_tensors_it)).reshape(*batch_shape, -1)
diff --git a/torch/testing/_internal/composite_compliance.py b/torch/testing/_internal/composite_compliance.py
index c44c0f50ff5d..527fc8a5826e 100644
--- a/torch/testing/_internal/composite_compliance.py
+++ b/torch/testing/_internal/composite_compliance.py
@@ -234,7 +234,7 @@ def generate_cct_and_mode(autograd_view_consistency=True):
                     #    tensor results to be that of the tensors that alias the input
                     result = func(*args, **kwargs)
                     if isinstance(result, (tuple, list)):
-                        for a, b in zip(rs, result):
+                        for a, b in zip(rs, result, strict=True):
                             a.set_(b)
                     else:
                         rs.set_(result)
@@ -303,7 +303,7 @@ def generate_subclass_choices(flat_args, CCT, cct_mode):
     for which_args_are_wrapped in itertools.product(*subclass_options):
 
         result = [maybe_map(partial(wrap, CCT=CCT, cct_mode=cct_mode), should_wrap_arg, arg)
-                  for should_wrap_arg, arg in zip(which_args_are_wrapped, flat_args)]
+                  for should_wrap_arg, arg in zip(which_args_are_wrapped, flat_args, strict=True)]
         yield result, which_args_are_wrapped
 
 
@@ -539,11 +539,11 @@ def check_forward_ad_formula(op: Callable, args, kwargs, gradcheck_wrapper=None,
                 return fwAD.make_dual(primal.detach(), tangent)
             elif is_tensorlist(primal):
                 return tuple(fwAD.make_dual(pri.detach(), tang) if tang is not None else pri
-                             for pri, tang in zip(primal, tangent))
+                             for pri, tang in zip(primal, tangent, strict=True))
             return primal
 
         def compute_expected_grad(args, tangent_args, kwargs, tangent_kwargs):
-            op_args = tuple(map(maybe_make_dual, zip(args, tangent_args)))
+            op_args = tuple(map(maybe_make_dual, zip(args, tangent_args, strict=True)))
             op_kwargs = {k: maybe_make_dual((v, tangent_kwargs[k])) for k, v in kwargs.items()}
 
             if gradcheck_wrapper is None:
@@ -572,7 +572,7 @@ def check_forward_ad_formula(op: Callable, args, kwargs, gradcheck_wrapper=None,
                 new_tang_args, new_tang_kwargs, \
                     which_tang_args_are_wrapped, which_tang_kwargs_are_wrapped = tang_choice
 
-                op_args = tuple(map(maybe_make_dual, zip(new_args, new_tang_args)))
+                op_args = tuple(map(maybe_make_dual, zip(new_args, new_tang_args, strict=True)))
                 op_kwargs = {k: maybe_make_dual((v, new_tang_kwargs[k])) for k, v in new_kwargs.items()}
 
                 try:
diff --git a/torch/testing/_internal/custom_tensor.py b/torch/testing/_internal/custom_tensor.py
index 9fa6f79ec68a..de1b44ba8dac 100644
--- a/torch/testing/_internal/custom_tensor.py
+++ b/torch/testing/_internal/custom_tensor.py
@@ -144,7 +144,9 @@ class CustomTensorPlainOut(torch.Tensor):
             new_out = pytree.tree_unflatten(
                 (
                     CustomTensorPlainOut(tensor1, tensor2)
-                    for tensor1, tensor2 in zip(out_inner_flat_1, out_inner_flat_2)
+                    for tensor1, tensor2 in zip(
+                        out_inner_flat_1, out_inner_flat_2, strict=True
+                    )
                 ),
                 spec,
             )
diff --git a/torch/testing/_internal/distributed/common_state_dict.py b/torch/testing/_internal/distributed/common_state_dict.py
index 76b7800a8d2a..a78e312306ba 100644
--- a/torch/testing/_internal/distributed/common_state_dict.py
+++ b/torch/testing/_internal/distributed/common_state_dict.py
@@ -60,7 +60,7 @@ class VerifyStateDictMixin:
         dist_osd: dict[str, Any],
     ) -> None:
         params = list(chain.from_iterable(g["params"] for g in optim.param_groups))
-        param_pid_mapping = dict(zip(params, range(len(params))))
+        param_pid_mapping = dict(zip(params, range(len(params)), strict=True))
         fqn_pid_mapping = {}
         for fqn, param in model.named_parameters():
             pid = param_pid_mapping[param]
@@ -90,7 +90,7 @@ class VerifyStateDictMixin:
             dist_osd[_PG] = [new_pg]
 
         self.assertEqual(len(osd[_PG]), len(dist_osd[_PG]))
-        for group, dist_group in zip(osd[_PG], dist_osd[_PG]):
+        for group, dist_group in zip(osd[_PG], dist_osd[_PG], strict=True):
             self.assertEqual(len(group), len(dist_group))
             for key, value in group.items():
                 # Below doesn't work because param_groups can have None
diff --git a/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py b/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
index 428224022a45..ca9bc297010a 100644
--- a/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
+++ b/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
@@ -238,7 +238,9 @@ class Trainer:
             sparse_microbatch = torch.split(sparse_features, 2)
             values_microbatch = torch.split(values, 2)
             batches = []
-            for d, s, v in zip(dense_microbatch, sparse_microbatch, values_microbatch):
+            for d, s, v in zip(
+                dense_microbatch, sparse_microbatch, values_microbatch, strict=True
+            ):
                 feature_set = FeatureSet(dense_features=d, sparse_features=s, values=v)
                 batches.append(feature_set)
 
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index 62ef8d4a5eca..c41602d43994 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -678,7 +678,7 @@ class DistributedTest:
             # Verify buffers across ranks.
             m1_buffers = list(m1.buffers())
             m2_buffers = list(m2.buffers())
-            for buf1, buf2 in zip(m1_buffers, m2_buffers):
+            for buf1, buf2 in zip(m1_buffers, m2_buffers, strict=True):
                 gathered_bufs = [
                     torch.empty_like(buf1) for _ in range(dist.get_world_size())
                 ]
@@ -3045,7 +3045,7 @@ class DistributedTest:
                 curr_values = master_values if rank == src else worker_values
                 tensors = [
                     _build_tensor(src + 1, val, dtype=dtype)
-                    for dtype, val in zip(dtypes, curr_values)
+                    for dtype, val in zip(dtypes, curr_values, strict=True)
                 ]
                 if cuda:
                     tensors = [t.cuda(rank_to_GPU[rank][0]) for t in tensors]
@@ -3066,7 +3066,9 @@ class DistributedTest:
                 )
                 expected_tensors = [
                     _build_tensor(src + 1, expected_value, dtype=dtype)
-                    for dtype, expected_value in zip(dtypes, expected_values)
+                    for dtype, expected_value in zip(
+                        dtypes, expected_values, strict=True
+                    )
                 ]
                 self.assertEqual(tensors, expected_tensors)
 
@@ -3338,7 +3340,7 @@ class DistributedTest:
                 )
                 if rank == dest:
                     expected_tensors = [_build_tensor(dest + 1, i) for i in group]
-                    for t1, t2 in zip(tensors, expected_tensors):
+                    for t1, t2 in zip(tensors, expected_tensors, strict=True):
                         self.assertEqual(t1, t2)
 
             self._barrier()
@@ -3440,7 +3442,7 @@ class DistributedTest:
                 expected_tensors = [
                     _build_tensor(dest + 1, i, dtype=dtype) for i in group
                 ]
-                for t1, t2 in zip(tensors, expected_tensors):
+                for t1, t2 in zip(tensors, expected_tensors, strict=True):
                     self.assertEqual(t1, t2)
 
             self._barrier()
@@ -3624,8 +3626,8 @@ class DistributedTest:
                 tensor_shapes=tensor_shapes,
             )
 
-            for l1, l2 in zip(output_tensor_lists, expected_tensors):
-                for t1, t2 in zip(l1, l2):
+            for l1, l2 in zip(output_tensor_lists, expected_tensors, strict=True):
+                for t1, t2 in zip(l1, l2, strict=True):
                     if not torch.equal(t1, t2):
                         return False
             return True
@@ -3824,7 +3826,7 @@ class DistributedTest:
                     ]
                     out_tensors = [t.cuda(rank_to_GPU[rank][0]) for t in out_tensors]
                 dist.all_to_all(out_tensors, in_tensors, group=group_id)
-                for t1, t2 in zip(out_tensors, expected_tensors):
+                for t1, t2 in zip(out_tensors, expected_tensors, strict=True):
                     self.assertEqual(t1, t2)
             self._barrier()
 
@@ -4203,7 +4205,7 @@ class DistributedTest:
 
         def _assert_equal_param(self, param_gpu, param_DDP):
             self.assertEqual(len(param_gpu), len(param_DDP))
-            for p_gpu, p_DDP in zip(param_gpu, param_DDP):
+            for p_gpu, p_DDP in zip(param_gpu, param_DDP, strict=True):
                 self.assertEqual(p_gpu, p_DDP)
 
         def _test_DDP_niter(
@@ -4618,6 +4620,7 @@ class DistributedTest:
                     for hook_param, allreduce_param in zip(
                         ddp_model_with_optimizer_hook.parameters(),
                         ddp_model_with_no_hook.parameters(),
+                        strict=True,
                     ):
                         self.assertEqual(hook_param, allreduce_param)
 
@@ -4649,6 +4652,7 @@ class DistributedTest:
                     for hook_param, allreduce_param in zip(
                         ddp_model_with_optimizer_hook.parameters(),
                         ddp_model_with_no_hook.parameters(),
+                        strict=True,
                     ):
                         self.assertEqual(hook_param, allreduce_param)
 
@@ -4825,7 +4829,9 @@ class DistributedTest:
                         optimizer_kwargs=optim_kwargs,
                     )
 
-                for p1, p2 in zip(model.parameters(), model_optim_in_bwd.parameters()):
+                for p1, p2 in zip(
+                    model.parameters(), model_optim_in_bwd.parameters(), strict=True
+                ):
                     self.assertEqual(p1, p2, "Parameters not initially equal!")
                 # Enable determinism in cudnn operators
                 with torch.backends.cudnn.flags(
@@ -4843,7 +4849,9 @@ class DistributedTest:
                             inp
                         ).sum().backward()  # runs optimizer as well
                         for p1, p2 in zip(
-                            model.parameters(), model_optim_in_bwd.parameters()
+                            model.parameters(),
+                            model_optim_in_bwd.parameters(),
+                            strict=True,
                         ):
                             self.assertEqual(
                                 p1, p2, f"Params not equal at iteration {i}"
@@ -5323,7 +5331,9 @@ class DistributedTest:
                     # sync grads
                     step_model(ddp_model, ddp_input, ddp_target)
 
-                for i, j in zip(model.parameters(), ddp_model.parameters()):
+                for i, j in zip(
+                    model.parameters(), ddp_model.parameters(), strict=True
+                ):
                     if not i.requires_grad:
                         continue
                     if iteration % 2 == 0:
@@ -5562,6 +5572,7 @@ class DistributedTest:
             for i, j in zip(
                 ddp_model_grad_not_view.parameters(),
                 ddp_model_grad_is_view.parameters(),
+                strict=True,
             ):
                 self.assertEqual(i, j)
 
@@ -5667,7 +5678,9 @@ class DistributedTest:
                     target,
                 )
                 for p1, p2 in zip(
-                    net.parameters(), net_using_post_localSGD_opt.parameters()
+                    net.parameters(),
+                    net_using_post_localSGD_opt.parameters(),
+                    strict=True,
                 ):
                     self.assertEqual(p1.data, p2.data)
 
@@ -6817,7 +6830,7 @@ class DistributedTest:
             # they are the same as new_model on rank_to_broadcast.
             if rank == rank_to_broadcast:
                 expected_states = new_model.state_dict().values()
-                for t, expected in zip(net_module_states, expected_states):
+                for t, expected in zip(net_module_states, expected_states, strict=True):
                     self.assertEqual(t, expected)
 
         @skip_if_lt_x_gpu(2)
@@ -7134,7 +7147,9 @@ class DistributedTest:
 
             # Validate model state dicts are equal
             for (_, local_tensor), (_, dist_tensor) in zip(
-                local_model.state_dict().items(), net.module.state_dict().items()
+                local_model.state_dict().items(),
+                net.module.state_dict().items(),
+                strict=True,
             ):
                 self.assertEqual(local_tensor, dist_tensor)
 
@@ -7722,13 +7737,17 @@ class DistributedTest:
                     # materialized param grad is not touched by DDP, so its grad should
                     # be the same as if running locally.
                     for materialized_param, local_param in zip(
-                        ddp.module.fc2.parameters(), local_model.fc2.parameters()
+                        ddp.module.fc2.parameters(),
+                        local_model.fc2.parameters(),
+                        strict=True,
                     ):
                         self.assertEqual(materialized_param.grad, local_param.grad)
 
                     # fc1 parameter grad should still be different, due to allreduce.
                     for synced_param, local_param in zip(
-                        ddp.module.fc1.parameters(), local_model.fc1.parameters()
+                        ddp.module.fc1.parameters(),
+                        local_model.fc1.parameters(),
+                        strict=True,
                     ):
                         self.assertFalse(synced_param.grad == local_param.grad)
 
@@ -8581,7 +8600,7 @@ class DistributedTest:
 
                 # Verify grads are the same
                 for local_param, dist_param in zip(
-                    local_net.parameters(), net.parameters()
+                    local_net.parameters(), net.parameters(), strict=True
                 ):
                     local_grad = local_param.grad
                     dist_grad = dist_param.grad
@@ -8631,7 +8650,7 @@ class DistributedTest:
             torch._C._functions.UndefinedGrad()(out).backward()
             torch._C._functions.UndefinedGrad()(local_out).backward()
             for (dist_param_name, dist_param), (local_param_name, local_param) in zip(
-                net.named_parameters(), local_net.named_parameters()
+                net.named_parameters(), local_net.named_parameters(), strict=True
             ):
                 dist_grad = dist_param.grad
                 local_grad = local_param.grad
@@ -8689,7 +8708,9 @@ class DistributedTest:
             self.assertTrue(
                 static_model._get_ddp_logging_data().get("has_rebuilt_buckets", 0)
             )
-            for i, j in zip(base_model.parameters(), static_model.parameters()):
+            for i, j in zip(
+                base_model.parameters(), static_model.parameters(), strict=True
+            ):
                 self.assertEqual(i, j)
 
         @require_backend_is_available({"gloo"})
@@ -9297,7 +9318,7 @@ class DistributedTest:
                     loss_static.backward()
                     self._model_step(model_static_graph)
                     for p, p_static in zip(
-                        model.parameters(), model_static_graph.parameters()
+                        model.parameters(), model_static_graph.parameters(), strict=True
                     ):
                         self.assertEqual(p, p_static)
 
@@ -9974,7 +9995,7 @@ class DistributedTest:
                         p.grad.data = p.grad / iters
 
                     for p_ddp, p_local in zip(
-                        model.parameters(), local_model.parameters()
+                        model.parameters(), local_model.parameters(), strict=True
                     ):
                         self.assertTrue(
                             torch.allclose(p_ddp.grad, p_local.grad),
@@ -10191,7 +10212,9 @@ class DistributedTest:
             #  (refer to https://github.com/numpy/numpy/blob/266aad7478bc7fbcc55eea7f942a0d373b838396/numpy/random/mtrand.pyi)
             # To make sure random state was restored properly, all entries should equal the original
             for entry1, entry2 in zip(
-                hook_state.rng.get_state(), dummy_hook_state.rng.get_state()
+                hook_state.rng.get_state(),
+                dummy_hook_state.rng.get_state(),
+                strict=True,
             ):
                 np.testing.assert_array_equal(entry1, entry2)
 
@@ -10212,7 +10235,7 @@ class DistributedTest:
 
             # Check that gradients after 10 epochs are the same
             for orig_param, dummy_param in zip(
-                ddp_model.parameters(), dummy_ddp_model.parameters()
+                ddp_model.parameters(), dummy_ddp_model.parameters(), strict=True
             ):
                 self.assertEqual(orig_param.grad, dummy_param.grad)
 
@@ -10299,7 +10322,9 @@ class DistributedTest:
                 self.assertEqual(out_ddp, out_ddp_static)
                 out_ddp.backward()
                 out_ddp_static.backward()
-                for p1, p2 in zip(ddp.parameters(), ddp_static.parameters()):
+                for p1, p2 in zip(
+                    ddp.parameters(), ddp_static.parameters(), strict=True
+                ):
                     self.assertEqual(p1.grad, p2.grad)
 
         @skip_if_lt_x_gpu(2)
@@ -10392,7 +10417,9 @@ class DistributedTest:
                 test_model_1._get_ddp_logging_data().get("num_buckets_reduced"), 1
             )
 
-            for i, j in zip(base_model.parameters(), test_model_1.parameters()):
+            for i, j in zip(
+                base_model.parameters(), test_model_1.parameters(), strict=True
+            ):
                 self.assertEqual(i, j)
 
 
diff --git a/torch/testing/_internal/distributed/multi_threaded_pg.py b/torch/testing/_internal/distributed/multi_threaded_pg.py
index 1f5d1ef1bdbd..2cc22cb7c23a 100644
--- a/torch/testing/_internal/distributed/multi_threaded_pg.py
+++ b/torch/testing/_internal/distributed/multi_threaded_pg.py
@@ -457,7 +457,9 @@ class ProcessLocalGroup(dist.ProcessGroup):
     ):
         works = [
             self._reduce_scatter_base(output_tensor, input_tensor, opts)
-            for output_tensor, input_tensor in zip(output_tensors, input_tensors)
+            for output_tensor, input_tensor in zip(
+                output_tensors, input_tensors, strict=True
+            )
         ]
         for work in works[:-1]:
             work.wait()
@@ -467,7 +469,7 @@ class ProcessLocalGroup(dist.ProcessGroup):
         self, output_tensor_list, input_tensor_list, opts=AllgatherOptions()
     ):
         res = None
-        for o_t, i_t in zip(output_tensor_list, input_tensor_list):
+        for o_t, i_t in zip(output_tensor_list, input_tensor_list, strict=True):
             res = self._allgather_base(o_t, i_t)
         return res
 
diff --git a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
index f7cb2075e373..1d6c7500c5ad 100644
--- a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
+++ b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
@@ -2749,7 +2749,7 @@ class TensorPipeCudaDistAutogradTest(RpcAgentTestFixture):
 
                 for i in range(len(futs)):
                     local_gradients = [p.grad for p in local_layers[i].parameters()]
-                    for g1, g2 in zip(futs[i].wait(), local_gradients):
+                    for g1, g2 in zip(futs[i].wait(), local_gradients, strict=True):
                         self.assertEqual(g1, g2)
 
         rpc.shutdown()
diff --git a/torch/testing/_internal/distributed/rpc/examples/parameter_server_test.py b/torch/testing/_internal/distributed/rpc/examples/parameter_server_test.py
index f84ba5225c6e..ad0b7fbe2207 100644
--- a/torch/testing/_internal/distributed/rpc/examples/parameter_server_test.py
+++ b/torch/testing/_internal/distributed/rpc/examples/parameter_server_test.py
@@ -46,7 +46,7 @@ class BatchUpdateParameterServer:
     @rpc.functions.async_execution
     def update_and_fetch_model(ps_rref, grads):
         self = ps_rref.local_value()
-        for p, g in zip(self.model.parameters(), grads):
+        for p, g in zip(self.model.parameters(), grads, strict=True):
             if p.grad is None:
                 p.grad = g
             else:
diff --git a/torch/testing/_internal/distributed/rpc/examples/reinforcement_learning_rpc_test.py b/torch/testing/_internal/distributed/rpc/examples/reinforcement_learning_rpc_test.py
index beb08a25484d..57008aed17db 100644
--- a/torch/testing/_internal/distributed/rpc/examples/reinforcement_learning_rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/examples/reinforcement_learning_rpc_test.py
@@ -216,7 +216,7 @@ class Agent:
             returns.insert(0, R)
         returns = torch.tensor(returns)
         returns = (returns - returns.mean()) / (returns.std() + self.eps)
-        for log_prob, R in zip(probs, returns):
+        for log_prob, R in zip(probs, returns, strict=True):
             policy_loss.append(-log_prob * R)
         self.optimizer.zero_grad()
         policy_loss = torch.cat(policy_loss).sum()
diff --git a/torch/testing/_internal/jit_utils.py b/torch/testing/_internal/jit_utils.py
index 4bc0738ec2f3..e98d0e482683 100644
--- a/torch/testing/_internal/jit_utils.py
+++ b/torch/testing/_internal/jit_utils.py
@@ -249,7 +249,7 @@ class JitTestCase(JitCommonTestCase):
             saved_module_buffer_2.seek(0)
             code_files_2, _debug_files_2 = extract_files(saved_module_buffer_2)
 
-            for a, b in zip(code_files, code_files_2):
+            for a, b in zip(code_files, code_files_2, strict=True):
                 self.assertMultiLineEqual(a, b)
 
             if isinstance(m, torch._C.ScriptModule):
@@ -617,7 +617,7 @@ class JitTestCase(JitCommonTestCase):
         self.assertEqual(outputs, outputs_ge)
         if inputs_require_grads:
             self.assertEqual(grads, grads_ge, atol=grad_atol, rtol=grad_rtol)
-            for g2, g2_ge in zip(grads2, grads2_ge):
+            for g2, g2_ge in zip(grads2, grads2_ge, strict=True):
                 if g2 is None and g2_ge is None:
                     continue
                 self.assertEqual(g2, g2_ge, atol=8e-4, rtol=8e-4)
diff --git a/torch/testing/_internal/logging_utils.py b/torch/testing/_internal/logging_utils.py
index 1632149c6584..1e1ecf8f4f70 100644
--- a/torch/testing/_internal/logging_utils.py
+++ b/torch/testing/_internal/logging_utils.py
@@ -228,11 +228,11 @@ def multiple_logs_to_string(module: str, *log_options: str) -> tuple[list[io.Str
     def tmp_redirect_logs():
         loggers = [torch._logging.getArtifactLogger(module, option) for option in log_options]
         try:
-            for logger, handler in zip(loggers, handlers):
+            for logger, handler in zip(loggers, handlers, strict=True):
                 logger.addHandler(handler)
             yield
         finally:
-            for logger, handler in zip(loggers, handlers):
+            for logger, handler in zip(loggers, handlers, strict=True):
                 logger.removeHandler(handler)
 
     def ctx_manager() -> AbstractContextManager[None]:
diff --git a/torch/testing/_internal/opinfo/definitions/_masked.py b/torch/testing/_internal/opinfo/definitions/_masked.py
index 4ff16b343715..d65fbef658a4 100644
--- a/torch/testing/_internal/opinfo/definitions/_masked.py
+++ b/torch/testing/_internal/opinfo/definitions/_masked.py
@@ -402,9 +402,9 @@ def sample_inputs_masked_logaddexp(op_info, device, dtype, requires_grad, **kwar
         make_tensor, dtype=dtype, device=device, requires_grad=requires_grad
     )
     for shape, input_masks, other_masks in zip(
-        shapes, input_mask_lists, other_mask_lists
+        shapes, input_mask_lists, other_mask_lists, strict=True
     ):
-        for input_mask, other_mask in zip(input_masks, other_masks):
+        for input_mask, other_mask in zip(input_masks, other_masks, strict=True):
             yield SampleInput(
                 make_arg(shape),
                 make_arg(shape),
diff --git a/torch/testing/_internal/two_tensor.py b/torch/testing/_internal/two_tensor.py
index 3a503c741e88..8197829ac7f4 100644
--- a/torch/testing/_internal/two_tensor.py
+++ b/torch/testing/_internal/two_tensor.py
@@ -78,7 +78,7 @@ class TwoTensor(torch.Tensor):
         # our two inner tensors return the same value
         out_flat = [
             cls(o_a, o_b) if isinstance(o_a, torch.Tensor) else o_a
-            for o_a, o_b in zip(out_a_flat, out_b_flat)
+            for o_a, o_b in zip(out_a_flat, out_b_flat, strict=True)
         ]
         out = pytree.tree_unflatten(out_flat, spec)
         from torch._higher_order_ops.cond import cond_op

From e59513618727068a949b670312b09634b90fae5e Mon Sep 17 00:00:00 2001
From: Yuanyuan Chen <cyyever@outlook.com>
Date: Sat, 18 Oct 2025 05:44:10 +0000
Subject: [PATCH 103/123] Enable PLC1802 on ruff (#165813)

This PR enables ruff check `PLC1802`, which detects len calls on sequences in a boolean test context.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165813
Approved by: https://github.com/ezyang
---
 benchmarks/dynamo/huggingface.py                         | 2 +-
 pyproject.toml                                           | 1 +
 test/quantization/core/test_quantized_tensor.py          | 2 +-
 torch/_dynamo/backends/distributed.py                    | 6 +++---
 torch/_dynamo/output_graph.py                            | 4 ++--
 torch/_dynamo/variables/builtin.py                       | 2 +-
 torch/_inductor/comms.py                                 | 2 +-
 torch/_inductor/runtime/triton_heuristics.py             | 2 +-
 torch/_inductor/utils.py                                 | 2 +-
 torch/distributed/fsdp/_fully_shard/_fsdp_param_group.py | 6 ++----
 torch/distributed/pipelining/schedules.py                | 2 +-
 torch/hub.py                                             | 2 +-
 torch/testing/_internal/opinfo/core.py                   | 2 +-
 torch/utils/data/datapipes/dataframe/datapipes.py        | 6 +++---
 torch/utils/data/datapipes/iter/combining.py             | 2 +-
 torch/utils/data/datapipes/iter/selecting.py             | 2 +-
 torch/utils/weak.py                                      | 2 +-
 17 files changed, 23 insertions(+), 24 deletions(-)

diff --git a/benchmarks/dynamo/huggingface.py b/benchmarks/dynamo/huggingface.py
index 2c774bbb1d2e..d856a241ccac 100755
--- a/benchmarks/dynamo/huggingface.py
+++ b/benchmarks/dynamo/huggingface.py
@@ -124,7 +124,7 @@ with open(MODELS_FILENAME) as fh:
             continue
         batch_size = int(batch_size)
         BATCH_SIZE_KNOWN_MODELS[model_name] = batch_size
-assert len(BATCH_SIZE_KNOWN_MODELS)
+assert BATCH_SIZE_KNOWN_MODELS
 
 
 try:
diff --git a/pyproject.toml b/pyproject.toml
index 8e29c1c81d56..e42f08d296f3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -212,6 +212,7 @@ select = [
     "PIE810",
     "PLC0131", # type bivariance
     "PLC0132", # type param mismatch
+    "PLC1802", # len({expression}) used as condition without comparison
     "PLC0205", # string as __slots__
     "PLC3002", # unnecessary-direct-lambda-call
     "PLE",
diff --git a/test/quantization/core/test_quantized_tensor.py b/test/quantization/core/test_quantized_tensor.py
index f241cc438757..b46e2df1d9ee 100644
--- a/test/quantization/core/test_quantized_tensor.py
+++ b/test/quantization/core/test_quantized_tensor.py
@@ -100,7 +100,7 @@ def param_search_greedy(x, bit_rate, n_bins=200, ratio=0.16):
             cur_min, cur_max, cur_loss = cur_min + stepsize, cur_max, loss1
         else:
             cur_min, cur_max, cur_loss = cur_min, cur_max - stepsize, loss2
-    if len(solutions):
+    if solutions:
         best = solutions[0]
         for solution in solutions:
             if solution[-1] < best[-1]:
diff --git a/torch/_dynamo/backends/distributed.py b/torch/_dynamo/backends/distributed.py
index b282a6218816..6be9690c6a1c 100644
--- a/torch/_dynamo/backends/distributed.py
+++ b/torch/_dynamo/backends/distributed.py
@@ -98,14 +98,14 @@ def pretty_print_buckets(buckets: list[Bucket], bucket_bytes_cap: int) -> None:
                 )
             )
 
-    if len(rows):
+    if rows:
         log.info(
             "\nDDPOptimizer used bucket cap %s and created %d buckets. Enable debug logs for detailed bucket info.",
             bucket_bytes_cap,
             len(buckets),
         )
 
-        if len(extended_buckets):
+        if extended_buckets:
             log.warning(
                 "Some buckets were extended beyond their requested parameter capacities"
                 " in order to ensure each subgraph has an output node, required for fx graph partitioning."
@@ -122,7 +122,7 @@ def pretty_print_buckets(buckets: list[Bucket], bucket_bytes_cap: int) -> None:
                 tabulate(rows, headers=headers, tablefmt="simple_grid"),
             )
 
-            if len(extended_buckets):
+            if extended_buckets:
                 log.warning(
                     "DDPOptimizer extended these buckets to ensure per-subgraph output nodes:\n%s",
                     tabulate(
diff --git a/torch/_dynamo/output_graph.py b/torch/_dynamo/output_graph.py
index 9bce964c3f1a..f39d80f89b45 100644
--- a/torch/_dynamo/output_graph.py
+++ b/torch/_dynamo/output_graph.py
@@ -1867,7 +1867,7 @@ class OutputGraph(OutputGraphCommon):
                 _get_source_debug_name(var.source) for var in potential_side_effects
             ]
 
-            if len(side_effect_refs):
+            if side_effect_refs:
                 warnings.warn(
                     f"While exporting, we found certain side effects happened in the model.forward. "
                     f"Here are the list of potential sources you can double check: {side_effect_refs}"
@@ -3736,7 +3736,7 @@ class SubgraphTracer(fx.Tracer):
             if v1 != v2
         ]
 
-        if len(mutated_inputs):
+        if mutated_inputs:
             mutated_nodes = [input_nodes[i] for i in mutated_inputs]
             msg = f"Input mutation detected at {mutated_nodes}"
             return MutationInfo(True, msg)
diff --git a/torch/_dynamo/variables/builtin.py b/torch/_dynamo/variables/builtin.py
index a03f7d0f4d74..09bdb81150e6 100644
--- a/torch/_dynamo/variables/builtin.py
+++ b/torch/_dynamo/variables/builtin.py
@@ -1847,7 +1847,7 @@ class BuiltinVariable(VariableTracker):
                 polyfills.builtins.iter_
             ).call_function(tx, [obj, *args], {})
 
-            if len(args):
+            if args:
                 # iter(obj, sentinel) returns an object that implements
                 # __iter__ and __next__ methods (UserDefinedObjectVariable)
                 # Wrap the return value in a IteratorVariable subclass (LazyObjectIteratorVariable)
diff --git a/torch/_inductor/comms.py b/torch/_inductor/comms.py
index 86f272c8b24e..3cf0156e043a 100644
--- a/torch/_inductor/comms.py
+++ b/torch/_inductor/comms.py
@@ -834,7 +834,7 @@ def _schedule_for_comm(
             collective_cost -= snode_to_cost[candidate.snode]
         heapq.heapify(ready)
 
-    while len(ready):
+    while ready:
         snode = heapq.heappop(ready).snode
         if reorder_for_overlap and contains_collective(snode):
             schedule_collective_for_overlap(snode)
diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py
index 44b567bf5ecd..2ae2880fb018 100644
--- a/torch/_inductor/runtime/triton_heuristics.py
+++ b/torch/_inductor/runtime/triton_heuristics.py
@@ -2895,7 +2895,7 @@ def match_target_block_product(
         relative_scores[dim] = score / total_score
 
     # Scale up dimensions by their relative scores until we reach the target
-    while curr_block_product < target_block_product and len(relative_scores):
+    while curr_block_product < target_block_product and relative_scores:
         dim, score = max(relative_scores.items(), key=lambda item: item[1])
 
         # Check if we've hit the max for this dimension
diff --git a/torch/_inductor/utils.py b/torch/_inductor/utils.py
index 233a294aaed6..f1c7f23cf719 100644
--- a/torch/_inductor/utils.py
+++ b/torch/_inductor/utils.py
@@ -792,7 +792,7 @@ def get_kernel_metadata(
     # where `inductor_nodes` contains nodes from multiple graph instances
     # is not supported. An example of this is conditional statements.
     single_graph = None
-    if len(inductor_nodes):
+    if inductor_nodes:
         unique_graphs = OrderedSet(n.graph for n in inductor_nodes)
         if len(unique_graphs) == 1:
             single_graph = inductor_nodes[0].graph
diff --git a/torch/distributed/fsdp/_fully_shard/_fsdp_param_group.py b/torch/distributed/fsdp/_fully_shard/_fsdp_param_group.py
index 39d5711ef33b..32939a554503 100644
--- a/torch/distributed/fsdp/_fully_shard/_fsdp_param_group.py
+++ b/torch/distributed/fsdp/_fully_shard/_fsdp_param_group.py
@@ -237,7 +237,7 @@ class FSDPParamGroup:
             raise AssertionError(
                 f"FSDP expects uniform original parameter dtype but got {orig_dtypes}"
             )
-        self._orig_dtype = next(iter(orig_dtypes)) if len(trainable_params) else None
+        self._orig_dtype = next(iter(orig_dtypes)) if trainable_params else None
         if len(trainable_params) > 0 and len(reduce_dtypes) != 1:
             # This can be relaxed if we issue one reduce-scatter per reduce
             # dtype (but we would need a way for users to specify multiple
@@ -245,9 +245,7 @@ class FSDPParamGroup:
             raise AssertionError(
                 f"FSDP expects uniform reduce dtype but got {reduce_dtypes}"
             )
-        self._reduce_dtype = (
-            next(iter(reduce_dtypes)) if len(trainable_params) else None
-        )
+        self._reduce_dtype = next(iter(reduce_dtypes)) if trainable_params else None
 
     def lazy_init(self):
         # Lazy init should be idempotent
diff --git a/torch/distributed/pipelining/schedules.py b/torch/distributed/pipelining/schedules.py
index 589505de4e4a..067a9351d823 100644
--- a/torch/distributed/pipelining/schedules.py
+++ b/torch/distributed/pipelining/schedules.py
@@ -2178,7 +2178,7 @@ BACKWARD_INPUT, BACKWARD_WEIGHT, and OVERLAP_F_B are supported."
                 raise e
 
         # Mostly these operations should have finished long ago, but there isn't an obvious time when to wait for them
-        while len(send_ops):
+        while send_ops:
             _wait_batch_p2p(send_ops.pop())
 
         assert len(self.unshard_ops) == 0, "Unused unshard operations"
diff --git a/torch/hub.py b/torch/hub.py
index 4b68e997162a..d3328d1abe6e 100644
--- a/torch/hub.py
+++ b/torch/hub.py
@@ -372,7 +372,7 @@ def _check_dependencies(m):
 
     if dependencies is not None:
         missing_deps = [pkg for pkg in dependencies if not _check_module_exists(pkg)]
-        if len(missing_deps):
+        if missing_deps:
             raise RuntimeError(f"Missing dependencies: {', '.join(missing_deps)}")
 
 
diff --git a/torch/testing/_internal/opinfo/core.py b/torch/testing/_internal/opinfo/core.py
index 4a31fb454b5a..685fa2fd2efd 100644
--- a/torch/testing/_internal/opinfo/core.py
+++ b/torch/testing/_internal/opinfo/core.py
@@ -166,7 +166,7 @@ class SampleInput:
 A SampleInput can be constructed "naturally" with *args and **kwargs or by
 explicitly setting the "args" and "kwargs" parameters, but the two
 methods of construction cannot be mixed!"""
-        elif len(var_args) or len(var_kwargs):
+        elif var_args or var_kwargs:
             assert (
                 output_process_fn_grad is None
                 and broadcasts_input is None
diff --git a/torch/utils/data/datapipes/dataframe/datapipes.py b/torch/utils/data/datapipes/dataframe/datapipes.py
index 2bf0dda77752..0c1b416e99c2 100644
--- a/torch/utils/data/datapipes/dataframe/datapipes.py
+++ b/torch/utils/data/datapipes/dataframe/datapipes.py
@@ -53,7 +53,7 @@ class ConcatDataFramesPipe(DFIterDataPipe):
             if len(buffer) == self.n_batch:
                 yield df_wrapper.concat(buffer)
                 buffer = []
-        if len(buffer):
+        if buffer:
             yield df_wrapper.concat(buffer)
 
 
@@ -78,7 +78,7 @@ class ShuffleDataFramesPipe(DFIterDataPipe):
             if len(buffer) == size:
                 yield df_wrapper.concat(buffer)
                 buffer = []
-        if len(buffer):
+        if buffer:
             yield df_wrapper.concat(buffer)
 
 
@@ -107,7 +107,7 @@ class FilterDataFramesPipe(DFIterDataPipe):
                 if len(buffer) == size:
                     yield df_wrapper.concat(buffer)
                     buffer = []
-        if len(buffer):
+        if buffer:
             yield df_wrapper.concat(buffer)
 
 
diff --git a/torch/utils/data/datapipes/iter/combining.py b/torch/utils/data/datapipes/iter/combining.py
index 36afe6769eb1..22f27327b2ee 100644
--- a/torch/utils/data/datapipes/iter/combining.py
+++ b/torch/utils/data/datapipes/iter/combining.py
@@ -626,7 +626,7 @@ class MultiplexerIterDataPipe(IterDataPipe):
 
     def __iter__(self):
         iterators = [iter(x) for x in self.datapipes]
-        while len(iterators):
+        while iterators:
             for it in iterators:
                 try:
                     value = next(it)
diff --git a/torch/utils/data/datapipes/iter/selecting.py b/torch/utils/data/datapipes/iter/selecting.py
index 78d1820cb6aa..afb0e91d8557 100644
--- a/torch/utils/data/datapipes/iter/selecting.py
+++ b/torch/utils/data/datapipes/iter/selecting.py
@@ -88,7 +88,7 @@ class FilterIterDataPipe(IterDataPipe[_T_co]):
             for idx, mask in enumerate(df_wrapper.iterate(condition)):
                 if mask:
                     result.append(df_wrapper.get_item(data, idx))
-            if len(result):
+            if result:
                 return True, df_wrapper.concat(result)
             else:
                 return False, None  # type: ignore[return-value]
diff --git a/torch/utils/weak.py b/torch/utils/weak.py
index cb8862e64531..ed311cd05956 100644
--- a/torch/utils/weak.py
+++ b/torch/utils/weak.py
@@ -309,7 +309,7 @@ class WeakIdKeyDictionary(MutableMapping):
                 dict = type({})(dict)
             for key, value in dict.items():
                 d[self.ref_type(key, self._remove)] = value  # CHANGED
-        if len(kwargs):
+        if kwargs:
             self.update(kwargs)
 
     def __ior__(self, other):

From c79dfdc6550e872783aa5cb5fc9e86589bf18872 Mon Sep 17 00:00:00 2001
From: Yuanyuan Chen <cyyever@outlook.com>
Date: Sat, 18 Oct 2025 06:40:12 +0000
Subject: [PATCH 104/123] Enable all PIE rules on ruff (#165814)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR enables all PIE rules on ruff, there are already some enabled rules from this family, the new added rules are
```
PIE796  Enum contains duplicate value: {value}
PIE808  Unnecessary start argument in range
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165814
Approved by: https://github.com/ezyang
---
 benchmarks/gpt_fast/mixtral_moe_quantize.py   |  2 +-
 pyproject.toml                                |  7 +---
 .../ao/sparsity/test_activation_sparsifier.py |  4 +-
 test/ao/sparsity/test_data_scheduler.py       |  2 +-
 test/ao/sparsity/test_data_sparsifier.py      |  2 +-
 test/ao/sparsity/test_sparsifier.py           |  4 +-
 .../quantization/test_quantization.py         | 12 +++---
 test/distributed/checkpoint/test_planner.py   |  2 +-
 test/distributed/checkpoint/test_utils.py     |  2 +-
 .../elastic/agent/server/test/api_test.py     |  2 +-
 .../elastic/multiprocessing/api_test.py       |  2 +-
 .../timer/file_based_local_timer_test.py      |  2 +-
 .../elastic/timer/local_timer_example.py      |  4 +-
 .../elastic/timer/local_timer_test.py         |  2 +-
 .../utils/data/cycling_iterator_test.py       |  4 +-
 .../fsdp/test_fsdp_hybrid_shard.py            |  4 +-
 test/distributed/tensor/test_dtensor_ops.py   |  4 +-
 test/distributed/test_device_mesh.py          |  2 +-
 test/distributions/test_distributions.py      | 34 ++++++++---------
 test/dynamo/test_export.py                    |  8 ++--
 test/dynamo/test_functions.py                 |  2 +-
 test/dynamo/test_modules.py                   |  2 +-
 test/dynamo/test_repros.py                    |  6 +--
 test/functorch/test_ac.py                     |  4 +-
 test/inductor/test_codecache.py               |  2 +-
 test/inductor/test_compiled_autograd.py       |  2 +-
 test/inductor/test_max_autotune.py            |  2 +-
 test/inductor/test_triton_kernels.py          |  4 +-
 test/jit/xnnpack/test_xnnpack_delegate.py     |  2 +-
 test/nn/test_convolution.py                   |  2 +-
 test/nn/test_embedding.py                     |  2 +-
 test/nn/test_multihead_attention.py           |  2 +-
 test/nn/test_pooling.py                       |  2 +-
 test/onnx/test_onnx_opset.py                  |  4 +-
 test/optim/test_lrscheduler.py                |  2 +-
 test/profiler/test_profiler.py                |  6 +--
 .../core/experimental/test_floatx.py          |  2 +-
 test/test_dataloader.py                       |  2 +-
 test/test_datapipe.py                         |  6 +--
 test/test_dynamic_shapes.py                   |  2 +-
 test/test_indexing.py                         |  2 +-
 test/test_jit.py                              |  8 ++--
 test/test_jit_fuser_te.py                     |  8 ++--
 test/test_matmul_cuda.py                      |  2 +-
 test/test_mps.py                              | 14 +++----
 test/test_numa_binding.py                     |  6 +--
 test/test_reductions.py                       |  4 +-
 test/test_serialization.py                    |  2 +-
 test/test_sparse.py                           |  2 +-
 test/test_sparse_csr.py                       |  2 +-
 test/test_static_runtime.py                   |  2 +-
 test/test_tensorboard.py                      |  2 +-
 test/test_tensorexpr.py                       |  2 +-
 test/test_torch.py                            |  2 +-
 test/test_view_ops.py                         |  2 +-
 test/test_xnnpack_integration.py              |  4 +-
 torch/_decomp/decompositions_for_jvp.py       |  2 +-
 torch/_dynamo/eval_frame.py                   |  4 +-
 torch/_inductor/dependencies.py               |  2 +-
 torch/_meta_registrations.py                  |  2 +-
 torch/_numpy/_funcs_impl.py                   |  2 +-
 torch/_refs/__init__.py                       |  2 +-
 torch/_tensor_str.py                          |  6 +--
 torch/ao/ns/fx/pattern_utils.py               |  2 +-
 .../activation_sparsifier.py                  |  6 +--
 .../benchmarks/evaluate_disk_savings.py       |  2 +-
 .../lightning/tests/test_callbacks.py         |  2 +-
 .../sparsifier/nearly_diagonal_sparsifier.py  |  2 +-
 .../ao/quantization/experimental/observer.py  |  4 +-
 torch/ao/quantization/fx/_decomposed.py       |  2 +-
 torch/autograd/profiler.py                    |  2 +-
 torch/distributed/_pycute/layout.py           | 16 ++++----
 .../distributed/_symmetric_memory/__init__.py |  6 +--
 .../elastic/multiprocessing/api.py            |  2 +-
 .../distributed/elastic/timer/local_timer.py  |  2 +-
 torch/distributed/tensor/_dtensor_spec.py     |  2 +-
 torch/distributed/tensor/parallel/fsdp.py     |  2 +-
 torch/nested/_internal/ops.py                 |  2 +-
 .../torchscript_exporter/symbolic_helper.py   |  2 +-
 .../torchscript_exporter/symbolic_opset12.py  |  2 +-
 .../torchscript_exporter/symbolic_opset8.py   |  2 +-
 .../torchscript_exporter/symbolic_opset9.py   | 18 ++++-----
 .../_internal/common_methods_invocations.py   |  4 +-
 torch/testing/_internal/common_nn.py          | 10 ++---
 .../distributed/_tensor/common_dtensor.py     |  2 +-
 .../_internal/distributed/distributed_test.py | 38 +++++++++----------
 .../distributed/multi_threaded_pg.py          |  2 +-
 .../distributed/rpc/dist_autograd_test.py     |  6 +--
 .../_internal/distributed/rpc/rpc_test.py     |  4 +-
 torch/testing/_internal/jit_utils.py          |  2 +-
 torch/testing/_internal/triton_utils.py       |  2 +-
 91 files changed, 195 insertions(+), 200 deletions(-)

diff --git a/benchmarks/gpt_fast/mixtral_moe_quantize.py b/benchmarks/gpt_fast/mixtral_moe_quantize.py
index 50ffd61bdb83..fd0342ce3d59 100644
--- a/benchmarks/gpt_fast/mixtral_moe_quantize.py
+++ b/benchmarks/gpt_fast/mixtral_moe_quantize.py
@@ -85,7 +85,7 @@ class WeightOnlyInt8QuantHandler:
                 cur_state_dict[f"{fqn}.weight"] = int8_weight
                 cur_state_dict[f"{fqn}.scales"] = scales.to(mod.weight.dtype)
             elif isinstance(mod, ConditionalFeedForward):
-                for weight_idx in range(0, 3):
+                for weight_idx in range(3):
                     weight_name = f"w{weight_idx + 1}"
                     scales_name = f"scales{weight_idx + 1}"
                     weight = getattr(mod, weight_name)
diff --git a/pyproject.toml b/pyproject.toml
index e42f08d296f3..f18368b90d8d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -204,12 +204,7 @@ select = [
     "NPY",
     "PERF",
     "PGH004",
-    "PIE790",
-    "PIE794",
-    "PIE800",
-    "PIE804",
-    "PIE807",
-    "PIE810",
+    "PIE",
     "PLC0131", # type bivariance
     "PLC0132", # type param mismatch
     "PLC1802", # len({expression}) used as condition without comparison
diff --git a/test/ao/sparsity/test_activation_sparsifier.py b/test/ao/sparsity/test_activation_sparsifier.py
index 0f3f36ecda9f..079f5e1941d2 100644
--- a/test/ao/sparsity/test_activation_sparsifier.py
+++ b/test/ao/sparsity/test_activation_sparsifier.py
@@ -190,7 +190,7 @@ class TestActivationSparsifier(TestCase):
                 if features is None:
                     assert torch.all(mask * input_data == output)
                 else:
-                    for feature_idx in range(0, len(features)):
+                    for feature_idx in range(len(features)):
                         feature = torch.Tensor(
                             [features[feature_idx]], device=input_data.device
                         ).long()
@@ -378,7 +378,7 @@ class TestActivationSparsifier(TestCase):
         # some dummy data
         data_list = []
         num_data_points = 5
-        for _ in range(0, num_data_points):
+        for _ in range(num_data_points):
             rand_data = torch.randn(16, 1, 28, 28)
             activation_sparsifier.model(rand_data)
             data_list.append(rand_data)
diff --git a/test/ao/sparsity/test_data_scheduler.py b/test/ao/sparsity/test_data_scheduler.py
index de0a885f0153..47a85e1edda1 100644
--- a/test/ao/sparsity/test_data_scheduler.py
+++ b/test/ao/sparsity/test_data_scheduler.py
@@ -143,7 +143,7 @@ class TestBaseDataScheduler(TestCase):
 
         # checking step count
         step_cnt = 5
-        for _ in range(0, step_cnt):
+        for _ in range(step_cnt):
             sparsifier.step()
             scheduler.step()
 
diff --git a/test/ao/sparsity/test_data_sparsifier.py b/test/ao/sparsity/test_data_sparsifier.py
index dce04292763f..fa08e8c90ac2 100644
--- a/test/ao/sparsity/test_data_sparsifier.py
+++ b/test/ao/sparsity/test_data_sparsifier.py
@@ -123,7 +123,7 @@ class _BaseDataSparsiferTestCase(TestCase):
 
         step_count = 3
 
-        for _ in range(0, step_count):
+        for _ in range(step_count):
             sparsifier.step()
         for some_data in all_data:
             name, data, _ = self._get_name_data_config(some_data)
diff --git a/test/ao/sparsity/test_sparsifier.py b/test/ao/sparsity/test_sparsifier.py
index d5010b7abccd..a940a3e9feba 100644
--- a/test/ao/sparsity/test_sparsifier.py
+++ b/test/ao/sparsity/test_sparsifier.py
@@ -472,8 +472,8 @@ class TestNearlyDiagonalSparsifier(TestCase):
         else:
             height, width = mask.shape
             dist_to_diagonal = nearliness // 2
-            for row in range(0, height):
-                for col in range(0, width):
+            for row in range(height):
+                for col in range(width):
                     if abs(row - col) <= dist_to_diagonal:
                         assert mask[row, col] == 1
                     else:
diff --git a/test/distributed/algorithms/quantization/test_quantization.py b/test/distributed/algorithms/quantization/test_quantization.py
index b65e0a747405..6044eac70b51 100644
--- a/test/distributed/algorithms/quantization/test_quantization.py
+++ b/test/distributed/algorithms/quantization/test_quantization.py
@@ -79,7 +79,7 @@ if BACKEND == "gloo" or BACKEND == "nccl":
             dist.init_process_group(
                 store=store, rank=self.rank, world_size=self.world_size, backend="gloo"
             )
-            group = list(range(0, self.world_size))
+            group = list(range(self.world_size))
             group_id = dist.group.WORLD
             self._test_all_gather(
                 group, group_id, self.rank, dtype=torch.float32, qtype=DQuantType.FP16
@@ -94,7 +94,7 @@ if BACKEND == "gloo" or BACKEND == "nccl":
             dist.init_process_group(
                 store=store, rank=self.rank, world_size=self.world_size, backend="gloo"
             )
-            group = list(range(0, self.world_size))
+            group = list(range(self.world_size))
             group_id = dist.group.WORLD
             self._test_all_gather(
                 group, group_id, self.rank, dtype=torch.float32, qtype=DQuantType.BFP16
@@ -111,7 +111,7 @@ if BACKEND == "gloo" or BACKEND == "nccl":
             dist.init_process_group(
                 store=store, rank=self.rank, world_size=self.world_size, backend="nccl"
             )
-            group = list(range(0, self.world_size))
+            group = list(range(self.world_size))
             group_id = dist.new_group(range(self.world_size))
             rank_to_GPU = init_multigpu_helper(self.world_size, BACKEND)
             self._test_all_to_all(
@@ -135,7 +135,7 @@ if BACKEND == "gloo" or BACKEND == "nccl":
             dist.init_process_group(
                 store=store, rank=self.rank, world_size=self.world_size, backend="nccl"
             )
-            group = list(range(0, self.world_size))
+            group = list(range(self.world_size))
             group_id = dist.new_group(range(self.world_size))
             rank_to_GPU = init_multigpu_helper(self.world_size, BACKEND)
             self._test_all_to_all(
@@ -158,7 +158,7 @@ if BACKEND == "gloo" or BACKEND == "nccl":
             dist.init_process_group(
                 store=store, rank=self.rank, world_size=self.world_size, backend="nccl"
             )
-            group = list(range(0, self.world_size))
+            group = list(range(self.world_size))
             group_id = dist.new_group(range(self.world_size))
             rank_to_GPU = init_multigpu_helper(self.world_size, BACKEND)
             self._test_all_to_all_single(
@@ -181,7 +181,7 @@ if BACKEND == "gloo" or BACKEND == "nccl":
             dist.init_process_group(
                 store=store, rank=self.rank, world_size=self.world_size, backend="nccl"
             )
-            group = list(range(0, self.world_size))
+            group = list(range(self.world_size))
             group_id = dist.new_group(range(self.world_size))
             rank_to_GPU = init_multigpu_helper(self.world_size, BACKEND)
             self._test_all_to_all_single(
diff --git a/test/distributed/checkpoint/test_planner.py b/test/distributed/checkpoint/test_planner.py
index edf043301ed2..86bed29de998 100644
--- a/test/distributed/checkpoint/test_planner.py
+++ b/test/distributed/checkpoint/test_planner.py
@@ -66,7 +66,7 @@ if TEST_WITH_DEV_DBG_ASAN:
 def create_sharded_tensor(rank, world_size, shards_per_rank, shard_size=8):
     shards_metadata = []
     local_shards = []
-    for idx in range(0, world_size * shards_per_rank):
+    for idx in range(world_size * shards_per_rank):
         shard_rank = idx // shards_per_rank
         shard_md = ShardMetadata(
             shard_offsets=[idx * shard_size],
diff --git a/test/distributed/checkpoint/test_utils.py b/test/distributed/checkpoint/test_utils.py
index 722670c95f18..79dbe741822c 100644
--- a/test/distributed/checkpoint/test_utils.py
+++ b/test/distributed/checkpoint/test_utils.py
@@ -45,7 +45,7 @@ if TEST_WITH_DEV_DBG_ASAN:
 def create_sharded_tensor(rank, world_size, shards_per_rank):
     shards_metadata = []
     local_shards = []
-    for idx in range(0, world_size * shards_per_rank):
+    for idx in range(world_size * shards_per_rank):
         shard_rank = idx // shards_per_rank
         shard_md = ShardMetadata(
             shard_offsets=[idx * 8], shard_sizes=[8], placement=f"rank:{shard_rank}/cpu"
diff --git a/test/distributed/elastic/agent/server/test/api_test.py b/test/distributed/elastic/agent/server/test/api_test.py
index 11776324ed7f..dd96f9b6dfb0 100644
--- a/test/distributed/elastic/agent/server/test/api_test.py
+++ b/test/distributed/elastic/agent/server/test/api_test.py
@@ -633,7 +633,7 @@ class SimpleElasticAgentTest(unittest.TestCase):
         worker_group = agent.get_worker_group()
 
         num_restarts = 3
-        for _ in range(0, num_restarts):
+        for _ in range(num_restarts):
             agent._restart_workers(worker_group)
             self.assertEqual(WorkerState.HEALTHY, worker_group.state)
 
diff --git a/test/distributed/elastic/multiprocessing/api_test.py b/test/distributed/elastic/multiprocessing/api_test.py
index 4ac0dcacb4b8..19d941e0d9c6 100644
--- a/test/distributed/elastic/multiprocessing/api_test.py
+++ b/test/distributed/elastic/multiprocessing/api_test.py
@@ -146,7 +146,7 @@ def echo_large(size: int) -> dict[int, str]:
     returns a large output ({0: test0", 1: "test1", ..., (size-1):f"test{size-1}"})
     """
     out = {}
-    for idx in range(0, size):
+    for idx in range(size):
         out[idx] = f"test{idx}"
     return out
 
diff --git a/test/distributed/elastic/timer/file_based_local_timer_test.py b/test/distributed/elastic/timer/file_based_local_timer_test.py
index cf597eb6a37a..0125ce5cd25a 100644
--- a/test/distributed/elastic/timer/file_based_local_timer_test.py
+++ b/test/distributed/elastic/timer/file_based_local_timer_test.py
@@ -191,7 +191,7 @@ if not (IS_WINDOWS or IS_MACOS or IS_ARM64):
         """
         client = timer.FileTimerClient(file_path)
         sem.release()
-        for _ in range(0, n):
+        for _ in range(n):
             client.acquire("test_scope", 0)
             time.sleep(interval)
 
diff --git a/test/distributed/elastic/timer/local_timer_example.py b/test/distributed/elastic/timer/local_timer_example.py
index 09421f4b38f5..6d438f2536d6 100644
--- a/test/distributed/elastic/timer/local_timer_example.py
+++ b/test/distributed/elastic/timer/local_timer_example.py
@@ -102,7 +102,7 @@ if not (IS_WINDOWS or IS_MACOS or IS_ARM64):
 
             world_size = 8
             processes = []
-            for i in range(0, world_size):
+            for i in range(world_size):
                 if i % 2 == 0:
                     p = spawn_ctx.Process(target=_stuck_function, args=(i, mp_queue))
                 else:
@@ -110,7 +110,7 @@ if not (IS_WINDOWS or IS_MACOS or IS_ARM64):
                 p.start()
                 processes.append(p)
 
-            for i in range(0, world_size):
+            for i in range(world_size):
                 p = processes[i]
                 p.join()
                 if i % 2 == 0:
diff --git a/test/distributed/elastic/timer/local_timer_test.py b/test/distributed/elastic/timer/local_timer_test.py
index b65b202d5ec6..8818b1788c62 100644
--- a/test/distributed/elastic/timer/local_timer_test.py
+++ b/test/distributed/elastic/timer/local_timer_test.py
@@ -127,7 +127,7 @@ if not INVALID_PLATFORMS:
         interval seconds. Releases the given semaphore once before going to work.
         """
         sem.release()
-        for i in range(0, n):
+        for i in range(n):
             mp_queue.put(TimerRequest(i, "test_scope", 0))
             time.sleep(interval)
 
diff --git a/test/distributed/elastic/utils/data/cycling_iterator_test.py b/test/distributed/elastic/utils/data/cycling_iterator_test.py
index c9cb055a2c22..835ed6ebbd01 100644
--- a/test/distributed/elastic/utils/data/cycling_iterator_test.py
+++ b/test/distributed/elastic/utils/data/cycling_iterator_test.py
@@ -15,7 +15,7 @@ class CyclingIteratorTest(unittest.TestCase):
     def generator(self, epoch, stride, max_epochs):
         # generate an continuously incrementing list each epoch
         # e.g. [0,1,2] [3,4,5] [6,7,8] ...
-        return iter([stride * epoch + i for i in range(0, stride)])
+        return iter([stride * epoch + i for i in range(stride)])
 
     def test_cycling_iterator(self):
         stride = 3
@@ -25,7 +25,7 @@ class CyclingIteratorTest(unittest.TestCase):
             return self.generator(epoch, stride, max_epochs)
 
         it = CyclingIterator(n=max_epochs, generator_fn=generator_fn)
-        for i in range(0, stride * max_epochs):
+        for i in range(stride * max_epochs):
             self.assertEqual(i, next(it))
 
         with self.assertRaises(StopIteration):
diff --git a/test/distributed/fsdp/test_fsdp_hybrid_shard.py b/test/distributed/fsdp/test_fsdp_hybrid_shard.py
index 26a05bbc4171..e2ea4c5fc9af 100644
--- a/test/distributed/fsdp/test_fsdp_hybrid_shard.py
+++ b/test/distributed/fsdp/test_fsdp_hybrid_shard.py
@@ -124,7 +124,7 @@ class TestFSDPHybridShard(FSDPTest):
         model = MyModel().to(device_type)
         num_node_devices = torch.accelerator.device_count()
         shard_rank_lists = (
-            list(range(0, num_node_devices // 2)),
+            list(range(num_node_devices // 2)),
             list(range(num_node_devices // 2, num_node_devices)),
         )
         shard_groups = (
@@ -175,7 +175,7 @@ class TestFSDPHybridShard(FSDPTest):
         model = MyModel().to(device_type)
         num_node_devices = torch.accelerator.device_count()
         shard_rank_lists = (
-            list(range(0, num_node_devices // 2)),
+            list(range(num_node_devices // 2)),
             list(range(num_node_devices // 2, num_node_devices)),
         )
         shard_groups = (
diff --git a/test/distributed/tensor/test_dtensor_ops.py b/test/distributed/tensor/test_dtensor_ops.py
index c4373773d662..df51152a9030 100644
--- a/test/distributed/tensor/test_dtensor_ops.py
+++ b/test/distributed/tensor/test_dtensor_ops.py
@@ -802,7 +802,7 @@ class TestLocalDTensorOps(TestDTensorOps):
         self.run_opinfo_test(dtype, op)
 
     def test_mean(self):
-        with LocalTensorMode(frozenset(range(0, self.world_size))):
+        with LocalTensorMode(frozenset(range(self.world_size))):
             self.run_mean()
 
     def test_one_hot(self):
@@ -811,7 +811,7 @@ class TestLocalDTensorOps(TestDTensorOps):
     def run_opinfo_test(
         self, dtype, op, requires_grad=True, sample_inputs_filter=lambda s: True
     ):
-        with LocalTensorMode(frozenset(range(0, self.world_size))):
+        with LocalTensorMode(frozenset(range(self.world_size))):
             super().run_opinfo_test(dtype, op, requires_grad, sample_inputs_filter)
 
     def assertEqualOnRank(self, x, y, msg=None, *, rank=0):
diff --git a/test/distributed/test_device_mesh.py b/test/distributed/test_device_mesh.py
index 0ed4651d3ec5..2db674a458ed 100644
--- a/test/distributed/test_device_mesh.py
+++ b/test/distributed/test_device_mesh.py
@@ -536,7 +536,7 @@ class DeviceMeshTestNDim(DTensorTestBase):
         # Create shard groups (e.g. (0, 1, 2, 3), (4, 5, 6, 7))
         # and assign the correct shard group to each rank
         shard_rank_lists = (
-            list(range(0, self.world_size // 2)),
+            list(range(self.world_size // 2)),
             list(range(self.world_size // 2, self.world_size)),
         )
         shard_groups = (
diff --git a/test/distributions/test_distributions.py b/test/distributions/test_distributions.py
index b588589d81ba..550589002003 100644
--- a/test/distributions/test_distributions.py
+++ b/test/distributions/test_distributions.py
@@ -5722,11 +5722,11 @@ class TestKL(DistributionsTestCase):
     def test_kl_multivariate_normal(self):
         set_rng_seed(0)  # see Note [Randomized statistical tests]
         n = 5  # Number of tests for multivariate_normal
-        for i in range(0, n):
-            loc = [torch.randn(4) for _ in range(0, 2)]
+        for i in range(n):
+            loc = [torch.randn(4) for _ in range(2)]
             scale_tril = [
                 transform_to(constraints.lower_cholesky)(torch.randn(4, 4))
-                for _ in range(0, 2)
+                for _ in range(2)
             ]
             p = MultivariateNormal(loc=loc[0], scale_tril=scale_tril[0])
             q = MultivariateNormal(loc=loc[1], scale_tril=scale_tril[1])
@@ -5755,10 +5755,10 @@ class TestKL(DistributionsTestCase):
 
     def test_kl_multivariate_normal_batched(self):
         b = 7  # Number of batches
-        loc = [torch.randn(b, 3) for _ in range(0, 2)]
+        loc = [torch.randn(b, 3) for _ in range(2)]
         scale_tril = [
             transform_to(constraints.lower_cholesky)(torch.randn(b, 3, 3))
-            for _ in range(0, 2)
+            for _ in range(2)
         ]
         expected_kl = torch.stack(
             [
@@ -5766,7 +5766,7 @@ class TestKL(DistributionsTestCase):
                     MultivariateNormal(loc[0][i], scale_tril=scale_tril[0][i]),
                     MultivariateNormal(loc[1][i], scale_tril=scale_tril[1][i]),
                 )
-                for i in range(0, b)
+                for i in range(b)
             ]
         )
         actual_kl = kl_divergence(
@@ -5777,7 +5777,7 @@ class TestKL(DistributionsTestCase):
 
     def test_kl_multivariate_normal_batched_broadcasted(self):
         b = 7  # Number of batches
-        loc = [torch.randn(b, 3) for _ in range(0, 2)]
+        loc = [torch.randn(b, 3) for _ in range(2)]
         scale_tril = [
             transform_to(constraints.lower_cholesky)(torch.randn(b, 3, 3)),
             transform_to(constraints.lower_cholesky)(torch.randn(3, 3)),
@@ -5788,7 +5788,7 @@ class TestKL(DistributionsTestCase):
                     MultivariateNormal(loc[0][i], scale_tril=scale_tril[0][i]),
                     MultivariateNormal(loc[1][i], scale_tril=scale_tril[1]),
                 )
-                for i in range(0, b)
+                for i in range(b)
             ]
         )
         actual_kl = kl_divergence(
@@ -5800,15 +5800,15 @@ class TestKL(DistributionsTestCase):
     def test_kl_lowrank_multivariate_normal(self):
         set_rng_seed(0)  # see Note [Randomized statistical tests]
         n = 5  # Number of tests for lowrank_multivariate_normal
-        for i in range(0, n):
-            loc = [torch.randn(4) for _ in range(0, 2)]
-            cov_factor = [torch.randn(4, 3) for _ in range(0, 2)]
+        for i in range(n):
+            loc = [torch.randn(4) for _ in range(2)]
+            cov_factor = [torch.randn(4, 3) for _ in range(2)]
             cov_diag = [
-                transform_to(constraints.positive)(torch.randn(4)) for _ in range(0, 2)
+                transform_to(constraints.positive)(torch.randn(4)) for _ in range(2)
             ]
             covariance_matrix = [
                 cov_factor[i].matmul(cov_factor[i].t()) + cov_diag[i].diag()
-                for i in range(0, 2)
+                for i in range(2)
             ]
             p = LowRankMultivariateNormal(loc[0], cov_factor[0], cov_diag[0])
             q = LowRankMultivariateNormal(loc[1], cov_factor[1], cov_diag[1])
@@ -5861,10 +5861,10 @@ class TestKL(DistributionsTestCase):
 
     def test_kl_lowrank_multivariate_normal_batched(self):
         b = 7  # Number of batches
-        loc = [torch.randn(b, 3) for _ in range(0, 2)]
-        cov_factor = [torch.randn(b, 3, 2) for _ in range(0, 2)]
+        loc = [torch.randn(b, 3) for _ in range(2)]
+        cov_factor = [torch.randn(b, 3, 2) for _ in range(2)]
         cov_diag = [
-            transform_to(constraints.positive)(torch.randn(b, 3)) for _ in range(0, 2)
+            transform_to(constraints.positive)(torch.randn(b, 3)) for _ in range(2)
         ]
         expected_kl = torch.stack(
             [
@@ -5876,7 +5876,7 @@ class TestKL(DistributionsTestCase):
                         loc[1][i], cov_factor[1][i], cov_diag[1][i]
                     ),
                 )
-                for i in range(0, b)
+                for i in range(b)
             ]
         )
         actual_kl = kl_divergence(
diff --git a/test/dynamo/test_export.py b/test/dynamo/test_export.py
index 112da727ec61..f3f438d241af 100644
--- a/test/dynamo/test_export.py
+++ b/test/dynamo/test_export.py
@@ -49,9 +49,9 @@ class ExportTests(torch._dynamo.test_case.TestCase):
             lc_key = state[0]
             lc_val = state[1]
             bar = []
-            for _ in range(0, 4):
+            for _ in range(4):
                 bar2 = []
-                for _ in range(0, 3):
+                for _ in range(3):
                     bar2.append(
                         lc_key + lc_val + torch.tensor([0.1, 0.25, 0.4, 0.5, 0.1])
                     )
@@ -665,9 +665,9 @@ def forward(self, x, y):
             lc_key = state[0]
             lc_val = state[1]
             bar = []
-            for _ in range(0, 4):
+            for _ in range(4):
                 bar2 = []
-                for _ in range(0, 3):
+                for _ in range(3):
                     bar2.append(
                         lc_key + lc_val + torch.tensor([0.1, 0.25, 0.4, 0.5, 0.1])
                     )
diff --git a/test/dynamo/test_functions.py b/test/dynamo/test_functions.py
index d16676cda8ee..647033e63e4c 100644
--- a/test/dynamo/test_functions.py
+++ b/test/dynamo/test_functions.py
@@ -3627,7 +3627,7 @@ class GraphModule(torch.nn.Module):
                 )
 
         test(range(10), slice(1, 10, 2), expected=range(1, 10, 2))
-        test(range(10), slice(None, 10, None), expected=range(0, 10))
+        test(range(10), slice(None, 10, None), expected=range(10))
         test(range(10), slice(-1, 7, None), expected=range(9, 7))
         test(range(10), slice(-1, 7, 2), expected=range(9, 7, 2))
         test(range(1, 10, 2), slice(3, 7, 2), expected=range(7, 11, 4))
diff --git a/test/dynamo/test_modules.py b/test/dynamo/test_modules.py
index 7cac7eca7239..c251ce28bac4 100644
--- a/test/dynamo/test_modules.py
+++ b/test/dynamo/test_modules.py
@@ -3047,7 +3047,7 @@ class OptimizedModuleTest(torch._dynamo.test_case.TestCase):
         def generate(x, c):
             return mod(x) + c
 
-        for _ in range(0, 10):
+        for _ in range(10):
             generate(torch.randn(10, 10), 0)
             generate(torch.randn(10, 10), 1)
         self.assertEqual(cnt.frame_count, 2)
diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index 362a541918c3..ac0515ac6ba8 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -4471,7 +4471,7 @@ class ReproTests(torch._dynamo.test_case.TestCase):
 
         compiled_fn = torch.compile(func, backend=cnt, fullgraph=True)
         requires_grad = func is not func1
-        for _ in range(0, 5):
+        for _ in range(5):
             # Inputs
             eager_a = torch.ones([6], requires_grad=requires_grad)
             compiled_a = torch.ones([6], requires_grad=requires_grad)
@@ -4623,7 +4623,7 @@ class ReproTests(torch._dynamo.test_case.TestCase):
         x = torch.rand([2, 2])
         self.assertEqual(opt_fn(x, counter), fn(x, counter))
         self.assertEqual(counter[0], 2)
-        for _ in range(0, 10):
+        for _ in range(10):
             opt_fn(x, counter)
         self.assertEqual(counter[0], 12)
         if torch._dynamo.config.assume_static_by_default:
@@ -4784,7 +4784,7 @@ class ReproTests(torch._dynamo.test_case.TestCase):
     def test_contains_range_constprop(self):
         def fn(x):
             # dynamo should const prop to False
-            if 3 in range(0, 10):
+            if 3 in range(10):
                 return x + 1
             else:
                 return x + 2
diff --git a/test/functorch/test_ac.py b/test/functorch/test_ac.py
index fde84b6683ed..d0611f19cf2a 100644
--- a/test/functorch/test_ac.py
+++ b/test/functorch/test_ac.py
@@ -106,7 +106,7 @@ class MemoryBudgetTest(TestCase):
             return f(x, ws)
 
         _, eager_flops = get_mem_and_flops(call)
-        for budget in range(0, 11):
+        for budget in range(11):
             mem, flops = get_mem_and_flops(call, memory_budget=budget / 10)
             if budget <= 5:
                 # We start saving the matmuls
@@ -251,7 +251,7 @@ class MemoryBudgetTest(TestCase):
             return f(x, ws)
 
         expected = call()
-        for budget in range(0, 11):
+        for budget in range(11):
             memory_budget = budget / 10
             torch._dynamo.reset()
             with config.patch(activation_memory_budget=memory_budget):
diff --git a/test/inductor/test_codecache.py b/test/inductor/test_codecache.py
index 78c2dd3de852..ca2e9007109d 100644
--- a/test/inductor/test_codecache.py
+++ b/test/inductor/test_codecache.py
@@ -1146,7 +1146,7 @@ class TestFxGraphCache(TestCase):
             raise unittest.SkipTest(f"requires {GPU_TYPE}")
 
         def fn1(x):
-            return x + torch.tensor(list(range(0, 12)), device=device)
+            return x + torch.tensor(list(range(12)), device=device)
 
         def fn2(x):
             return x + torch.tensor(list(range(1, 13)), device=device)
diff --git a/test/inductor/test_compiled_autograd.py b/test/inductor/test_compiled_autograd.py
index 2612af01f6ff..716d3bfafee2 100644
--- a/test/inductor/test_compiled_autograd.py
+++ b/test/inductor/test_compiled_autograd.py
@@ -1599,7 +1599,7 @@ main()
 
         eager_check()
 
-        for i in range(0, 5):
+        for i in range(5):
             with compiled_autograd._enable(compiler_fn):
                 eager_check()
 
diff --git a/test/inductor/test_max_autotune.py b/test/inductor/test_max_autotune.py
index 6645f17fb9ee..85405283e4bd 100644
--- a/test/inductor/test_max_autotune.py
+++ b/test/inductor/test_max_autotune.py
@@ -2095,7 +2095,7 @@ class TestMaxAutotune(TestCase):
 
         # Test loop.
         def test_func2(x):
-            for i in range(0, 10):
+            for i in range(10):
                 x = torch.matmul(x, x)
             return x
 
diff --git a/test/inductor/test_triton_kernels.py b/test/inductor/test_triton_kernels.py
index 9a21220ce4d9..4739d00f1f4a 100644
--- a/test/inductor/test_triton_kernels.py
+++ b/test/inductor/test_triton_kernels.py
@@ -3005,7 +3005,7 @@ class MutationTests(torch._inductor.test_case.TestCase):
             mask = offsets < n_elements
             x = tl.load(in_ptr0 + offsets, mask=mask)
             y = tl.load(in_ptr1 + offsets, mask=mask)
-            for i in range(0, BLOCK_SIZE):
+            for i in range(BLOCK_SIZE):
                 i = tl.multiple_of(i, 1)
             output = x + y
             tl.store(out_ptr + offsets, output, mask=mask)
@@ -3160,7 +3160,7 @@ class MutationTests(torch._inductor.test_case.TestCase):
             x = tl.load(x_block_ptr)
 
             # Compute gating
-            for c2 in range(0, tl.cdiv(C2, BLOCK_SIZE_C2)):
+            for c2 in range(tl.cdiv(C2, BLOCK_SIZE_C2)):
                 # Compute block pointers
                 offs_c2 = c2 * BLOCK_SIZE_C2 + tl.arange(0, BLOCK_SIZE_C2)
                 o_block_ptr = O_ptr + offs_m[:, None] * C2 + offs_c2[None, :]
diff --git a/test/jit/xnnpack/test_xnnpack_delegate.py b/test/jit/xnnpack/test_xnnpack_delegate.py
index b97765ed5bb0..f6c7832d5b28 100644
--- a/test/jit/xnnpack/test_xnnpack_delegate.py
+++ b/test/jit/xnnpack/test_xnnpack_delegate.py
@@ -32,7 +32,7 @@ class TestXNNPackBackend(unittest.TestCase):
             },
         )
 
-        for _ in range(0, 20):
+        for _ in range(20):
             sample_input = torch.randn(4, 4, 4)
             actual_output = scripted_module(sample_input)
             expected_output = lowered_module(sample_input)
diff --git a/test/nn/test_convolution.py b/test/nn/test_convolution.py
index 4cdcac707644..3c3b3f53e528 100644
--- a/test/nn/test_convolution.py
+++ b/test/nn/test_convolution.py
@@ -1292,7 +1292,7 @@ class TestConvolutionNN(NNTestCase):
             kernel_x = torch.zeros([3, 1, 1, radius * 2 + 1], device=image.device)
             image = torch.nn.functional.conv2d(image, kernel_x, groups=image.shape[-3])
 
-        for i in range(0, 128):
+        for i in range(128):
             # This should not fail
             reproducer(radius=i)
 
diff --git a/test/nn/test_embedding.py b/test/nn/test_embedding.py
index fb9d842ce476..f21184290fa1 100644
--- a/test/nn/test_embedding.py
+++ b/test/nn/test_embedding.py
@@ -551,7 +551,7 @@ class TestEmbeddingNNDeviceType(NNTestCase):
                 # Pull out the bag's indices from indices_1D, and fill any
                 # remaining space with padding indices
                 indices_in_bag = []
-                for item_pos in range(0, max_indices_per_bag):
+                for item_pos in range(max_indices_per_bag):
                     if (start + item_pos) < end:
                         indices_in_bag.append(indices_1D[start + item_pos])
                     else:
diff --git a/test/nn/test_multihead_attention.py b/test/nn/test_multihead_attention.py
index 0c04e3b86b88..3dc6a586ced6 100644
--- a/test/nn/test_multihead_attention.py
+++ b/test/nn/test_multihead_attention.py
@@ -485,7 +485,7 @@ class TestMultiheadAttentionNN(NNTestCase):
         )[0]
         output_3d = output_3d.transpose(0, 1)  # [N, T, D]
 
-        for i in range(0, batch_size):
+        for i in range(batch_size):
             output_2d = mta_model(
                 query[i].unsqueeze(0).transpose(0, 1),
                 key[i].unsqueeze(0).transpose(0, 1),
diff --git a/test/nn/test_pooling.py b/test/nn/test_pooling.py
index d282a885f4ed..c3a7b829b2b1 100644
--- a/test/nn/test_pooling.py
+++ b/test/nn/test_pooling.py
@@ -1135,7 +1135,7 @@ torch.cuda.synchronize()
         for size, kernel_size, stride, dilation, ceil_mode in itertools.product(
             sizes, kernel_sizes, strides, dilations, ceil_modes
         ):
-            padding = random.sample(range(0, math.floor(kernel_size / 2) + 1), 1)
+            padding = random.sample(range(math.floor(kernel_size / 2) + 1), 1)
             check(
                 torch.randn(size, device=device, dtype=dtype),
                 kernel_size,
diff --git a/test/onnx/test_onnx_opset.py b/test/onnx/test_onnx_opset.py
index 75de1f3fab83..16ca93dbfe2c 100644
--- a/test/onnx/test_onnx_opset.py
+++ b/test/onnx/test_onnx_opset.py
@@ -36,12 +36,12 @@ def check_onnx_opset_operator(
     # but the op's attributes can optionally be
     # specified as well
     assert len(ops) == len(graph.node)
-    for i in range(0, len(ops)):
+    for i in range(len(ops)):
         assert graph.node[i].op_type == ops[i]["op_name"]
         if "attributes" in ops[i]:
             attributes = ops[i]["attributes"]
             assert len(attributes) == len(graph.node[i].attribute)
-            for j in range(0, len(attributes)):
+            for j in range(len(attributes)):
                 for attribute_field in attributes[j].keys():
                     assert attributes[j][attribute_field] == getattr(
                         graph.node[i].attribute[j], attribute_field
diff --git a/test/optim/test_lrscheduler.py b/test/optim/test_lrscheduler.py
index cea85b07646f..3e65720a45b6 100644
--- a/test/optim/test_lrscheduler.py
+++ b/test/optim/test_lrscheduler.py
@@ -1509,7 +1509,7 @@ class TestLRScheduler(TestCase):
             14.0 / 3,
             29.0 / 6,
         ]
-        deltas = [2 * i for i in range(0, 2)]
+        deltas = [2 * i for i in range(2)]
         base_lrs = [1 + delta for delta in deltas]
         max_lrs = [5 + delta for delta in deltas]
         lr_targets = [[x + delta for x in lr_base_target] for delta in deltas]
diff --git a/test/profiler/test_profiler.py b/test/profiler/test_profiler.py
index 1461731a5998..a9321da3fbd3 100644
--- a/test/profiler/test_profiler.py
+++ b/test/profiler/test_profiler.py
@@ -1930,7 +1930,7 @@ assert KinetoStepTracker.current_step() == initial_step + 2 * niters
         event_list.table()
 
     def _check_all_gpu_present(self, gpu_dict, max_gpu_count):
-        for i in range(0, max_gpu_count):
+        for i in range(max_gpu_count):
             self.assertEqual(gpu_dict["GPU " + str(i)], 1)
 
     # Do json sanity testing. Checks that all events are between profiler start and end
@@ -2139,8 +2139,8 @@ assert KinetoStepTracker.current_step() == initial_step + 2 * niters
                         step_helper_funcs.append(event)
             self.assertEqual(len(prof_steps), 5)
             self.assertEqual(len(step_helper_funcs), 5)
-            for i in range(0, len(step_helper_funcs)):
-                for j in range(0, len(step_helper_funcs)):
+            for i in range(len(step_helper_funcs)):
+                for j in range(len(step_helper_funcs)):
                     self.assertTrue(
                         not self._partial_overlap(prof_steps[i], step_helper_funcs[j])
                     )
diff --git a/test/quantization/core/experimental/test_floatx.py b/test/quantization/core/experimental/test_floatx.py
index ee7fe0a9d186..c4cea4073a5c 100644
--- a/test/quantization/core/experimental/test_floatx.py
+++ b/test/quantization/core/experimental/test_floatx.py
@@ -275,7 +275,7 @@ class TestFloat8Dtype(TestCase):
         IMO simpler to special case e8m0 here.
         """
 
-        for biased_exponent in range(0, 256):
+        for biased_exponent in range(256):
             # iterate through all the possible options of guard, round, sticky bits
             # for the current exponent
             for grs in range(8):
diff --git a/test/test_dataloader.py b/test/test_dataloader.py
index da0c12082244..b9000a2c68d3 100644
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@@ -3494,7 +3494,7 @@ class TestIndividualWorkerQueue(TestCase):
             max_num_workers = 1
 
         for batch_size in (8, 16, 32, 64):
-            for num_workers in range(0, min(6, max_num_workers)):
+            for num_workers in range(min(6, max_num_workers)):
                 self._run_ind_worker_queue_test(
                     batch_size=batch_size, num_workers=num_workers + 1
                 )
diff --git a/test/test_datapipe.py b/test/test_datapipe.py
index e92fa2b0615d..2790145665b1 100644
--- a/test/test_datapipe.py
+++ b/test/test_datapipe.py
@@ -520,7 +520,7 @@ class TestIterableDataPipeBasic(TestCase):
         self.assertEqual(list(range(9)), list(n))
 
         # Functional Test: Uneven DataPipes
-        source_numbers = list(range(0, 10)) + [10, 12]
+        source_numbers = list(range(10)) + [10, 12]
         numbers_dp = dp.iter.IterableWrapper(source_numbers)
         n1, n2 = numbers_dp.demux(2, lambda x: x % 2)
         self.assertEqual([0, 2, 4, 6, 8, 10, 12], list(n1))
@@ -1257,7 +1257,7 @@ class TestFunctionalIterDataPipe(TestCase):
         )
         output1, output2 = list(dp1), list(dp2)
         self.assertEqual(list(range(5, 10)), output1)
-        self.assertEqual(list(range(0, 5)), output2)
+        self.assertEqual(list(range(5)), output2)
 
         # Functional Test: values of the same classification are lumped together, and unlimited buffer
         with warnings.catch_warnings(record=True) as wa:
@@ -1271,7 +1271,7 @@ class TestFunctionalIterDataPipe(TestCase):
             self.assertRegex(str(wa[-1].message), r"Unlimited buffer size is set")
         output1, output2 = list(dp1), list(dp2)
         self.assertEqual(list(range(5, 10)), output1)
-        self.assertEqual(list(range(0, 5)), output2)
+        self.assertEqual(list(range(5)), output2)
 
         # Functional Test: classifier returns a value outside of [0, num_instance - 1]
         dp0 = input_dp.demux(num_instances=1, classifier_fn=lambda x: x % 2)
diff --git a/test/test_dynamic_shapes.py b/test/test_dynamic_shapes.py
index fcc45521fbb1..b8fa4ffbd421 100644
--- a/test/test_dynamic_shapes.py
+++ b/test/test_dynamic_shapes.py
@@ -1385,7 +1385,7 @@ class f(torch.nn.Module):
             self.assertEqual(x.storage_offset(), y.storage_offset())
 
     def test_tensor_factory_with_symint(self):
-        args = list(range(0, 3))
+        args = list(range(3))
         expected = torch.tensor(args)
 
         shape_env = ShapeEnv()
diff --git a/test/test_indexing.py b/test/test_indexing.py
index fa91b5903410..99d84a65abca 100644
--- a/test/test_indexing.py
+++ b/test/test_indexing.py
@@ -902,7 +902,7 @@ class TestIndexing(TestCase):
         # Set window size
         W = 10
         # Generate a list of lists, containing overlapping window indices
-        indices = [range(i, i + W) for i in range(0, N - W)]
+        indices = [range(i, i + W) for i in range(N - W)]
 
         for i in [len(indices), 100, 32]:
             windowed_data = t[indices[:i]]
diff --git a/test/test_jit.py b/test/test_jit.py
index 6a3c968f86dd..613903e9a116 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -3153,7 +3153,7 @@ class TestScript(JitTestCase):
             eplan = get_execution_plan(dstate)
             num_bailouts = eplan.code.num_bailouts()
 
-            for i in range(0, num_bailouts):
+            for i in range(num_bailouts):
                 eplan.code.request_bailout(i)
                 self.assertEqual(jitted(x), expected)
 
@@ -5950,7 +5950,7 @@ a")
             # type: (int) -> int
             prev = 1
             v = 1
-            for i in range(0, x):
+            for i in range(x):
                 save = v
                 v = v + prev
                 prev = save
@@ -10938,7 +10938,7 @@ dedent """
 
             # Test symbolic differentiation
             # Run Forward and Backward thrice to trigger autodiff graph
-            for i in range(0, 3):
+            for i in range(3):
                 y = jit_module(x)
                 y.backward(grad)
             x.grad.zero_()
@@ -11802,7 +11802,7 @@ dedent """
         def fn_zip_enumerate(x, y):
             # type: (List[int], List[int]) -> int
             sum = 0
-            for (i, (j, v), k) in zip(x, enumerate(y), range(0, 100)):
+            for (i, (j, v), k) in zip(x, enumerate(y), range(100)):
                 sum += i * j * v * k
 
             return sum
diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py
index 1bda41f7f8f1..dba28f98cbf9 100644
--- a/test/test_jit_fuser_te.py
+++ b/test/test_jit_fuser_te.py
@@ -243,7 +243,7 @@ class TestTEFuser(JitTestCase):
             return x2.sum()
 
         with texpr_reductions_enabled():
-            a = torch.tensor(list(range(0, 15)), dtype=torch.float, device="cpu")
+            a = torch.tensor(list(range(15)), dtype=torch.float, device="cpu")
             a = a.reshape(5, 3)
             scripted = self.checkScript(func, (a,))
             self.assertLastGraphAllFused()
@@ -259,7 +259,7 @@ class TestTEFuser(JitTestCase):
             return x.sum((-2,)) * 2
 
         with texpr_reductions_enabled():
-            a = torch.tensor(list(range(0, 15)), dtype=torch.float, device="cpu")
+            a = torch.tensor(list(range(15)), dtype=torch.float, device="cpu")
             a = a.reshape(5, 3)
             scripted = self.checkScript(func, (a,))
             self.assertLastGraphAllFused()
@@ -271,7 +271,7 @@ class TestTEFuser(JitTestCase):
             return x.sum((0,), keepdim=True, dtype=torch.double) * 2
 
         with texpr_reductions_enabled():
-            a = torch.tensor(list(range(0, 15)), dtype=torch.float, device="cpu")
+            a = torch.tensor(list(range(15)), dtype=torch.float, device="cpu")
             a = a.reshape(5, 3)
 
             self.checkScript(func, (a,))
@@ -2234,7 +2234,7 @@ class TestTEFuser(JitTestCase):
 
         indices = [0, 1, 2, 3]
         sets = []
-        for i in range(0, len(indices) + 1):
+        for i in range(len(indices) + 1):
             for subset in combinations(indices, i):
                 sets.append(subset)  # noqa: PERF402
 
diff --git a/test/test_matmul_cuda.py b/test/test_matmul_cuda.py
index 61f5642830dd..bf46ee0709fc 100644
--- a/test/test_matmul_cuda.py
+++ b/test/test_matmul_cuda.py
@@ -231,7 +231,7 @@ class TestMatmulCuda(InductorTestCase):
     def test_cublas_addmm_alignment(self, dtype):
         device = 'cuda'
         # perturb X, A, or B alignment
-        for idx in range(0, 3):
+        for idx in range(3):
             for offset in range(1, 3):
                 offsets = [0, 0, 0]
                 offsets[idx] = offset
diff --git a/test/test_mps.py b/test/test_mps.py
index 7346d1d26d44..e825fa77aa89 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -1900,7 +1900,7 @@ class TestMPS(TestCaseMPS):
         res_cpu = torch.linalg.vector_norm(B_cpu, ord=3.5)
         self.assertEqual(res_mps, res_cpu)
 
-        for dim in range(0, B_mps.dim()):
+        for dim in range(B_mps.dim()):
             res_mps = torch.linalg.vector_norm(B_mps, ord=3.5, dim=dim)
             res_cpu = torch.linalg.vector_norm(B_cpu, ord=3.5, dim=dim)
             self.assertEqual(res_mps, res_cpu)
@@ -2871,8 +2871,8 @@ class TestMPS(TestCaseMPS):
 
     def test_contiguous_slice_2d(self):
         def helper(shape):
-            for i in range(0, shape[0]):
-                for j in range(0, shape[1]):
+            for i in range(shape[0]):
+                for j in range(shape[1]):
                     t_mps = torch.randn(shape, device="mps")
                     t_cpu = t_mps.detach().clone().cpu()
 
@@ -3432,12 +3432,12 @@ class TestMPS(TestCaseMPS):
         elems = torch.arange(n_tensors * n_tensor_elems, dtype=torch.float32)
 
         tensor_list = []
-        for i in range(0, n_tensors - 1):
+        for i in range(n_tensors - 1):
             # create a list of contiguous view tensors (view tensor created by the slice op)
             t = elems[n_tensor_elems * i : n_tensor_elems * (i + 1)]
             tensor_list.append(t)
 
-        for i in range(0, n_tensors - 1):
+        for i in range(n_tensors - 1):
             t = tensor_list[i].view(1, n_tensor_elems)
             t_mps = t.to("mps")
             self.assertEqual(t, t_mps.cpu(), f"i={i}")
@@ -4942,7 +4942,7 @@ class TestMPS(TestCaseMPS):
             x_mps = fn(torch.zeros(shape, device="mps"), dim=dim)
             self.assertEqual(x_cpu, x_mps.cpu())
         for fn in [torch.any, torch.all]:
-            for dim in range(0, 4):
+            for dim in range(4):
                 helper(fn, dim)
 
         # 6D tensor reductions
@@ -9750,7 +9750,7 @@ class TestGatherScatter(TestCaseMPS):
         self.assertEqual(x_cpu, x_mps)
 
     def test_cast_gather_scatter(self):
-        for _ in range(0, 50):
+        for _ in range(50):
             input = np.random.randint(0, 255, size=(5, 5, 4), dtype=np.uint8)
             with torch.no_grad():
                 s = torch.tensor(input, dtype=torch.uint8, device="mps").unsqueeze(0)
diff --git a/test/test_numa_binding.py b/test/test_numa_binding.py
index 764156ff9b98..c599587e281d 100644
--- a/test/test_numa_binding.py
+++ b/test/test_numa_binding.py
@@ -549,7 +549,7 @@ class NumaBindingTest(TestCase):
             bound_logical_cpu_indices_0,
             # Gets an extra physical core due to odd number of physical cores on numa node
             # 3 physical cores total, 2 GPUs: GPU 0 gets 2 physical cores (CPUs 0-3)
-            set(range(0, 4)),
+            set(range(4)),
         )
 
         bound_logical_cpu_indices_1 = (
@@ -677,7 +677,7 @@ class NumaBindingTest(TestCase):
             # 1 numa node, 2 L3 caches, 1 physical core per L3 cache = 2 logical CPUs per cache
             # L3 cache 0: CPUs 0-1, L3 cache 1: CPUs 2-3
             # Both have same number of CPUs, so prefer lower cache key (0)
-            set(range(0, 2)),
+            set(range(2)),
         )
 
     def test_binds_to_node_0_if_node_stored_as_minus_one(self) -> None:
@@ -709,7 +709,7 @@ class NumaBindingTest(TestCase):
             # GPU 0 has numa node stored as -1, which is treated as numa node 0
             # Each numa node has 1 * 1 * 2 = 2 logical CPUs
             # Numa node 0 has CPUs 0-1
-            set(range(0, 2)),
+            set(range(2)),
         )
 
     def test_callable_entrypoint_basic(self) -> None:
diff --git a/test/test_reductions.py b/test/test_reductions.py
index e4fa54491dd0..4a3235fbc50c 100644
--- a/test/test_reductions.py
+++ b/test/test_reductions.py
@@ -1710,7 +1710,7 @@ class TestReductions(TestCase):
                                             with_extremal=False, atol=None, rtol=None,
                                             exact_dtype=True, with_keepdim=False):
         # Test 0-d to 3-d tensors.
-        for ndims in range(0, 4):
+        for ndims in range(4):
             shape = _rand_shape(ndims, min_size=5, max_size=10)
             for n in range(ndims + 1):
                 for c in combinations(list(range(ndims)), n):
@@ -2623,7 +2623,7 @@ class TestReductions(TestCase):
         # Generate some random test cases
         ops = ['quantile', 'nanquantile']
         inputs = [tuple(np.random.randint(2, 10, size=i)) for i in range(1, 4)]
-        quantiles = [tuple(np.random.rand(i)) for i in range(0, 5)]
+        quantiles = [tuple(np.random.rand(i)) for i in range(5)]
         keepdims = [True, False]
 
         # Add corner cases
diff --git a/test/test_serialization.py b/test/test_serialization.py
index 7c4208b6a0d6..a6e3ef23580d 100644
--- a/test/test_serialization.py
+++ b/test/test_serialization.py
@@ -295,7 +295,7 @@ class SerializationMixin:
             5,
             6
         ]
-        for i in range(0, 100):
+        for i in range(100):
             data.append(0)
         t = torch.tensor(data, dtype=torch.uint8)
 
diff --git a/test/test_sparse.py b/test/test_sparse.py
index 866f38a316d7..196506a8e13d 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -5300,7 +5300,7 @@ class TestSparseAny(TestCase):
             x_dense = torch.eye(dense_dim, dtype=dtype, device=device)
             for sparse_dim_in in range(1, dense_dim):
                 x_sparse = x_dense.to_sparse(sparse_dim_in)
-                for sparse_dim_out in range(0, dense_dim):
+                for sparse_dim_out in range(dense_dim):
                     if sparse_dim_out == sparse_dim_in:
                         self.assertTrue(x_sparse.to_sparse(sparse_dim_out).sparse_dim() == sparse_dim_out)
                     else:
diff --git a/test/test_sparse_csr.py b/test/test_sparse_csr.py
index 65e800f6eba1..45748c683621 100644
--- a/test/test_sparse_csr.py
+++ b/test/test_sparse_csr.py
@@ -135,7 +135,7 @@ class TestSparseCSRSampler(TestCase):
         index_dtype = torch.int32
         for n_rows in range(1, 10):
             for n_cols in range(1, 10):
-                for nnz in range(0, n_rows * n_cols + 1):
+                for nnz in range(n_rows * n_cols + 1):
                     crow_indices = self._make_crow_indices(
                         n_rows, n_cols, nnz,
                         device=device, dtype=index_dtype)
diff --git a/test/test_static_runtime.py b/test/test_static_runtime.py
index 893aea8e3130..df1e0c3e34fa 100644
--- a/test/test_static_runtime.py
+++ b/test/test_static_runtime.py
@@ -60,7 +60,7 @@ class MultiHeadAttentionLayer(nn.Module):
 # Taken from https://github.com/facebookresearch/dlrm/blob/master/dlrm_s_pytorch.py
 def create_mlp(ln, sigmoid_layer):
     layers = nn.ModuleList()
-    for i in range(0, len(ln) - 1):
+    for i in range(len(ln) - 1):
         n = ln[i]
         m = ln[i + 1]
 
diff --git a/test/test_tensorboard.py b/test/test_tensorboard.py
index cd527db88441..8ff6913887c8 100644
--- a/test/test_tensorboard.py
+++ b/test/test_tensorboard.py
@@ -200,7 +200,7 @@ class TestTensorBoardPyTorchNumpy(BaseTestCase):
                 bucket_counts=counts.tolist(),
             )
 
-            ints = torch.tensor(range(0, 100)).float()
+            ints = torch.tensor(range(100)).float()
             nbins = 100
             counts = torch.histc(ints, bins=nbins, min=0, max=99)
             limits = torch.tensor(range(nbins))
diff --git a/test/test_tensorexpr.py b/test/test_tensorexpr.py
index 17d3a58535d6..57be409ab6b4 100644
--- a/test/test_tensorexpr.py
+++ b/test/test_tensorexpr.py
@@ -1216,7 +1216,7 @@ class TestTensorExprFuser(BaseTestClass):
         @torch.jit.script
         def test(x: torch.Tensor, y: torch.Tensor, z: int) -> torch.Tensor:
             b = y
-            for i in range(0, z):
+            for i in range(z):
                 a = x + y
                 b = b + y
             return b
diff --git a/test/test_torch.py b/test/test_torch.py
index 05ea6ea61db1..9b28b801348a 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -8424,7 +8424,7 @@ tensor([[[1.+1.j, 1.+1.j, 1.+1.j,  ..., 1.+1.j, 1.+1.j, 1.+1.j],
     def test_Size_iter(self):
         for sizes in [iter([1, 2, 3, 4, 5]), range(1, 6)]:
             x = torch.Size(sizes)
-            for i in range(0, 5):
+            for i in range(5):
                 self.assertEqual(x[i], i + 1)
 
     def test_t_not_2d_error(self):
diff --git a/test/test_view_ops.py b/test/test_view_ops.py
index 5bec225787cc..174632b07988 100644
--- a/test/test_view_ops.py
+++ b/test/test_view_ops.py
@@ -1559,7 +1559,7 @@ class TestOldViewOps(TestCase):
             self.compare_with_numpy(torch_fn, np_fn, x, device=None, dtype=None)
 
     def _test_atleast_dim(self, torch_fn, np_fn, device, dtype):
-        for ndims in range(0, 5):
+        for ndims in range(5):
             shape = _rand_shape(ndims, min_size=5, max_size=10)
             for _ in range(ndims + 1):
                 for with_extremal in [False, True]:
diff --git a/test/test_xnnpack_integration.py b/test/test_xnnpack_integration.py
index 481bd3c76a50..62e257790fd4 100644
--- a/test/test_xnnpack_integration.py
+++ b/test/test_xnnpack_integration.py
@@ -1316,7 +1316,7 @@ class TestXNNPACKConv1dTransformPass(TestCase):
         groups_list = range(1, 3)
         kernel_list = range(1, 4)
         stride_list = range(1, 3)
-        padding_list = range(0, 3)
+        padding_list = range(3)
         dilation_list = range(1, 3)
 
         for hparams in itertools.product(
@@ -1401,7 +1401,7 @@ class TestXNNPACKConv1dTransformPass(TestCase):
         groups_list = range(1, 3)
         kernel_list = range(1, 4)
         stride_list = range(1, 3)
-        padding_list = range(0, 3)
+        padding_list = range(3)
         dilation_list = range(1, 3)
         output_features_list = range(1, 3)
 
diff --git a/torch/_decomp/decompositions_for_jvp.py b/torch/_decomp/decompositions_for_jvp.py
index e11540e0c2ba..fb4a4d85faa2 100644
--- a/torch/_decomp/decompositions_for_jvp.py
+++ b/torch/_decomp/decompositions_for_jvp.py
@@ -147,7 +147,7 @@ def native_layer_norm_backward(
     inner_dims = input_shape[axis:]
     outer_dims = input_shape[:axis]
     inner_dim_indices = list(range(axis, input_ndim))
-    outer_dim_indices = list(range(0, axis))
+    outer_dim_indices = list(range(axis))
 
     N = 1
     for i in inner_dims:
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index 036f1ba7d01a..451776ef25fd 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -1248,7 +1248,7 @@ def argument_names(
         # signature. Assign names as {varargs}_0, {varargs}_1, ...
         assert fullargspec.varargs is not None, "More arguments than expected"
         input_strs += [
-            f"{fullargspec.varargs}_{i}" for i in range(0, len(args) - len(input_strs))
+            f"{fullargspec.varargs}_{i}" for i in range(len(args) - len(input_strs))
         ]
     elif len(args) < len(fullargspec.args):
         # 3. If there are fewer arguments in `args` than `fullargspec.args`,
@@ -1538,7 +1538,7 @@ class FlattenInputOutputSignature(torch.fx.Transformer):
         }
 
         self.new_args = []
-        for i in range(0, len(flat_args)):
+        for i in range(len(flat_args)):
             arg = super().placeholder(f"arg{i}", (), {})
             if i in matched_input_elements_to_fake:
                 arg.node.meta["val"] = matched_input_elements_to_fake[i]
diff --git a/torch/_inductor/dependencies.py b/torch/_inductor/dependencies.py
index 0547b6b1db90..b431972521da 100644
--- a/torch/_inductor/dependencies.py
+++ b/torch/_inductor/dependencies.py
@@ -151,7 +151,7 @@ class MemoryDep(Dep):
         stride_to_index = {s: i for i, s in enumerate(self_strides)}
         order = [stride_to_index[s] for s in other_strides]
 
-        assert OrderedSet(order) == OrderedSet(range(0, self.num_vars))
+        assert OrderedSet(order) == OrderedSet(range(self.num_vars))
         return order
 
     def get_offset(self) -> sympy.Expr:
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index e89be2299434..1ad443ff387e 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -1787,7 +1787,7 @@ def _padding_check_valid_input(input, padding, *, dim):
         for d in range(1, input_dim):
             valid_batch_mode = valid_batch_mode and input.size(d) != 0
     else:
-        for d in range(0, input_dim):
+        for d in range(input_dim):
             valid_non_batch_mode = valid_non_batch_mode and input.size(d) != 0
 
     # allow empty batch size but not other dimensions.
diff --git a/torch/_numpy/_funcs_impl.py b/torch/_numpy/_funcs_impl.py
index 4ab3b29d34b8..f57e7fb001fb 100644
--- a/torch/_numpy/_funcs_impl.py
+++ b/torch/_numpy/_funcs_impl.py
@@ -1449,7 +1449,7 @@ def rollaxis(a: ArrayLike, axis, start=0):
         # numpy returns a view, here we try returning the tensor itself
         # return tensor[...]
         return a
-    axes = list(range(0, n))
+    axes = list(range(n))
     axes.remove(axis)
     axes.insert(start, axis)
     return a.view(axes)
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index 13d6efd4ac67..822f949d536f 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -4738,7 +4738,7 @@ def transpose(a: TensorLikeType, dim0: int, dim1: int) -> TensorLikeType:
     if a.ndim <= 1 or dim0 == dim1:
         return aten.alias.default(a)
 
-    _permutation = list(range(0, a.ndim))
+    _permutation = list(range(a.ndim))
     _permutation[_dim0] = _dim1
     _permutation[_dim1] = _dim0
     return torch.permute(a, _permutation)
diff --git a/torch/_tensor_str.py b/torch/_tensor_str.py
index af4deb471db2..86a745f09b44 100644
--- a/torch/_tensor_str.py
+++ b/torch/_tensor_str.py
@@ -307,7 +307,7 @@ def _tensor_str_with_formatter(self, indent, summarize, formatter1, formatter2=N
                 _tensor_str_with_formatter(
                     self[i], indent + 1, summarize, formatter1, formatter2
                 )
-                for i in range(0, PRINT_OPTS.edgeitems)
+                for i in range(PRINT_OPTS.edgeitems)
             ]
             + ["..."]
             + [
@@ -322,7 +322,7 @@ def _tensor_str_with_formatter(self, indent, summarize, formatter1, formatter2=N
             _tensor_str_with_formatter(
                 self[i], indent + 1, summarize, formatter1, formatter2
             )
-            for i in range(0, self.size(0))
+            for i in range(self.size(0))
         ]
 
     tensor_str = ("," + "\n" * (dim - 1) + " " * (indent + 1)).join(slices)
@@ -406,7 +406,7 @@ def get_summarized_data(self):
     if not PRINT_OPTS.edgeitems:
         return self.new_empty([0] * self.dim())
     elif self.size(0) > 2 * PRINT_OPTS.edgeitems:
-        start = [self[i] for i in range(0, PRINT_OPTS.edgeitems)]
+        start = [self[i] for i in range(PRINT_OPTS.edgeitems)]
         end = [self[i] for i in range(len(self) - PRINT_OPTS.edgeitems, len(self))]
         return torch.stack([get_summarized_data(x) for x in (start + end)])
     else:
diff --git a/torch/ao/ns/fx/pattern_utils.py b/torch/ao/ns/fx/pattern_utils.py
index 242d1740d91b..8339ce8f57c1 100644
--- a/torch/ao/ns/fx/pattern_utils.py
+++ b/torch/ao/ns/fx/pattern_utils.py
@@ -28,7 +28,7 @@ def get_type_a_related_to_b(
     for s in base_name_to_sets_of_related_ops.values():
         s_list = list(s)
         # add every bidirectional pair
-        for idx_0 in range(0, len(s_list)):
+        for idx_0 in range(len(s_list)):
             for idx_1 in range(idx_0, len(s_list)):
                 type_a_related_to_b.add((s_list[idx_0], s_list[idx_1]))
                 type_a_related_to_b.add((s_list[idx_1], s_list[idx_0]))
diff --git a/torch/ao/pruning/_experimental/activation_sparsifier/activation_sparsifier.py b/torch/ao/pruning/_experimental/activation_sparsifier/activation_sparsifier.py
index ef6a35686c7d..4330b0e24253 100644
--- a/torch/ao/pruning/_experimental/activation_sparsifier/activation_sparsifier.py
+++ b/torch/ao/pruning/_experimental/activation_sparsifier/activation_sparsifier.py
@@ -158,9 +158,9 @@ class ActivationSparsifier:
                 # data should be a list [aggregated over each feature only]
                 if data is None:
                     out_data = [
-                        0 for _ in range(0, len(features))
+                        0 for _ in range(len(features))
                     ]  # create one in case of 1st forward
-                    self.state[name]["mask"] = [0 for _ in range(0, len(features))]
+                    self.state[name]["mask"] = [0 for _ in range(len(features))]
                 else:
                     out_data = data  # a list
 
@@ -336,7 +336,7 @@ class ActivationSparsifier:
                 return input_data * mask
             else:
                 # apply per feature, feature_dim
-                for feature_idx in range(0, len(features)):
+                for feature_idx in range(len(features)):
                     feature = (
                         torch.Tensor([features[feature_idx]])
                         .long()
diff --git a/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_disk_savings.py b/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_disk_savings.py
index 8192b617139b..0e25f59cea64 100644
--- a/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_disk_savings.py
+++ b/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_disk_savings.py
@@ -99,7 +99,7 @@ def sparsify_model(path_to_model, sparsified_model_dump_path):
         sparse_block_shapes (List of tuples)
             List of sparse block shapes to be sparsified on
     """
-    sparsity_levels = [sl / 10 for sl in range(0, 10)]
+    sparsity_levels = [sl / 10 for sl in range(10)]
     sparsity_levels += [0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99, 1.0]
 
     norms = ["L1", "L2"]
diff --git a/torch/ao/pruning/_experimental/data_sparsifier/lightning/tests/test_callbacks.py b/torch/ao/pruning/_experimental/data_sparsifier/lightning/tests/test_callbacks.py
index 442639be9b21..5a36e13c7b46 100644
--- a/torch/ao/pruning/_experimental/data_sparsifier/lightning/tests/test_callbacks.py
+++ b/torch/ao/pruning/_experimental/data_sparsifier/lightning/tests/test_callbacks.py
@@ -299,7 +299,7 @@ class TestTrainingAwareCallback(TestCase):
         self._check_on_train_start(pl_module, callback, sparsifier_args, scheduler_args)
 
         num_epochs = 5
-        for _ in range(0, num_epochs):
+        for _ in range(num_epochs):
             self._check_on_train_epoch_start(pl_module, callback)
             self._simulate_update_param_model(pl_module)
             self._check_on_train_epoch_end(pl_module, callback)
diff --git a/torch/ao/pruning/sparsifier/nearly_diagonal_sparsifier.py b/torch/ao/pruning/sparsifier/nearly_diagonal_sparsifier.py
index a4d42ea80328..26fb3a98b8fb 100644
--- a/torch/ao/pruning/sparsifier/nearly_diagonal_sparsifier.py
+++ b/torch/ao/pruning/sparsifier/nearly_diagonal_sparsifier.py
@@ -53,7 +53,7 @@ class NearlyDiagonalSparsifier(base_sparsifier.BaseSparsifier):
                 "nearliness cannot be larger than the dimensions of tensor."
             )
 
-        for row in range(0, height):
+        for row in range(height):
             # Bounds of entries that needs to be set to 1
             low = max(0, row - dist_to_diagonal)
             high = min(width, row + dist_to_diagonal + 1)
diff --git a/torch/ao/quantization/experimental/observer.py b/torch/ao/quantization/experimental/observer.py
index 7d9432ab27ec..e61fcb67c94a 100644
--- a/torch/ao/quantization/experimental/observer.py
+++ b/torch/ao/quantization/experimental/observer.py
@@ -68,10 +68,10 @@ class APoTObserver(ObserverBase):
         p_all = []
 
         # create levels
-        for i in range(0, self.n):
+        for i in range(self.n):
             p_curr = torch.tensor([0])
 
-            for j in range(0, (2**self.k - 2) + 1):
+            for j in range((2**self.k - 2) + 1):
                 curr_ele = 2 ** (-(i + j * self.n))
                 p_append = torch.tensor([curr_ele])
                 p_curr = torch.cat((p_curr, p_append))
diff --git a/torch/ao/quantization/fx/_decomposed.py b/torch/ao/quantization/fx/_decomposed.py
index 160e9aa3afef..b145cbfaeeba 100644
--- a/torch/ao/quantization/fx/_decomposed.py
+++ b/torch/ao/quantization/fx/_decomposed.py
@@ -1159,7 +1159,7 @@ class FakeQuantPerChannel(torch.autograd.Function):
             f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
         )
         assert axis < input.dim(), f"Expecting axis to be < {input.dim()}"
-        broadcast_dims = list(range(0, axis)) + list(range(axis + 1, input.ndim))
+        broadcast_dims = list(range(axis)) + list(range(axis + 1, input.ndim))
         unsqueeze_scales = _unsqueeze_multiple(scales, broadcast_dims)
         unsqueeze_zero_points = _unsqueeze_multiple(zero_points, broadcast_dims)
         temp = torch.round(input * (1.0 / unsqueeze_scales)) + unsqueeze_zero_points
diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index 322d39f72202..cdab6259d85b 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -1212,7 +1212,7 @@ class KinetoStepTracker:
                     "Profiler step count has increased more than 1 - "
                     f"current_step = {cls._current_step} step dict =  {cls._step_dict}"
                 )
-            for _ in range(0, delta):
+            for _ in range(delta):
                 _kineto_step()
             cls._current_step = new_step
         return cls._current_step
diff --git a/torch/distributed/_pycute/layout.py b/torch/distributed/_pycute/layout.py
index be25cad2e953..04ae5d1fa5fd 100644
--- a/torch/distributed/_pycute/layout.py
+++ b/torch/distributed/_pycute/layout.py
@@ -162,7 +162,7 @@ def coalesce(layout: Layout, profile: LayoutProfile = None) -> Layout:
         assert len(layout) >= len(profile)
         return make_layout(
             chain(
-                (coalesce(layout[i], profile[i]) for i in range(0, len(profile))),  # type: ignore[arg-type]
+                (coalesce(layout[i], profile[i]) for i in range(len(profile))),  # type: ignore[arg-type]
                 (layout[i] for i in range(len(profile), len(layout))),
             )
         )
@@ -203,7 +203,7 @@ def filter(layout: Layout, profile: LayoutProfile = None) -> Layout:
         assert len(layout) >= len(profile)
         return make_layout(
             chain(
-                (filter(layout[i], profile[i]) for i in range(0, len(profile))),  # type: ignore[arg-type]
+                (filter(layout[i], profile[i]) for i in range(len(profile))),  # type: ignore[arg-type]
                 (layout[i] for i in range(len(profile), len(layout))),
             )
         )
@@ -233,7 +233,7 @@ def composition(layoutA: Layout, layoutB: LayoutInput) -> Layout:
         assert len(layoutA) >= len(layoutB)
         return make_layout(
             chain(
-                (composition(layoutA[i], layoutB[i]) for i in range(0, len(layoutB))),  # type: ignore[arg-type]
+                (composition(layoutA[i], layoutB[i]) for i in range(len(layoutB))),  # type: ignore[arg-type]
                 (layoutA[i] for i in range(len(layoutB), len(layoutA))),
             )
         )
@@ -371,7 +371,7 @@ def logical_divide(layoutA: Layout, layoutB: LayoutInput) -> Layout:
             chain(
                 (
                     logical_divide(layoutA[i], layoutB[i])  # type: ignore[arg-type]
-                    for i in range(0, len(layoutB))
+                    for i in range(len(layoutB))
                 ),
                 (layoutA[i] for i in range(len(layoutB), len(layoutA))),
             )
@@ -396,7 +396,7 @@ def logical_product(layoutA: Layout, layoutB: LayoutInput) -> Layout:
             chain(
                 (
                     logical_product(layoutA[i], layoutB[i])  # type: ignore[arg-type]
-                    for i in range(0, len(layoutB))
+                    for i in range(len(layoutB))
                 ),
                 (layoutA[i] for i in range(len(layoutB), len(layoutA))),
             )
@@ -421,14 +421,14 @@ def hier_unzip(
         # A layout with shape ((A,a),(B,b),(C,c))
         split = make_layout(
             hier_unzip(splitter, layoutA[i], layoutB[i])  # type: ignore[arg-type]
-            for i in range(0, len(layoutB))
+            for i in range(len(layoutB))
         )
         # Gather to shape ((A,B,C,...),(a,b,c,...,y,z))
         return make_layout(
-            make_layout(split[i][0] for i in range(0, len(layoutB))),  # type: ignore[arg-type]
+            make_layout(split[i][0] for i in range(len(layoutB))),  # type: ignore[arg-type]
             make_layout(
                 chain(  # type: ignore[arg-type]
-                    (split[i][1] for i in range(0, len(layoutB))),
+                    (split[i][1] for i in range(len(layoutB))),
                     (layoutA[i] for i in range(len(layoutB), len(layoutA))),
                 )
             ),
diff --git a/torch/distributed/_symmetric_memory/__init__.py b/torch/distributed/_symmetric_memory/__init__.py
index 1c576e886fe1..132a40977f85 100644
--- a/torch/distributed/_symmetric_memory/__init__.py
+++ b/torch/distributed/_symmetric_memory/__init__.py
@@ -1671,7 +1671,7 @@ def _low_contention_all_gather(
             local_buf.copy_(tensor)
         # pull
         symm_mem.barrier()
-        for step in range(0, world_size):
+        for step in range(world_size):
             remote_rank = (rank - step) % world_size
             src_buf = symm_mem.get_buffer(remote_rank, tensor.shape, tensor.dtype)
             chunks[remote_rank].copy_(src_buf)
@@ -1706,7 +1706,7 @@ def _low_contention_reduce_scatter_with_symm_mem_input(
     with _get_backend_stream():
         # pull + offline reduction
         symm_mem.barrier()
-        for step in range(0, world_size):
+        for step in range(world_size):
             remote_rank = (rank - step) % world_size
             src_buf = symm_mem.get_buffer(
                 remote_rank,
@@ -1743,7 +1743,7 @@ def _low_contention_reduce_scatter_with_workspace(
     with _get_backend_stream():
         # push + offline reduction
         workspace.barrier()
-        for step in range(0, world_size):
+        for step in range(world_size):
             remote_rank = (rank - step) % world_size
             dst_buf = workspace.get_buffer(
                 remote_rank, chunks[0].shape, chunks[0].dtype, chunks[0].numel() * rank
diff --git a/torch/distributed/elastic/multiprocessing/api.py b/torch/distributed/elastic/multiprocessing/api.py
index d91974548221..9bb580c5bf78 100644
--- a/torch/distributed/elastic/multiprocessing/api.py
+++ b/torch/distributed/elastic/multiprocessing/api.py
@@ -727,7 +727,7 @@ class MultiprocessContext(PContext):
             # pipe. Hence to prevent deadlocks on large return values,
             # we opportunistically try queue.get on each join call
             # See: https://docs.python.org/2/library/multiprocessing.html#all-platforms
-            for local_rank in range(0, self.nprocs):
+            for local_rank in range(self.nprocs):
                 return_queue = self._ret_vals[local_rank]
                 if not return_queue.empty():
                     # save the return values temporarily into a member var
diff --git a/torch/distributed/elastic/timer/local_timer.py b/torch/distributed/elastic/timer/local_timer.py
index d55cc6ac6e37..5e66ef3fae34 100644
--- a/torch/distributed/elastic/timer/local_timer.py
+++ b/torch/distributed/elastic/timer/local_timer.py
@@ -59,7 +59,7 @@ class MultiprocessingRequestQueue(RequestQueue):
     def get(self, size, timeout: float) -> list[TimerRequest]:
         requests = []
         wait = timeout
-        for _ in range(0, size):
+        for _ in range(size):
             start = time.time()
 
             try:
diff --git a/torch/distributed/tensor/_dtensor_spec.py b/torch/distributed/tensor/_dtensor_spec.py
index e12f41c4858b..42cb7fcd7c33 100644
--- a/torch/distributed/tensor/_dtensor_spec.py
+++ b/torch/distributed/tensor/_dtensor_spec.py
@@ -107,7 +107,7 @@ class DTensorSpec:
         # follow default left-to-right device order if shard_order is not specified
         tensor_dim_to_mesh_dims: defaultdict[int, list[int]] = defaultdict(list)
         mesh_ndim = len(placements)
-        for mesh_dim in range(0, mesh_ndim):
+        for mesh_dim in range(mesh_ndim):
             # shard_order doesn't work with _StridedShard
             if isinstance(placements[mesh_dim], _StridedShard):
                 return ()
diff --git a/torch/distributed/tensor/parallel/fsdp.py b/torch/distributed/tensor/parallel/fsdp.py
index 6cffbdb83d2f..f5367397cc80 100644
--- a/torch/distributed/tensor/parallel/fsdp.py
+++ b/torch/distributed/tensor/parallel/fsdp.py
@@ -306,7 +306,7 @@ def _all_gather_dtensor(
     placements = list(copy.deepcopy(tensor.placements))
     # FSDP + TP: [Shard(0), tp_placement] -> [Replicate(), tp_placement]
     # HSDP + TP: [Replicate(), Shard(0), tp_placement] -> [Replicate(), Replicate(), tp_placement]
-    for i in range(0, len(placements) - 1):
+    for i in range(len(placements) - 1):
         placements[i] = Replicate()
     tensor = tensor.redistribute(
         device_mesh=tensor.device_mesh,
diff --git a/torch/nested/_internal/ops.py b/torch/nested/_internal/ops.py
index f52bfab2a8b3..bdca74c13b1d 100644
--- a/torch/nested/_internal/ops.py
+++ b/torch/nested/_internal/ops.py
@@ -1112,7 +1112,7 @@ def chunk_default(func, *args, **kwargs):
         # the input number; it can be counter-intuitive, but it matches dense behavior.
         return [
             NestedTensor(values=chunk_values[i], **(nested_kwargs[i]))
-            for i in range(0, len(chunk_values))
+            for i in range(len(chunk_values))
         ]
     else:
         return [
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_helper.py b/torch/onnx/_internal/torchscript_exporter/symbolic_helper.py
index bcd36a6ac41b..3f92f6418c89 100644
--- a/torch/onnx/_internal/torchscript_exporter/symbolic_helper.py
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_helper.py
@@ -1005,7 +1005,7 @@ def _interpolate_size_to_scales(g: jit_utils.GraphContext, input, output_size, d
             if i < 2
             else float(output_size[-(dim - i)])
             / float(input.type().sizes()[-(dim - i)])
-            for i in range(0, dim)
+            for i in range(dim)
         ]
         scales = g.op(
             "Constant", value_t=torch.tensor(scales_constant, dtype=torch.float32)
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset12.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset12.py
index 822e14556768..d4b887560f9b 100644
--- a/torch/onnx/_internal/torchscript_exporter/symbolic_opset12.py
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset12.py
@@ -331,7 +331,7 @@ def unfold(g: jit_utils.GraphContext, input, dimension, size, step):
 
         ndim = symbolic_helper._get_tensor_rank(input)
         assert ndim is not None
-        perm = list(range(0, ndim))
+        perm = list(range(ndim))
         perm.append(perm.pop(dimension))
 
         unsqueeze_list = []
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset8.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset8.py
index bde072608088..8ba8e6ee6622 100644
--- a/torch/onnx/_internal/torchscript_exporter/symbolic_opset8.py
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset8.py
@@ -116,7 +116,7 @@ def _interpolate(name, dim, interpolate_mode):
                 if i < 2
                 else float(output_size[-(dim - i)])
                 / float(input.type().sizes()[-(dim - i)])
-                for i in range(0, dim)
+                for i in range(dim)
             ]
         return g.op("Upsample", input, mode_s=interpolate_mode, scales_f=scales)
 
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset9.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset9.py
index 9b7aba64ef31..16e94b91f89f 100644
--- a/torch/onnx/_internal/torchscript_exporter/symbolic_opset9.py
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset9.py
@@ -840,7 +840,7 @@ def t(g: jit_utils.GraphContext, self):
 def numpy_T(g: jit_utils.GraphContext, input):
     ndim = symbolic_helper._get_tensor_rank(input)
     assert ndim is not None
-    perm = list(reversed(range(0, ndim)))
+    perm = list(reversed(range(ndim)))
     return g.op("Transpose", input, perm_i=perm)
 
 
@@ -990,7 +990,7 @@ def transpose(g: jit_utils.GraphContext, self, dim0, dim1):
 @_onnx_symbolic("aten::permute")
 @symbolic_helper.parse_args("v", "is")
 def permute(g: jit_utils.GraphContext, self, dims):
-    if dims == list(range(0, len(dims))):
+    if dims == list(range(len(dims))):
         return self
     return g.op("Transpose", self, perm_i=dims)
 
@@ -1368,7 +1368,7 @@ def get_pool_ceil_padding(input, kernel_size, stride, padding):
         )
     ceiled_output_dim = [
         math.ceil((dim[i] + 2 * padding[i] - kernel_size[i]) / float(stride[i])) + 1
-        for i in range(0, len(padding))
+        for i in range(len(padding))
     ]
     # ensure last pooling starts inside
     ceiled_output_dim = [
@@ -1377,7 +1377,7 @@ def get_pool_ceil_padding(input, kernel_size, stride, padding):
             if (((ceiled_output_dim[i] - 1) * stride[i]) >= (dim[i] + padding[i]))
             else ceiled_output_dim[i]
         )
-        for i in range(0, len(ceiled_output_dim))
+        for i in range(len(ceiled_output_dim))
     ]
     padding_ceil = [
         (
@@ -1392,7 +1392,7 @@ def get_pool_ceil_padding(input, kernel_size, stride, padding):
                 )
             )
         )
-        for i in range(0, len(padding))
+        for i in range(len(padding))
     ]
     # ensure padding is not > kernel_size
     padding_ceil = [
@@ -1405,7 +1405,7 @@ def get_pool_ceil_padding(input, kernel_size, stride, padding):
             if ((padding_ceil[i] + 2 * padding[i]) >= (kernel_size[i]))
             else int(padding_ceil[i])
         )
-        for i in range(0, len(padding_ceil))
+        for i in range(len(padding_ceil))
     ]
     return padding_ceil
 
@@ -1697,14 +1697,14 @@ def _adaptive_pool(name, type, tuple_fn, fn=None):
                 name, "input size not accessible", input
             )
         # verify if output size % input size = 0 for all dim
-        mod = [dim[i] % output_size[i] for i in range(0, len(dim))]
+        mod = [dim[i] % output_size[i] for i in range(len(dim))]
         if mod != [0] * len(mod):
             if output_size == [1] * len(output_size):
                 return g.op("GlobalMaxPool", input), None
             return symbolic_helper._unimplemented(
                 name, "output size that are not factor of input size", output_size_value
             )
-        k = [int(dim[i] / output_size[i]) for i in range(0, len(dim))]
+        k = [int(dim[i] / output_size[i]) for i in range(len(dim))]
         # call max_poolxd_with_indices to get indices in the output
         if type == "MaxPool":
             # pyrefly: ignore  # not-callable
@@ -2906,7 +2906,7 @@ def unfold(g: jit_utils.GraphContext, input, dimension, size, step):
             for low, hi in zip(low_indices, hi_indices)
         ]
         ndim = len(sizes)
-        perm = list(range(0, ndim))
+        perm = list(range(ndim))
         perm.append(perm.pop(dimension))
         unsqueeze = [
             symbolic_helper._unsqueeze_helper(
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 82e630519eb8..0cecc762bce4 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -11615,7 +11615,7 @@ def reference_searchsorted(sorted_sequence, boundary, out_int32=False, right=Fal
         # numpy searchsorted only supports 1D inputs so we split up ND inputs
         orig_shape = boundary.shape
         num_splits = np.prod(sorted_sequence.shape[:-1])
-        splits = range(0, num_splits)
+        splits = range(num_splits)
         sorted_sequence, boundary = sorted_sequence.reshape(num_splits, -1), boundary.reshape(num_splits, -1)
         if sorter is not None:
             sorter = sorter.reshape(num_splits, -1)
@@ -16258,7 +16258,7 @@ op_db: list[OpInfo] = [
         aten_backward_name='_prelu_kernel_backward',
         ref=lambda x, weight:
             np.maximum(0., x) + np.minimum(0., x) *
-            (weight if x.ndim == 1 else weight.reshape([weight.size if i == 1 else 1 for i in range(0, x.ndim)])),
+            (weight if x.ndim == 1 else weight.reshape([weight.size if i == 1 else 1 for i in range(x.ndim)])),
         dtypes=floating_types_and(torch.bfloat16, torch.float16),
         supports_forward_ad=True,
         supports_fwgrad_bwgrad=True,
diff --git a/torch/testing/_internal/common_nn.py b/torch/testing/_internal/common_nn.py
index 68a35e8c40a1..3153359326dc 100644
--- a/torch/testing/_internal/common_nn.py
+++ b/torch/testing/_internal/common_nn.py
@@ -2896,7 +2896,7 @@ def _multilabelmarginloss_reference(input, target):
 
     sum = 0
     for target_index in targets:
-        for i in range(0, len(input)):
+        for i in range(len(input)):
             if i not in targets:
                 sum += max(0, 1 - input[target_index] + input[i])
 
@@ -2914,7 +2914,7 @@ def multilabelmarginloss_reference(input, target, reduction='mean'):
     n = input.size(0)
     dim = input.size(1)
     output = input.new(n).zero_()
-    for i in range(0, n):
+    for i in range(n):
         output[i] = _multilabelmarginloss_reference(input[i], target[i])
 
     if reduction == 'mean':
@@ -2955,7 +2955,7 @@ def _multimarginloss_reference(input, target_idx, p, margin, weight):
         weight = input.new(len(input)).fill_(1)
 
     output = 0
-    for i in range(0, len(input)):
+    for i in range(len(input)):
         if i != target_idx:
             output += weight[target_idx] * (max(0, (margin - input[target_idx] + input[i])) ** p)
     return output
@@ -2972,7 +2972,7 @@ def multimarginloss_reference(input, target, p=1, margin=1, weight=None, reducti
     n = input.size(0)
     dim = input.size(1)
     output = input.new(n)
-    for x in range(0, n):
+    for x in range(n):
         output[x] = _multimarginloss_reference(input[x], target[x], p, margin, weight)
 
     if reduction == 'mean':
@@ -2987,7 +2987,7 @@ def multimarginloss_reference(input, target, p=1, margin=1, weight=None, reducti
 def cosineembeddingloss_reference(input1, input2, target, margin=0, reduction='mean'):
     def _cos(a, b):
         cos = a.new(a.size(0))
-        for i in range(0, a.size(0)):
+        for i in range(a.size(0)):
             cos[i] = (a[i] * b[i]).sum() / ((((a[i] * a[i]).sum() + 1e-12) * ((b[i] * b[i]).sum() + 1e-12)) ** 0.5)
         return cos
 
diff --git a/torch/testing/_internal/distributed/_tensor/common_dtensor.py b/torch/testing/_internal/distributed/_tensor/common_dtensor.py
index a9beb0e60865..22d6d8e7dede 100644
--- a/torch/testing/_internal/distributed/_tensor/common_dtensor.py
+++ b/torch/testing/_internal/distributed/_tensor/common_dtensor.py
@@ -705,7 +705,7 @@ class LocalDTensorTestBase(DTensorTestBase):
         self.skipTest(msg)
 
     def _get_local_tensor_mode(self):
-        return LocalTensorMode(frozenset(range(0, self.world_size)))
+        return LocalTensorMode(frozenset(range(self.world_size)))
 
     def setUp(self) -> None:
         super().setUp()
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index c41602d43994..499341b07951 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -658,13 +658,13 @@ class DistributedTest:
             return (group, group_id, rank)
 
         def _init_full_group_test(self, **kwargs):
-            group = list(range(0, dist.get_world_size()))
+            group = list(range(dist.get_world_size()))
             group_id = dist.new_group(**kwargs)
             rank = dist.get_rank()
             return (group, group_id, rank)
 
         def _init_global_test(self):
-            group = list(range(0, dist.get_world_size()))
+            group = list(range(dist.get_world_size()))
             group_id = dist.group.WORLD
             rank = dist.get_rank()
             return (group, group_id, rank)
@@ -1114,7 +1114,7 @@ class DistributedTest:
                 averager = averagers.PeriodicModelAverager(
                     period=period, warmup_steps=warmup_steps
                 )
-                for step in range(0, 20):
+                for step in range(20):
                     # Reset the parameters at every step.
                     param.data = copy.deepcopy(tensor)
                     for params in model.parameters():
@@ -1143,7 +1143,7 @@ class DistributedTest:
                 averager = averagers.PeriodicModelAverager(
                     period=period, warmup_steps=warmup_steps
                 )
-                for step in range(0, 20):
+                for step in range(20):
                     # Reset the parameters at every step.
                     for param_group in opt.param_groups:
                         for params in param_group["params"]:
@@ -1203,7 +1203,7 @@ class DistributedTest:
                 averager = averagers.PeriodicModelAverager(
                     period=period, warmup_steps=warmup_steps
                 )
-                for step in range(0, 20):
+                for step in range(20):
                     # Reset the parameters at every step.
                     param.data = copy.deepcopy(tensor)
                     for params in model.parameters():
@@ -1284,7 +1284,7 @@ class DistributedTest:
             expected_global_avg_tensor = (
                 torch.ones_like(param.data) * sum(range(world_size)) / world_size
             )
-            for step in range(0, 25):
+            for step in range(25):
                 # Reset the parameters at every step.
                 param.data = copy.deepcopy(tensor)
                 for params in model.parameters():
@@ -1390,7 +1390,7 @@ class DistributedTest:
 
             for val in ["1", "0"]:
                 os.environ["TORCH_NCCL_BLOCKING_WAIT"] = val
-                for src in range(0, world_size):
+                for src in range(world_size):
                     send_tensor = _build_tensor(rank + 1, device_id=device_id).fill_(
                         src
                     )
@@ -1409,7 +1409,7 @@ class DistributedTest:
                 for req in reqs:
                     req.wait()
 
-                for src in range(0, world_size):
+                for src in range(world_size):
                     self.assertEqual(recv_tensors[src], expected_tensors[src])
 
             self._barrier()
@@ -1505,7 +1505,7 @@ class DistributedTest:
             rank = dist.get_rank()
             p2p_op_list = []
 
-            for src in range(0, dist.get_world_size()):
+            for src in range(dist.get_world_size()):
                 if src == rank:
                     continue
                 send_tensor = _build_tensor(rank + 1)
@@ -1528,7 +1528,7 @@ class DistributedTest:
             rank = dist.get_rank()
             p2p_op_list = []
 
-            for src in range(0, dist.get_world_size()):
+            for src in range(dist.get_world_size()):
                 if src == rank:
                     continue
                 send_tensor = _build_tensor(rank + 1)
@@ -1602,10 +1602,10 @@ class DistributedTest:
             tensor = _build_tensor(rank + 1, device_id=device_id)
             profiler_cls = profiler_ctx if profiler_ctx is not None else nullcontext()
             with profiler_cls as prof:
-                for src in range(0, world_size):
+                for src in range(world_size):
                     if src == rank:
                         # Send mode
-                        for dst in range(0, world_size):
+                        for dst in range(world_size):
                             if dst == rank:
                                 continue
                             dist.send(tensor, dst)
@@ -1674,10 +1674,10 @@ class DistributedTest:
             tensor = _build_tensor(send_size)
             ctx = profiler_ctx if profiler_ctx is not None else nullcontext()
             with ctx as prof:
-                for src in range(0, dist.get_world_size()):
+                for src in range(dist.get_world_size()):
                     if src == rank:
                         # Send mode
-                        for dst in range(0, dist.get_world_size()):
+                        for dst in range(dist.get_world_size()):
                             if dst == rank:
                                 continue
                             dist.send(tensor, dst)
@@ -1742,10 +1742,10 @@ class DistributedTest:
 
             ctx = profiler_ctx if profiler_ctx is not None else nullcontext()
             with ctx as prof:
-                for dst in range(0, dist.get_world_size()):
+                for dst in range(dist.get_world_size()):
                     if dst == rank:
                         # Recv mode
-                        for dst in range(0, dist.get_world_size()):
+                        for dst in range(dist.get_world_size()):
                             if dst == rank:
                                 continue
 
@@ -1846,10 +1846,10 @@ class DistributedTest:
             tensor = _build_tensor(send_recv_size, value=rank)
             ctx = profiler_ctx if profiler_ctx is not None else nullcontext()
             with ctx as prof:
-                for dst in range(0, world_size):
+                for dst in range(world_size):
                     if dst == rank:
                         # Recv mode
-                        for src in range(0, world_size):
+                        for src in range(world_size):
                             if src == rank:
                                 continue
                             output_tensor = _build_tensor(send_recv_size, value=-1)
@@ -7480,7 +7480,7 @@ class DistributedTest:
                 for baseline_iter in baseline_num_iters:
                     for offset in iteration_offsets:
                         mapping = dict.fromkeys(
-                            range(0, num_early_join_ranks), baseline_iter
+                            range(num_early_join_ranks), baseline_iter
                         )
                         # if num_early_join_ranks > 1, ranks > 0 that will join early
                         # iterate offset//2 more times than rank 0, to test nodes
diff --git a/torch/testing/_internal/distributed/multi_threaded_pg.py b/torch/testing/_internal/distributed/multi_threaded_pg.py
index 2cc22cb7c23a..79aff05b3421 100644
--- a/torch/testing/_internal/distributed/multi_threaded_pg.py
+++ b/torch/testing/_internal/distributed/multi_threaded_pg.py
@@ -166,7 +166,7 @@ class AllReduce:
             # collect all data to the list and make them
             # all on rank 0 device
             tensors = [
-                data[src_rank][i].to(rank_0_device) for src_rank in range(0, len(data))
+                data[src_rank][i].to(rank_0_device) for src_rank in range(len(data))
             ]
 
             # now mimic reduce across all ranks
diff --git a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
index 1d6c7500c5ad..3c5c9101e43c 100644
--- a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
+++ b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
@@ -266,7 +266,7 @@ class CommonDistAutogradTest(RpcAgentTestFixture):
         grads = dist_autograd.get_gradients(context_id)
         nargs = len(args)
         ngrads = 0
-        for i in range(0, nargs):
+        for i in range(nargs):
             if local_grads[i] is not None:
                 self.assertIn(args[i], grads)
                 self.assertEqual(local_grads[i], grads[args[i]])
@@ -1973,7 +1973,7 @@ class DistAutogradTest(CommonDistAutogradTest):
         DistAutogradTest._test_clean_context_backward_context_id = context_id
 
         # Send the context id to all nodes.
-        for i in range(0, self.world_size):
+        for i in range(self.world_size):
             if i != self.rank:
                 rank_distance = (i - self.rank + self.world_size) % self.world_size
                 rpc.rpc_sync(
@@ -1988,7 +1988,7 @@ class DistAutogradTest(CommonDistAutogradTest):
         self.assertEqual(self.world_size - 1, len(known_context_ids))
 
         t1 = torch.rand((3, 3), requires_grad=True)
-        for i in range(0, 100):
+        for i in range(100):
             dst = self._next_rank()
             t1 = rpc.rpc_sync(worker_name(dst), torch.add, args=(t1, t1))
 
diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py
index 4ec964092b39..03469e473921 100644
--- a/torch/testing/_internal/distributed/rpc/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/rpc_test.py
@@ -1818,7 +1818,7 @@ class RpcTest(RpcAgentTestFixture, RpcTestCommon):
         # Spawn multiple threads that send RPCs to ensure keys are correctly
         # prefixed when there are multiple RPCs being created/in flight at the
         # same time.
-        dst_ranks = [rank for rank in range(0, self.world_size) if rank != self.rank]
+        dst_ranks = [rank for rank in range(self.world_size) if rank != self.rank]
 
         def rpc_with_profiling(dst_worker):
             with _profile() as prof:
@@ -1884,7 +1884,7 @@ class RpcTest(RpcAgentTestFixture, RpcTestCommon):
         if self.rank != 1:
             return
 
-        dst_ranks = [rank for rank in range(0, self.world_size) if rank != self.rank]
+        dst_ranks = [rank for rank in range(self.world_size) if rank != self.rank]
         for dst in dst_ranks:
             dst_worker = worker_name(dst)
             with _profile() as prof:
diff --git a/torch/testing/_internal/jit_utils.py b/torch/testing/_internal/jit_utils.py
index e98d0e482683..ce8e68ae1e2c 100644
--- a/torch/testing/_internal/jit_utils.py
+++ b/torch/testing/_internal/jit_utils.py
@@ -439,7 +439,7 @@ class JitTestCase(JitCommonTestCase):
         state = model.get_debug_state()
         plan = get_execution_plan(state)
         num_bailouts = plan.code.num_bailouts()
-        for i in range(0, num_bailouts):
+        for i in range(num_bailouts):
             plan.code.request_bailout(i)
             bailout_outputs = model(*inputs)
             self.assertEqual(bailout_outputs, expected)
diff --git a/torch/testing/_internal/triton_utils.py b/torch/testing/_internal/triton_utils.py
index 4edaf86dd1d7..0964c68ebb20 100644
--- a/torch/testing/_internal/triton_utils.py
+++ b/torch/testing/_internal/triton_utils.py
@@ -912,7 +912,7 @@ if has_triton():
         b_ptrs = b_ptr + (offs_k[:, None] + offs_bn[None, :])
 
         accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-        for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        for k in range(tl.cdiv(K, BLOCK_SIZE_K)):
             a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0)
             b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0)
             accumulator = tl.dot(a, b, accumulator)

From 24520b8386af5f8f95dfe0c1b7d59f506d673bf0 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Sat, 18 Oct 2025 07:21:08 +0000
Subject: [PATCH 105/123] Revert "Enable all PIE rules on ruff (#165814)"

This reverts commit c79dfdc6550e872783aa5cb5fc9e86589bf18872.

Reverted https://github.com/pytorch/pytorch/pull/165814 on behalf of https://github.com/cyyever due to Need to cover more files ([comment](https://github.com/pytorch/pytorch/pull/165814#issuecomment-3417931863))
---
 benchmarks/gpt_fast/mixtral_moe_quantize.py   |  2 +-
 pyproject.toml                                |  7 +++-
 .../ao/sparsity/test_activation_sparsifier.py |  4 +-
 test/ao/sparsity/test_data_scheduler.py       |  2 +-
 test/ao/sparsity/test_data_sparsifier.py      |  2 +-
 test/ao/sparsity/test_sparsifier.py           |  4 +-
 .../quantization/test_quantization.py         | 12 +++---
 test/distributed/checkpoint/test_planner.py   |  2 +-
 test/distributed/checkpoint/test_utils.py     |  2 +-
 .../elastic/agent/server/test/api_test.py     |  2 +-
 .../elastic/multiprocessing/api_test.py       |  2 +-
 .../timer/file_based_local_timer_test.py      |  2 +-
 .../elastic/timer/local_timer_example.py      |  4 +-
 .../elastic/timer/local_timer_test.py         |  2 +-
 .../utils/data/cycling_iterator_test.py       |  4 +-
 .../fsdp/test_fsdp_hybrid_shard.py            |  4 +-
 test/distributed/tensor/test_dtensor_ops.py   |  4 +-
 test/distributed/test_device_mesh.py          |  2 +-
 test/distributions/test_distributions.py      | 34 ++++++++---------
 test/dynamo/test_export.py                    |  8 ++--
 test/dynamo/test_functions.py                 |  2 +-
 test/dynamo/test_modules.py                   |  2 +-
 test/dynamo/test_repros.py                    |  6 +--
 test/functorch/test_ac.py                     |  4 +-
 test/inductor/test_codecache.py               |  2 +-
 test/inductor/test_compiled_autograd.py       |  2 +-
 test/inductor/test_max_autotune.py            |  2 +-
 test/inductor/test_triton_kernels.py          |  4 +-
 test/jit/xnnpack/test_xnnpack_delegate.py     |  2 +-
 test/nn/test_convolution.py                   |  2 +-
 test/nn/test_embedding.py                     |  2 +-
 test/nn/test_multihead_attention.py           |  2 +-
 test/nn/test_pooling.py                       |  2 +-
 test/onnx/test_onnx_opset.py                  |  4 +-
 test/optim/test_lrscheduler.py                |  2 +-
 test/profiler/test_profiler.py                |  6 +--
 .../core/experimental/test_floatx.py          |  2 +-
 test/test_dataloader.py                       |  2 +-
 test/test_datapipe.py                         |  6 +--
 test/test_dynamic_shapes.py                   |  2 +-
 test/test_indexing.py                         |  2 +-
 test/test_jit.py                              |  8 ++--
 test/test_jit_fuser_te.py                     |  8 ++--
 test/test_matmul_cuda.py                      |  2 +-
 test/test_mps.py                              | 14 +++----
 test/test_numa_binding.py                     |  6 +--
 test/test_reductions.py                       |  4 +-
 test/test_serialization.py                    |  2 +-
 test/test_sparse.py                           |  2 +-
 test/test_sparse_csr.py                       |  2 +-
 test/test_static_runtime.py                   |  2 +-
 test/test_tensorboard.py                      |  2 +-
 test/test_tensorexpr.py                       |  2 +-
 test/test_torch.py                            |  2 +-
 test/test_view_ops.py                         |  2 +-
 test/test_xnnpack_integration.py              |  4 +-
 torch/_decomp/decompositions_for_jvp.py       |  2 +-
 torch/_dynamo/eval_frame.py                   |  4 +-
 torch/_inductor/dependencies.py               |  2 +-
 torch/_meta_registrations.py                  |  2 +-
 torch/_numpy/_funcs_impl.py                   |  2 +-
 torch/_refs/__init__.py                       |  2 +-
 torch/_tensor_str.py                          |  6 +--
 torch/ao/ns/fx/pattern_utils.py               |  2 +-
 .../activation_sparsifier.py                  |  6 +--
 .../benchmarks/evaluate_disk_savings.py       |  2 +-
 .../lightning/tests/test_callbacks.py         |  2 +-
 .../sparsifier/nearly_diagonal_sparsifier.py  |  2 +-
 .../ao/quantization/experimental/observer.py  |  4 +-
 torch/ao/quantization/fx/_decomposed.py       |  2 +-
 torch/autograd/profiler.py                    |  2 +-
 torch/distributed/_pycute/layout.py           | 16 ++++----
 .../distributed/_symmetric_memory/__init__.py |  6 +--
 .../elastic/multiprocessing/api.py            |  2 +-
 .../distributed/elastic/timer/local_timer.py  |  2 +-
 torch/distributed/tensor/_dtensor_spec.py     |  2 +-
 torch/distributed/tensor/parallel/fsdp.py     |  2 +-
 torch/nested/_internal/ops.py                 |  2 +-
 .../torchscript_exporter/symbolic_helper.py   |  2 +-
 .../torchscript_exporter/symbolic_opset12.py  |  2 +-
 .../torchscript_exporter/symbolic_opset8.py   |  2 +-
 .../torchscript_exporter/symbolic_opset9.py   | 18 ++++-----
 .../_internal/common_methods_invocations.py   |  4 +-
 torch/testing/_internal/common_nn.py          | 10 ++---
 .../distributed/_tensor/common_dtensor.py     |  2 +-
 .../_internal/distributed/distributed_test.py | 38 +++++++++----------
 .../distributed/multi_threaded_pg.py          |  2 +-
 .../distributed/rpc/dist_autograd_test.py     |  6 +--
 .../_internal/distributed/rpc/rpc_test.py     |  4 +-
 torch/testing/_internal/jit_utils.py          |  2 +-
 torch/testing/_internal/triton_utils.py       |  2 +-
 91 files changed, 200 insertions(+), 195 deletions(-)

diff --git a/benchmarks/gpt_fast/mixtral_moe_quantize.py b/benchmarks/gpt_fast/mixtral_moe_quantize.py
index fd0342ce3d59..50ffd61bdb83 100644
--- a/benchmarks/gpt_fast/mixtral_moe_quantize.py
+++ b/benchmarks/gpt_fast/mixtral_moe_quantize.py
@@ -85,7 +85,7 @@ class WeightOnlyInt8QuantHandler:
                 cur_state_dict[f"{fqn}.weight"] = int8_weight
                 cur_state_dict[f"{fqn}.scales"] = scales.to(mod.weight.dtype)
             elif isinstance(mod, ConditionalFeedForward):
-                for weight_idx in range(3):
+                for weight_idx in range(0, 3):
                     weight_name = f"w{weight_idx + 1}"
                     scales_name = f"scales{weight_idx + 1}"
                     weight = getattr(mod, weight_name)
diff --git a/pyproject.toml b/pyproject.toml
index f18368b90d8d..e42f08d296f3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -204,7 +204,12 @@ select = [
     "NPY",
     "PERF",
     "PGH004",
-    "PIE",
+    "PIE790",
+    "PIE794",
+    "PIE800",
+    "PIE804",
+    "PIE807",
+    "PIE810",
     "PLC0131", # type bivariance
     "PLC0132", # type param mismatch
     "PLC1802", # len({expression}) used as condition without comparison
diff --git a/test/ao/sparsity/test_activation_sparsifier.py b/test/ao/sparsity/test_activation_sparsifier.py
index 079f5e1941d2..0f3f36ecda9f 100644
--- a/test/ao/sparsity/test_activation_sparsifier.py
+++ b/test/ao/sparsity/test_activation_sparsifier.py
@@ -190,7 +190,7 @@ class TestActivationSparsifier(TestCase):
                 if features is None:
                     assert torch.all(mask * input_data == output)
                 else:
-                    for feature_idx in range(len(features)):
+                    for feature_idx in range(0, len(features)):
                         feature = torch.Tensor(
                             [features[feature_idx]], device=input_data.device
                         ).long()
@@ -378,7 +378,7 @@ class TestActivationSparsifier(TestCase):
         # some dummy data
         data_list = []
         num_data_points = 5
-        for _ in range(num_data_points):
+        for _ in range(0, num_data_points):
             rand_data = torch.randn(16, 1, 28, 28)
             activation_sparsifier.model(rand_data)
             data_list.append(rand_data)
diff --git a/test/ao/sparsity/test_data_scheduler.py b/test/ao/sparsity/test_data_scheduler.py
index 47a85e1edda1..de0a885f0153 100644
--- a/test/ao/sparsity/test_data_scheduler.py
+++ b/test/ao/sparsity/test_data_scheduler.py
@@ -143,7 +143,7 @@ class TestBaseDataScheduler(TestCase):
 
         # checking step count
         step_cnt = 5
-        for _ in range(step_cnt):
+        for _ in range(0, step_cnt):
             sparsifier.step()
             scheduler.step()
 
diff --git a/test/ao/sparsity/test_data_sparsifier.py b/test/ao/sparsity/test_data_sparsifier.py
index fa08e8c90ac2..dce04292763f 100644
--- a/test/ao/sparsity/test_data_sparsifier.py
+++ b/test/ao/sparsity/test_data_sparsifier.py
@@ -123,7 +123,7 @@ class _BaseDataSparsiferTestCase(TestCase):
 
         step_count = 3
 
-        for _ in range(step_count):
+        for _ in range(0, step_count):
             sparsifier.step()
         for some_data in all_data:
             name, data, _ = self._get_name_data_config(some_data)
diff --git a/test/ao/sparsity/test_sparsifier.py b/test/ao/sparsity/test_sparsifier.py
index a940a3e9feba..d5010b7abccd 100644
--- a/test/ao/sparsity/test_sparsifier.py
+++ b/test/ao/sparsity/test_sparsifier.py
@@ -472,8 +472,8 @@ class TestNearlyDiagonalSparsifier(TestCase):
         else:
             height, width = mask.shape
             dist_to_diagonal = nearliness // 2
-            for row in range(height):
-                for col in range(width):
+            for row in range(0, height):
+                for col in range(0, width):
                     if abs(row - col) <= dist_to_diagonal:
                         assert mask[row, col] == 1
                     else:
diff --git a/test/distributed/algorithms/quantization/test_quantization.py b/test/distributed/algorithms/quantization/test_quantization.py
index 6044eac70b51..b65e0a747405 100644
--- a/test/distributed/algorithms/quantization/test_quantization.py
+++ b/test/distributed/algorithms/quantization/test_quantization.py
@@ -79,7 +79,7 @@ if BACKEND == "gloo" or BACKEND == "nccl":
             dist.init_process_group(
                 store=store, rank=self.rank, world_size=self.world_size, backend="gloo"
             )
-            group = list(range(self.world_size))
+            group = list(range(0, self.world_size))
             group_id = dist.group.WORLD
             self._test_all_gather(
                 group, group_id, self.rank, dtype=torch.float32, qtype=DQuantType.FP16
@@ -94,7 +94,7 @@ if BACKEND == "gloo" or BACKEND == "nccl":
             dist.init_process_group(
                 store=store, rank=self.rank, world_size=self.world_size, backend="gloo"
             )
-            group = list(range(self.world_size))
+            group = list(range(0, self.world_size))
             group_id = dist.group.WORLD
             self._test_all_gather(
                 group, group_id, self.rank, dtype=torch.float32, qtype=DQuantType.BFP16
@@ -111,7 +111,7 @@ if BACKEND == "gloo" or BACKEND == "nccl":
             dist.init_process_group(
                 store=store, rank=self.rank, world_size=self.world_size, backend="nccl"
             )
-            group = list(range(self.world_size))
+            group = list(range(0, self.world_size))
             group_id = dist.new_group(range(self.world_size))
             rank_to_GPU = init_multigpu_helper(self.world_size, BACKEND)
             self._test_all_to_all(
@@ -135,7 +135,7 @@ if BACKEND == "gloo" or BACKEND == "nccl":
             dist.init_process_group(
                 store=store, rank=self.rank, world_size=self.world_size, backend="nccl"
             )
-            group = list(range(self.world_size))
+            group = list(range(0, self.world_size))
             group_id = dist.new_group(range(self.world_size))
             rank_to_GPU = init_multigpu_helper(self.world_size, BACKEND)
             self._test_all_to_all(
@@ -158,7 +158,7 @@ if BACKEND == "gloo" or BACKEND == "nccl":
             dist.init_process_group(
                 store=store, rank=self.rank, world_size=self.world_size, backend="nccl"
             )
-            group = list(range(self.world_size))
+            group = list(range(0, self.world_size))
             group_id = dist.new_group(range(self.world_size))
             rank_to_GPU = init_multigpu_helper(self.world_size, BACKEND)
             self._test_all_to_all_single(
@@ -181,7 +181,7 @@ if BACKEND == "gloo" or BACKEND == "nccl":
             dist.init_process_group(
                 store=store, rank=self.rank, world_size=self.world_size, backend="nccl"
             )
-            group = list(range(self.world_size))
+            group = list(range(0, self.world_size))
             group_id = dist.new_group(range(self.world_size))
             rank_to_GPU = init_multigpu_helper(self.world_size, BACKEND)
             self._test_all_to_all_single(
diff --git a/test/distributed/checkpoint/test_planner.py b/test/distributed/checkpoint/test_planner.py
index 86bed29de998..edf043301ed2 100644
--- a/test/distributed/checkpoint/test_planner.py
+++ b/test/distributed/checkpoint/test_planner.py
@@ -66,7 +66,7 @@ if TEST_WITH_DEV_DBG_ASAN:
 def create_sharded_tensor(rank, world_size, shards_per_rank, shard_size=8):
     shards_metadata = []
     local_shards = []
-    for idx in range(world_size * shards_per_rank):
+    for idx in range(0, world_size * shards_per_rank):
         shard_rank = idx // shards_per_rank
         shard_md = ShardMetadata(
             shard_offsets=[idx * shard_size],
diff --git a/test/distributed/checkpoint/test_utils.py b/test/distributed/checkpoint/test_utils.py
index 79dbe741822c..722670c95f18 100644
--- a/test/distributed/checkpoint/test_utils.py
+++ b/test/distributed/checkpoint/test_utils.py
@@ -45,7 +45,7 @@ if TEST_WITH_DEV_DBG_ASAN:
 def create_sharded_tensor(rank, world_size, shards_per_rank):
     shards_metadata = []
     local_shards = []
-    for idx in range(world_size * shards_per_rank):
+    for idx in range(0, world_size * shards_per_rank):
         shard_rank = idx // shards_per_rank
         shard_md = ShardMetadata(
             shard_offsets=[idx * 8], shard_sizes=[8], placement=f"rank:{shard_rank}/cpu"
diff --git a/test/distributed/elastic/agent/server/test/api_test.py b/test/distributed/elastic/agent/server/test/api_test.py
index dd96f9b6dfb0..11776324ed7f 100644
--- a/test/distributed/elastic/agent/server/test/api_test.py
+++ b/test/distributed/elastic/agent/server/test/api_test.py
@@ -633,7 +633,7 @@ class SimpleElasticAgentTest(unittest.TestCase):
         worker_group = agent.get_worker_group()
 
         num_restarts = 3
-        for _ in range(num_restarts):
+        for _ in range(0, num_restarts):
             agent._restart_workers(worker_group)
             self.assertEqual(WorkerState.HEALTHY, worker_group.state)
 
diff --git a/test/distributed/elastic/multiprocessing/api_test.py b/test/distributed/elastic/multiprocessing/api_test.py
index 19d941e0d9c6..4ac0dcacb4b8 100644
--- a/test/distributed/elastic/multiprocessing/api_test.py
+++ b/test/distributed/elastic/multiprocessing/api_test.py
@@ -146,7 +146,7 @@ def echo_large(size: int) -> dict[int, str]:
     returns a large output ({0: test0", 1: "test1", ..., (size-1):f"test{size-1}"})
     """
     out = {}
-    for idx in range(size):
+    for idx in range(0, size):
         out[idx] = f"test{idx}"
     return out
 
diff --git a/test/distributed/elastic/timer/file_based_local_timer_test.py b/test/distributed/elastic/timer/file_based_local_timer_test.py
index 0125ce5cd25a..cf597eb6a37a 100644
--- a/test/distributed/elastic/timer/file_based_local_timer_test.py
+++ b/test/distributed/elastic/timer/file_based_local_timer_test.py
@@ -191,7 +191,7 @@ if not (IS_WINDOWS or IS_MACOS or IS_ARM64):
         """
         client = timer.FileTimerClient(file_path)
         sem.release()
-        for _ in range(n):
+        for _ in range(0, n):
             client.acquire("test_scope", 0)
             time.sleep(interval)
 
diff --git a/test/distributed/elastic/timer/local_timer_example.py b/test/distributed/elastic/timer/local_timer_example.py
index 6d438f2536d6..09421f4b38f5 100644
--- a/test/distributed/elastic/timer/local_timer_example.py
+++ b/test/distributed/elastic/timer/local_timer_example.py
@@ -102,7 +102,7 @@ if not (IS_WINDOWS or IS_MACOS or IS_ARM64):
 
             world_size = 8
             processes = []
-            for i in range(world_size):
+            for i in range(0, world_size):
                 if i % 2 == 0:
                     p = spawn_ctx.Process(target=_stuck_function, args=(i, mp_queue))
                 else:
@@ -110,7 +110,7 @@ if not (IS_WINDOWS or IS_MACOS or IS_ARM64):
                 p.start()
                 processes.append(p)
 
-            for i in range(world_size):
+            for i in range(0, world_size):
                 p = processes[i]
                 p.join()
                 if i % 2 == 0:
diff --git a/test/distributed/elastic/timer/local_timer_test.py b/test/distributed/elastic/timer/local_timer_test.py
index 8818b1788c62..b65b202d5ec6 100644
--- a/test/distributed/elastic/timer/local_timer_test.py
+++ b/test/distributed/elastic/timer/local_timer_test.py
@@ -127,7 +127,7 @@ if not INVALID_PLATFORMS:
         interval seconds. Releases the given semaphore once before going to work.
         """
         sem.release()
-        for i in range(n):
+        for i in range(0, n):
             mp_queue.put(TimerRequest(i, "test_scope", 0))
             time.sleep(interval)
 
diff --git a/test/distributed/elastic/utils/data/cycling_iterator_test.py b/test/distributed/elastic/utils/data/cycling_iterator_test.py
index 835ed6ebbd01..c9cb055a2c22 100644
--- a/test/distributed/elastic/utils/data/cycling_iterator_test.py
+++ b/test/distributed/elastic/utils/data/cycling_iterator_test.py
@@ -15,7 +15,7 @@ class CyclingIteratorTest(unittest.TestCase):
     def generator(self, epoch, stride, max_epochs):
         # generate an continuously incrementing list each epoch
         # e.g. [0,1,2] [3,4,5] [6,7,8] ...
-        return iter([stride * epoch + i for i in range(stride)])
+        return iter([stride * epoch + i for i in range(0, stride)])
 
     def test_cycling_iterator(self):
         stride = 3
@@ -25,7 +25,7 @@ class CyclingIteratorTest(unittest.TestCase):
             return self.generator(epoch, stride, max_epochs)
 
         it = CyclingIterator(n=max_epochs, generator_fn=generator_fn)
-        for i in range(stride * max_epochs):
+        for i in range(0, stride * max_epochs):
             self.assertEqual(i, next(it))
 
         with self.assertRaises(StopIteration):
diff --git a/test/distributed/fsdp/test_fsdp_hybrid_shard.py b/test/distributed/fsdp/test_fsdp_hybrid_shard.py
index e2ea4c5fc9af..26a05bbc4171 100644
--- a/test/distributed/fsdp/test_fsdp_hybrid_shard.py
+++ b/test/distributed/fsdp/test_fsdp_hybrid_shard.py
@@ -124,7 +124,7 @@ class TestFSDPHybridShard(FSDPTest):
         model = MyModel().to(device_type)
         num_node_devices = torch.accelerator.device_count()
         shard_rank_lists = (
-            list(range(num_node_devices // 2)),
+            list(range(0, num_node_devices // 2)),
             list(range(num_node_devices // 2, num_node_devices)),
         )
         shard_groups = (
@@ -175,7 +175,7 @@ class TestFSDPHybridShard(FSDPTest):
         model = MyModel().to(device_type)
         num_node_devices = torch.accelerator.device_count()
         shard_rank_lists = (
-            list(range(num_node_devices // 2)),
+            list(range(0, num_node_devices // 2)),
             list(range(num_node_devices // 2, num_node_devices)),
         )
         shard_groups = (
diff --git a/test/distributed/tensor/test_dtensor_ops.py b/test/distributed/tensor/test_dtensor_ops.py
index df51152a9030..c4373773d662 100644
--- a/test/distributed/tensor/test_dtensor_ops.py
+++ b/test/distributed/tensor/test_dtensor_ops.py
@@ -802,7 +802,7 @@ class TestLocalDTensorOps(TestDTensorOps):
         self.run_opinfo_test(dtype, op)
 
     def test_mean(self):
-        with LocalTensorMode(frozenset(range(self.world_size))):
+        with LocalTensorMode(frozenset(range(0, self.world_size))):
             self.run_mean()
 
     def test_one_hot(self):
@@ -811,7 +811,7 @@ class TestLocalDTensorOps(TestDTensorOps):
     def run_opinfo_test(
         self, dtype, op, requires_grad=True, sample_inputs_filter=lambda s: True
     ):
-        with LocalTensorMode(frozenset(range(self.world_size))):
+        with LocalTensorMode(frozenset(range(0, self.world_size))):
             super().run_opinfo_test(dtype, op, requires_grad, sample_inputs_filter)
 
     def assertEqualOnRank(self, x, y, msg=None, *, rank=0):
diff --git a/test/distributed/test_device_mesh.py b/test/distributed/test_device_mesh.py
index 2db674a458ed..0ed4651d3ec5 100644
--- a/test/distributed/test_device_mesh.py
+++ b/test/distributed/test_device_mesh.py
@@ -536,7 +536,7 @@ class DeviceMeshTestNDim(DTensorTestBase):
         # Create shard groups (e.g. (0, 1, 2, 3), (4, 5, 6, 7))
         # and assign the correct shard group to each rank
         shard_rank_lists = (
-            list(range(self.world_size // 2)),
+            list(range(0, self.world_size // 2)),
             list(range(self.world_size // 2, self.world_size)),
         )
         shard_groups = (
diff --git a/test/distributions/test_distributions.py b/test/distributions/test_distributions.py
index 550589002003..b588589d81ba 100644
--- a/test/distributions/test_distributions.py
+++ b/test/distributions/test_distributions.py
@@ -5722,11 +5722,11 @@ class TestKL(DistributionsTestCase):
     def test_kl_multivariate_normal(self):
         set_rng_seed(0)  # see Note [Randomized statistical tests]
         n = 5  # Number of tests for multivariate_normal
-        for i in range(n):
-            loc = [torch.randn(4) for _ in range(2)]
+        for i in range(0, n):
+            loc = [torch.randn(4) for _ in range(0, 2)]
             scale_tril = [
                 transform_to(constraints.lower_cholesky)(torch.randn(4, 4))
-                for _ in range(2)
+                for _ in range(0, 2)
             ]
             p = MultivariateNormal(loc=loc[0], scale_tril=scale_tril[0])
             q = MultivariateNormal(loc=loc[1], scale_tril=scale_tril[1])
@@ -5755,10 +5755,10 @@ class TestKL(DistributionsTestCase):
 
     def test_kl_multivariate_normal_batched(self):
         b = 7  # Number of batches
-        loc = [torch.randn(b, 3) for _ in range(2)]
+        loc = [torch.randn(b, 3) for _ in range(0, 2)]
         scale_tril = [
             transform_to(constraints.lower_cholesky)(torch.randn(b, 3, 3))
-            for _ in range(2)
+            for _ in range(0, 2)
         ]
         expected_kl = torch.stack(
             [
@@ -5766,7 +5766,7 @@ class TestKL(DistributionsTestCase):
                     MultivariateNormal(loc[0][i], scale_tril=scale_tril[0][i]),
                     MultivariateNormal(loc[1][i], scale_tril=scale_tril[1][i]),
                 )
-                for i in range(b)
+                for i in range(0, b)
             ]
         )
         actual_kl = kl_divergence(
@@ -5777,7 +5777,7 @@ class TestKL(DistributionsTestCase):
 
     def test_kl_multivariate_normal_batched_broadcasted(self):
         b = 7  # Number of batches
-        loc = [torch.randn(b, 3) for _ in range(2)]
+        loc = [torch.randn(b, 3) for _ in range(0, 2)]
         scale_tril = [
             transform_to(constraints.lower_cholesky)(torch.randn(b, 3, 3)),
             transform_to(constraints.lower_cholesky)(torch.randn(3, 3)),
@@ -5788,7 +5788,7 @@ class TestKL(DistributionsTestCase):
                     MultivariateNormal(loc[0][i], scale_tril=scale_tril[0][i]),
                     MultivariateNormal(loc[1][i], scale_tril=scale_tril[1]),
                 )
-                for i in range(b)
+                for i in range(0, b)
             ]
         )
         actual_kl = kl_divergence(
@@ -5800,15 +5800,15 @@ class TestKL(DistributionsTestCase):
     def test_kl_lowrank_multivariate_normal(self):
         set_rng_seed(0)  # see Note [Randomized statistical tests]
         n = 5  # Number of tests for lowrank_multivariate_normal
-        for i in range(n):
-            loc = [torch.randn(4) for _ in range(2)]
-            cov_factor = [torch.randn(4, 3) for _ in range(2)]
+        for i in range(0, n):
+            loc = [torch.randn(4) for _ in range(0, 2)]
+            cov_factor = [torch.randn(4, 3) for _ in range(0, 2)]
             cov_diag = [
-                transform_to(constraints.positive)(torch.randn(4)) for _ in range(2)
+                transform_to(constraints.positive)(torch.randn(4)) for _ in range(0, 2)
             ]
             covariance_matrix = [
                 cov_factor[i].matmul(cov_factor[i].t()) + cov_diag[i].diag()
-                for i in range(2)
+                for i in range(0, 2)
             ]
             p = LowRankMultivariateNormal(loc[0], cov_factor[0], cov_diag[0])
             q = LowRankMultivariateNormal(loc[1], cov_factor[1], cov_diag[1])
@@ -5861,10 +5861,10 @@ class TestKL(DistributionsTestCase):
 
     def test_kl_lowrank_multivariate_normal_batched(self):
         b = 7  # Number of batches
-        loc = [torch.randn(b, 3) for _ in range(2)]
-        cov_factor = [torch.randn(b, 3, 2) for _ in range(2)]
+        loc = [torch.randn(b, 3) for _ in range(0, 2)]
+        cov_factor = [torch.randn(b, 3, 2) for _ in range(0, 2)]
         cov_diag = [
-            transform_to(constraints.positive)(torch.randn(b, 3)) for _ in range(2)
+            transform_to(constraints.positive)(torch.randn(b, 3)) for _ in range(0, 2)
         ]
         expected_kl = torch.stack(
             [
@@ -5876,7 +5876,7 @@ class TestKL(DistributionsTestCase):
                         loc[1][i], cov_factor[1][i], cov_diag[1][i]
                     ),
                 )
-                for i in range(b)
+                for i in range(0, b)
             ]
         )
         actual_kl = kl_divergence(
diff --git a/test/dynamo/test_export.py b/test/dynamo/test_export.py
index f3f438d241af..112da727ec61 100644
--- a/test/dynamo/test_export.py
+++ b/test/dynamo/test_export.py
@@ -49,9 +49,9 @@ class ExportTests(torch._dynamo.test_case.TestCase):
             lc_key = state[0]
             lc_val = state[1]
             bar = []
-            for _ in range(4):
+            for _ in range(0, 4):
                 bar2 = []
-                for _ in range(3):
+                for _ in range(0, 3):
                     bar2.append(
                         lc_key + lc_val + torch.tensor([0.1, 0.25, 0.4, 0.5, 0.1])
                     )
@@ -665,9 +665,9 @@ def forward(self, x, y):
             lc_key = state[0]
             lc_val = state[1]
             bar = []
-            for _ in range(4):
+            for _ in range(0, 4):
                 bar2 = []
-                for _ in range(3):
+                for _ in range(0, 3):
                     bar2.append(
                         lc_key + lc_val + torch.tensor([0.1, 0.25, 0.4, 0.5, 0.1])
                     )
diff --git a/test/dynamo/test_functions.py b/test/dynamo/test_functions.py
index 647033e63e4c..d16676cda8ee 100644
--- a/test/dynamo/test_functions.py
+++ b/test/dynamo/test_functions.py
@@ -3627,7 +3627,7 @@ class GraphModule(torch.nn.Module):
                 )
 
         test(range(10), slice(1, 10, 2), expected=range(1, 10, 2))
-        test(range(10), slice(None, 10, None), expected=range(10))
+        test(range(10), slice(None, 10, None), expected=range(0, 10))
         test(range(10), slice(-1, 7, None), expected=range(9, 7))
         test(range(10), slice(-1, 7, 2), expected=range(9, 7, 2))
         test(range(1, 10, 2), slice(3, 7, 2), expected=range(7, 11, 4))
diff --git a/test/dynamo/test_modules.py b/test/dynamo/test_modules.py
index c251ce28bac4..7cac7eca7239 100644
--- a/test/dynamo/test_modules.py
+++ b/test/dynamo/test_modules.py
@@ -3047,7 +3047,7 @@ class OptimizedModuleTest(torch._dynamo.test_case.TestCase):
         def generate(x, c):
             return mod(x) + c
 
-        for _ in range(10):
+        for _ in range(0, 10):
             generate(torch.randn(10, 10), 0)
             generate(torch.randn(10, 10), 1)
         self.assertEqual(cnt.frame_count, 2)
diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index ac0515ac6ba8..362a541918c3 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -4471,7 +4471,7 @@ class ReproTests(torch._dynamo.test_case.TestCase):
 
         compiled_fn = torch.compile(func, backend=cnt, fullgraph=True)
         requires_grad = func is not func1
-        for _ in range(5):
+        for _ in range(0, 5):
             # Inputs
             eager_a = torch.ones([6], requires_grad=requires_grad)
             compiled_a = torch.ones([6], requires_grad=requires_grad)
@@ -4623,7 +4623,7 @@ class ReproTests(torch._dynamo.test_case.TestCase):
         x = torch.rand([2, 2])
         self.assertEqual(opt_fn(x, counter), fn(x, counter))
         self.assertEqual(counter[0], 2)
-        for _ in range(10):
+        for _ in range(0, 10):
             opt_fn(x, counter)
         self.assertEqual(counter[0], 12)
         if torch._dynamo.config.assume_static_by_default:
@@ -4784,7 +4784,7 @@ class ReproTests(torch._dynamo.test_case.TestCase):
     def test_contains_range_constprop(self):
         def fn(x):
             # dynamo should const prop to False
-            if 3 in range(10):
+            if 3 in range(0, 10):
                 return x + 1
             else:
                 return x + 2
diff --git a/test/functorch/test_ac.py b/test/functorch/test_ac.py
index d0611f19cf2a..fde84b6683ed 100644
--- a/test/functorch/test_ac.py
+++ b/test/functorch/test_ac.py
@@ -106,7 +106,7 @@ class MemoryBudgetTest(TestCase):
             return f(x, ws)
 
         _, eager_flops = get_mem_and_flops(call)
-        for budget in range(11):
+        for budget in range(0, 11):
             mem, flops = get_mem_and_flops(call, memory_budget=budget / 10)
             if budget <= 5:
                 # We start saving the matmuls
@@ -251,7 +251,7 @@ class MemoryBudgetTest(TestCase):
             return f(x, ws)
 
         expected = call()
-        for budget in range(11):
+        for budget in range(0, 11):
             memory_budget = budget / 10
             torch._dynamo.reset()
             with config.patch(activation_memory_budget=memory_budget):
diff --git a/test/inductor/test_codecache.py b/test/inductor/test_codecache.py
index ca2e9007109d..78c2dd3de852 100644
--- a/test/inductor/test_codecache.py
+++ b/test/inductor/test_codecache.py
@@ -1146,7 +1146,7 @@ class TestFxGraphCache(TestCase):
             raise unittest.SkipTest(f"requires {GPU_TYPE}")
 
         def fn1(x):
-            return x + torch.tensor(list(range(12)), device=device)
+            return x + torch.tensor(list(range(0, 12)), device=device)
 
         def fn2(x):
             return x + torch.tensor(list(range(1, 13)), device=device)
diff --git a/test/inductor/test_compiled_autograd.py b/test/inductor/test_compiled_autograd.py
index 716d3bfafee2..2612af01f6ff 100644
--- a/test/inductor/test_compiled_autograd.py
+++ b/test/inductor/test_compiled_autograd.py
@@ -1599,7 +1599,7 @@ main()
 
         eager_check()
 
-        for i in range(5):
+        for i in range(0, 5):
             with compiled_autograd._enable(compiler_fn):
                 eager_check()
 
diff --git a/test/inductor/test_max_autotune.py b/test/inductor/test_max_autotune.py
index 85405283e4bd..6645f17fb9ee 100644
--- a/test/inductor/test_max_autotune.py
+++ b/test/inductor/test_max_autotune.py
@@ -2095,7 +2095,7 @@ class TestMaxAutotune(TestCase):
 
         # Test loop.
         def test_func2(x):
-            for i in range(10):
+            for i in range(0, 10):
                 x = torch.matmul(x, x)
             return x
 
diff --git a/test/inductor/test_triton_kernels.py b/test/inductor/test_triton_kernels.py
index 4739d00f1f4a..9a21220ce4d9 100644
--- a/test/inductor/test_triton_kernels.py
+++ b/test/inductor/test_triton_kernels.py
@@ -3005,7 +3005,7 @@ class MutationTests(torch._inductor.test_case.TestCase):
             mask = offsets < n_elements
             x = tl.load(in_ptr0 + offsets, mask=mask)
             y = tl.load(in_ptr1 + offsets, mask=mask)
-            for i in range(BLOCK_SIZE):
+            for i in range(0, BLOCK_SIZE):
                 i = tl.multiple_of(i, 1)
             output = x + y
             tl.store(out_ptr + offsets, output, mask=mask)
@@ -3160,7 +3160,7 @@ class MutationTests(torch._inductor.test_case.TestCase):
             x = tl.load(x_block_ptr)
 
             # Compute gating
-            for c2 in range(tl.cdiv(C2, BLOCK_SIZE_C2)):
+            for c2 in range(0, tl.cdiv(C2, BLOCK_SIZE_C2)):
                 # Compute block pointers
                 offs_c2 = c2 * BLOCK_SIZE_C2 + tl.arange(0, BLOCK_SIZE_C2)
                 o_block_ptr = O_ptr + offs_m[:, None] * C2 + offs_c2[None, :]
diff --git a/test/jit/xnnpack/test_xnnpack_delegate.py b/test/jit/xnnpack/test_xnnpack_delegate.py
index f6c7832d5b28..b97765ed5bb0 100644
--- a/test/jit/xnnpack/test_xnnpack_delegate.py
+++ b/test/jit/xnnpack/test_xnnpack_delegate.py
@@ -32,7 +32,7 @@ class TestXNNPackBackend(unittest.TestCase):
             },
         )
 
-        for _ in range(20):
+        for _ in range(0, 20):
             sample_input = torch.randn(4, 4, 4)
             actual_output = scripted_module(sample_input)
             expected_output = lowered_module(sample_input)
diff --git a/test/nn/test_convolution.py b/test/nn/test_convolution.py
index 3c3b3f53e528..4cdcac707644 100644
--- a/test/nn/test_convolution.py
+++ b/test/nn/test_convolution.py
@@ -1292,7 +1292,7 @@ class TestConvolutionNN(NNTestCase):
             kernel_x = torch.zeros([3, 1, 1, radius * 2 + 1], device=image.device)
             image = torch.nn.functional.conv2d(image, kernel_x, groups=image.shape[-3])
 
-        for i in range(128):
+        for i in range(0, 128):
             # This should not fail
             reproducer(radius=i)
 
diff --git a/test/nn/test_embedding.py b/test/nn/test_embedding.py
index f21184290fa1..fb9d842ce476 100644
--- a/test/nn/test_embedding.py
+++ b/test/nn/test_embedding.py
@@ -551,7 +551,7 @@ class TestEmbeddingNNDeviceType(NNTestCase):
                 # Pull out the bag's indices from indices_1D, and fill any
                 # remaining space with padding indices
                 indices_in_bag = []
-                for item_pos in range(max_indices_per_bag):
+                for item_pos in range(0, max_indices_per_bag):
                     if (start + item_pos) < end:
                         indices_in_bag.append(indices_1D[start + item_pos])
                     else:
diff --git a/test/nn/test_multihead_attention.py b/test/nn/test_multihead_attention.py
index 3dc6a586ced6..0c04e3b86b88 100644
--- a/test/nn/test_multihead_attention.py
+++ b/test/nn/test_multihead_attention.py
@@ -485,7 +485,7 @@ class TestMultiheadAttentionNN(NNTestCase):
         )[0]
         output_3d = output_3d.transpose(0, 1)  # [N, T, D]
 
-        for i in range(batch_size):
+        for i in range(0, batch_size):
             output_2d = mta_model(
                 query[i].unsqueeze(0).transpose(0, 1),
                 key[i].unsqueeze(0).transpose(0, 1),
diff --git a/test/nn/test_pooling.py b/test/nn/test_pooling.py
index c3a7b829b2b1..d282a885f4ed 100644
--- a/test/nn/test_pooling.py
+++ b/test/nn/test_pooling.py
@@ -1135,7 +1135,7 @@ torch.cuda.synchronize()
         for size, kernel_size, stride, dilation, ceil_mode in itertools.product(
             sizes, kernel_sizes, strides, dilations, ceil_modes
         ):
-            padding = random.sample(range(math.floor(kernel_size / 2) + 1), 1)
+            padding = random.sample(range(0, math.floor(kernel_size / 2) + 1), 1)
             check(
                 torch.randn(size, device=device, dtype=dtype),
                 kernel_size,
diff --git a/test/onnx/test_onnx_opset.py b/test/onnx/test_onnx_opset.py
index 16ca93dbfe2c..75de1f3fab83 100644
--- a/test/onnx/test_onnx_opset.py
+++ b/test/onnx/test_onnx_opset.py
@@ -36,12 +36,12 @@ def check_onnx_opset_operator(
     # but the op's attributes can optionally be
     # specified as well
     assert len(ops) == len(graph.node)
-    for i in range(len(ops)):
+    for i in range(0, len(ops)):
         assert graph.node[i].op_type == ops[i]["op_name"]
         if "attributes" in ops[i]:
             attributes = ops[i]["attributes"]
             assert len(attributes) == len(graph.node[i].attribute)
-            for j in range(len(attributes)):
+            for j in range(0, len(attributes)):
                 for attribute_field in attributes[j].keys():
                     assert attributes[j][attribute_field] == getattr(
                         graph.node[i].attribute[j], attribute_field
diff --git a/test/optim/test_lrscheduler.py b/test/optim/test_lrscheduler.py
index 3e65720a45b6..cea85b07646f 100644
--- a/test/optim/test_lrscheduler.py
+++ b/test/optim/test_lrscheduler.py
@@ -1509,7 +1509,7 @@ class TestLRScheduler(TestCase):
             14.0 / 3,
             29.0 / 6,
         ]
-        deltas = [2 * i for i in range(2)]
+        deltas = [2 * i for i in range(0, 2)]
         base_lrs = [1 + delta for delta in deltas]
         max_lrs = [5 + delta for delta in deltas]
         lr_targets = [[x + delta for x in lr_base_target] for delta in deltas]
diff --git a/test/profiler/test_profiler.py b/test/profiler/test_profiler.py
index a9321da3fbd3..1461731a5998 100644
--- a/test/profiler/test_profiler.py
+++ b/test/profiler/test_profiler.py
@@ -1930,7 +1930,7 @@ assert KinetoStepTracker.current_step() == initial_step + 2 * niters
         event_list.table()
 
     def _check_all_gpu_present(self, gpu_dict, max_gpu_count):
-        for i in range(max_gpu_count):
+        for i in range(0, max_gpu_count):
             self.assertEqual(gpu_dict["GPU " + str(i)], 1)
 
     # Do json sanity testing. Checks that all events are between profiler start and end
@@ -2139,8 +2139,8 @@ assert KinetoStepTracker.current_step() == initial_step + 2 * niters
                         step_helper_funcs.append(event)
             self.assertEqual(len(prof_steps), 5)
             self.assertEqual(len(step_helper_funcs), 5)
-            for i in range(len(step_helper_funcs)):
-                for j in range(len(step_helper_funcs)):
+            for i in range(0, len(step_helper_funcs)):
+                for j in range(0, len(step_helper_funcs)):
                     self.assertTrue(
                         not self._partial_overlap(prof_steps[i], step_helper_funcs[j])
                     )
diff --git a/test/quantization/core/experimental/test_floatx.py b/test/quantization/core/experimental/test_floatx.py
index c4cea4073a5c..ee7fe0a9d186 100644
--- a/test/quantization/core/experimental/test_floatx.py
+++ b/test/quantization/core/experimental/test_floatx.py
@@ -275,7 +275,7 @@ class TestFloat8Dtype(TestCase):
         IMO simpler to special case e8m0 here.
         """
 
-        for biased_exponent in range(256):
+        for biased_exponent in range(0, 256):
             # iterate through all the possible options of guard, round, sticky bits
             # for the current exponent
             for grs in range(8):
diff --git a/test/test_dataloader.py b/test/test_dataloader.py
index b9000a2c68d3..da0c12082244 100644
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@@ -3494,7 +3494,7 @@ class TestIndividualWorkerQueue(TestCase):
             max_num_workers = 1
 
         for batch_size in (8, 16, 32, 64):
-            for num_workers in range(min(6, max_num_workers)):
+            for num_workers in range(0, min(6, max_num_workers)):
                 self._run_ind_worker_queue_test(
                     batch_size=batch_size, num_workers=num_workers + 1
                 )
diff --git a/test/test_datapipe.py b/test/test_datapipe.py
index 2790145665b1..e92fa2b0615d 100644
--- a/test/test_datapipe.py
+++ b/test/test_datapipe.py
@@ -520,7 +520,7 @@ class TestIterableDataPipeBasic(TestCase):
         self.assertEqual(list(range(9)), list(n))
 
         # Functional Test: Uneven DataPipes
-        source_numbers = list(range(10)) + [10, 12]
+        source_numbers = list(range(0, 10)) + [10, 12]
         numbers_dp = dp.iter.IterableWrapper(source_numbers)
         n1, n2 = numbers_dp.demux(2, lambda x: x % 2)
         self.assertEqual([0, 2, 4, 6, 8, 10, 12], list(n1))
@@ -1257,7 +1257,7 @@ class TestFunctionalIterDataPipe(TestCase):
         )
         output1, output2 = list(dp1), list(dp2)
         self.assertEqual(list(range(5, 10)), output1)
-        self.assertEqual(list(range(5)), output2)
+        self.assertEqual(list(range(0, 5)), output2)
 
         # Functional Test: values of the same classification are lumped together, and unlimited buffer
         with warnings.catch_warnings(record=True) as wa:
@@ -1271,7 +1271,7 @@ class TestFunctionalIterDataPipe(TestCase):
             self.assertRegex(str(wa[-1].message), r"Unlimited buffer size is set")
         output1, output2 = list(dp1), list(dp2)
         self.assertEqual(list(range(5, 10)), output1)
-        self.assertEqual(list(range(5)), output2)
+        self.assertEqual(list(range(0, 5)), output2)
 
         # Functional Test: classifier returns a value outside of [0, num_instance - 1]
         dp0 = input_dp.demux(num_instances=1, classifier_fn=lambda x: x % 2)
diff --git a/test/test_dynamic_shapes.py b/test/test_dynamic_shapes.py
index b8fa4ffbd421..fcc45521fbb1 100644
--- a/test/test_dynamic_shapes.py
+++ b/test/test_dynamic_shapes.py
@@ -1385,7 +1385,7 @@ class f(torch.nn.Module):
             self.assertEqual(x.storage_offset(), y.storage_offset())
 
     def test_tensor_factory_with_symint(self):
-        args = list(range(3))
+        args = list(range(0, 3))
         expected = torch.tensor(args)
 
         shape_env = ShapeEnv()
diff --git a/test/test_indexing.py b/test/test_indexing.py
index 99d84a65abca..fa91b5903410 100644
--- a/test/test_indexing.py
+++ b/test/test_indexing.py
@@ -902,7 +902,7 @@ class TestIndexing(TestCase):
         # Set window size
         W = 10
         # Generate a list of lists, containing overlapping window indices
-        indices = [range(i, i + W) for i in range(N - W)]
+        indices = [range(i, i + W) for i in range(0, N - W)]
 
         for i in [len(indices), 100, 32]:
             windowed_data = t[indices[:i]]
diff --git a/test/test_jit.py b/test/test_jit.py
index 613903e9a116..6a3c968f86dd 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -3153,7 +3153,7 @@ class TestScript(JitTestCase):
             eplan = get_execution_plan(dstate)
             num_bailouts = eplan.code.num_bailouts()
 
-            for i in range(num_bailouts):
+            for i in range(0, num_bailouts):
                 eplan.code.request_bailout(i)
                 self.assertEqual(jitted(x), expected)
 
@@ -5950,7 +5950,7 @@ a")
             # type: (int) -> int
             prev = 1
             v = 1
-            for i in range(x):
+            for i in range(0, x):
                 save = v
                 v = v + prev
                 prev = save
@@ -10938,7 +10938,7 @@ dedent """
 
             # Test symbolic differentiation
             # Run Forward and Backward thrice to trigger autodiff graph
-            for i in range(3):
+            for i in range(0, 3):
                 y = jit_module(x)
                 y.backward(grad)
             x.grad.zero_()
@@ -11802,7 +11802,7 @@ dedent """
         def fn_zip_enumerate(x, y):
             # type: (List[int], List[int]) -> int
             sum = 0
-            for (i, (j, v), k) in zip(x, enumerate(y), range(100)):
+            for (i, (j, v), k) in zip(x, enumerate(y), range(0, 100)):
                 sum += i * j * v * k
 
             return sum
diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py
index dba28f98cbf9..1bda41f7f8f1 100644
--- a/test/test_jit_fuser_te.py
+++ b/test/test_jit_fuser_te.py
@@ -243,7 +243,7 @@ class TestTEFuser(JitTestCase):
             return x2.sum()
 
         with texpr_reductions_enabled():
-            a = torch.tensor(list(range(15)), dtype=torch.float, device="cpu")
+            a = torch.tensor(list(range(0, 15)), dtype=torch.float, device="cpu")
             a = a.reshape(5, 3)
             scripted = self.checkScript(func, (a,))
             self.assertLastGraphAllFused()
@@ -259,7 +259,7 @@ class TestTEFuser(JitTestCase):
             return x.sum((-2,)) * 2
 
         with texpr_reductions_enabled():
-            a = torch.tensor(list(range(15)), dtype=torch.float, device="cpu")
+            a = torch.tensor(list(range(0, 15)), dtype=torch.float, device="cpu")
             a = a.reshape(5, 3)
             scripted = self.checkScript(func, (a,))
             self.assertLastGraphAllFused()
@@ -271,7 +271,7 @@ class TestTEFuser(JitTestCase):
             return x.sum((0,), keepdim=True, dtype=torch.double) * 2
 
         with texpr_reductions_enabled():
-            a = torch.tensor(list(range(15)), dtype=torch.float, device="cpu")
+            a = torch.tensor(list(range(0, 15)), dtype=torch.float, device="cpu")
             a = a.reshape(5, 3)
 
             self.checkScript(func, (a,))
@@ -2234,7 +2234,7 @@ class TestTEFuser(JitTestCase):
 
         indices = [0, 1, 2, 3]
         sets = []
-        for i in range(len(indices) + 1):
+        for i in range(0, len(indices) + 1):
             for subset in combinations(indices, i):
                 sets.append(subset)  # noqa: PERF402
 
diff --git a/test/test_matmul_cuda.py b/test/test_matmul_cuda.py
index bf46ee0709fc..61f5642830dd 100644
--- a/test/test_matmul_cuda.py
+++ b/test/test_matmul_cuda.py
@@ -231,7 +231,7 @@ class TestMatmulCuda(InductorTestCase):
     def test_cublas_addmm_alignment(self, dtype):
         device = 'cuda'
         # perturb X, A, or B alignment
-        for idx in range(3):
+        for idx in range(0, 3):
             for offset in range(1, 3):
                 offsets = [0, 0, 0]
                 offsets[idx] = offset
diff --git a/test/test_mps.py b/test/test_mps.py
index e825fa77aa89..7346d1d26d44 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -1900,7 +1900,7 @@ class TestMPS(TestCaseMPS):
         res_cpu = torch.linalg.vector_norm(B_cpu, ord=3.5)
         self.assertEqual(res_mps, res_cpu)
 
-        for dim in range(B_mps.dim()):
+        for dim in range(0, B_mps.dim()):
             res_mps = torch.linalg.vector_norm(B_mps, ord=3.5, dim=dim)
             res_cpu = torch.linalg.vector_norm(B_cpu, ord=3.5, dim=dim)
             self.assertEqual(res_mps, res_cpu)
@@ -2871,8 +2871,8 @@ class TestMPS(TestCaseMPS):
 
     def test_contiguous_slice_2d(self):
         def helper(shape):
-            for i in range(shape[0]):
-                for j in range(shape[1]):
+            for i in range(0, shape[0]):
+                for j in range(0, shape[1]):
                     t_mps = torch.randn(shape, device="mps")
                     t_cpu = t_mps.detach().clone().cpu()
 
@@ -3432,12 +3432,12 @@ class TestMPS(TestCaseMPS):
         elems = torch.arange(n_tensors * n_tensor_elems, dtype=torch.float32)
 
         tensor_list = []
-        for i in range(n_tensors - 1):
+        for i in range(0, n_tensors - 1):
             # create a list of contiguous view tensors (view tensor created by the slice op)
             t = elems[n_tensor_elems * i : n_tensor_elems * (i + 1)]
             tensor_list.append(t)
 
-        for i in range(n_tensors - 1):
+        for i in range(0, n_tensors - 1):
             t = tensor_list[i].view(1, n_tensor_elems)
             t_mps = t.to("mps")
             self.assertEqual(t, t_mps.cpu(), f"i={i}")
@@ -4942,7 +4942,7 @@ class TestMPS(TestCaseMPS):
             x_mps = fn(torch.zeros(shape, device="mps"), dim=dim)
             self.assertEqual(x_cpu, x_mps.cpu())
         for fn in [torch.any, torch.all]:
-            for dim in range(4):
+            for dim in range(0, 4):
                 helper(fn, dim)
 
         # 6D tensor reductions
@@ -9750,7 +9750,7 @@ class TestGatherScatter(TestCaseMPS):
         self.assertEqual(x_cpu, x_mps)
 
     def test_cast_gather_scatter(self):
-        for _ in range(50):
+        for _ in range(0, 50):
             input = np.random.randint(0, 255, size=(5, 5, 4), dtype=np.uint8)
             with torch.no_grad():
                 s = torch.tensor(input, dtype=torch.uint8, device="mps").unsqueeze(0)
diff --git a/test/test_numa_binding.py b/test/test_numa_binding.py
index c599587e281d..764156ff9b98 100644
--- a/test/test_numa_binding.py
+++ b/test/test_numa_binding.py
@@ -549,7 +549,7 @@ class NumaBindingTest(TestCase):
             bound_logical_cpu_indices_0,
             # Gets an extra physical core due to odd number of physical cores on numa node
             # 3 physical cores total, 2 GPUs: GPU 0 gets 2 physical cores (CPUs 0-3)
-            set(range(4)),
+            set(range(0, 4)),
         )
 
         bound_logical_cpu_indices_1 = (
@@ -677,7 +677,7 @@ class NumaBindingTest(TestCase):
             # 1 numa node, 2 L3 caches, 1 physical core per L3 cache = 2 logical CPUs per cache
             # L3 cache 0: CPUs 0-1, L3 cache 1: CPUs 2-3
             # Both have same number of CPUs, so prefer lower cache key (0)
-            set(range(2)),
+            set(range(0, 2)),
         )
 
     def test_binds_to_node_0_if_node_stored_as_minus_one(self) -> None:
@@ -709,7 +709,7 @@ class NumaBindingTest(TestCase):
             # GPU 0 has numa node stored as -1, which is treated as numa node 0
             # Each numa node has 1 * 1 * 2 = 2 logical CPUs
             # Numa node 0 has CPUs 0-1
-            set(range(2)),
+            set(range(0, 2)),
         )
 
     def test_callable_entrypoint_basic(self) -> None:
diff --git a/test/test_reductions.py b/test/test_reductions.py
index 4a3235fbc50c..e4fa54491dd0 100644
--- a/test/test_reductions.py
+++ b/test/test_reductions.py
@@ -1710,7 +1710,7 @@ class TestReductions(TestCase):
                                             with_extremal=False, atol=None, rtol=None,
                                             exact_dtype=True, with_keepdim=False):
         # Test 0-d to 3-d tensors.
-        for ndims in range(4):
+        for ndims in range(0, 4):
             shape = _rand_shape(ndims, min_size=5, max_size=10)
             for n in range(ndims + 1):
                 for c in combinations(list(range(ndims)), n):
@@ -2623,7 +2623,7 @@ class TestReductions(TestCase):
         # Generate some random test cases
         ops = ['quantile', 'nanquantile']
         inputs = [tuple(np.random.randint(2, 10, size=i)) for i in range(1, 4)]
-        quantiles = [tuple(np.random.rand(i)) for i in range(5)]
+        quantiles = [tuple(np.random.rand(i)) for i in range(0, 5)]
         keepdims = [True, False]
 
         # Add corner cases
diff --git a/test/test_serialization.py b/test/test_serialization.py
index a6e3ef23580d..7c4208b6a0d6 100644
--- a/test/test_serialization.py
+++ b/test/test_serialization.py
@@ -295,7 +295,7 @@ class SerializationMixin:
             5,
             6
         ]
-        for i in range(100):
+        for i in range(0, 100):
             data.append(0)
         t = torch.tensor(data, dtype=torch.uint8)
 
diff --git a/test/test_sparse.py b/test/test_sparse.py
index 196506a8e13d..866f38a316d7 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -5300,7 +5300,7 @@ class TestSparseAny(TestCase):
             x_dense = torch.eye(dense_dim, dtype=dtype, device=device)
             for sparse_dim_in in range(1, dense_dim):
                 x_sparse = x_dense.to_sparse(sparse_dim_in)
-                for sparse_dim_out in range(dense_dim):
+                for sparse_dim_out in range(0, dense_dim):
                     if sparse_dim_out == sparse_dim_in:
                         self.assertTrue(x_sparse.to_sparse(sparse_dim_out).sparse_dim() == sparse_dim_out)
                     else:
diff --git a/test/test_sparse_csr.py b/test/test_sparse_csr.py
index 45748c683621..65e800f6eba1 100644
--- a/test/test_sparse_csr.py
+++ b/test/test_sparse_csr.py
@@ -135,7 +135,7 @@ class TestSparseCSRSampler(TestCase):
         index_dtype = torch.int32
         for n_rows in range(1, 10):
             for n_cols in range(1, 10):
-                for nnz in range(n_rows * n_cols + 1):
+                for nnz in range(0, n_rows * n_cols + 1):
                     crow_indices = self._make_crow_indices(
                         n_rows, n_cols, nnz,
                         device=device, dtype=index_dtype)
diff --git a/test/test_static_runtime.py b/test/test_static_runtime.py
index df1e0c3e34fa..893aea8e3130 100644
--- a/test/test_static_runtime.py
+++ b/test/test_static_runtime.py
@@ -60,7 +60,7 @@ class MultiHeadAttentionLayer(nn.Module):
 # Taken from https://github.com/facebookresearch/dlrm/blob/master/dlrm_s_pytorch.py
 def create_mlp(ln, sigmoid_layer):
     layers = nn.ModuleList()
-    for i in range(len(ln) - 1):
+    for i in range(0, len(ln) - 1):
         n = ln[i]
         m = ln[i + 1]
 
diff --git a/test/test_tensorboard.py b/test/test_tensorboard.py
index 8ff6913887c8..cd527db88441 100644
--- a/test/test_tensorboard.py
+++ b/test/test_tensorboard.py
@@ -200,7 +200,7 @@ class TestTensorBoardPyTorchNumpy(BaseTestCase):
                 bucket_counts=counts.tolist(),
             )
 
-            ints = torch.tensor(range(100)).float()
+            ints = torch.tensor(range(0, 100)).float()
             nbins = 100
             counts = torch.histc(ints, bins=nbins, min=0, max=99)
             limits = torch.tensor(range(nbins))
diff --git a/test/test_tensorexpr.py b/test/test_tensorexpr.py
index 57be409ab6b4..17d3a58535d6 100644
--- a/test/test_tensorexpr.py
+++ b/test/test_tensorexpr.py
@@ -1216,7 +1216,7 @@ class TestTensorExprFuser(BaseTestClass):
         @torch.jit.script
         def test(x: torch.Tensor, y: torch.Tensor, z: int) -> torch.Tensor:
             b = y
-            for i in range(z):
+            for i in range(0, z):
                 a = x + y
                 b = b + y
             return b
diff --git a/test/test_torch.py b/test/test_torch.py
index 9b28b801348a..05ea6ea61db1 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -8424,7 +8424,7 @@ tensor([[[1.+1.j, 1.+1.j, 1.+1.j,  ..., 1.+1.j, 1.+1.j, 1.+1.j],
     def test_Size_iter(self):
         for sizes in [iter([1, 2, 3, 4, 5]), range(1, 6)]:
             x = torch.Size(sizes)
-            for i in range(5):
+            for i in range(0, 5):
                 self.assertEqual(x[i], i + 1)
 
     def test_t_not_2d_error(self):
diff --git a/test/test_view_ops.py b/test/test_view_ops.py
index 174632b07988..5bec225787cc 100644
--- a/test/test_view_ops.py
+++ b/test/test_view_ops.py
@@ -1559,7 +1559,7 @@ class TestOldViewOps(TestCase):
             self.compare_with_numpy(torch_fn, np_fn, x, device=None, dtype=None)
 
     def _test_atleast_dim(self, torch_fn, np_fn, device, dtype):
-        for ndims in range(5):
+        for ndims in range(0, 5):
             shape = _rand_shape(ndims, min_size=5, max_size=10)
             for _ in range(ndims + 1):
                 for with_extremal in [False, True]:
diff --git a/test/test_xnnpack_integration.py b/test/test_xnnpack_integration.py
index 62e257790fd4..481bd3c76a50 100644
--- a/test/test_xnnpack_integration.py
+++ b/test/test_xnnpack_integration.py
@@ -1316,7 +1316,7 @@ class TestXNNPACKConv1dTransformPass(TestCase):
         groups_list = range(1, 3)
         kernel_list = range(1, 4)
         stride_list = range(1, 3)
-        padding_list = range(3)
+        padding_list = range(0, 3)
         dilation_list = range(1, 3)
 
         for hparams in itertools.product(
@@ -1401,7 +1401,7 @@ class TestXNNPACKConv1dTransformPass(TestCase):
         groups_list = range(1, 3)
         kernel_list = range(1, 4)
         stride_list = range(1, 3)
-        padding_list = range(3)
+        padding_list = range(0, 3)
         dilation_list = range(1, 3)
         output_features_list = range(1, 3)
 
diff --git a/torch/_decomp/decompositions_for_jvp.py b/torch/_decomp/decompositions_for_jvp.py
index fb4a4d85faa2..e11540e0c2ba 100644
--- a/torch/_decomp/decompositions_for_jvp.py
+++ b/torch/_decomp/decompositions_for_jvp.py
@@ -147,7 +147,7 @@ def native_layer_norm_backward(
     inner_dims = input_shape[axis:]
     outer_dims = input_shape[:axis]
     inner_dim_indices = list(range(axis, input_ndim))
-    outer_dim_indices = list(range(axis))
+    outer_dim_indices = list(range(0, axis))
 
     N = 1
     for i in inner_dims:
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index 451776ef25fd..036f1ba7d01a 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -1248,7 +1248,7 @@ def argument_names(
         # signature. Assign names as {varargs}_0, {varargs}_1, ...
         assert fullargspec.varargs is not None, "More arguments than expected"
         input_strs += [
-            f"{fullargspec.varargs}_{i}" for i in range(len(args) - len(input_strs))
+            f"{fullargspec.varargs}_{i}" for i in range(0, len(args) - len(input_strs))
         ]
     elif len(args) < len(fullargspec.args):
         # 3. If there are fewer arguments in `args` than `fullargspec.args`,
@@ -1538,7 +1538,7 @@ class FlattenInputOutputSignature(torch.fx.Transformer):
         }
 
         self.new_args = []
-        for i in range(len(flat_args)):
+        for i in range(0, len(flat_args)):
             arg = super().placeholder(f"arg{i}", (), {})
             if i in matched_input_elements_to_fake:
                 arg.node.meta["val"] = matched_input_elements_to_fake[i]
diff --git a/torch/_inductor/dependencies.py b/torch/_inductor/dependencies.py
index b431972521da..0547b6b1db90 100644
--- a/torch/_inductor/dependencies.py
+++ b/torch/_inductor/dependencies.py
@@ -151,7 +151,7 @@ class MemoryDep(Dep):
         stride_to_index = {s: i for i, s in enumerate(self_strides)}
         order = [stride_to_index[s] for s in other_strides]
 
-        assert OrderedSet(order) == OrderedSet(range(self.num_vars))
+        assert OrderedSet(order) == OrderedSet(range(0, self.num_vars))
         return order
 
     def get_offset(self) -> sympy.Expr:
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index 1ad443ff387e..e89be2299434 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -1787,7 +1787,7 @@ def _padding_check_valid_input(input, padding, *, dim):
         for d in range(1, input_dim):
             valid_batch_mode = valid_batch_mode and input.size(d) != 0
     else:
-        for d in range(input_dim):
+        for d in range(0, input_dim):
             valid_non_batch_mode = valid_non_batch_mode and input.size(d) != 0
 
     # allow empty batch size but not other dimensions.
diff --git a/torch/_numpy/_funcs_impl.py b/torch/_numpy/_funcs_impl.py
index f57e7fb001fb..4ab3b29d34b8 100644
--- a/torch/_numpy/_funcs_impl.py
+++ b/torch/_numpy/_funcs_impl.py
@@ -1449,7 +1449,7 @@ def rollaxis(a: ArrayLike, axis, start=0):
         # numpy returns a view, here we try returning the tensor itself
         # return tensor[...]
         return a
-    axes = list(range(n))
+    axes = list(range(0, n))
     axes.remove(axis)
     axes.insert(start, axis)
     return a.view(axes)
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index 822f949d536f..13d6efd4ac67 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -4738,7 +4738,7 @@ def transpose(a: TensorLikeType, dim0: int, dim1: int) -> TensorLikeType:
     if a.ndim <= 1 or dim0 == dim1:
         return aten.alias.default(a)
 
-    _permutation = list(range(a.ndim))
+    _permutation = list(range(0, a.ndim))
     _permutation[_dim0] = _dim1
     _permutation[_dim1] = _dim0
     return torch.permute(a, _permutation)
diff --git a/torch/_tensor_str.py b/torch/_tensor_str.py
index 86a745f09b44..af4deb471db2 100644
--- a/torch/_tensor_str.py
+++ b/torch/_tensor_str.py
@@ -307,7 +307,7 @@ def _tensor_str_with_formatter(self, indent, summarize, formatter1, formatter2=N
                 _tensor_str_with_formatter(
                     self[i], indent + 1, summarize, formatter1, formatter2
                 )
-                for i in range(PRINT_OPTS.edgeitems)
+                for i in range(0, PRINT_OPTS.edgeitems)
             ]
             + ["..."]
             + [
@@ -322,7 +322,7 @@ def _tensor_str_with_formatter(self, indent, summarize, formatter1, formatter2=N
             _tensor_str_with_formatter(
                 self[i], indent + 1, summarize, formatter1, formatter2
             )
-            for i in range(self.size(0))
+            for i in range(0, self.size(0))
         ]
 
     tensor_str = ("," + "\n" * (dim - 1) + " " * (indent + 1)).join(slices)
@@ -406,7 +406,7 @@ def get_summarized_data(self):
     if not PRINT_OPTS.edgeitems:
         return self.new_empty([0] * self.dim())
     elif self.size(0) > 2 * PRINT_OPTS.edgeitems:
-        start = [self[i] for i in range(PRINT_OPTS.edgeitems)]
+        start = [self[i] for i in range(0, PRINT_OPTS.edgeitems)]
         end = [self[i] for i in range(len(self) - PRINT_OPTS.edgeitems, len(self))]
         return torch.stack([get_summarized_data(x) for x in (start + end)])
     else:
diff --git a/torch/ao/ns/fx/pattern_utils.py b/torch/ao/ns/fx/pattern_utils.py
index 8339ce8f57c1..242d1740d91b 100644
--- a/torch/ao/ns/fx/pattern_utils.py
+++ b/torch/ao/ns/fx/pattern_utils.py
@@ -28,7 +28,7 @@ def get_type_a_related_to_b(
     for s in base_name_to_sets_of_related_ops.values():
         s_list = list(s)
         # add every bidirectional pair
-        for idx_0 in range(len(s_list)):
+        for idx_0 in range(0, len(s_list)):
             for idx_1 in range(idx_0, len(s_list)):
                 type_a_related_to_b.add((s_list[idx_0], s_list[idx_1]))
                 type_a_related_to_b.add((s_list[idx_1], s_list[idx_0]))
diff --git a/torch/ao/pruning/_experimental/activation_sparsifier/activation_sparsifier.py b/torch/ao/pruning/_experimental/activation_sparsifier/activation_sparsifier.py
index 4330b0e24253..ef6a35686c7d 100644
--- a/torch/ao/pruning/_experimental/activation_sparsifier/activation_sparsifier.py
+++ b/torch/ao/pruning/_experimental/activation_sparsifier/activation_sparsifier.py
@@ -158,9 +158,9 @@ class ActivationSparsifier:
                 # data should be a list [aggregated over each feature only]
                 if data is None:
                     out_data = [
-                        0 for _ in range(len(features))
+                        0 for _ in range(0, len(features))
                     ]  # create one in case of 1st forward
-                    self.state[name]["mask"] = [0 for _ in range(len(features))]
+                    self.state[name]["mask"] = [0 for _ in range(0, len(features))]
                 else:
                     out_data = data  # a list
 
@@ -336,7 +336,7 @@ class ActivationSparsifier:
                 return input_data * mask
             else:
                 # apply per feature, feature_dim
-                for feature_idx in range(len(features)):
+                for feature_idx in range(0, len(features)):
                     feature = (
                         torch.Tensor([features[feature_idx]])
                         .long()
diff --git a/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_disk_savings.py b/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_disk_savings.py
index 0e25f59cea64..8192b617139b 100644
--- a/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_disk_savings.py
+++ b/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_disk_savings.py
@@ -99,7 +99,7 @@ def sparsify_model(path_to_model, sparsified_model_dump_path):
         sparse_block_shapes (List of tuples)
             List of sparse block shapes to be sparsified on
     """
-    sparsity_levels = [sl / 10 for sl in range(10)]
+    sparsity_levels = [sl / 10 for sl in range(0, 10)]
     sparsity_levels += [0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99, 1.0]
 
     norms = ["L1", "L2"]
diff --git a/torch/ao/pruning/_experimental/data_sparsifier/lightning/tests/test_callbacks.py b/torch/ao/pruning/_experimental/data_sparsifier/lightning/tests/test_callbacks.py
index 5a36e13c7b46..442639be9b21 100644
--- a/torch/ao/pruning/_experimental/data_sparsifier/lightning/tests/test_callbacks.py
+++ b/torch/ao/pruning/_experimental/data_sparsifier/lightning/tests/test_callbacks.py
@@ -299,7 +299,7 @@ class TestTrainingAwareCallback(TestCase):
         self._check_on_train_start(pl_module, callback, sparsifier_args, scheduler_args)
 
         num_epochs = 5
-        for _ in range(num_epochs):
+        for _ in range(0, num_epochs):
             self._check_on_train_epoch_start(pl_module, callback)
             self._simulate_update_param_model(pl_module)
             self._check_on_train_epoch_end(pl_module, callback)
diff --git a/torch/ao/pruning/sparsifier/nearly_diagonal_sparsifier.py b/torch/ao/pruning/sparsifier/nearly_diagonal_sparsifier.py
index 26fb3a98b8fb..a4d42ea80328 100644
--- a/torch/ao/pruning/sparsifier/nearly_diagonal_sparsifier.py
+++ b/torch/ao/pruning/sparsifier/nearly_diagonal_sparsifier.py
@@ -53,7 +53,7 @@ class NearlyDiagonalSparsifier(base_sparsifier.BaseSparsifier):
                 "nearliness cannot be larger than the dimensions of tensor."
             )
 
-        for row in range(height):
+        for row in range(0, height):
             # Bounds of entries that needs to be set to 1
             low = max(0, row - dist_to_diagonal)
             high = min(width, row + dist_to_diagonal + 1)
diff --git a/torch/ao/quantization/experimental/observer.py b/torch/ao/quantization/experimental/observer.py
index e61fcb67c94a..7d9432ab27ec 100644
--- a/torch/ao/quantization/experimental/observer.py
+++ b/torch/ao/quantization/experimental/observer.py
@@ -68,10 +68,10 @@ class APoTObserver(ObserverBase):
         p_all = []
 
         # create levels
-        for i in range(self.n):
+        for i in range(0, self.n):
             p_curr = torch.tensor([0])
 
-            for j in range((2**self.k - 2) + 1):
+            for j in range(0, (2**self.k - 2) + 1):
                 curr_ele = 2 ** (-(i + j * self.n))
                 p_append = torch.tensor([curr_ele])
                 p_curr = torch.cat((p_curr, p_append))
diff --git a/torch/ao/quantization/fx/_decomposed.py b/torch/ao/quantization/fx/_decomposed.py
index b145cbfaeeba..160e9aa3afef 100644
--- a/torch/ao/quantization/fx/_decomposed.py
+++ b/torch/ao/quantization/fx/_decomposed.py
@@ -1159,7 +1159,7 @@ class FakeQuantPerChannel(torch.autograd.Function):
             f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
         )
         assert axis < input.dim(), f"Expecting axis to be < {input.dim()}"
-        broadcast_dims = list(range(axis)) + list(range(axis + 1, input.ndim))
+        broadcast_dims = list(range(0, axis)) + list(range(axis + 1, input.ndim))
         unsqueeze_scales = _unsqueeze_multiple(scales, broadcast_dims)
         unsqueeze_zero_points = _unsqueeze_multiple(zero_points, broadcast_dims)
         temp = torch.round(input * (1.0 / unsqueeze_scales)) + unsqueeze_zero_points
diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index cdab6259d85b..322d39f72202 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -1212,7 +1212,7 @@ class KinetoStepTracker:
                     "Profiler step count has increased more than 1 - "
                     f"current_step = {cls._current_step} step dict =  {cls._step_dict}"
                 )
-            for _ in range(delta):
+            for _ in range(0, delta):
                 _kineto_step()
             cls._current_step = new_step
         return cls._current_step
diff --git a/torch/distributed/_pycute/layout.py b/torch/distributed/_pycute/layout.py
index 04ae5d1fa5fd..be25cad2e953 100644
--- a/torch/distributed/_pycute/layout.py
+++ b/torch/distributed/_pycute/layout.py
@@ -162,7 +162,7 @@ def coalesce(layout: Layout, profile: LayoutProfile = None) -> Layout:
         assert len(layout) >= len(profile)
         return make_layout(
             chain(
-                (coalesce(layout[i], profile[i]) for i in range(len(profile))),  # type: ignore[arg-type]
+                (coalesce(layout[i], profile[i]) for i in range(0, len(profile))),  # type: ignore[arg-type]
                 (layout[i] for i in range(len(profile), len(layout))),
             )
         )
@@ -203,7 +203,7 @@ def filter(layout: Layout, profile: LayoutProfile = None) -> Layout:
         assert len(layout) >= len(profile)
         return make_layout(
             chain(
-                (filter(layout[i], profile[i]) for i in range(len(profile))),  # type: ignore[arg-type]
+                (filter(layout[i], profile[i]) for i in range(0, len(profile))),  # type: ignore[arg-type]
                 (layout[i] for i in range(len(profile), len(layout))),
             )
         )
@@ -233,7 +233,7 @@ def composition(layoutA: Layout, layoutB: LayoutInput) -> Layout:
         assert len(layoutA) >= len(layoutB)
         return make_layout(
             chain(
-                (composition(layoutA[i], layoutB[i]) for i in range(len(layoutB))),  # type: ignore[arg-type]
+                (composition(layoutA[i], layoutB[i]) for i in range(0, len(layoutB))),  # type: ignore[arg-type]
                 (layoutA[i] for i in range(len(layoutB), len(layoutA))),
             )
         )
@@ -371,7 +371,7 @@ def logical_divide(layoutA: Layout, layoutB: LayoutInput) -> Layout:
             chain(
                 (
                     logical_divide(layoutA[i], layoutB[i])  # type: ignore[arg-type]
-                    for i in range(len(layoutB))
+                    for i in range(0, len(layoutB))
                 ),
                 (layoutA[i] for i in range(len(layoutB), len(layoutA))),
             )
@@ -396,7 +396,7 @@ def logical_product(layoutA: Layout, layoutB: LayoutInput) -> Layout:
             chain(
                 (
                     logical_product(layoutA[i], layoutB[i])  # type: ignore[arg-type]
-                    for i in range(len(layoutB))
+                    for i in range(0, len(layoutB))
                 ),
                 (layoutA[i] for i in range(len(layoutB), len(layoutA))),
             )
@@ -421,14 +421,14 @@ def hier_unzip(
         # A layout with shape ((A,a),(B,b),(C,c))
         split = make_layout(
             hier_unzip(splitter, layoutA[i], layoutB[i])  # type: ignore[arg-type]
-            for i in range(len(layoutB))
+            for i in range(0, len(layoutB))
         )
         # Gather to shape ((A,B,C,...),(a,b,c,...,y,z))
         return make_layout(
-            make_layout(split[i][0] for i in range(len(layoutB))),  # type: ignore[arg-type]
+            make_layout(split[i][0] for i in range(0, len(layoutB))),  # type: ignore[arg-type]
             make_layout(
                 chain(  # type: ignore[arg-type]
-                    (split[i][1] for i in range(len(layoutB))),
+                    (split[i][1] for i in range(0, len(layoutB))),
                     (layoutA[i] for i in range(len(layoutB), len(layoutA))),
                 )
             ),
diff --git a/torch/distributed/_symmetric_memory/__init__.py b/torch/distributed/_symmetric_memory/__init__.py
index 132a40977f85..1c576e886fe1 100644
--- a/torch/distributed/_symmetric_memory/__init__.py
+++ b/torch/distributed/_symmetric_memory/__init__.py
@@ -1671,7 +1671,7 @@ def _low_contention_all_gather(
             local_buf.copy_(tensor)
         # pull
         symm_mem.barrier()
-        for step in range(world_size):
+        for step in range(0, world_size):
             remote_rank = (rank - step) % world_size
             src_buf = symm_mem.get_buffer(remote_rank, tensor.shape, tensor.dtype)
             chunks[remote_rank].copy_(src_buf)
@@ -1706,7 +1706,7 @@ def _low_contention_reduce_scatter_with_symm_mem_input(
     with _get_backend_stream():
         # pull + offline reduction
         symm_mem.barrier()
-        for step in range(world_size):
+        for step in range(0, world_size):
             remote_rank = (rank - step) % world_size
             src_buf = symm_mem.get_buffer(
                 remote_rank,
@@ -1743,7 +1743,7 @@ def _low_contention_reduce_scatter_with_workspace(
     with _get_backend_stream():
         # push + offline reduction
         workspace.barrier()
-        for step in range(world_size):
+        for step in range(0, world_size):
             remote_rank = (rank - step) % world_size
             dst_buf = workspace.get_buffer(
                 remote_rank, chunks[0].shape, chunks[0].dtype, chunks[0].numel() * rank
diff --git a/torch/distributed/elastic/multiprocessing/api.py b/torch/distributed/elastic/multiprocessing/api.py
index 9bb580c5bf78..d91974548221 100644
--- a/torch/distributed/elastic/multiprocessing/api.py
+++ b/torch/distributed/elastic/multiprocessing/api.py
@@ -727,7 +727,7 @@ class MultiprocessContext(PContext):
             # pipe. Hence to prevent deadlocks on large return values,
             # we opportunistically try queue.get on each join call
             # See: https://docs.python.org/2/library/multiprocessing.html#all-platforms
-            for local_rank in range(self.nprocs):
+            for local_rank in range(0, self.nprocs):
                 return_queue = self._ret_vals[local_rank]
                 if not return_queue.empty():
                     # save the return values temporarily into a member var
diff --git a/torch/distributed/elastic/timer/local_timer.py b/torch/distributed/elastic/timer/local_timer.py
index 5e66ef3fae34..d55cc6ac6e37 100644
--- a/torch/distributed/elastic/timer/local_timer.py
+++ b/torch/distributed/elastic/timer/local_timer.py
@@ -59,7 +59,7 @@ class MultiprocessingRequestQueue(RequestQueue):
     def get(self, size, timeout: float) -> list[TimerRequest]:
         requests = []
         wait = timeout
-        for _ in range(size):
+        for _ in range(0, size):
             start = time.time()
 
             try:
diff --git a/torch/distributed/tensor/_dtensor_spec.py b/torch/distributed/tensor/_dtensor_spec.py
index 42cb7fcd7c33..e12f41c4858b 100644
--- a/torch/distributed/tensor/_dtensor_spec.py
+++ b/torch/distributed/tensor/_dtensor_spec.py
@@ -107,7 +107,7 @@ class DTensorSpec:
         # follow default left-to-right device order if shard_order is not specified
         tensor_dim_to_mesh_dims: defaultdict[int, list[int]] = defaultdict(list)
         mesh_ndim = len(placements)
-        for mesh_dim in range(mesh_ndim):
+        for mesh_dim in range(0, mesh_ndim):
             # shard_order doesn't work with _StridedShard
             if isinstance(placements[mesh_dim], _StridedShard):
                 return ()
diff --git a/torch/distributed/tensor/parallel/fsdp.py b/torch/distributed/tensor/parallel/fsdp.py
index f5367397cc80..6cffbdb83d2f 100644
--- a/torch/distributed/tensor/parallel/fsdp.py
+++ b/torch/distributed/tensor/parallel/fsdp.py
@@ -306,7 +306,7 @@ def _all_gather_dtensor(
     placements = list(copy.deepcopy(tensor.placements))
     # FSDP + TP: [Shard(0), tp_placement] -> [Replicate(), tp_placement]
     # HSDP + TP: [Replicate(), Shard(0), tp_placement] -> [Replicate(), Replicate(), tp_placement]
-    for i in range(len(placements) - 1):
+    for i in range(0, len(placements) - 1):
         placements[i] = Replicate()
     tensor = tensor.redistribute(
         device_mesh=tensor.device_mesh,
diff --git a/torch/nested/_internal/ops.py b/torch/nested/_internal/ops.py
index bdca74c13b1d..f52bfab2a8b3 100644
--- a/torch/nested/_internal/ops.py
+++ b/torch/nested/_internal/ops.py
@@ -1112,7 +1112,7 @@ def chunk_default(func, *args, **kwargs):
         # the input number; it can be counter-intuitive, but it matches dense behavior.
         return [
             NestedTensor(values=chunk_values[i], **(nested_kwargs[i]))
-            for i in range(len(chunk_values))
+            for i in range(0, len(chunk_values))
         ]
     else:
         return [
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_helper.py b/torch/onnx/_internal/torchscript_exporter/symbolic_helper.py
index 3f92f6418c89..bcd36a6ac41b 100644
--- a/torch/onnx/_internal/torchscript_exporter/symbolic_helper.py
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_helper.py
@@ -1005,7 +1005,7 @@ def _interpolate_size_to_scales(g: jit_utils.GraphContext, input, output_size, d
             if i < 2
             else float(output_size[-(dim - i)])
             / float(input.type().sizes()[-(dim - i)])
-            for i in range(dim)
+            for i in range(0, dim)
         ]
         scales = g.op(
             "Constant", value_t=torch.tensor(scales_constant, dtype=torch.float32)
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset12.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset12.py
index d4b887560f9b..822e14556768 100644
--- a/torch/onnx/_internal/torchscript_exporter/symbolic_opset12.py
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset12.py
@@ -331,7 +331,7 @@ def unfold(g: jit_utils.GraphContext, input, dimension, size, step):
 
         ndim = symbolic_helper._get_tensor_rank(input)
         assert ndim is not None
-        perm = list(range(ndim))
+        perm = list(range(0, ndim))
         perm.append(perm.pop(dimension))
 
         unsqueeze_list = []
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset8.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset8.py
index 8ba8e6ee6622..bde072608088 100644
--- a/torch/onnx/_internal/torchscript_exporter/symbolic_opset8.py
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset8.py
@@ -116,7 +116,7 @@ def _interpolate(name, dim, interpolate_mode):
                 if i < 2
                 else float(output_size[-(dim - i)])
                 / float(input.type().sizes()[-(dim - i)])
-                for i in range(dim)
+                for i in range(0, dim)
             ]
         return g.op("Upsample", input, mode_s=interpolate_mode, scales_f=scales)
 
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset9.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset9.py
index 16e94b91f89f..9b7aba64ef31 100644
--- a/torch/onnx/_internal/torchscript_exporter/symbolic_opset9.py
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset9.py
@@ -840,7 +840,7 @@ def t(g: jit_utils.GraphContext, self):
 def numpy_T(g: jit_utils.GraphContext, input):
     ndim = symbolic_helper._get_tensor_rank(input)
     assert ndim is not None
-    perm = list(reversed(range(ndim)))
+    perm = list(reversed(range(0, ndim)))
     return g.op("Transpose", input, perm_i=perm)
 
 
@@ -990,7 +990,7 @@ def transpose(g: jit_utils.GraphContext, self, dim0, dim1):
 @_onnx_symbolic("aten::permute")
 @symbolic_helper.parse_args("v", "is")
 def permute(g: jit_utils.GraphContext, self, dims):
-    if dims == list(range(len(dims))):
+    if dims == list(range(0, len(dims))):
         return self
     return g.op("Transpose", self, perm_i=dims)
 
@@ -1368,7 +1368,7 @@ def get_pool_ceil_padding(input, kernel_size, stride, padding):
         )
     ceiled_output_dim = [
         math.ceil((dim[i] + 2 * padding[i] - kernel_size[i]) / float(stride[i])) + 1
-        for i in range(len(padding))
+        for i in range(0, len(padding))
     ]
     # ensure last pooling starts inside
     ceiled_output_dim = [
@@ -1377,7 +1377,7 @@ def get_pool_ceil_padding(input, kernel_size, stride, padding):
             if (((ceiled_output_dim[i] - 1) * stride[i]) >= (dim[i] + padding[i]))
             else ceiled_output_dim[i]
         )
-        for i in range(len(ceiled_output_dim))
+        for i in range(0, len(ceiled_output_dim))
     ]
     padding_ceil = [
         (
@@ -1392,7 +1392,7 @@ def get_pool_ceil_padding(input, kernel_size, stride, padding):
                 )
             )
         )
-        for i in range(len(padding))
+        for i in range(0, len(padding))
     ]
     # ensure padding is not > kernel_size
     padding_ceil = [
@@ -1405,7 +1405,7 @@ def get_pool_ceil_padding(input, kernel_size, stride, padding):
             if ((padding_ceil[i] + 2 * padding[i]) >= (kernel_size[i]))
             else int(padding_ceil[i])
         )
-        for i in range(len(padding_ceil))
+        for i in range(0, len(padding_ceil))
     ]
     return padding_ceil
 
@@ -1697,14 +1697,14 @@ def _adaptive_pool(name, type, tuple_fn, fn=None):
                 name, "input size not accessible", input
             )
         # verify if output size % input size = 0 for all dim
-        mod = [dim[i] % output_size[i] for i in range(len(dim))]
+        mod = [dim[i] % output_size[i] for i in range(0, len(dim))]
         if mod != [0] * len(mod):
             if output_size == [1] * len(output_size):
                 return g.op("GlobalMaxPool", input), None
             return symbolic_helper._unimplemented(
                 name, "output size that are not factor of input size", output_size_value
             )
-        k = [int(dim[i] / output_size[i]) for i in range(len(dim))]
+        k = [int(dim[i] / output_size[i]) for i in range(0, len(dim))]
         # call max_poolxd_with_indices to get indices in the output
         if type == "MaxPool":
             # pyrefly: ignore  # not-callable
@@ -2906,7 +2906,7 @@ def unfold(g: jit_utils.GraphContext, input, dimension, size, step):
             for low, hi in zip(low_indices, hi_indices)
         ]
         ndim = len(sizes)
-        perm = list(range(ndim))
+        perm = list(range(0, ndim))
         perm.append(perm.pop(dimension))
         unsqueeze = [
             symbolic_helper._unsqueeze_helper(
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 0cecc762bce4..82e630519eb8 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -11615,7 +11615,7 @@ def reference_searchsorted(sorted_sequence, boundary, out_int32=False, right=Fal
         # numpy searchsorted only supports 1D inputs so we split up ND inputs
         orig_shape = boundary.shape
         num_splits = np.prod(sorted_sequence.shape[:-1])
-        splits = range(num_splits)
+        splits = range(0, num_splits)
         sorted_sequence, boundary = sorted_sequence.reshape(num_splits, -1), boundary.reshape(num_splits, -1)
         if sorter is not None:
             sorter = sorter.reshape(num_splits, -1)
@@ -16258,7 +16258,7 @@ op_db: list[OpInfo] = [
         aten_backward_name='_prelu_kernel_backward',
         ref=lambda x, weight:
             np.maximum(0., x) + np.minimum(0., x) *
-            (weight if x.ndim == 1 else weight.reshape([weight.size if i == 1 else 1 for i in range(x.ndim)])),
+            (weight if x.ndim == 1 else weight.reshape([weight.size if i == 1 else 1 for i in range(0, x.ndim)])),
         dtypes=floating_types_and(torch.bfloat16, torch.float16),
         supports_forward_ad=True,
         supports_fwgrad_bwgrad=True,
diff --git a/torch/testing/_internal/common_nn.py b/torch/testing/_internal/common_nn.py
index 3153359326dc..68a35e8c40a1 100644
--- a/torch/testing/_internal/common_nn.py
+++ b/torch/testing/_internal/common_nn.py
@@ -2896,7 +2896,7 @@ def _multilabelmarginloss_reference(input, target):
 
     sum = 0
     for target_index in targets:
-        for i in range(len(input)):
+        for i in range(0, len(input)):
             if i not in targets:
                 sum += max(0, 1 - input[target_index] + input[i])
 
@@ -2914,7 +2914,7 @@ def multilabelmarginloss_reference(input, target, reduction='mean'):
     n = input.size(0)
     dim = input.size(1)
     output = input.new(n).zero_()
-    for i in range(n):
+    for i in range(0, n):
         output[i] = _multilabelmarginloss_reference(input[i], target[i])
 
     if reduction == 'mean':
@@ -2955,7 +2955,7 @@ def _multimarginloss_reference(input, target_idx, p, margin, weight):
         weight = input.new(len(input)).fill_(1)
 
     output = 0
-    for i in range(len(input)):
+    for i in range(0, len(input)):
         if i != target_idx:
             output += weight[target_idx] * (max(0, (margin - input[target_idx] + input[i])) ** p)
     return output
@@ -2972,7 +2972,7 @@ def multimarginloss_reference(input, target, p=1, margin=1, weight=None, reducti
     n = input.size(0)
     dim = input.size(1)
     output = input.new(n)
-    for x in range(n):
+    for x in range(0, n):
         output[x] = _multimarginloss_reference(input[x], target[x], p, margin, weight)
 
     if reduction == 'mean':
@@ -2987,7 +2987,7 @@ def multimarginloss_reference(input, target, p=1, margin=1, weight=None, reducti
 def cosineembeddingloss_reference(input1, input2, target, margin=0, reduction='mean'):
     def _cos(a, b):
         cos = a.new(a.size(0))
-        for i in range(a.size(0)):
+        for i in range(0, a.size(0)):
             cos[i] = (a[i] * b[i]).sum() / ((((a[i] * a[i]).sum() + 1e-12) * ((b[i] * b[i]).sum() + 1e-12)) ** 0.5)
         return cos
 
diff --git a/torch/testing/_internal/distributed/_tensor/common_dtensor.py b/torch/testing/_internal/distributed/_tensor/common_dtensor.py
index 22d6d8e7dede..a9beb0e60865 100644
--- a/torch/testing/_internal/distributed/_tensor/common_dtensor.py
+++ b/torch/testing/_internal/distributed/_tensor/common_dtensor.py
@@ -705,7 +705,7 @@ class LocalDTensorTestBase(DTensorTestBase):
         self.skipTest(msg)
 
     def _get_local_tensor_mode(self):
-        return LocalTensorMode(frozenset(range(self.world_size)))
+        return LocalTensorMode(frozenset(range(0, self.world_size)))
 
     def setUp(self) -> None:
         super().setUp()
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index 499341b07951..c41602d43994 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -658,13 +658,13 @@ class DistributedTest:
             return (group, group_id, rank)
 
         def _init_full_group_test(self, **kwargs):
-            group = list(range(dist.get_world_size()))
+            group = list(range(0, dist.get_world_size()))
             group_id = dist.new_group(**kwargs)
             rank = dist.get_rank()
             return (group, group_id, rank)
 
         def _init_global_test(self):
-            group = list(range(dist.get_world_size()))
+            group = list(range(0, dist.get_world_size()))
             group_id = dist.group.WORLD
             rank = dist.get_rank()
             return (group, group_id, rank)
@@ -1114,7 +1114,7 @@ class DistributedTest:
                 averager = averagers.PeriodicModelAverager(
                     period=period, warmup_steps=warmup_steps
                 )
-                for step in range(20):
+                for step in range(0, 20):
                     # Reset the parameters at every step.
                     param.data = copy.deepcopy(tensor)
                     for params in model.parameters():
@@ -1143,7 +1143,7 @@ class DistributedTest:
                 averager = averagers.PeriodicModelAverager(
                     period=period, warmup_steps=warmup_steps
                 )
-                for step in range(20):
+                for step in range(0, 20):
                     # Reset the parameters at every step.
                     for param_group in opt.param_groups:
                         for params in param_group["params"]:
@@ -1203,7 +1203,7 @@ class DistributedTest:
                 averager = averagers.PeriodicModelAverager(
                     period=period, warmup_steps=warmup_steps
                 )
-                for step in range(20):
+                for step in range(0, 20):
                     # Reset the parameters at every step.
                     param.data = copy.deepcopy(tensor)
                     for params in model.parameters():
@@ -1284,7 +1284,7 @@ class DistributedTest:
             expected_global_avg_tensor = (
                 torch.ones_like(param.data) * sum(range(world_size)) / world_size
             )
-            for step in range(25):
+            for step in range(0, 25):
                 # Reset the parameters at every step.
                 param.data = copy.deepcopy(tensor)
                 for params in model.parameters():
@@ -1390,7 +1390,7 @@ class DistributedTest:
 
             for val in ["1", "0"]:
                 os.environ["TORCH_NCCL_BLOCKING_WAIT"] = val
-                for src in range(world_size):
+                for src in range(0, world_size):
                     send_tensor = _build_tensor(rank + 1, device_id=device_id).fill_(
                         src
                     )
@@ -1409,7 +1409,7 @@ class DistributedTest:
                 for req in reqs:
                     req.wait()
 
-                for src in range(world_size):
+                for src in range(0, world_size):
                     self.assertEqual(recv_tensors[src], expected_tensors[src])
 
             self._barrier()
@@ -1505,7 +1505,7 @@ class DistributedTest:
             rank = dist.get_rank()
             p2p_op_list = []
 
-            for src in range(dist.get_world_size()):
+            for src in range(0, dist.get_world_size()):
                 if src == rank:
                     continue
                 send_tensor = _build_tensor(rank + 1)
@@ -1528,7 +1528,7 @@ class DistributedTest:
             rank = dist.get_rank()
             p2p_op_list = []
 
-            for src in range(dist.get_world_size()):
+            for src in range(0, dist.get_world_size()):
                 if src == rank:
                     continue
                 send_tensor = _build_tensor(rank + 1)
@@ -1602,10 +1602,10 @@ class DistributedTest:
             tensor = _build_tensor(rank + 1, device_id=device_id)
             profiler_cls = profiler_ctx if profiler_ctx is not None else nullcontext()
             with profiler_cls as prof:
-                for src in range(world_size):
+                for src in range(0, world_size):
                     if src == rank:
                         # Send mode
-                        for dst in range(world_size):
+                        for dst in range(0, world_size):
                             if dst == rank:
                                 continue
                             dist.send(tensor, dst)
@@ -1674,10 +1674,10 @@ class DistributedTest:
             tensor = _build_tensor(send_size)
             ctx = profiler_ctx if profiler_ctx is not None else nullcontext()
             with ctx as prof:
-                for src in range(dist.get_world_size()):
+                for src in range(0, dist.get_world_size()):
                     if src == rank:
                         # Send mode
-                        for dst in range(dist.get_world_size()):
+                        for dst in range(0, dist.get_world_size()):
                             if dst == rank:
                                 continue
                             dist.send(tensor, dst)
@@ -1742,10 +1742,10 @@ class DistributedTest:
 
             ctx = profiler_ctx if profiler_ctx is not None else nullcontext()
             with ctx as prof:
-                for dst in range(dist.get_world_size()):
+                for dst in range(0, dist.get_world_size()):
                     if dst == rank:
                         # Recv mode
-                        for dst in range(dist.get_world_size()):
+                        for dst in range(0, dist.get_world_size()):
                             if dst == rank:
                                 continue
 
@@ -1846,10 +1846,10 @@ class DistributedTest:
             tensor = _build_tensor(send_recv_size, value=rank)
             ctx = profiler_ctx if profiler_ctx is not None else nullcontext()
             with ctx as prof:
-                for dst in range(world_size):
+                for dst in range(0, world_size):
                     if dst == rank:
                         # Recv mode
-                        for src in range(world_size):
+                        for src in range(0, world_size):
                             if src == rank:
                                 continue
                             output_tensor = _build_tensor(send_recv_size, value=-1)
@@ -7480,7 +7480,7 @@ class DistributedTest:
                 for baseline_iter in baseline_num_iters:
                     for offset in iteration_offsets:
                         mapping = dict.fromkeys(
-                            range(num_early_join_ranks), baseline_iter
+                            range(0, num_early_join_ranks), baseline_iter
                         )
                         # if num_early_join_ranks > 1, ranks > 0 that will join early
                         # iterate offset//2 more times than rank 0, to test nodes
diff --git a/torch/testing/_internal/distributed/multi_threaded_pg.py b/torch/testing/_internal/distributed/multi_threaded_pg.py
index 79aff05b3421..2cc22cb7c23a 100644
--- a/torch/testing/_internal/distributed/multi_threaded_pg.py
+++ b/torch/testing/_internal/distributed/multi_threaded_pg.py
@@ -166,7 +166,7 @@ class AllReduce:
             # collect all data to the list and make them
             # all on rank 0 device
             tensors = [
-                data[src_rank][i].to(rank_0_device) for src_rank in range(len(data))
+                data[src_rank][i].to(rank_0_device) for src_rank in range(0, len(data))
             ]
 
             # now mimic reduce across all ranks
diff --git a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
index 3c5c9101e43c..1d6c7500c5ad 100644
--- a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
+++ b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
@@ -266,7 +266,7 @@ class CommonDistAutogradTest(RpcAgentTestFixture):
         grads = dist_autograd.get_gradients(context_id)
         nargs = len(args)
         ngrads = 0
-        for i in range(nargs):
+        for i in range(0, nargs):
             if local_grads[i] is not None:
                 self.assertIn(args[i], grads)
                 self.assertEqual(local_grads[i], grads[args[i]])
@@ -1973,7 +1973,7 @@ class DistAutogradTest(CommonDistAutogradTest):
         DistAutogradTest._test_clean_context_backward_context_id = context_id
 
         # Send the context id to all nodes.
-        for i in range(self.world_size):
+        for i in range(0, self.world_size):
             if i != self.rank:
                 rank_distance = (i - self.rank + self.world_size) % self.world_size
                 rpc.rpc_sync(
@@ -1988,7 +1988,7 @@ class DistAutogradTest(CommonDistAutogradTest):
         self.assertEqual(self.world_size - 1, len(known_context_ids))
 
         t1 = torch.rand((3, 3), requires_grad=True)
-        for i in range(100):
+        for i in range(0, 100):
             dst = self._next_rank()
             t1 = rpc.rpc_sync(worker_name(dst), torch.add, args=(t1, t1))
 
diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py
index 03469e473921..4ec964092b39 100644
--- a/torch/testing/_internal/distributed/rpc/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/rpc_test.py
@@ -1818,7 +1818,7 @@ class RpcTest(RpcAgentTestFixture, RpcTestCommon):
         # Spawn multiple threads that send RPCs to ensure keys are correctly
         # prefixed when there are multiple RPCs being created/in flight at the
         # same time.
-        dst_ranks = [rank for rank in range(self.world_size) if rank != self.rank]
+        dst_ranks = [rank for rank in range(0, self.world_size) if rank != self.rank]
 
         def rpc_with_profiling(dst_worker):
             with _profile() as prof:
@@ -1884,7 +1884,7 @@ class RpcTest(RpcAgentTestFixture, RpcTestCommon):
         if self.rank != 1:
             return
 
-        dst_ranks = [rank for rank in range(self.world_size) if rank != self.rank]
+        dst_ranks = [rank for rank in range(0, self.world_size) if rank != self.rank]
         for dst in dst_ranks:
             dst_worker = worker_name(dst)
             with _profile() as prof:
diff --git a/torch/testing/_internal/jit_utils.py b/torch/testing/_internal/jit_utils.py
index ce8e68ae1e2c..e98d0e482683 100644
--- a/torch/testing/_internal/jit_utils.py
+++ b/torch/testing/_internal/jit_utils.py
@@ -439,7 +439,7 @@ class JitTestCase(JitCommonTestCase):
         state = model.get_debug_state()
         plan = get_execution_plan(state)
         num_bailouts = plan.code.num_bailouts()
-        for i in range(num_bailouts):
+        for i in range(0, num_bailouts):
             plan.code.request_bailout(i)
             bailout_outputs = model(*inputs)
             self.assertEqual(bailout_outputs, expected)
diff --git a/torch/testing/_internal/triton_utils.py b/torch/testing/_internal/triton_utils.py
index 0964c68ebb20..4edaf86dd1d7 100644
--- a/torch/testing/_internal/triton_utils.py
+++ b/torch/testing/_internal/triton_utils.py
@@ -912,7 +912,7 @@ if has_triton():
         b_ptrs = b_ptr + (offs_k[:, None] + offs_bn[None, :])
 
         accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-        for k in range(tl.cdiv(K, BLOCK_SIZE_K)):
+        for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
             a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0)
             b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0)
             accumulator = tl.dot(a, b, accumulator)

From 0bbdd6b8dbda2d63820ae46d05536bd1e9a111b9 Mon Sep 17 00:00:00 2001
From: "Nichols A. Romero" <nick.romero@amd.com>
Date: Sat, 18 Oct 2025 07:23:37 +0000
Subject: [PATCH 106/123] [ROCm][inductor] heuristic improvements for pointwise
 kernels (#163197)

Heuristic improvements for pointwise kernels for MI350.

Contributions from several members of the AMD Inductor and Triton teams:
@jataylo @AmdSampsa @iupaikov-amd @@xiaohuguo2023

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163197
Approved by: https://github.com/PaulZhang12, https://github.com/eellison, https://github.com/jansel

Co-authored-by: AmdSampsa <sampsa.riikonen@amd.com>
Co-authored-by: Jack Taylor <108682042+jataylo@users.noreply.github.com>
---
 torch/_inductor/runtime/hints.py             |  3 +-
 torch/_inductor/runtime/triton_heuristics.py | 63 ++++++++++++++++++--
 2 files changed, 60 insertions(+), 6 deletions(-)

diff --git a/torch/_inductor/runtime/hints.py b/torch/_inductor/runtime/hints.py
index 1cff04d04079..10a5a9749a51 100644
--- a/torch/_inductor/runtime/hints.py
+++ b/torch/_inductor/runtime/hints.py
@@ -6,13 +6,14 @@ import functools
 import typing
 from enum import auto, Enum
 
+import torch
 from torch.utils._triton import has_triton_package
 
 
 # The following maximums only apply to runtime autotuning, when using FixedTritonConfig one may see larger values
 # NOTE: if these fail asserts submit a PR to increase them
 TRITON_MAX_BLOCK = {
-    "X": 4096,
+    "X": 8192 if torch.version.hip else 4096,
     "Y": 1024,
     "Z": 1024,
     "R0_": 4096 * 16,  # * 16 is multi-kernel only
diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py
index 2ae2880fb018..12dc07fe3b1f 100644
--- a/torch/_inductor/runtime/triton_heuristics.py
+++ b/torch/_inductor/runtime/triton_heuristics.py
@@ -2244,6 +2244,9 @@ def triton_config(
     num_stages=1,
     num_elements_per_warp=256,
     min_elem_per_thread=0,
+    num_warps=None,
+    matrix_instr=None,
+    waves_per_eu=None,
 ) -> Config:
     """
     Construct a pointwise triton config with some adjustment heuristics
@@ -2300,9 +2303,11 @@ def triton_config(
     ):
         z *= 2
 
-    num_warps = _num_warps(
-        conditional_product(x, y, z) // num_elements_per_warp, min_num_warps=1
-    )
+    # Calculate num_warps if they are not hard passed to config
+    if num_warps is None:
+        num_warps = _num_warps(
+            conditional_product(x, y, z) // num_elements_per_warp, min_num_warps=1
+        )
     # we are going to arrive at 2 warps only if bs was too small due to
     # numel being too small. However to workaround some ptx bugs we still
     # want at least 4 warps if there's enough elements per thread
@@ -2332,7 +2337,15 @@ def triton_config(
         cfg["ZBLOCK"] = z
     check_max_block(cfg)
     check_config(cfg, xnumel=xnumel, ynumel=ynumel, znumel=znumel)
-    return Config(cfg, num_warps=num_warps, num_stages=num_stages)
+    config = Config(cfg, num_warps=num_warps, num_stages=num_stages)
+
+    if torch.version.hip:
+        if matrix_instr is not None:
+            config.kwargs["matrix_instr_nonkdim"] = matrix_instr
+        if waves_per_eu is not None:
+            config.kwargs["waves_per_eu"] = waves_per_eu
+
+    return config
 
 
 def _get_nd_reduction_numels(r: int, size_hints: dict[str, int]) -> dict[str, int]:
@@ -2578,10 +2591,32 @@ def pointwise(
                 ),
                 *hinted_configs,
             ]
+            # Additional configs appended for ROCm builds
+            if torch.version.hip:
+                configs.extend(
+                    [
+                        triton_config_with_settings(
+                            size_hints, TRITON_MAX_BLOCK["X"], waves_per_eu=2
+                        ),
+                        triton_config_with_settings(
+                            size_hints,
+                            4096,  # wrt: better than the max_block for some kernel
+                        ),
+                        triton_config_with_settings(
+                            size_hints,
+                            2048,
+                            num_warps=8,
+                            num_stages=2,
+                            waves_per_eu=1,  # 20% improvement
+                        ),
+                    ]
+                )
     if len(size_hints) == 2:
+        # Only avoiding tuning on TileHint.SQUARE if not on ROCm builds
+        # ROCm has observed improvement by diverging here
         if (
             not inductor_meta.get("autotune_pointwise", True)
-            or tile_hint == TileHint.SQUARE
+            or (torch.version.hip is None and tile_hint == TileHint.SQUARE)
         ) and not (
             inductor_meta.get("max_autotune")
             or inductor_meta.get("max_autotune_pointwise")
@@ -2597,6 +2632,24 @@ def pointwise(
                 triton_config_with_settings(size_hints, 1, bs),
                 *hinted_configs,
             ]
+            # Additional configs appended for ROCm builds
+            if torch.version.hip:
+                configs.extend(
+                    [
+                        triton_config_with_settings(
+                            size_hints, 64, 32
+                        ),  # better for some kernels
+                        triton_config_with_settings(
+                            size_hints, 128, 16
+                        ),  # +10% for some kernels
+                        triton_config_with_settings(
+                            size_hints, 128, 32
+                        ),  # additional 10% more
+                        triton_config_with_settings(
+                            size_hints, 32, 512
+                        ),  # +30% for some kernels
+                    ]
+                )
     if len(size_hints) == 3:
         if not inductor_meta.get("autotune_pointwise", True):
             configs = [triton_config_with_settings(size_hints, 16, 16, 16)]

From a0948d4d232d4ae11e0e3c33c5dc252c98b9b40a Mon Sep 17 00:00:00 2001
From: "Nichols A. Romero" <nick.romero@amd.com>
Date: Sat, 18 Oct 2025 07:33:21 +0000
Subject: [PATCH 107/123] [ROCm][inductor] autotune support for persistent
 reduction kernels (#163908)

After the removal of want_no_x_dim for persistent reduction kernels, we can improve the autotuning setup for persistent reduction kernels.

Currently even with tuning enable, filtering will only try a single config in many cases. Avoid filtering with autotune mode, and override MAX_BLOCK limit. Also we always include tiny_config when autotuning is enabled.

Contributions from several members of the AMD Inductor and Triton teams: @jataylo @iupaikov-amd @AmdSampsa @xiaohuguo2023

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163908
Approved by: https://github.com/jansel, https://github.com/PaulZhang12
---
 torch/_inductor/runtime/triton_heuristics.py | 75 ++++++++++++--------
 1 file changed, 46 insertions(+), 29 deletions(-)

diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py
index 12dc07fe3b1f..b49b9ac54228 100644
--- a/torch/_inductor/runtime/triton_heuristics.py
+++ b/torch/_inductor/runtime/triton_heuristics.py
@@ -3222,6 +3222,15 @@ def _persistent_reduction_configs(
         else:
             raise NotImplementedError("native matmul only supports mm/bmm pattern")
 
+    max_autotune_enabled = inductor_meta.get("max_autotune") or inductor_meta.get(
+        "max_autotune_pointwise"
+    )
+
+    if torch.version.hip:
+        xblock_vals = [1, 4, 8, 16, 32, 64, 128, 256]
+    else:
+        xblock_vals = [1, 8, 32, 128]
+
     if "y" not in size_hints:
         configs = [
             triton_config_reduction(
@@ -3231,7 +3240,7 @@ def _persistent_reduction_configs(
                 register_intensive=True,
                 reduction_hint=reduction_hint,
             )
-            for xblock in (1, 8, 32, 128)
+            for xblock in xblock_vals
             if xblock == 1
             or (rnumel * xblock <= MAX_PERSISTENT_BLOCK_NUMEL and xblock <= xnumel)
         ]
@@ -3239,7 +3248,7 @@ def _persistent_reduction_configs(
         configs = []
         assert "tiling_scores" in inductor_meta
         x_y_scores = {dim: inductor_meta["tiling_scores"][dim] for dim in ("x", "y")}
-        for target_block_size in (1, 8, 32, 64, 128):
+        for target_block_size in xblock_vals:
             if target_block_size * rnumel > MAX_PERSISTENT_BLOCK_NUMEL:
                 continue
 
@@ -3252,39 +3261,47 @@ def _persistent_reduction_configs(
                 )
             )
 
+    tiny_configs = [
+        triton_config_reduction(
+            size_hints,
+            2 * (256 // rnumel) if rnumel <= 256 else 1,
+            rnumel,
+        )
+    ]
+
     # defer to more autotuning, initially
     if "y" in size_hints:
         pass
     # TODO(jansel): we should be able to improve these heuristics
-    elif reduction_hint == ReductionHint.INNER and rnumel >= 256:
-        if rnumel > 1024:
-            configs = configs[:1]
-        else:
-            x_block = 8
-            if xnumel // x_block < 128 or loads_and_stores >= 5:
-                x_block = 1
+    elif not max_autotune_enabled:  # Do not filter configs when tuning
+        if reduction_hint == ReductionHint.INNER and rnumel >= 256:
+            if rnumel > 1024:
+                configs = configs[:1]
+            else:
+                x_block = 8
+                if xnumel // x_block < 128 or loads_and_stores >= 5:
+                    x_block = 1
 
-            configs = [
-                triton_config_reduction(
-                    size_hints,
-                    x_block,
-                    rnumel,
-                    register_intensive=True,
-                    reduction_hint=reduction_hint,
-                )
-            ]
+                configs = [
+                    triton_config_reduction(
+                        size_hints,
+                        x_block,
+                        rnumel,
+                        register_intensive=True,
+                    )
+                ]
+
+        elif reduction_hint == ReductionHint.OUTER:
+            configs = configs[-1:]
+        elif reduction_hint == ReductionHint.OUTER_TINY:
+            configs = tiny_configs
+    else:
+        if torch.version.hip:
+            # If autotune is enabled append tiny configs
+            for conf in tiny_configs:
+                if conf not in configs:
+                    configs.append(conf)
 
-    elif reduction_hint == ReductionHint.OUTER:
-        configs = configs[-1:]
-    elif reduction_hint == ReductionHint.OUTER_TINY:
-        configs = [
-            triton_config_reduction(
-                size_hints,
-                2 * (256 // rnumel) if rnumel <= 256 else 1,
-                rnumel,
-                reduction_hint=reduction_hint,
-            )
-        ]
     for c in configs:
         # we don't need Rn_BLOCK for persistent reduction
         for prefix in size_hints:

From fdab48a7c1c4f0f7416c3517cab7f353619a5091 Mon Sep 17 00:00:00 2001
From: Yuanyuan Chen <cyyever@outlook.com>
Date: Sat, 18 Oct 2025 07:36:18 +0000
Subject: [PATCH 108/123] Enable all PIE rules on ruff (#165814)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR enables all PIE rules on ruff, there are already some enabled rules from this family, the new added rules are
```
PIE796  Enum contains duplicate value: {value}
PIE808  Unnecessary start argument in range
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165814
Approved by: https://github.com/ezyang
---
 benchmarks/gpt_fast/mixtral_moe_quantize.py   |  2 +-
 caffe2/perfkernels/hp_emblookup_codegen.py    |  8 ++--
 pyproject.toml                                |  7 +---
 .../ao/sparsity/test_activation_sparsifier.py |  4 +-
 test/ao/sparsity/test_data_scheduler.py       |  2 +-
 test/ao/sparsity/test_data_sparsifier.py      |  2 +-
 test/ao/sparsity/test_sparsifier.py           |  4 +-
 .../quantization/test_quantization.py         | 12 +++---
 test/distributed/checkpoint/test_planner.py   |  2 +-
 test/distributed/checkpoint/test_utils.py     |  2 +-
 .../elastic/agent/server/test/api_test.py     |  2 +-
 .../elastic/multiprocessing/api_test.py       |  2 +-
 .../timer/file_based_local_timer_test.py      |  2 +-
 .../elastic/timer/local_timer_example.py      |  4 +-
 .../elastic/timer/local_timer_test.py         |  2 +-
 .../utils/data/cycling_iterator_test.py       |  4 +-
 .../fsdp/test_fsdp_hybrid_shard.py            |  4 +-
 test/distributed/tensor/test_dtensor_ops.py   |  4 +-
 test/distributed/test_device_mesh.py          |  2 +-
 test/distributions/test_distributions.py      | 34 ++++++++---------
 test/dynamo/test_export.py                    |  8 ++--
 test/dynamo/test_functions.py                 |  2 +-
 test/dynamo/test_modules.py                   |  2 +-
 test/dynamo/test_repros.py                    |  6 +--
 test/functorch/test_ac.py                     |  4 +-
 test/inductor/test_codecache.py               |  2 +-
 test/inductor/test_compiled_autograd.py       |  2 +-
 test/inductor/test_max_autotune.py            |  2 +-
 test/inductor/test_triton_kernels.py          |  4 +-
 test/jit/xnnpack/test_xnnpack_delegate.py     |  2 +-
 test/nn/test_convolution.py                   |  2 +-
 test/nn/test_embedding.py                     |  2 +-
 test/nn/test_multihead_attention.py           |  2 +-
 test/nn/test_pooling.py                       |  2 +-
 test/onnx/test_onnx_opset.py                  |  4 +-
 test/optim/test_lrscheduler.py                |  2 +-
 test/profiler/test_profiler.py                |  6 +--
 .../core/experimental/test_floatx.py          |  2 +-
 test/test_dataloader.py                       |  2 +-
 test/test_datapipe.py                         |  6 +--
 test/test_dynamic_shapes.py                   |  4 +-
 test/test_indexing.py                         |  2 +-
 test/test_jit.py                              |  8 ++--
 test/test_jit_fuser_te.py                     |  8 ++--
 test/test_matmul_cuda.py                      |  2 +-
 test/test_mps.py                              | 14 +++----
 test/test_numa_binding.py                     |  6 +--
 test/test_reductions.py                       |  4 +-
 test/test_serialization.py                    |  2 +-
 test/test_sparse.py                           |  2 +-
 test/test_sparse_csr.py                       |  2 +-
 test/test_static_runtime.py                   |  2 +-
 test/test_tensorboard.py                      |  2 +-
 test/test_tensorexpr.py                       |  2 +-
 test/test_torch.py                            |  2 +-
 test/test_view_ops.py                         |  2 +-
 test/test_xnnpack_integration.py              |  4 +-
 torch/_decomp/decompositions_for_jvp.py       |  2 +-
 torch/_dynamo/eval_frame.py                   |  4 +-
 torch/_inductor/dependencies.py               |  2 +-
 torch/_meta_registrations.py                  |  2 +-
 torch/_numpy/_funcs_impl.py                   |  2 +-
 torch/_refs/__init__.py                       |  2 +-
 torch/_tensor_str.py                          |  6 +--
 torch/ao/ns/fx/pattern_utils.py               |  2 +-
 .../activation_sparsifier.py                  |  6 +--
 .../benchmarks/evaluate_disk_savings.py       |  2 +-
 .../lightning/tests/test_callbacks.py         |  2 +-
 .../sparsifier/nearly_diagonal_sparsifier.py  |  2 +-
 .../ao/quantization/experimental/observer.py  |  4 +-
 torch/ao/quantization/fx/_decomposed.py       |  2 +-
 torch/autograd/profiler.py                    |  2 +-
 torch/distributed/_pycute/layout.py           | 16 ++++----
 .../distributed/_symmetric_memory/__init__.py |  6 +--
 .../elastic/multiprocessing/api.py            |  2 +-
 .../distributed/elastic/timer/local_timer.py  |  2 +-
 torch/distributed/tensor/_dtensor_spec.py     |  2 +-
 torch/distributed/tensor/parallel/fsdp.py     |  2 +-
 torch/nested/_internal/ops.py                 |  2 +-
 .../torchscript_exporter/symbolic_helper.py   |  2 +-
 .../torchscript_exporter/symbolic_opset12.py  |  2 +-
 .../torchscript_exporter/symbolic_opset8.py   |  2 +-
 .../torchscript_exporter/symbolic_opset9.py   | 18 ++++-----
 .../_internal/common_methods_invocations.py   |  4 +-
 torch/testing/_internal/common_nn.py          | 10 ++---
 .../distributed/_tensor/common_dtensor.py     |  2 +-
 .../_internal/distributed/distributed_test.py | 38 +++++++++----------
 .../distributed/multi_threaded_pg.py          |  2 +-
 .../distributed/rpc/dist_autograd_test.py     |  6 +--
 .../_internal/distributed/rpc/rpc_test.py     |  4 +-
 torch/testing/_internal/jit_utils.py          |  2 +-
 torch/testing/_internal/triton_utils.py       |  2 +-
 92 files changed, 200 insertions(+), 205 deletions(-)

diff --git a/benchmarks/gpt_fast/mixtral_moe_quantize.py b/benchmarks/gpt_fast/mixtral_moe_quantize.py
index 50ffd61bdb83..fd0342ce3d59 100644
--- a/benchmarks/gpt_fast/mixtral_moe_quantize.py
+++ b/benchmarks/gpt_fast/mixtral_moe_quantize.py
@@ -85,7 +85,7 @@ class WeightOnlyInt8QuantHandler:
                 cur_state_dict[f"{fqn}.weight"] = int8_weight
                 cur_state_dict[f"{fqn}.scales"] = scales.to(mod.weight.dtype)
             elif isinstance(mod, ConditionalFeedForward):
-                for weight_idx in range(0, 3):
+                for weight_idx in range(3):
                     weight_name = f"w{weight_idx + 1}"
                     scales_name = f"scales{weight_idx + 1}"
                     weight = getattr(mod, weight_name)
diff --git a/caffe2/perfkernels/hp_emblookup_codegen.py b/caffe2/perfkernels/hp_emblookup_codegen.py
index 91f6ac238c0f..43254cddf26e 100644
--- a/caffe2/perfkernels/hp_emblookup_codegen.py
+++ b/caffe2/perfkernels/hp_emblookup_codegen.py
@@ -74,7 +74,7 @@ def unroll(uf, IndexType, InType, OutType, use_weights, isa, fused, use_offsets)
         )
 
     code.append("      " + OutType + "* op = &out[rangeIndex * block_size];")
-    for i in range(0, uf):
+    for i in range(uf):
         j = 8 * i
         code.append("      __m256 vop" + str(j) + " = _mm256_setzero_ps();")
 
@@ -158,7 +158,7 @@ def unroll(uf, IndexType, InType, OutType, use_weights, isa, fused, use_offsets)
         "&input[idx_pref_T0 * fused_block_size];"
     )
 
-    for i in range(0, uf):
+    for i in range(uf):
         j = 8 * i
         cachelinesize = 64
         byteoffset = sizeof[InType] * j
@@ -170,7 +170,7 @@ def unroll(uf, IndexType, InType, OutType, use_weights, isa, fused, use_offsets)
         code.append("      if (!normalize_by_lengths || length == 0) {")
     else:
         code.append("      if (!normalize_by_lengths || lengths[rangeIndex] == 0) {")
-    for i in range(0, uf):
+    for i in range(uf):
         j = 8 * i
         code.append("        _mm256_storeu_ps(&op[" + str(j) + "], vop" + str(j) + ");")
     code.append("      } else {")
@@ -181,7 +181,7 @@ def unroll(uf, IndexType, InType, OutType, use_weights, isa, fused, use_offsets)
         code.append(
             "        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);"
         )
-    for i in range(0, uf):
+    for i in range(uf):
         j = 8 * i
         code.append(
             "        _mm256_storeu_ps(&op["
diff --git a/pyproject.toml b/pyproject.toml
index e42f08d296f3..f18368b90d8d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -204,12 +204,7 @@ select = [
     "NPY",
     "PERF",
     "PGH004",
-    "PIE790",
-    "PIE794",
-    "PIE800",
-    "PIE804",
-    "PIE807",
-    "PIE810",
+    "PIE",
     "PLC0131", # type bivariance
     "PLC0132", # type param mismatch
     "PLC1802", # len({expression}) used as condition without comparison
diff --git a/test/ao/sparsity/test_activation_sparsifier.py b/test/ao/sparsity/test_activation_sparsifier.py
index 0f3f36ecda9f..079f5e1941d2 100644
--- a/test/ao/sparsity/test_activation_sparsifier.py
+++ b/test/ao/sparsity/test_activation_sparsifier.py
@@ -190,7 +190,7 @@ class TestActivationSparsifier(TestCase):
                 if features is None:
                     assert torch.all(mask * input_data == output)
                 else:
-                    for feature_idx in range(0, len(features)):
+                    for feature_idx in range(len(features)):
                         feature = torch.Tensor(
                             [features[feature_idx]], device=input_data.device
                         ).long()
@@ -378,7 +378,7 @@ class TestActivationSparsifier(TestCase):
         # some dummy data
         data_list = []
         num_data_points = 5
-        for _ in range(0, num_data_points):
+        for _ in range(num_data_points):
             rand_data = torch.randn(16, 1, 28, 28)
             activation_sparsifier.model(rand_data)
             data_list.append(rand_data)
diff --git a/test/ao/sparsity/test_data_scheduler.py b/test/ao/sparsity/test_data_scheduler.py
index de0a885f0153..47a85e1edda1 100644
--- a/test/ao/sparsity/test_data_scheduler.py
+++ b/test/ao/sparsity/test_data_scheduler.py
@@ -143,7 +143,7 @@ class TestBaseDataScheduler(TestCase):
 
         # checking step count
         step_cnt = 5
-        for _ in range(0, step_cnt):
+        for _ in range(step_cnt):
             sparsifier.step()
             scheduler.step()
 
diff --git a/test/ao/sparsity/test_data_sparsifier.py b/test/ao/sparsity/test_data_sparsifier.py
index dce04292763f..fa08e8c90ac2 100644
--- a/test/ao/sparsity/test_data_sparsifier.py
+++ b/test/ao/sparsity/test_data_sparsifier.py
@@ -123,7 +123,7 @@ class _BaseDataSparsiferTestCase(TestCase):
 
         step_count = 3
 
-        for _ in range(0, step_count):
+        for _ in range(step_count):
             sparsifier.step()
         for some_data in all_data:
             name, data, _ = self._get_name_data_config(some_data)
diff --git a/test/ao/sparsity/test_sparsifier.py b/test/ao/sparsity/test_sparsifier.py
index d5010b7abccd..a940a3e9feba 100644
--- a/test/ao/sparsity/test_sparsifier.py
+++ b/test/ao/sparsity/test_sparsifier.py
@@ -472,8 +472,8 @@ class TestNearlyDiagonalSparsifier(TestCase):
         else:
             height, width = mask.shape
             dist_to_diagonal = nearliness // 2
-            for row in range(0, height):
-                for col in range(0, width):
+            for row in range(height):
+                for col in range(width):
                     if abs(row - col) <= dist_to_diagonal:
                         assert mask[row, col] == 1
                     else:
diff --git a/test/distributed/algorithms/quantization/test_quantization.py b/test/distributed/algorithms/quantization/test_quantization.py
index b65e0a747405..6044eac70b51 100644
--- a/test/distributed/algorithms/quantization/test_quantization.py
+++ b/test/distributed/algorithms/quantization/test_quantization.py
@@ -79,7 +79,7 @@ if BACKEND == "gloo" or BACKEND == "nccl":
             dist.init_process_group(
                 store=store, rank=self.rank, world_size=self.world_size, backend="gloo"
             )
-            group = list(range(0, self.world_size))
+            group = list(range(self.world_size))
             group_id = dist.group.WORLD
             self._test_all_gather(
                 group, group_id, self.rank, dtype=torch.float32, qtype=DQuantType.FP16
@@ -94,7 +94,7 @@ if BACKEND == "gloo" or BACKEND == "nccl":
             dist.init_process_group(
                 store=store, rank=self.rank, world_size=self.world_size, backend="gloo"
             )
-            group = list(range(0, self.world_size))
+            group = list(range(self.world_size))
             group_id = dist.group.WORLD
             self._test_all_gather(
                 group, group_id, self.rank, dtype=torch.float32, qtype=DQuantType.BFP16
@@ -111,7 +111,7 @@ if BACKEND == "gloo" or BACKEND == "nccl":
             dist.init_process_group(
                 store=store, rank=self.rank, world_size=self.world_size, backend="nccl"
             )
-            group = list(range(0, self.world_size))
+            group = list(range(self.world_size))
             group_id = dist.new_group(range(self.world_size))
             rank_to_GPU = init_multigpu_helper(self.world_size, BACKEND)
             self._test_all_to_all(
@@ -135,7 +135,7 @@ if BACKEND == "gloo" or BACKEND == "nccl":
             dist.init_process_group(
                 store=store, rank=self.rank, world_size=self.world_size, backend="nccl"
             )
-            group = list(range(0, self.world_size))
+            group = list(range(self.world_size))
             group_id = dist.new_group(range(self.world_size))
             rank_to_GPU = init_multigpu_helper(self.world_size, BACKEND)
             self._test_all_to_all(
@@ -158,7 +158,7 @@ if BACKEND == "gloo" or BACKEND == "nccl":
             dist.init_process_group(
                 store=store, rank=self.rank, world_size=self.world_size, backend="nccl"
             )
-            group = list(range(0, self.world_size))
+            group = list(range(self.world_size))
             group_id = dist.new_group(range(self.world_size))
             rank_to_GPU = init_multigpu_helper(self.world_size, BACKEND)
             self._test_all_to_all_single(
@@ -181,7 +181,7 @@ if BACKEND == "gloo" or BACKEND == "nccl":
             dist.init_process_group(
                 store=store, rank=self.rank, world_size=self.world_size, backend="nccl"
             )
-            group = list(range(0, self.world_size))
+            group = list(range(self.world_size))
             group_id = dist.new_group(range(self.world_size))
             rank_to_GPU = init_multigpu_helper(self.world_size, BACKEND)
             self._test_all_to_all_single(
diff --git a/test/distributed/checkpoint/test_planner.py b/test/distributed/checkpoint/test_planner.py
index edf043301ed2..86bed29de998 100644
--- a/test/distributed/checkpoint/test_planner.py
+++ b/test/distributed/checkpoint/test_planner.py
@@ -66,7 +66,7 @@ if TEST_WITH_DEV_DBG_ASAN:
 def create_sharded_tensor(rank, world_size, shards_per_rank, shard_size=8):
     shards_metadata = []
     local_shards = []
-    for idx in range(0, world_size * shards_per_rank):
+    for idx in range(world_size * shards_per_rank):
         shard_rank = idx // shards_per_rank
         shard_md = ShardMetadata(
             shard_offsets=[idx * shard_size],
diff --git a/test/distributed/checkpoint/test_utils.py b/test/distributed/checkpoint/test_utils.py
index 722670c95f18..79dbe741822c 100644
--- a/test/distributed/checkpoint/test_utils.py
+++ b/test/distributed/checkpoint/test_utils.py
@@ -45,7 +45,7 @@ if TEST_WITH_DEV_DBG_ASAN:
 def create_sharded_tensor(rank, world_size, shards_per_rank):
     shards_metadata = []
     local_shards = []
-    for idx in range(0, world_size * shards_per_rank):
+    for idx in range(world_size * shards_per_rank):
         shard_rank = idx // shards_per_rank
         shard_md = ShardMetadata(
             shard_offsets=[idx * 8], shard_sizes=[8], placement=f"rank:{shard_rank}/cpu"
diff --git a/test/distributed/elastic/agent/server/test/api_test.py b/test/distributed/elastic/agent/server/test/api_test.py
index 11776324ed7f..dd96f9b6dfb0 100644
--- a/test/distributed/elastic/agent/server/test/api_test.py
+++ b/test/distributed/elastic/agent/server/test/api_test.py
@@ -633,7 +633,7 @@ class SimpleElasticAgentTest(unittest.TestCase):
         worker_group = agent.get_worker_group()
 
         num_restarts = 3
-        for _ in range(0, num_restarts):
+        for _ in range(num_restarts):
             agent._restart_workers(worker_group)
             self.assertEqual(WorkerState.HEALTHY, worker_group.state)
 
diff --git a/test/distributed/elastic/multiprocessing/api_test.py b/test/distributed/elastic/multiprocessing/api_test.py
index 4ac0dcacb4b8..19d941e0d9c6 100644
--- a/test/distributed/elastic/multiprocessing/api_test.py
+++ b/test/distributed/elastic/multiprocessing/api_test.py
@@ -146,7 +146,7 @@ def echo_large(size: int) -> dict[int, str]:
     returns a large output ({0: test0", 1: "test1", ..., (size-1):f"test{size-1}"})
     """
     out = {}
-    for idx in range(0, size):
+    for idx in range(size):
         out[idx] = f"test{idx}"
     return out
 
diff --git a/test/distributed/elastic/timer/file_based_local_timer_test.py b/test/distributed/elastic/timer/file_based_local_timer_test.py
index cf597eb6a37a..0125ce5cd25a 100644
--- a/test/distributed/elastic/timer/file_based_local_timer_test.py
+++ b/test/distributed/elastic/timer/file_based_local_timer_test.py
@@ -191,7 +191,7 @@ if not (IS_WINDOWS or IS_MACOS or IS_ARM64):
         """
         client = timer.FileTimerClient(file_path)
         sem.release()
-        for _ in range(0, n):
+        for _ in range(n):
             client.acquire("test_scope", 0)
             time.sleep(interval)
 
diff --git a/test/distributed/elastic/timer/local_timer_example.py b/test/distributed/elastic/timer/local_timer_example.py
index 09421f4b38f5..6d438f2536d6 100644
--- a/test/distributed/elastic/timer/local_timer_example.py
+++ b/test/distributed/elastic/timer/local_timer_example.py
@@ -102,7 +102,7 @@ if not (IS_WINDOWS or IS_MACOS or IS_ARM64):
 
             world_size = 8
             processes = []
-            for i in range(0, world_size):
+            for i in range(world_size):
                 if i % 2 == 0:
                     p = spawn_ctx.Process(target=_stuck_function, args=(i, mp_queue))
                 else:
@@ -110,7 +110,7 @@ if not (IS_WINDOWS or IS_MACOS or IS_ARM64):
                 p.start()
                 processes.append(p)
 
-            for i in range(0, world_size):
+            for i in range(world_size):
                 p = processes[i]
                 p.join()
                 if i % 2 == 0:
diff --git a/test/distributed/elastic/timer/local_timer_test.py b/test/distributed/elastic/timer/local_timer_test.py
index b65b202d5ec6..8818b1788c62 100644
--- a/test/distributed/elastic/timer/local_timer_test.py
+++ b/test/distributed/elastic/timer/local_timer_test.py
@@ -127,7 +127,7 @@ if not INVALID_PLATFORMS:
         interval seconds. Releases the given semaphore once before going to work.
         """
         sem.release()
-        for i in range(0, n):
+        for i in range(n):
             mp_queue.put(TimerRequest(i, "test_scope", 0))
             time.sleep(interval)
 
diff --git a/test/distributed/elastic/utils/data/cycling_iterator_test.py b/test/distributed/elastic/utils/data/cycling_iterator_test.py
index c9cb055a2c22..835ed6ebbd01 100644
--- a/test/distributed/elastic/utils/data/cycling_iterator_test.py
+++ b/test/distributed/elastic/utils/data/cycling_iterator_test.py
@@ -15,7 +15,7 @@ class CyclingIteratorTest(unittest.TestCase):
     def generator(self, epoch, stride, max_epochs):
         # generate an continuously incrementing list each epoch
         # e.g. [0,1,2] [3,4,5] [6,7,8] ...
-        return iter([stride * epoch + i for i in range(0, stride)])
+        return iter([stride * epoch + i for i in range(stride)])
 
     def test_cycling_iterator(self):
         stride = 3
@@ -25,7 +25,7 @@ class CyclingIteratorTest(unittest.TestCase):
             return self.generator(epoch, stride, max_epochs)
 
         it = CyclingIterator(n=max_epochs, generator_fn=generator_fn)
-        for i in range(0, stride * max_epochs):
+        for i in range(stride * max_epochs):
             self.assertEqual(i, next(it))
 
         with self.assertRaises(StopIteration):
diff --git a/test/distributed/fsdp/test_fsdp_hybrid_shard.py b/test/distributed/fsdp/test_fsdp_hybrid_shard.py
index 26a05bbc4171..e2ea4c5fc9af 100644
--- a/test/distributed/fsdp/test_fsdp_hybrid_shard.py
+++ b/test/distributed/fsdp/test_fsdp_hybrid_shard.py
@@ -124,7 +124,7 @@ class TestFSDPHybridShard(FSDPTest):
         model = MyModel().to(device_type)
         num_node_devices = torch.accelerator.device_count()
         shard_rank_lists = (
-            list(range(0, num_node_devices // 2)),
+            list(range(num_node_devices // 2)),
             list(range(num_node_devices // 2, num_node_devices)),
         )
         shard_groups = (
@@ -175,7 +175,7 @@ class TestFSDPHybridShard(FSDPTest):
         model = MyModel().to(device_type)
         num_node_devices = torch.accelerator.device_count()
         shard_rank_lists = (
-            list(range(0, num_node_devices // 2)),
+            list(range(num_node_devices // 2)),
             list(range(num_node_devices // 2, num_node_devices)),
         )
         shard_groups = (
diff --git a/test/distributed/tensor/test_dtensor_ops.py b/test/distributed/tensor/test_dtensor_ops.py
index c4373773d662..df51152a9030 100644
--- a/test/distributed/tensor/test_dtensor_ops.py
+++ b/test/distributed/tensor/test_dtensor_ops.py
@@ -802,7 +802,7 @@ class TestLocalDTensorOps(TestDTensorOps):
         self.run_opinfo_test(dtype, op)
 
     def test_mean(self):
-        with LocalTensorMode(frozenset(range(0, self.world_size))):
+        with LocalTensorMode(frozenset(range(self.world_size))):
             self.run_mean()
 
     def test_one_hot(self):
@@ -811,7 +811,7 @@ class TestLocalDTensorOps(TestDTensorOps):
     def run_opinfo_test(
         self, dtype, op, requires_grad=True, sample_inputs_filter=lambda s: True
     ):
-        with LocalTensorMode(frozenset(range(0, self.world_size))):
+        with LocalTensorMode(frozenset(range(self.world_size))):
             super().run_opinfo_test(dtype, op, requires_grad, sample_inputs_filter)
 
     def assertEqualOnRank(self, x, y, msg=None, *, rank=0):
diff --git a/test/distributed/test_device_mesh.py b/test/distributed/test_device_mesh.py
index 0ed4651d3ec5..2db674a458ed 100644
--- a/test/distributed/test_device_mesh.py
+++ b/test/distributed/test_device_mesh.py
@@ -536,7 +536,7 @@ class DeviceMeshTestNDim(DTensorTestBase):
         # Create shard groups (e.g. (0, 1, 2, 3), (4, 5, 6, 7))
         # and assign the correct shard group to each rank
         shard_rank_lists = (
-            list(range(0, self.world_size // 2)),
+            list(range(self.world_size // 2)),
             list(range(self.world_size // 2, self.world_size)),
         )
         shard_groups = (
diff --git a/test/distributions/test_distributions.py b/test/distributions/test_distributions.py
index b588589d81ba..550589002003 100644
--- a/test/distributions/test_distributions.py
+++ b/test/distributions/test_distributions.py
@@ -5722,11 +5722,11 @@ class TestKL(DistributionsTestCase):
     def test_kl_multivariate_normal(self):
         set_rng_seed(0)  # see Note [Randomized statistical tests]
         n = 5  # Number of tests for multivariate_normal
-        for i in range(0, n):
-            loc = [torch.randn(4) for _ in range(0, 2)]
+        for i in range(n):
+            loc = [torch.randn(4) for _ in range(2)]
             scale_tril = [
                 transform_to(constraints.lower_cholesky)(torch.randn(4, 4))
-                for _ in range(0, 2)
+                for _ in range(2)
             ]
             p = MultivariateNormal(loc=loc[0], scale_tril=scale_tril[0])
             q = MultivariateNormal(loc=loc[1], scale_tril=scale_tril[1])
@@ -5755,10 +5755,10 @@ class TestKL(DistributionsTestCase):
 
     def test_kl_multivariate_normal_batched(self):
         b = 7  # Number of batches
-        loc = [torch.randn(b, 3) for _ in range(0, 2)]
+        loc = [torch.randn(b, 3) for _ in range(2)]
         scale_tril = [
             transform_to(constraints.lower_cholesky)(torch.randn(b, 3, 3))
-            for _ in range(0, 2)
+            for _ in range(2)
         ]
         expected_kl = torch.stack(
             [
@@ -5766,7 +5766,7 @@ class TestKL(DistributionsTestCase):
                     MultivariateNormal(loc[0][i], scale_tril=scale_tril[0][i]),
                     MultivariateNormal(loc[1][i], scale_tril=scale_tril[1][i]),
                 )
-                for i in range(0, b)
+                for i in range(b)
             ]
         )
         actual_kl = kl_divergence(
@@ -5777,7 +5777,7 @@ class TestKL(DistributionsTestCase):
 
     def test_kl_multivariate_normal_batched_broadcasted(self):
         b = 7  # Number of batches
-        loc = [torch.randn(b, 3) for _ in range(0, 2)]
+        loc = [torch.randn(b, 3) for _ in range(2)]
         scale_tril = [
             transform_to(constraints.lower_cholesky)(torch.randn(b, 3, 3)),
             transform_to(constraints.lower_cholesky)(torch.randn(3, 3)),
@@ -5788,7 +5788,7 @@ class TestKL(DistributionsTestCase):
                     MultivariateNormal(loc[0][i], scale_tril=scale_tril[0][i]),
                     MultivariateNormal(loc[1][i], scale_tril=scale_tril[1]),
                 )
-                for i in range(0, b)
+                for i in range(b)
             ]
         )
         actual_kl = kl_divergence(
@@ -5800,15 +5800,15 @@ class TestKL(DistributionsTestCase):
     def test_kl_lowrank_multivariate_normal(self):
         set_rng_seed(0)  # see Note [Randomized statistical tests]
         n = 5  # Number of tests for lowrank_multivariate_normal
-        for i in range(0, n):
-            loc = [torch.randn(4) for _ in range(0, 2)]
-            cov_factor = [torch.randn(4, 3) for _ in range(0, 2)]
+        for i in range(n):
+            loc = [torch.randn(4) for _ in range(2)]
+            cov_factor = [torch.randn(4, 3) for _ in range(2)]
             cov_diag = [
-                transform_to(constraints.positive)(torch.randn(4)) for _ in range(0, 2)
+                transform_to(constraints.positive)(torch.randn(4)) for _ in range(2)
             ]
             covariance_matrix = [
                 cov_factor[i].matmul(cov_factor[i].t()) + cov_diag[i].diag()
-                for i in range(0, 2)
+                for i in range(2)
             ]
             p = LowRankMultivariateNormal(loc[0], cov_factor[0], cov_diag[0])
             q = LowRankMultivariateNormal(loc[1], cov_factor[1], cov_diag[1])
@@ -5861,10 +5861,10 @@ class TestKL(DistributionsTestCase):
 
     def test_kl_lowrank_multivariate_normal_batched(self):
         b = 7  # Number of batches
-        loc = [torch.randn(b, 3) for _ in range(0, 2)]
-        cov_factor = [torch.randn(b, 3, 2) for _ in range(0, 2)]
+        loc = [torch.randn(b, 3) for _ in range(2)]
+        cov_factor = [torch.randn(b, 3, 2) for _ in range(2)]
         cov_diag = [
-            transform_to(constraints.positive)(torch.randn(b, 3)) for _ in range(0, 2)
+            transform_to(constraints.positive)(torch.randn(b, 3)) for _ in range(2)
         ]
         expected_kl = torch.stack(
             [
@@ -5876,7 +5876,7 @@ class TestKL(DistributionsTestCase):
                         loc[1][i], cov_factor[1][i], cov_diag[1][i]
                     ),
                 )
-                for i in range(0, b)
+                for i in range(b)
             ]
         )
         actual_kl = kl_divergence(
diff --git a/test/dynamo/test_export.py b/test/dynamo/test_export.py
index 112da727ec61..f3f438d241af 100644
--- a/test/dynamo/test_export.py
+++ b/test/dynamo/test_export.py
@@ -49,9 +49,9 @@ class ExportTests(torch._dynamo.test_case.TestCase):
             lc_key = state[0]
             lc_val = state[1]
             bar = []
-            for _ in range(0, 4):
+            for _ in range(4):
                 bar2 = []
-                for _ in range(0, 3):
+                for _ in range(3):
                     bar2.append(
                         lc_key + lc_val + torch.tensor([0.1, 0.25, 0.4, 0.5, 0.1])
                     )
@@ -665,9 +665,9 @@ def forward(self, x, y):
             lc_key = state[0]
             lc_val = state[1]
             bar = []
-            for _ in range(0, 4):
+            for _ in range(4):
                 bar2 = []
-                for _ in range(0, 3):
+                for _ in range(3):
                     bar2.append(
                         lc_key + lc_val + torch.tensor([0.1, 0.25, 0.4, 0.5, 0.1])
                     )
diff --git a/test/dynamo/test_functions.py b/test/dynamo/test_functions.py
index d16676cda8ee..647033e63e4c 100644
--- a/test/dynamo/test_functions.py
+++ b/test/dynamo/test_functions.py
@@ -3627,7 +3627,7 @@ class GraphModule(torch.nn.Module):
                 )
 
         test(range(10), slice(1, 10, 2), expected=range(1, 10, 2))
-        test(range(10), slice(None, 10, None), expected=range(0, 10))
+        test(range(10), slice(None, 10, None), expected=range(10))
         test(range(10), slice(-1, 7, None), expected=range(9, 7))
         test(range(10), slice(-1, 7, 2), expected=range(9, 7, 2))
         test(range(1, 10, 2), slice(3, 7, 2), expected=range(7, 11, 4))
diff --git a/test/dynamo/test_modules.py b/test/dynamo/test_modules.py
index 7cac7eca7239..c251ce28bac4 100644
--- a/test/dynamo/test_modules.py
+++ b/test/dynamo/test_modules.py
@@ -3047,7 +3047,7 @@ class OptimizedModuleTest(torch._dynamo.test_case.TestCase):
         def generate(x, c):
             return mod(x) + c
 
-        for _ in range(0, 10):
+        for _ in range(10):
             generate(torch.randn(10, 10), 0)
             generate(torch.randn(10, 10), 1)
         self.assertEqual(cnt.frame_count, 2)
diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index 362a541918c3..ac0515ac6ba8 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -4471,7 +4471,7 @@ class ReproTests(torch._dynamo.test_case.TestCase):
 
         compiled_fn = torch.compile(func, backend=cnt, fullgraph=True)
         requires_grad = func is not func1
-        for _ in range(0, 5):
+        for _ in range(5):
             # Inputs
             eager_a = torch.ones([6], requires_grad=requires_grad)
             compiled_a = torch.ones([6], requires_grad=requires_grad)
@@ -4623,7 +4623,7 @@ class ReproTests(torch._dynamo.test_case.TestCase):
         x = torch.rand([2, 2])
         self.assertEqual(opt_fn(x, counter), fn(x, counter))
         self.assertEqual(counter[0], 2)
-        for _ in range(0, 10):
+        for _ in range(10):
             opt_fn(x, counter)
         self.assertEqual(counter[0], 12)
         if torch._dynamo.config.assume_static_by_default:
@@ -4784,7 +4784,7 @@ class ReproTests(torch._dynamo.test_case.TestCase):
     def test_contains_range_constprop(self):
         def fn(x):
             # dynamo should const prop to False
-            if 3 in range(0, 10):
+            if 3 in range(10):
                 return x + 1
             else:
                 return x + 2
diff --git a/test/functorch/test_ac.py b/test/functorch/test_ac.py
index fde84b6683ed..d0611f19cf2a 100644
--- a/test/functorch/test_ac.py
+++ b/test/functorch/test_ac.py
@@ -106,7 +106,7 @@ class MemoryBudgetTest(TestCase):
             return f(x, ws)
 
         _, eager_flops = get_mem_and_flops(call)
-        for budget in range(0, 11):
+        for budget in range(11):
             mem, flops = get_mem_and_flops(call, memory_budget=budget / 10)
             if budget <= 5:
                 # We start saving the matmuls
@@ -251,7 +251,7 @@ class MemoryBudgetTest(TestCase):
             return f(x, ws)
 
         expected = call()
-        for budget in range(0, 11):
+        for budget in range(11):
             memory_budget = budget / 10
             torch._dynamo.reset()
             with config.patch(activation_memory_budget=memory_budget):
diff --git a/test/inductor/test_codecache.py b/test/inductor/test_codecache.py
index 78c2dd3de852..ca2e9007109d 100644
--- a/test/inductor/test_codecache.py
+++ b/test/inductor/test_codecache.py
@@ -1146,7 +1146,7 @@ class TestFxGraphCache(TestCase):
             raise unittest.SkipTest(f"requires {GPU_TYPE}")
 
         def fn1(x):
-            return x + torch.tensor(list(range(0, 12)), device=device)
+            return x + torch.tensor(list(range(12)), device=device)
 
         def fn2(x):
             return x + torch.tensor(list(range(1, 13)), device=device)
diff --git a/test/inductor/test_compiled_autograd.py b/test/inductor/test_compiled_autograd.py
index 2612af01f6ff..716d3bfafee2 100644
--- a/test/inductor/test_compiled_autograd.py
+++ b/test/inductor/test_compiled_autograd.py
@@ -1599,7 +1599,7 @@ main()
 
         eager_check()
 
-        for i in range(0, 5):
+        for i in range(5):
             with compiled_autograd._enable(compiler_fn):
                 eager_check()
 
diff --git a/test/inductor/test_max_autotune.py b/test/inductor/test_max_autotune.py
index 6645f17fb9ee..85405283e4bd 100644
--- a/test/inductor/test_max_autotune.py
+++ b/test/inductor/test_max_autotune.py
@@ -2095,7 +2095,7 @@ class TestMaxAutotune(TestCase):
 
         # Test loop.
         def test_func2(x):
-            for i in range(0, 10):
+            for i in range(10):
                 x = torch.matmul(x, x)
             return x
 
diff --git a/test/inductor/test_triton_kernels.py b/test/inductor/test_triton_kernels.py
index 9a21220ce4d9..4739d00f1f4a 100644
--- a/test/inductor/test_triton_kernels.py
+++ b/test/inductor/test_triton_kernels.py
@@ -3005,7 +3005,7 @@ class MutationTests(torch._inductor.test_case.TestCase):
             mask = offsets < n_elements
             x = tl.load(in_ptr0 + offsets, mask=mask)
             y = tl.load(in_ptr1 + offsets, mask=mask)
-            for i in range(0, BLOCK_SIZE):
+            for i in range(BLOCK_SIZE):
                 i = tl.multiple_of(i, 1)
             output = x + y
             tl.store(out_ptr + offsets, output, mask=mask)
@@ -3160,7 +3160,7 @@ class MutationTests(torch._inductor.test_case.TestCase):
             x = tl.load(x_block_ptr)
 
             # Compute gating
-            for c2 in range(0, tl.cdiv(C2, BLOCK_SIZE_C2)):
+            for c2 in range(tl.cdiv(C2, BLOCK_SIZE_C2)):
                 # Compute block pointers
                 offs_c2 = c2 * BLOCK_SIZE_C2 + tl.arange(0, BLOCK_SIZE_C2)
                 o_block_ptr = O_ptr + offs_m[:, None] * C2 + offs_c2[None, :]
diff --git a/test/jit/xnnpack/test_xnnpack_delegate.py b/test/jit/xnnpack/test_xnnpack_delegate.py
index b97765ed5bb0..f6c7832d5b28 100644
--- a/test/jit/xnnpack/test_xnnpack_delegate.py
+++ b/test/jit/xnnpack/test_xnnpack_delegate.py
@@ -32,7 +32,7 @@ class TestXNNPackBackend(unittest.TestCase):
             },
         )
 
-        for _ in range(0, 20):
+        for _ in range(20):
             sample_input = torch.randn(4, 4, 4)
             actual_output = scripted_module(sample_input)
             expected_output = lowered_module(sample_input)
diff --git a/test/nn/test_convolution.py b/test/nn/test_convolution.py
index 4cdcac707644..3c3b3f53e528 100644
--- a/test/nn/test_convolution.py
+++ b/test/nn/test_convolution.py
@@ -1292,7 +1292,7 @@ class TestConvolutionNN(NNTestCase):
             kernel_x = torch.zeros([3, 1, 1, radius * 2 + 1], device=image.device)
             image = torch.nn.functional.conv2d(image, kernel_x, groups=image.shape[-3])
 
-        for i in range(0, 128):
+        for i in range(128):
             # This should not fail
             reproducer(radius=i)
 
diff --git a/test/nn/test_embedding.py b/test/nn/test_embedding.py
index fb9d842ce476..f21184290fa1 100644
--- a/test/nn/test_embedding.py
+++ b/test/nn/test_embedding.py
@@ -551,7 +551,7 @@ class TestEmbeddingNNDeviceType(NNTestCase):
                 # Pull out the bag's indices from indices_1D, and fill any
                 # remaining space with padding indices
                 indices_in_bag = []
-                for item_pos in range(0, max_indices_per_bag):
+                for item_pos in range(max_indices_per_bag):
                     if (start + item_pos) < end:
                         indices_in_bag.append(indices_1D[start + item_pos])
                     else:
diff --git a/test/nn/test_multihead_attention.py b/test/nn/test_multihead_attention.py
index 0c04e3b86b88..3dc6a586ced6 100644
--- a/test/nn/test_multihead_attention.py
+++ b/test/nn/test_multihead_attention.py
@@ -485,7 +485,7 @@ class TestMultiheadAttentionNN(NNTestCase):
         )[0]
         output_3d = output_3d.transpose(0, 1)  # [N, T, D]
 
-        for i in range(0, batch_size):
+        for i in range(batch_size):
             output_2d = mta_model(
                 query[i].unsqueeze(0).transpose(0, 1),
                 key[i].unsqueeze(0).transpose(0, 1),
diff --git a/test/nn/test_pooling.py b/test/nn/test_pooling.py
index d282a885f4ed..c3a7b829b2b1 100644
--- a/test/nn/test_pooling.py
+++ b/test/nn/test_pooling.py
@@ -1135,7 +1135,7 @@ torch.cuda.synchronize()
         for size, kernel_size, stride, dilation, ceil_mode in itertools.product(
             sizes, kernel_sizes, strides, dilations, ceil_modes
         ):
-            padding = random.sample(range(0, math.floor(kernel_size / 2) + 1), 1)
+            padding = random.sample(range(math.floor(kernel_size / 2) + 1), 1)
             check(
                 torch.randn(size, device=device, dtype=dtype),
                 kernel_size,
diff --git a/test/onnx/test_onnx_opset.py b/test/onnx/test_onnx_opset.py
index 75de1f3fab83..16ca93dbfe2c 100644
--- a/test/onnx/test_onnx_opset.py
+++ b/test/onnx/test_onnx_opset.py
@@ -36,12 +36,12 @@ def check_onnx_opset_operator(
     # but the op's attributes can optionally be
     # specified as well
     assert len(ops) == len(graph.node)
-    for i in range(0, len(ops)):
+    for i in range(len(ops)):
         assert graph.node[i].op_type == ops[i]["op_name"]
         if "attributes" in ops[i]:
             attributes = ops[i]["attributes"]
             assert len(attributes) == len(graph.node[i].attribute)
-            for j in range(0, len(attributes)):
+            for j in range(len(attributes)):
                 for attribute_field in attributes[j].keys():
                     assert attributes[j][attribute_field] == getattr(
                         graph.node[i].attribute[j], attribute_field
diff --git a/test/optim/test_lrscheduler.py b/test/optim/test_lrscheduler.py
index cea85b07646f..3e65720a45b6 100644
--- a/test/optim/test_lrscheduler.py
+++ b/test/optim/test_lrscheduler.py
@@ -1509,7 +1509,7 @@ class TestLRScheduler(TestCase):
             14.0 / 3,
             29.0 / 6,
         ]
-        deltas = [2 * i for i in range(0, 2)]
+        deltas = [2 * i for i in range(2)]
         base_lrs = [1 + delta for delta in deltas]
         max_lrs = [5 + delta for delta in deltas]
         lr_targets = [[x + delta for x in lr_base_target] for delta in deltas]
diff --git a/test/profiler/test_profiler.py b/test/profiler/test_profiler.py
index 1461731a5998..a9321da3fbd3 100644
--- a/test/profiler/test_profiler.py
+++ b/test/profiler/test_profiler.py
@@ -1930,7 +1930,7 @@ assert KinetoStepTracker.current_step() == initial_step + 2 * niters
         event_list.table()
 
     def _check_all_gpu_present(self, gpu_dict, max_gpu_count):
-        for i in range(0, max_gpu_count):
+        for i in range(max_gpu_count):
             self.assertEqual(gpu_dict["GPU " + str(i)], 1)
 
     # Do json sanity testing. Checks that all events are between profiler start and end
@@ -2139,8 +2139,8 @@ assert KinetoStepTracker.current_step() == initial_step + 2 * niters
                         step_helper_funcs.append(event)
             self.assertEqual(len(prof_steps), 5)
             self.assertEqual(len(step_helper_funcs), 5)
-            for i in range(0, len(step_helper_funcs)):
-                for j in range(0, len(step_helper_funcs)):
+            for i in range(len(step_helper_funcs)):
+                for j in range(len(step_helper_funcs)):
                     self.assertTrue(
                         not self._partial_overlap(prof_steps[i], step_helper_funcs[j])
                     )
diff --git a/test/quantization/core/experimental/test_floatx.py b/test/quantization/core/experimental/test_floatx.py
index ee7fe0a9d186..c4cea4073a5c 100644
--- a/test/quantization/core/experimental/test_floatx.py
+++ b/test/quantization/core/experimental/test_floatx.py
@@ -275,7 +275,7 @@ class TestFloat8Dtype(TestCase):
         IMO simpler to special case e8m0 here.
         """
 
-        for biased_exponent in range(0, 256):
+        for biased_exponent in range(256):
             # iterate through all the possible options of guard, round, sticky bits
             # for the current exponent
             for grs in range(8):
diff --git a/test/test_dataloader.py b/test/test_dataloader.py
index da0c12082244..b9000a2c68d3 100644
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@@ -3494,7 +3494,7 @@ class TestIndividualWorkerQueue(TestCase):
             max_num_workers = 1
 
         for batch_size in (8, 16, 32, 64):
-            for num_workers in range(0, min(6, max_num_workers)):
+            for num_workers in range(min(6, max_num_workers)):
                 self._run_ind_worker_queue_test(
                     batch_size=batch_size, num_workers=num_workers + 1
                 )
diff --git a/test/test_datapipe.py b/test/test_datapipe.py
index e92fa2b0615d..2790145665b1 100644
--- a/test/test_datapipe.py
+++ b/test/test_datapipe.py
@@ -520,7 +520,7 @@ class TestIterableDataPipeBasic(TestCase):
         self.assertEqual(list(range(9)), list(n))
 
         # Functional Test: Uneven DataPipes
-        source_numbers = list(range(0, 10)) + [10, 12]
+        source_numbers = list(range(10)) + [10, 12]
         numbers_dp = dp.iter.IterableWrapper(source_numbers)
         n1, n2 = numbers_dp.demux(2, lambda x: x % 2)
         self.assertEqual([0, 2, 4, 6, 8, 10, 12], list(n1))
@@ -1257,7 +1257,7 @@ class TestFunctionalIterDataPipe(TestCase):
         )
         output1, output2 = list(dp1), list(dp2)
         self.assertEqual(list(range(5, 10)), output1)
-        self.assertEqual(list(range(0, 5)), output2)
+        self.assertEqual(list(range(5)), output2)
 
         # Functional Test: values of the same classification are lumped together, and unlimited buffer
         with warnings.catch_warnings(record=True) as wa:
@@ -1271,7 +1271,7 @@ class TestFunctionalIterDataPipe(TestCase):
             self.assertRegex(str(wa[-1].message), r"Unlimited buffer size is set")
         output1, output2 = list(dp1), list(dp2)
         self.assertEqual(list(range(5, 10)), output1)
-        self.assertEqual(list(range(0, 5)), output2)
+        self.assertEqual(list(range(5)), output2)
 
         # Functional Test: classifier returns a value outside of [0, num_instance - 1]
         dp0 = input_dp.demux(num_instances=1, classifier_fn=lambda x: x % 2)
diff --git a/test/test_dynamic_shapes.py b/test/test_dynamic_shapes.py
index fcc45521fbb1..9a6575cf184d 100644
--- a/test/test_dynamic_shapes.py
+++ b/test/test_dynamic_shapes.py
@@ -1385,7 +1385,7 @@ class f(torch.nn.Module):
             self.assertEqual(x.storage_offset(), y.storage_offset())
 
     def test_tensor_factory_with_symint(self):
-        args = list(range(0, 3))
+        args = list(range(3))
         expected = torch.tensor(args)
 
         shape_env = ShapeEnv()
@@ -4291,7 +4291,7 @@ def forward(self, arg0_1: "i64[1][1]cpu", arg1_1: "Sym(u1)", arg2_1: "i64[u1][1]
             start = start.item()
             N = 3
             result = X0[start]
-            for i in range(0, N):
+            for i in range(N):
                 result += X0[start + 1 + i]
             return result
 
diff --git a/test/test_indexing.py b/test/test_indexing.py
index fa91b5903410..99d84a65abca 100644
--- a/test/test_indexing.py
+++ b/test/test_indexing.py
@@ -902,7 +902,7 @@ class TestIndexing(TestCase):
         # Set window size
         W = 10
         # Generate a list of lists, containing overlapping window indices
-        indices = [range(i, i + W) for i in range(0, N - W)]
+        indices = [range(i, i + W) for i in range(N - W)]
 
         for i in [len(indices), 100, 32]:
             windowed_data = t[indices[:i]]
diff --git a/test/test_jit.py b/test/test_jit.py
index 6a3c968f86dd..613903e9a116 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -3153,7 +3153,7 @@ class TestScript(JitTestCase):
             eplan = get_execution_plan(dstate)
             num_bailouts = eplan.code.num_bailouts()
 
-            for i in range(0, num_bailouts):
+            for i in range(num_bailouts):
                 eplan.code.request_bailout(i)
                 self.assertEqual(jitted(x), expected)
 
@@ -5950,7 +5950,7 @@ a")
             # type: (int) -> int
             prev = 1
             v = 1
-            for i in range(0, x):
+            for i in range(x):
                 save = v
                 v = v + prev
                 prev = save
@@ -10938,7 +10938,7 @@ dedent """
 
             # Test symbolic differentiation
             # Run Forward and Backward thrice to trigger autodiff graph
-            for i in range(0, 3):
+            for i in range(3):
                 y = jit_module(x)
                 y.backward(grad)
             x.grad.zero_()
@@ -11802,7 +11802,7 @@ dedent """
         def fn_zip_enumerate(x, y):
             # type: (List[int], List[int]) -> int
             sum = 0
-            for (i, (j, v), k) in zip(x, enumerate(y), range(0, 100)):
+            for (i, (j, v), k) in zip(x, enumerate(y), range(100)):
                 sum += i * j * v * k
 
             return sum
diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py
index 1bda41f7f8f1..dba28f98cbf9 100644
--- a/test/test_jit_fuser_te.py
+++ b/test/test_jit_fuser_te.py
@@ -243,7 +243,7 @@ class TestTEFuser(JitTestCase):
             return x2.sum()
 
         with texpr_reductions_enabled():
-            a = torch.tensor(list(range(0, 15)), dtype=torch.float, device="cpu")
+            a = torch.tensor(list(range(15)), dtype=torch.float, device="cpu")
             a = a.reshape(5, 3)
             scripted = self.checkScript(func, (a,))
             self.assertLastGraphAllFused()
@@ -259,7 +259,7 @@ class TestTEFuser(JitTestCase):
             return x.sum((-2,)) * 2
 
         with texpr_reductions_enabled():
-            a = torch.tensor(list(range(0, 15)), dtype=torch.float, device="cpu")
+            a = torch.tensor(list(range(15)), dtype=torch.float, device="cpu")
             a = a.reshape(5, 3)
             scripted = self.checkScript(func, (a,))
             self.assertLastGraphAllFused()
@@ -271,7 +271,7 @@ class TestTEFuser(JitTestCase):
             return x.sum((0,), keepdim=True, dtype=torch.double) * 2
 
         with texpr_reductions_enabled():
-            a = torch.tensor(list(range(0, 15)), dtype=torch.float, device="cpu")
+            a = torch.tensor(list(range(15)), dtype=torch.float, device="cpu")
             a = a.reshape(5, 3)
 
             self.checkScript(func, (a,))
@@ -2234,7 +2234,7 @@ class TestTEFuser(JitTestCase):
 
         indices = [0, 1, 2, 3]
         sets = []
-        for i in range(0, len(indices) + 1):
+        for i in range(len(indices) + 1):
             for subset in combinations(indices, i):
                 sets.append(subset)  # noqa: PERF402
 
diff --git a/test/test_matmul_cuda.py b/test/test_matmul_cuda.py
index 61f5642830dd..bf46ee0709fc 100644
--- a/test/test_matmul_cuda.py
+++ b/test/test_matmul_cuda.py
@@ -231,7 +231,7 @@ class TestMatmulCuda(InductorTestCase):
     def test_cublas_addmm_alignment(self, dtype):
         device = 'cuda'
         # perturb X, A, or B alignment
-        for idx in range(0, 3):
+        for idx in range(3):
             for offset in range(1, 3):
                 offsets = [0, 0, 0]
                 offsets[idx] = offset
diff --git a/test/test_mps.py b/test/test_mps.py
index 7346d1d26d44..e825fa77aa89 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -1900,7 +1900,7 @@ class TestMPS(TestCaseMPS):
         res_cpu = torch.linalg.vector_norm(B_cpu, ord=3.5)
         self.assertEqual(res_mps, res_cpu)
 
-        for dim in range(0, B_mps.dim()):
+        for dim in range(B_mps.dim()):
             res_mps = torch.linalg.vector_norm(B_mps, ord=3.5, dim=dim)
             res_cpu = torch.linalg.vector_norm(B_cpu, ord=3.5, dim=dim)
             self.assertEqual(res_mps, res_cpu)
@@ -2871,8 +2871,8 @@ class TestMPS(TestCaseMPS):
 
     def test_contiguous_slice_2d(self):
         def helper(shape):
-            for i in range(0, shape[0]):
-                for j in range(0, shape[1]):
+            for i in range(shape[0]):
+                for j in range(shape[1]):
                     t_mps = torch.randn(shape, device="mps")
                     t_cpu = t_mps.detach().clone().cpu()
 
@@ -3432,12 +3432,12 @@ class TestMPS(TestCaseMPS):
         elems = torch.arange(n_tensors * n_tensor_elems, dtype=torch.float32)
 
         tensor_list = []
-        for i in range(0, n_tensors - 1):
+        for i in range(n_tensors - 1):
             # create a list of contiguous view tensors (view tensor created by the slice op)
             t = elems[n_tensor_elems * i : n_tensor_elems * (i + 1)]
             tensor_list.append(t)
 
-        for i in range(0, n_tensors - 1):
+        for i in range(n_tensors - 1):
             t = tensor_list[i].view(1, n_tensor_elems)
             t_mps = t.to("mps")
             self.assertEqual(t, t_mps.cpu(), f"i={i}")
@@ -4942,7 +4942,7 @@ class TestMPS(TestCaseMPS):
             x_mps = fn(torch.zeros(shape, device="mps"), dim=dim)
             self.assertEqual(x_cpu, x_mps.cpu())
         for fn in [torch.any, torch.all]:
-            for dim in range(0, 4):
+            for dim in range(4):
                 helper(fn, dim)
 
         # 6D tensor reductions
@@ -9750,7 +9750,7 @@ class TestGatherScatter(TestCaseMPS):
         self.assertEqual(x_cpu, x_mps)
 
     def test_cast_gather_scatter(self):
-        for _ in range(0, 50):
+        for _ in range(50):
             input = np.random.randint(0, 255, size=(5, 5, 4), dtype=np.uint8)
             with torch.no_grad():
                 s = torch.tensor(input, dtype=torch.uint8, device="mps").unsqueeze(0)
diff --git a/test/test_numa_binding.py b/test/test_numa_binding.py
index 764156ff9b98..c599587e281d 100644
--- a/test/test_numa_binding.py
+++ b/test/test_numa_binding.py
@@ -549,7 +549,7 @@ class NumaBindingTest(TestCase):
             bound_logical_cpu_indices_0,
             # Gets an extra physical core due to odd number of physical cores on numa node
             # 3 physical cores total, 2 GPUs: GPU 0 gets 2 physical cores (CPUs 0-3)
-            set(range(0, 4)),
+            set(range(4)),
         )
 
         bound_logical_cpu_indices_1 = (
@@ -677,7 +677,7 @@ class NumaBindingTest(TestCase):
             # 1 numa node, 2 L3 caches, 1 physical core per L3 cache = 2 logical CPUs per cache
             # L3 cache 0: CPUs 0-1, L3 cache 1: CPUs 2-3
             # Both have same number of CPUs, so prefer lower cache key (0)
-            set(range(0, 2)),
+            set(range(2)),
         )
 
     def test_binds_to_node_0_if_node_stored_as_minus_one(self) -> None:
@@ -709,7 +709,7 @@ class NumaBindingTest(TestCase):
             # GPU 0 has numa node stored as -1, which is treated as numa node 0
             # Each numa node has 1 * 1 * 2 = 2 logical CPUs
             # Numa node 0 has CPUs 0-1
-            set(range(0, 2)),
+            set(range(2)),
         )
 
     def test_callable_entrypoint_basic(self) -> None:
diff --git a/test/test_reductions.py b/test/test_reductions.py
index e4fa54491dd0..4a3235fbc50c 100644
--- a/test/test_reductions.py
+++ b/test/test_reductions.py
@@ -1710,7 +1710,7 @@ class TestReductions(TestCase):
                                             with_extremal=False, atol=None, rtol=None,
                                             exact_dtype=True, with_keepdim=False):
         # Test 0-d to 3-d tensors.
-        for ndims in range(0, 4):
+        for ndims in range(4):
             shape = _rand_shape(ndims, min_size=5, max_size=10)
             for n in range(ndims + 1):
                 for c in combinations(list(range(ndims)), n):
@@ -2623,7 +2623,7 @@ class TestReductions(TestCase):
         # Generate some random test cases
         ops = ['quantile', 'nanquantile']
         inputs = [tuple(np.random.randint(2, 10, size=i)) for i in range(1, 4)]
-        quantiles = [tuple(np.random.rand(i)) for i in range(0, 5)]
+        quantiles = [tuple(np.random.rand(i)) for i in range(5)]
         keepdims = [True, False]
 
         # Add corner cases
diff --git a/test/test_serialization.py b/test/test_serialization.py
index 7c4208b6a0d6..a6e3ef23580d 100644
--- a/test/test_serialization.py
+++ b/test/test_serialization.py
@@ -295,7 +295,7 @@ class SerializationMixin:
             5,
             6
         ]
-        for i in range(0, 100):
+        for i in range(100):
             data.append(0)
         t = torch.tensor(data, dtype=torch.uint8)
 
diff --git a/test/test_sparse.py b/test/test_sparse.py
index 866f38a316d7..196506a8e13d 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -5300,7 +5300,7 @@ class TestSparseAny(TestCase):
             x_dense = torch.eye(dense_dim, dtype=dtype, device=device)
             for sparse_dim_in in range(1, dense_dim):
                 x_sparse = x_dense.to_sparse(sparse_dim_in)
-                for sparse_dim_out in range(0, dense_dim):
+                for sparse_dim_out in range(dense_dim):
                     if sparse_dim_out == sparse_dim_in:
                         self.assertTrue(x_sparse.to_sparse(sparse_dim_out).sparse_dim() == sparse_dim_out)
                     else:
diff --git a/test/test_sparse_csr.py b/test/test_sparse_csr.py
index 65e800f6eba1..45748c683621 100644
--- a/test/test_sparse_csr.py
+++ b/test/test_sparse_csr.py
@@ -135,7 +135,7 @@ class TestSparseCSRSampler(TestCase):
         index_dtype = torch.int32
         for n_rows in range(1, 10):
             for n_cols in range(1, 10):
-                for nnz in range(0, n_rows * n_cols + 1):
+                for nnz in range(n_rows * n_cols + 1):
                     crow_indices = self._make_crow_indices(
                         n_rows, n_cols, nnz,
                         device=device, dtype=index_dtype)
diff --git a/test/test_static_runtime.py b/test/test_static_runtime.py
index 893aea8e3130..df1e0c3e34fa 100644
--- a/test/test_static_runtime.py
+++ b/test/test_static_runtime.py
@@ -60,7 +60,7 @@ class MultiHeadAttentionLayer(nn.Module):
 # Taken from https://github.com/facebookresearch/dlrm/blob/master/dlrm_s_pytorch.py
 def create_mlp(ln, sigmoid_layer):
     layers = nn.ModuleList()
-    for i in range(0, len(ln) - 1):
+    for i in range(len(ln) - 1):
         n = ln[i]
         m = ln[i + 1]
 
diff --git a/test/test_tensorboard.py b/test/test_tensorboard.py
index cd527db88441..8ff6913887c8 100644
--- a/test/test_tensorboard.py
+++ b/test/test_tensorboard.py
@@ -200,7 +200,7 @@ class TestTensorBoardPyTorchNumpy(BaseTestCase):
                 bucket_counts=counts.tolist(),
             )
 
-            ints = torch.tensor(range(0, 100)).float()
+            ints = torch.tensor(range(100)).float()
             nbins = 100
             counts = torch.histc(ints, bins=nbins, min=0, max=99)
             limits = torch.tensor(range(nbins))
diff --git a/test/test_tensorexpr.py b/test/test_tensorexpr.py
index 17d3a58535d6..57be409ab6b4 100644
--- a/test/test_tensorexpr.py
+++ b/test/test_tensorexpr.py
@@ -1216,7 +1216,7 @@ class TestTensorExprFuser(BaseTestClass):
         @torch.jit.script
         def test(x: torch.Tensor, y: torch.Tensor, z: int) -> torch.Tensor:
             b = y
-            for i in range(0, z):
+            for i in range(z):
                 a = x + y
                 b = b + y
             return b
diff --git a/test/test_torch.py b/test/test_torch.py
index 05ea6ea61db1..9b28b801348a 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -8424,7 +8424,7 @@ tensor([[[1.+1.j, 1.+1.j, 1.+1.j,  ..., 1.+1.j, 1.+1.j, 1.+1.j],
     def test_Size_iter(self):
         for sizes in [iter([1, 2, 3, 4, 5]), range(1, 6)]:
             x = torch.Size(sizes)
-            for i in range(0, 5):
+            for i in range(5):
                 self.assertEqual(x[i], i + 1)
 
     def test_t_not_2d_error(self):
diff --git a/test/test_view_ops.py b/test/test_view_ops.py
index 5bec225787cc..174632b07988 100644
--- a/test/test_view_ops.py
+++ b/test/test_view_ops.py
@@ -1559,7 +1559,7 @@ class TestOldViewOps(TestCase):
             self.compare_with_numpy(torch_fn, np_fn, x, device=None, dtype=None)
 
     def _test_atleast_dim(self, torch_fn, np_fn, device, dtype):
-        for ndims in range(0, 5):
+        for ndims in range(5):
             shape = _rand_shape(ndims, min_size=5, max_size=10)
             for _ in range(ndims + 1):
                 for with_extremal in [False, True]:
diff --git a/test/test_xnnpack_integration.py b/test/test_xnnpack_integration.py
index 481bd3c76a50..62e257790fd4 100644
--- a/test/test_xnnpack_integration.py
+++ b/test/test_xnnpack_integration.py
@@ -1316,7 +1316,7 @@ class TestXNNPACKConv1dTransformPass(TestCase):
         groups_list = range(1, 3)
         kernel_list = range(1, 4)
         stride_list = range(1, 3)
-        padding_list = range(0, 3)
+        padding_list = range(3)
         dilation_list = range(1, 3)
 
         for hparams in itertools.product(
@@ -1401,7 +1401,7 @@ class TestXNNPACKConv1dTransformPass(TestCase):
         groups_list = range(1, 3)
         kernel_list = range(1, 4)
         stride_list = range(1, 3)
-        padding_list = range(0, 3)
+        padding_list = range(3)
         dilation_list = range(1, 3)
         output_features_list = range(1, 3)
 
diff --git a/torch/_decomp/decompositions_for_jvp.py b/torch/_decomp/decompositions_for_jvp.py
index e11540e0c2ba..fb4a4d85faa2 100644
--- a/torch/_decomp/decompositions_for_jvp.py
+++ b/torch/_decomp/decompositions_for_jvp.py
@@ -147,7 +147,7 @@ def native_layer_norm_backward(
     inner_dims = input_shape[axis:]
     outer_dims = input_shape[:axis]
     inner_dim_indices = list(range(axis, input_ndim))
-    outer_dim_indices = list(range(0, axis))
+    outer_dim_indices = list(range(axis))
 
     N = 1
     for i in inner_dims:
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index 036f1ba7d01a..451776ef25fd 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -1248,7 +1248,7 @@ def argument_names(
         # signature. Assign names as {varargs}_0, {varargs}_1, ...
         assert fullargspec.varargs is not None, "More arguments than expected"
         input_strs += [
-            f"{fullargspec.varargs}_{i}" for i in range(0, len(args) - len(input_strs))
+            f"{fullargspec.varargs}_{i}" for i in range(len(args) - len(input_strs))
         ]
     elif len(args) < len(fullargspec.args):
         # 3. If there are fewer arguments in `args` than `fullargspec.args`,
@@ -1538,7 +1538,7 @@ class FlattenInputOutputSignature(torch.fx.Transformer):
         }
 
         self.new_args = []
-        for i in range(0, len(flat_args)):
+        for i in range(len(flat_args)):
             arg = super().placeholder(f"arg{i}", (), {})
             if i in matched_input_elements_to_fake:
                 arg.node.meta["val"] = matched_input_elements_to_fake[i]
diff --git a/torch/_inductor/dependencies.py b/torch/_inductor/dependencies.py
index 0547b6b1db90..b431972521da 100644
--- a/torch/_inductor/dependencies.py
+++ b/torch/_inductor/dependencies.py
@@ -151,7 +151,7 @@ class MemoryDep(Dep):
         stride_to_index = {s: i for i, s in enumerate(self_strides)}
         order = [stride_to_index[s] for s in other_strides]
 
-        assert OrderedSet(order) == OrderedSet(range(0, self.num_vars))
+        assert OrderedSet(order) == OrderedSet(range(self.num_vars))
         return order
 
     def get_offset(self) -> sympy.Expr:
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index e89be2299434..1ad443ff387e 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -1787,7 +1787,7 @@ def _padding_check_valid_input(input, padding, *, dim):
         for d in range(1, input_dim):
             valid_batch_mode = valid_batch_mode and input.size(d) != 0
     else:
-        for d in range(0, input_dim):
+        for d in range(input_dim):
             valid_non_batch_mode = valid_non_batch_mode and input.size(d) != 0
 
     # allow empty batch size but not other dimensions.
diff --git a/torch/_numpy/_funcs_impl.py b/torch/_numpy/_funcs_impl.py
index 4ab3b29d34b8..f57e7fb001fb 100644
--- a/torch/_numpy/_funcs_impl.py
+++ b/torch/_numpy/_funcs_impl.py
@@ -1449,7 +1449,7 @@ def rollaxis(a: ArrayLike, axis, start=0):
         # numpy returns a view, here we try returning the tensor itself
         # return tensor[...]
         return a
-    axes = list(range(0, n))
+    axes = list(range(n))
     axes.remove(axis)
     axes.insert(start, axis)
     return a.view(axes)
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index 13d6efd4ac67..822f949d536f 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -4738,7 +4738,7 @@ def transpose(a: TensorLikeType, dim0: int, dim1: int) -> TensorLikeType:
     if a.ndim <= 1 or dim0 == dim1:
         return aten.alias.default(a)
 
-    _permutation = list(range(0, a.ndim))
+    _permutation = list(range(a.ndim))
     _permutation[_dim0] = _dim1
     _permutation[_dim1] = _dim0
     return torch.permute(a, _permutation)
diff --git a/torch/_tensor_str.py b/torch/_tensor_str.py
index af4deb471db2..86a745f09b44 100644
--- a/torch/_tensor_str.py
+++ b/torch/_tensor_str.py
@@ -307,7 +307,7 @@ def _tensor_str_with_formatter(self, indent, summarize, formatter1, formatter2=N
                 _tensor_str_with_formatter(
                     self[i], indent + 1, summarize, formatter1, formatter2
                 )
-                for i in range(0, PRINT_OPTS.edgeitems)
+                for i in range(PRINT_OPTS.edgeitems)
             ]
             + ["..."]
             + [
@@ -322,7 +322,7 @@ def _tensor_str_with_formatter(self, indent, summarize, formatter1, formatter2=N
             _tensor_str_with_formatter(
                 self[i], indent + 1, summarize, formatter1, formatter2
             )
-            for i in range(0, self.size(0))
+            for i in range(self.size(0))
         ]
 
     tensor_str = ("," + "\n" * (dim - 1) + " " * (indent + 1)).join(slices)
@@ -406,7 +406,7 @@ def get_summarized_data(self):
     if not PRINT_OPTS.edgeitems:
         return self.new_empty([0] * self.dim())
     elif self.size(0) > 2 * PRINT_OPTS.edgeitems:
-        start = [self[i] for i in range(0, PRINT_OPTS.edgeitems)]
+        start = [self[i] for i in range(PRINT_OPTS.edgeitems)]
         end = [self[i] for i in range(len(self) - PRINT_OPTS.edgeitems, len(self))]
         return torch.stack([get_summarized_data(x) for x in (start + end)])
     else:
diff --git a/torch/ao/ns/fx/pattern_utils.py b/torch/ao/ns/fx/pattern_utils.py
index 242d1740d91b..8339ce8f57c1 100644
--- a/torch/ao/ns/fx/pattern_utils.py
+++ b/torch/ao/ns/fx/pattern_utils.py
@@ -28,7 +28,7 @@ def get_type_a_related_to_b(
     for s in base_name_to_sets_of_related_ops.values():
         s_list = list(s)
         # add every bidirectional pair
-        for idx_0 in range(0, len(s_list)):
+        for idx_0 in range(len(s_list)):
             for idx_1 in range(idx_0, len(s_list)):
                 type_a_related_to_b.add((s_list[idx_0], s_list[idx_1]))
                 type_a_related_to_b.add((s_list[idx_1], s_list[idx_0]))
diff --git a/torch/ao/pruning/_experimental/activation_sparsifier/activation_sparsifier.py b/torch/ao/pruning/_experimental/activation_sparsifier/activation_sparsifier.py
index ef6a35686c7d..4330b0e24253 100644
--- a/torch/ao/pruning/_experimental/activation_sparsifier/activation_sparsifier.py
+++ b/torch/ao/pruning/_experimental/activation_sparsifier/activation_sparsifier.py
@@ -158,9 +158,9 @@ class ActivationSparsifier:
                 # data should be a list [aggregated over each feature only]
                 if data is None:
                     out_data = [
-                        0 for _ in range(0, len(features))
+                        0 for _ in range(len(features))
                     ]  # create one in case of 1st forward
-                    self.state[name]["mask"] = [0 for _ in range(0, len(features))]
+                    self.state[name]["mask"] = [0 for _ in range(len(features))]
                 else:
                     out_data = data  # a list
 
@@ -336,7 +336,7 @@ class ActivationSparsifier:
                 return input_data * mask
             else:
                 # apply per feature, feature_dim
-                for feature_idx in range(0, len(features)):
+                for feature_idx in range(len(features)):
                     feature = (
                         torch.Tensor([features[feature_idx]])
                         .long()
diff --git a/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_disk_savings.py b/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_disk_savings.py
index 8192b617139b..0e25f59cea64 100644
--- a/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_disk_savings.py
+++ b/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_disk_savings.py
@@ -99,7 +99,7 @@ def sparsify_model(path_to_model, sparsified_model_dump_path):
         sparse_block_shapes (List of tuples)
             List of sparse block shapes to be sparsified on
     """
-    sparsity_levels = [sl / 10 for sl in range(0, 10)]
+    sparsity_levels = [sl / 10 for sl in range(10)]
     sparsity_levels += [0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99, 1.0]
 
     norms = ["L1", "L2"]
diff --git a/torch/ao/pruning/_experimental/data_sparsifier/lightning/tests/test_callbacks.py b/torch/ao/pruning/_experimental/data_sparsifier/lightning/tests/test_callbacks.py
index 442639be9b21..5a36e13c7b46 100644
--- a/torch/ao/pruning/_experimental/data_sparsifier/lightning/tests/test_callbacks.py
+++ b/torch/ao/pruning/_experimental/data_sparsifier/lightning/tests/test_callbacks.py
@@ -299,7 +299,7 @@ class TestTrainingAwareCallback(TestCase):
         self._check_on_train_start(pl_module, callback, sparsifier_args, scheduler_args)
 
         num_epochs = 5
-        for _ in range(0, num_epochs):
+        for _ in range(num_epochs):
             self._check_on_train_epoch_start(pl_module, callback)
             self._simulate_update_param_model(pl_module)
             self._check_on_train_epoch_end(pl_module, callback)
diff --git a/torch/ao/pruning/sparsifier/nearly_diagonal_sparsifier.py b/torch/ao/pruning/sparsifier/nearly_diagonal_sparsifier.py
index a4d42ea80328..26fb3a98b8fb 100644
--- a/torch/ao/pruning/sparsifier/nearly_diagonal_sparsifier.py
+++ b/torch/ao/pruning/sparsifier/nearly_diagonal_sparsifier.py
@@ -53,7 +53,7 @@ class NearlyDiagonalSparsifier(base_sparsifier.BaseSparsifier):
                 "nearliness cannot be larger than the dimensions of tensor."
             )
 
-        for row in range(0, height):
+        for row in range(height):
             # Bounds of entries that needs to be set to 1
             low = max(0, row - dist_to_diagonal)
             high = min(width, row + dist_to_diagonal + 1)
diff --git a/torch/ao/quantization/experimental/observer.py b/torch/ao/quantization/experimental/observer.py
index 7d9432ab27ec..e61fcb67c94a 100644
--- a/torch/ao/quantization/experimental/observer.py
+++ b/torch/ao/quantization/experimental/observer.py
@@ -68,10 +68,10 @@ class APoTObserver(ObserverBase):
         p_all = []
 
         # create levels
-        for i in range(0, self.n):
+        for i in range(self.n):
             p_curr = torch.tensor([0])
 
-            for j in range(0, (2**self.k - 2) + 1):
+            for j in range((2**self.k - 2) + 1):
                 curr_ele = 2 ** (-(i + j * self.n))
                 p_append = torch.tensor([curr_ele])
                 p_curr = torch.cat((p_curr, p_append))
diff --git a/torch/ao/quantization/fx/_decomposed.py b/torch/ao/quantization/fx/_decomposed.py
index 160e9aa3afef..b145cbfaeeba 100644
--- a/torch/ao/quantization/fx/_decomposed.py
+++ b/torch/ao/quantization/fx/_decomposed.py
@@ -1159,7 +1159,7 @@ class FakeQuantPerChannel(torch.autograd.Function):
             f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
         )
         assert axis < input.dim(), f"Expecting axis to be < {input.dim()}"
-        broadcast_dims = list(range(0, axis)) + list(range(axis + 1, input.ndim))
+        broadcast_dims = list(range(axis)) + list(range(axis + 1, input.ndim))
         unsqueeze_scales = _unsqueeze_multiple(scales, broadcast_dims)
         unsqueeze_zero_points = _unsqueeze_multiple(zero_points, broadcast_dims)
         temp = torch.round(input * (1.0 / unsqueeze_scales)) + unsqueeze_zero_points
diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index 322d39f72202..cdab6259d85b 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -1212,7 +1212,7 @@ class KinetoStepTracker:
                     "Profiler step count has increased more than 1 - "
                     f"current_step = {cls._current_step} step dict =  {cls._step_dict}"
                 )
-            for _ in range(0, delta):
+            for _ in range(delta):
                 _kineto_step()
             cls._current_step = new_step
         return cls._current_step
diff --git a/torch/distributed/_pycute/layout.py b/torch/distributed/_pycute/layout.py
index be25cad2e953..04ae5d1fa5fd 100644
--- a/torch/distributed/_pycute/layout.py
+++ b/torch/distributed/_pycute/layout.py
@@ -162,7 +162,7 @@ def coalesce(layout: Layout, profile: LayoutProfile = None) -> Layout:
         assert len(layout) >= len(profile)
         return make_layout(
             chain(
-                (coalesce(layout[i], profile[i]) for i in range(0, len(profile))),  # type: ignore[arg-type]
+                (coalesce(layout[i], profile[i]) for i in range(len(profile))),  # type: ignore[arg-type]
                 (layout[i] for i in range(len(profile), len(layout))),
             )
         )
@@ -203,7 +203,7 @@ def filter(layout: Layout, profile: LayoutProfile = None) -> Layout:
         assert len(layout) >= len(profile)
         return make_layout(
             chain(
-                (filter(layout[i], profile[i]) for i in range(0, len(profile))),  # type: ignore[arg-type]
+                (filter(layout[i], profile[i]) for i in range(len(profile))),  # type: ignore[arg-type]
                 (layout[i] for i in range(len(profile), len(layout))),
             )
         )
@@ -233,7 +233,7 @@ def composition(layoutA: Layout, layoutB: LayoutInput) -> Layout:
         assert len(layoutA) >= len(layoutB)
         return make_layout(
             chain(
-                (composition(layoutA[i], layoutB[i]) for i in range(0, len(layoutB))),  # type: ignore[arg-type]
+                (composition(layoutA[i], layoutB[i]) for i in range(len(layoutB))),  # type: ignore[arg-type]
                 (layoutA[i] for i in range(len(layoutB), len(layoutA))),
             )
         )
@@ -371,7 +371,7 @@ def logical_divide(layoutA: Layout, layoutB: LayoutInput) -> Layout:
             chain(
                 (
                     logical_divide(layoutA[i], layoutB[i])  # type: ignore[arg-type]
-                    for i in range(0, len(layoutB))
+                    for i in range(len(layoutB))
                 ),
                 (layoutA[i] for i in range(len(layoutB), len(layoutA))),
             )
@@ -396,7 +396,7 @@ def logical_product(layoutA: Layout, layoutB: LayoutInput) -> Layout:
             chain(
                 (
                     logical_product(layoutA[i], layoutB[i])  # type: ignore[arg-type]
-                    for i in range(0, len(layoutB))
+                    for i in range(len(layoutB))
                 ),
                 (layoutA[i] for i in range(len(layoutB), len(layoutA))),
             )
@@ -421,14 +421,14 @@ def hier_unzip(
         # A layout with shape ((A,a),(B,b),(C,c))
         split = make_layout(
             hier_unzip(splitter, layoutA[i], layoutB[i])  # type: ignore[arg-type]
-            for i in range(0, len(layoutB))
+            for i in range(len(layoutB))
         )
         # Gather to shape ((A,B,C,...),(a,b,c,...,y,z))
         return make_layout(
-            make_layout(split[i][0] for i in range(0, len(layoutB))),  # type: ignore[arg-type]
+            make_layout(split[i][0] for i in range(len(layoutB))),  # type: ignore[arg-type]
             make_layout(
                 chain(  # type: ignore[arg-type]
-                    (split[i][1] for i in range(0, len(layoutB))),
+                    (split[i][1] for i in range(len(layoutB))),
                     (layoutA[i] for i in range(len(layoutB), len(layoutA))),
                 )
             ),
diff --git a/torch/distributed/_symmetric_memory/__init__.py b/torch/distributed/_symmetric_memory/__init__.py
index 1c576e886fe1..132a40977f85 100644
--- a/torch/distributed/_symmetric_memory/__init__.py
+++ b/torch/distributed/_symmetric_memory/__init__.py
@@ -1671,7 +1671,7 @@ def _low_contention_all_gather(
             local_buf.copy_(tensor)
         # pull
         symm_mem.barrier()
-        for step in range(0, world_size):
+        for step in range(world_size):
             remote_rank = (rank - step) % world_size
             src_buf = symm_mem.get_buffer(remote_rank, tensor.shape, tensor.dtype)
             chunks[remote_rank].copy_(src_buf)
@@ -1706,7 +1706,7 @@ def _low_contention_reduce_scatter_with_symm_mem_input(
     with _get_backend_stream():
         # pull + offline reduction
         symm_mem.barrier()
-        for step in range(0, world_size):
+        for step in range(world_size):
             remote_rank = (rank - step) % world_size
             src_buf = symm_mem.get_buffer(
                 remote_rank,
@@ -1743,7 +1743,7 @@ def _low_contention_reduce_scatter_with_workspace(
     with _get_backend_stream():
         # push + offline reduction
         workspace.barrier()
-        for step in range(0, world_size):
+        for step in range(world_size):
             remote_rank = (rank - step) % world_size
             dst_buf = workspace.get_buffer(
                 remote_rank, chunks[0].shape, chunks[0].dtype, chunks[0].numel() * rank
diff --git a/torch/distributed/elastic/multiprocessing/api.py b/torch/distributed/elastic/multiprocessing/api.py
index d91974548221..9bb580c5bf78 100644
--- a/torch/distributed/elastic/multiprocessing/api.py
+++ b/torch/distributed/elastic/multiprocessing/api.py
@@ -727,7 +727,7 @@ class MultiprocessContext(PContext):
             # pipe. Hence to prevent deadlocks on large return values,
             # we opportunistically try queue.get on each join call
             # See: https://docs.python.org/2/library/multiprocessing.html#all-platforms
-            for local_rank in range(0, self.nprocs):
+            for local_rank in range(self.nprocs):
                 return_queue = self._ret_vals[local_rank]
                 if not return_queue.empty():
                     # save the return values temporarily into a member var
diff --git a/torch/distributed/elastic/timer/local_timer.py b/torch/distributed/elastic/timer/local_timer.py
index d55cc6ac6e37..5e66ef3fae34 100644
--- a/torch/distributed/elastic/timer/local_timer.py
+++ b/torch/distributed/elastic/timer/local_timer.py
@@ -59,7 +59,7 @@ class MultiprocessingRequestQueue(RequestQueue):
     def get(self, size, timeout: float) -> list[TimerRequest]:
         requests = []
         wait = timeout
-        for _ in range(0, size):
+        for _ in range(size):
             start = time.time()
 
             try:
diff --git a/torch/distributed/tensor/_dtensor_spec.py b/torch/distributed/tensor/_dtensor_spec.py
index e12f41c4858b..42cb7fcd7c33 100644
--- a/torch/distributed/tensor/_dtensor_spec.py
+++ b/torch/distributed/tensor/_dtensor_spec.py
@@ -107,7 +107,7 @@ class DTensorSpec:
         # follow default left-to-right device order if shard_order is not specified
         tensor_dim_to_mesh_dims: defaultdict[int, list[int]] = defaultdict(list)
         mesh_ndim = len(placements)
-        for mesh_dim in range(0, mesh_ndim):
+        for mesh_dim in range(mesh_ndim):
             # shard_order doesn't work with _StridedShard
             if isinstance(placements[mesh_dim], _StridedShard):
                 return ()
diff --git a/torch/distributed/tensor/parallel/fsdp.py b/torch/distributed/tensor/parallel/fsdp.py
index 6cffbdb83d2f..f5367397cc80 100644
--- a/torch/distributed/tensor/parallel/fsdp.py
+++ b/torch/distributed/tensor/parallel/fsdp.py
@@ -306,7 +306,7 @@ def _all_gather_dtensor(
     placements = list(copy.deepcopy(tensor.placements))
     # FSDP + TP: [Shard(0), tp_placement] -> [Replicate(), tp_placement]
     # HSDP + TP: [Replicate(), Shard(0), tp_placement] -> [Replicate(), Replicate(), tp_placement]
-    for i in range(0, len(placements) - 1):
+    for i in range(len(placements) - 1):
         placements[i] = Replicate()
     tensor = tensor.redistribute(
         device_mesh=tensor.device_mesh,
diff --git a/torch/nested/_internal/ops.py b/torch/nested/_internal/ops.py
index f52bfab2a8b3..bdca74c13b1d 100644
--- a/torch/nested/_internal/ops.py
+++ b/torch/nested/_internal/ops.py
@@ -1112,7 +1112,7 @@ def chunk_default(func, *args, **kwargs):
         # the input number; it can be counter-intuitive, but it matches dense behavior.
         return [
             NestedTensor(values=chunk_values[i], **(nested_kwargs[i]))
-            for i in range(0, len(chunk_values))
+            for i in range(len(chunk_values))
         ]
     else:
         return [
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_helper.py b/torch/onnx/_internal/torchscript_exporter/symbolic_helper.py
index bcd36a6ac41b..3f92f6418c89 100644
--- a/torch/onnx/_internal/torchscript_exporter/symbolic_helper.py
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_helper.py
@@ -1005,7 +1005,7 @@ def _interpolate_size_to_scales(g: jit_utils.GraphContext, input, output_size, d
             if i < 2
             else float(output_size[-(dim - i)])
             / float(input.type().sizes()[-(dim - i)])
-            for i in range(0, dim)
+            for i in range(dim)
         ]
         scales = g.op(
             "Constant", value_t=torch.tensor(scales_constant, dtype=torch.float32)
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset12.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset12.py
index 822e14556768..d4b887560f9b 100644
--- a/torch/onnx/_internal/torchscript_exporter/symbolic_opset12.py
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset12.py
@@ -331,7 +331,7 @@ def unfold(g: jit_utils.GraphContext, input, dimension, size, step):
 
         ndim = symbolic_helper._get_tensor_rank(input)
         assert ndim is not None
-        perm = list(range(0, ndim))
+        perm = list(range(ndim))
         perm.append(perm.pop(dimension))
 
         unsqueeze_list = []
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset8.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset8.py
index bde072608088..8ba8e6ee6622 100644
--- a/torch/onnx/_internal/torchscript_exporter/symbolic_opset8.py
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset8.py
@@ -116,7 +116,7 @@ def _interpolate(name, dim, interpolate_mode):
                 if i < 2
                 else float(output_size[-(dim - i)])
                 / float(input.type().sizes()[-(dim - i)])
-                for i in range(0, dim)
+                for i in range(dim)
             ]
         return g.op("Upsample", input, mode_s=interpolate_mode, scales_f=scales)
 
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset9.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset9.py
index 9b7aba64ef31..16e94b91f89f 100644
--- a/torch/onnx/_internal/torchscript_exporter/symbolic_opset9.py
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset9.py
@@ -840,7 +840,7 @@ def t(g: jit_utils.GraphContext, self):
 def numpy_T(g: jit_utils.GraphContext, input):
     ndim = symbolic_helper._get_tensor_rank(input)
     assert ndim is not None
-    perm = list(reversed(range(0, ndim)))
+    perm = list(reversed(range(ndim)))
     return g.op("Transpose", input, perm_i=perm)
 
 
@@ -990,7 +990,7 @@ def transpose(g: jit_utils.GraphContext, self, dim0, dim1):
 @_onnx_symbolic("aten::permute")
 @symbolic_helper.parse_args("v", "is")
 def permute(g: jit_utils.GraphContext, self, dims):
-    if dims == list(range(0, len(dims))):
+    if dims == list(range(len(dims))):
         return self
     return g.op("Transpose", self, perm_i=dims)
 
@@ -1368,7 +1368,7 @@ def get_pool_ceil_padding(input, kernel_size, stride, padding):
         )
     ceiled_output_dim = [
         math.ceil((dim[i] + 2 * padding[i] - kernel_size[i]) / float(stride[i])) + 1
-        for i in range(0, len(padding))
+        for i in range(len(padding))
     ]
     # ensure last pooling starts inside
     ceiled_output_dim = [
@@ -1377,7 +1377,7 @@ def get_pool_ceil_padding(input, kernel_size, stride, padding):
             if (((ceiled_output_dim[i] - 1) * stride[i]) >= (dim[i] + padding[i]))
             else ceiled_output_dim[i]
         )
-        for i in range(0, len(ceiled_output_dim))
+        for i in range(len(ceiled_output_dim))
     ]
     padding_ceil = [
         (
@@ -1392,7 +1392,7 @@ def get_pool_ceil_padding(input, kernel_size, stride, padding):
                 )
             )
         )
-        for i in range(0, len(padding))
+        for i in range(len(padding))
     ]
     # ensure padding is not > kernel_size
     padding_ceil = [
@@ -1405,7 +1405,7 @@ def get_pool_ceil_padding(input, kernel_size, stride, padding):
             if ((padding_ceil[i] + 2 * padding[i]) >= (kernel_size[i]))
             else int(padding_ceil[i])
         )
-        for i in range(0, len(padding_ceil))
+        for i in range(len(padding_ceil))
     ]
     return padding_ceil
 
@@ -1697,14 +1697,14 @@ def _adaptive_pool(name, type, tuple_fn, fn=None):
                 name, "input size not accessible", input
             )
         # verify if output size % input size = 0 for all dim
-        mod = [dim[i] % output_size[i] for i in range(0, len(dim))]
+        mod = [dim[i] % output_size[i] for i in range(len(dim))]
         if mod != [0] * len(mod):
             if output_size == [1] * len(output_size):
                 return g.op("GlobalMaxPool", input), None
             return symbolic_helper._unimplemented(
                 name, "output size that are not factor of input size", output_size_value
             )
-        k = [int(dim[i] / output_size[i]) for i in range(0, len(dim))]
+        k = [int(dim[i] / output_size[i]) for i in range(len(dim))]
         # call max_poolxd_with_indices to get indices in the output
         if type == "MaxPool":
             # pyrefly: ignore  # not-callable
@@ -2906,7 +2906,7 @@ def unfold(g: jit_utils.GraphContext, input, dimension, size, step):
             for low, hi in zip(low_indices, hi_indices)
         ]
         ndim = len(sizes)
-        perm = list(range(0, ndim))
+        perm = list(range(ndim))
         perm.append(perm.pop(dimension))
         unsqueeze = [
             symbolic_helper._unsqueeze_helper(
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 82e630519eb8..0cecc762bce4 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -11615,7 +11615,7 @@ def reference_searchsorted(sorted_sequence, boundary, out_int32=False, right=Fal
         # numpy searchsorted only supports 1D inputs so we split up ND inputs
         orig_shape = boundary.shape
         num_splits = np.prod(sorted_sequence.shape[:-1])
-        splits = range(0, num_splits)
+        splits = range(num_splits)
         sorted_sequence, boundary = sorted_sequence.reshape(num_splits, -1), boundary.reshape(num_splits, -1)
         if sorter is not None:
             sorter = sorter.reshape(num_splits, -1)
@@ -16258,7 +16258,7 @@ op_db: list[OpInfo] = [
         aten_backward_name='_prelu_kernel_backward',
         ref=lambda x, weight:
             np.maximum(0., x) + np.minimum(0., x) *
-            (weight if x.ndim == 1 else weight.reshape([weight.size if i == 1 else 1 for i in range(0, x.ndim)])),
+            (weight if x.ndim == 1 else weight.reshape([weight.size if i == 1 else 1 for i in range(x.ndim)])),
         dtypes=floating_types_and(torch.bfloat16, torch.float16),
         supports_forward_ad=True,
         supports_fwgrad_bwgrad=True,
diff --git a/torch/testing/_internal/common_nn.py b/torch/testing/_internal/common_nn.py
index 68a35e8c40a1..3153359326dc 100644
--- a/torch/testing/_internal/common_nn.py
+++ b/torch/testing/_internal/common_nn.py
@@ -2896,7 +2896,7 @@ def _multilabelmarginloss_reference(input, target):
 
     sum = 0
     for target_index in targets:
-        for i in range(0, len(input)):
+        for i in range(len(input)):
             if i not in targets:
                 sum += max(0, 1 - input[target_index] + input[i])
 
@@ -2914,7 +2914,7 @@ def multilabelmarginloss_reference(input, target, reduction='mean'):
     n = input.size(0)
     dim = input.size(1)
     output = input.new(n).zero_()
-    for i in range(0, n):
+    for i in range(n):
         output[i] = _multilabelmarginloss_reference(input[i], target[i])
 
     if reduction == 'mean':
@@ -2955,7 +2955,7 @@ def _multimarginloss_reference(input, target_idx, p, margin, weight):
         weight = input.new(len(input)).fill_(1)
 
     output = 0
-    for i in range(0, len(input)):
+    for i in range(len(input)):
         if i != target_idx:
             output += weight[target_idx] * (max(0, (margin - input[target_idx] + input[i])) ** p)
     return output
@@ -2972,7 +2972,7 @@ def multimarginloss_reference(input, target, p=1, margin=1, weight=None, reducti
     n = input.size(0)
     dim = input.size(1)
     output = input.new(n)
-    for x in range(0, n):
+    for x in range(n):
         output[x] = _multimarginloss_reference(input[x], target[x], p, margin, weight)
 
     if reduction == 'mean':
@@ -2987,7 +2987,7 @@ def multimarginloss_reference(input, target, p=1, margin=1, weight=None, reducti
 def cosineembeddingloss_reference(input1, input2, target, margin=0, reduction='mean'):
     def _cos(a, b):
         cos = a.new(a.size(0))
-        for i in range(0, a.size(0)):
+        for i in range(a.size(0)):
             cos[i] = (a[i] * b[i]).sum() / ((((a[i] * a[i]).sum() + 1e-12) * ((b[i] * b[i]).sum() + 1e-12)) ** 0.5)
         return cos
 
diff --git a/torch/testing/_internal/distributed/_tensor/common_dtensor.py b/torch/testing/_internal/distributed/_tensor/common_dtensor.py
index a9beb0e60865..22d6d8e7dede 100644
--- a/torch/testing/_internal/distributed/_tensor/common_dtensor.py
+++ b/torch/testing/_internal/distributed/_tensor/common_dtensor.py
@@ -705,7 +705,7 @@ class LocalDTensorTestBase(DTensorTestBase):
         self.skipTest(msg)
 
     def _get_local_tensor_mode(self):
-        return LocalTensorMode(frozenset(range(0, self.world_size)))
+        return LocalTensorMode(frozenset(range(self.world_size)))
 
     def setUp(self) -> None:
         super().setUp()
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index c41602d43994..499341b07951 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -658,13 +658,13 @@ class DistributedTest:
             return (group, group_id, rank)
 
         def _init_full_group_test(self, **kwargs):
-            group = list(range(0, dist.get_world_size()))
+            group = list(range(dist.get_world_size()))
             group_id = dist.new_group(**kwargs)
             rank = dist.get_rank()
             return (group, group_id, rank)
 
         def _init_global_test(self):
-            group = list(range(0, dist.get_world_size()))
+            group = list(range(dist.get_world_size()))
             group_id = dist.group.WORLD
             rank = dist.get_rank()
             return (group, group_id, rank)
@@ -1114,7 +1114,7 @@ class DistributedTest:
                 averager = averagers.PeriodicModelAverager(
                     period=period, warmup_steps=warmup_steps
                 )
-                for step in range(0, 20):
+                for step in range(20):
                     # Reset the parameters at every step.
                     param.data = copy.deepcopy(tensor)
                     for params in model.parameters():
@@ -1143,7 +1143,7 @@ class DistributedTest:
                 averager = averagers.PeriodicModelAverager(
                     period=period, warmup_steps=warmup_steps
                 )
-                for step in range(0, 20):
+                for step in range(20):
                     # Reset the parameters at every step.
                     for param_group in opt.param_groups:
                         for params in param_group["params"]:
@@ -1203,7 +1203,7 @@ class DistributedTest:
                 averager = averagers.PeriodicModelAverager(
                     period=period, warmup_steps=warmup_steps
                 )
-                for step in range(0, 20):
+                for step in range(20):
                     # Reset the parameters at every step.
                     param.data = copy.deepcopy(tensor)
                     for params in model.parameters():
@@ -1284,7 +1284,7 @@ class DistributedTest:
             expected_global_avg_tensor = (
                 torch.ones_like(param.data) * sum(range(world_size)) / world_size
             )
-            for step in range(0, 25):
+            for step in range(25):
                 # Reset the parameters at every step.
                 param.data = copy.deepcopy(tensor)
                 for params in model.parameters():
@@ -1390,7 +1390,7 @@ class DistributedTest:
 
             for val in ["1", "0"]:
                 os.environ["TORCH_NCCL_BLOCKING_WAIT"] = val
-                for src in range(0, world_size):
+                for src in range(world_size):
                     send_tensor = _build_tensor(rank + 1, device_id=device_id).fill_(
                         src
                     )
@@ -1409,7 +1409,7 @@ class DistributedTest:
                 for req in reqs:
                     req.wait()
 
-                for src in range(0, world_size):
+                for src in range(world_size):
                     self.assertEqual(recv_tensors[src], expected_tensors[src])
 
             self._barrier()
@@ -1505,7 +1505,7 @@ class DistributedTest:
             rank = dist.get_rank()
             p2p_op_list = []
 
-            for src in range(0, dist.get_world_size()):
+            for src in range(dist.get_world_size()):
                 if src == rank:
                     continue
                 send_tensor = _build_tensor(rank + 1)
@@ -1528,7 +1528,7 @@ class DistributedTest:
             rank = dist.get_rank()
             p2p_op_list = []
 
-            for src in range(0, dist.get_world_size()):
+            for src in range(dist.get_world_size()):
                 if src == rank:
                     continue
                 send_tensor = _build_tensor(rank + 1)
@@ -1602,10 +1602,10 @@ class DistributedTest:
             tensor = _build_tensor(rank + 1, device_id=device_id)
             profiler_cls = profiler_ctx if profiler_ctx is not None else nullcontext()
             with profiler_cls as prof:
-                for src in range(0, world_size):
+                for src in range(world_size):
                     if src == rank:
                         # Send mode
-                        for dst in range(0, world_size):
+                        for dst in range(world_size):
                             if dst == rank:
                                 continue
                             dist.send(tensor, dst)
@@ -1674,10 +1674,10 @@ class DistributedTest:
             tensor = _build_tensor(send_size)
             ctx = profiler_ctx if profiler_ctx is not None else nullcontext()
             with ctx as prof:
-                for src in range(0, dist.get_world_size()):
+                for src in range(dist.get_world_size()):
                     if src == rank:
                         # Send mode
-                        for dst in range(0, dist.get_world_size()):
+                        for dst in range(dist.get_world_size()):
                             if dst == rank:
                                 continue
                             dist.send(tensor, dst)
@@ -1742,10 +1742,10 @@ class DistributedTest:
 
             ctx = profiler_ctx if profiler_ctx is not None else nullcontext()
             with ctx as prof:
-                for dst in range(0, dist.get_world_size()):
+                for dst in range(dist.get_world_size()):
                     if dst == rank:
                         # Recv mode
-                        for dst in range(0, dist.get_world_size()):
+                        for dst in range(dist.get_world_size()):
                             if dst == rank:
                                 continue
 
@@ -1846,10 +1846,10 @@ class DistributedTest:
             tensor = _build_tensor(send_recv_size, value=rank)
             ctx = profiler_ctx if profiler_ctx is not None else nullcontext()
             with ctx as prof:
-                for dst in range(0, world_size):
+                for dst in range(world_size):
                     if dst == rank:
                         # Recv mode
-                        for src in range(0, world_size):
+                        for src in range(world_size):
                             if src == rank:
                                 continue
                             output_tensor = _build_tensor(send_recv_size, value=-1)
@@ -7480,7 +7480,7 @@ class DistributedTest:
                 for baseline_iter in baseline_num_iters:
                     for offset in iteration_offsets:
                         mapping = dict.fromkeys(
-                            range(0, num_early_join_ranks), baseline_iter
+                            range(num_early_join_ranks), baseline_iter
                         )
                         # if num_early_join_ranks > 1, ranks > 0 that will join early
                         # iterate offset//2 more times than rank 0, to test nodes
diff --git a/torch/testing/_internal/distributed/multi_threaded_pg.py b/torch/testing/_internal/distributed/multi_threaded_pg.py
index 2cc22cb7c23a..79aff05b3421 100644
--- a/torch/testing/_internal/distributed/multi_threaded_pg.py
+++ b/torch/testing/_internal/distributed/multi_threaded_pg.py
@@ -166,7 +166,7 @@ class AllReduce:
             # collect all data to the list and make them
             # all on rank 0 device
             tensors = [
-                data[src_rank][i].to(rank_0_device) for src_rank in range(0, len(data))
+                data[src_rank][i].to(rank_0_device) for src_rank in range(len(data))
             ]
 
             # now mimic reduce across all ranks
diff --git a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
index 1d6c7500c5ad..3c5c9101e43c 100644
--- a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
+++ b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
@@ -266,7 +266,7 @@ class CommonDistAutogradTest(RpcAgentTestFixture):
         grads = dist_autograd.get_gradients(context_id)
         nargs = len(args)
         ngrads = 0
-        for i in range(0, nargs):
+        for i in range(nargs):
             if local_grads[i] is not None:
                 self.assertIn(args[i], grads)
                 self.assertEqual(local_grads[i], grads[args[i]])
@@ -1973,7 +1973,7 @@ class DistAutogradTest(CommonDistAutogradTest):
         DistAutogradTest._test_clean_context_backward_context_id = context_id
 
         # Send the context id to all nodes.
-        for i in range(0, self.world_size):
+        for i in range(self.world_size):
             if i != self.rank:
                 rank_distance = (i - self.rank + self.world_size) % self.world_size
                 rpc.rpc_sync(
@@ -1988,7 +1988,7 @@ class DistAutogradTest(CommonDistAutogradTest):
         self.assertEqual(self.world_size - 1, len(known_context_ids))
 
         t1 = torch.rand((3, 3), requires_grad=True)
-        for i in range(0, 100):
+        for i in range(100):
             dst = self._next_rank()
             t1 = rpc.rpc_sync(worker_name(dst), torch.add, args=(t1, t1))
 
diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py
index 4ec964092b39..03469e473921 100644
--- a/torch/testing/_internal/distributed/rpc/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/rpc_test.py
@@ -1818,7 +1818,7 @@ class RpcTest(RpcAgentTestFixture, RpcTestCommon):
         # Spawn multiple threads that send RPCs to ensure keys are correctly
         # prefixed when there are multiple RPCs being created/in flight at the
         # same time.
-        dst_ranks = [rank for rank in range(0, self.world_size) if rank != self.rank]
+        dst_ranks = [rank for rank in range(self.world_size) if rank != self.rank]
 
         def rpc_with_profiling(dst_worker):
             with _profile() as prof:
@@ -1884,7 +1884,7 @@ class RpcTest(RpcAgentTestFixture, RpcTestCommon):
         if self.rank != 1:
             return
 
-        dst_ranks = [rank for rank in range(0, self.world_size) if rank != self.rank]
+        dst_ranks = [rank for rank in range(self.world_size) if rank != self.rank]
         for dst in dst_ranks:
             dst_worker = worker_name(dst)
             with _profile() as prof:
diff --git a/torch/testing/_internal/jit_utils.py b/torch/testing/_internal/jit_utils.py
index e98d0e482683..ce8e68ae1e2c 100644
--- a/torch/testing/_internal/jit_utils.py
+++ b/torch/testing/_internal/jit_utils.py
@@ -439,7 +439,7 @@ class JitTestCase(JitCommonTestCase):
         state = model.get_debug_state()
         plan = get_execution_plan(state)
         num_bailouts = plan.code.num_bailouts()
-        for i in range(0, num_bailouts):
+        for i in range(num_bailouts):
             plan.code.request_bailout(i)
             bailout_outputs = model(*inputs)
             self.assertEqual(bailout_outputs, expected)
diff --git a/torch/testing/_internal/triton_utils.py b/torch/testing/_internal/triton_utils.py
index 4edaf86dd1d7..0964c68ebb20 100644
--- a/torch/testing/_internal/triton_utils.py
+++ b/torch/testing/_internal/triton_utils.py
@@ -912,7 +912,7 @@ if has_triton():
         b_ptrs = b_ptr + (offs_k[:, None] + offs_bn[None, :])
 
         accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-        for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        for k in range(tl.cdiv(K, BLOCK_SIZE_K)):
             a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0)
             b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0)
             accumulator = tl.dot(a, b, accumulator)

From ad67170c8b9c7ddd3442e32369cb9a0be7631d91 Mon Sep 17 00:00:00 2001
From: Isalia20 <irakli.salia854@gmail.com>
Date: Sat, 18 Oct 2025 09:04:42 +0000
Subject: [PATCH 109/123] [MPS] sparse matmuls (#165232)

Implements matmuls for sparse tensors. With this commit most of the core sparse operations should be implemented. Fixes:
https://github.com/pytorch/pytorch/issues/156540
https://github.com/pytorch/pytorch/issues/129842

Should be merged after:
https://github.com/pytorch/pytorch/pull/165102

To compare MPS and CPU, you can use this script:
```python
import torch
import time
import matplotlib.pyplot as plt

B, I, J, K = 8, 20000, 20000, 20000
num_iterations = 500

nnz_values = [10, 50, 100, 200, 500, 1000, 2000, 5000, 10000, 20000, 100000]
speedups = []

for nnz in nnz_values:
    indices = torch.stack([
        torch.randint(0, B, (nnz,)),
        torch.randint(0, I, (nnz,)),
        torch.randint(0, J, (nnz,)),
    ])
    values = torch.rand(nnz)

    sparse = torch.sparse_coo_tensor(indices, values, size=(B, I, J), device="mps").coalesce()
    dense = torch.randn(B, J, 200, device="mps")

    t1 = time.time()
    for _ in range(num_iterations):
        result = torch.bmm(sparse, dense)
    torch.mps.synchronize()
    t2 = time.time()
    mps_time = (t2 - t1) / num_iterations

    sparse_cpu = sparse.cpu()
    dense_cpu = dense.cpu()
    t1 = time.time()
    for _ in range(num_iterations):
        result_cpu = torch.bmm(sparse_cpu, dense_cpu)
    t2 = time.time()
    cpu_time = (t2 - t1) / num_iterations

    speedup = cpu_time / mps_time
    speedups.append(speedup)
    print(f"nnz={nnz}: MPS={mps_time:.6f}s, CPU={cpu_time:.6f}s, Speedup={speedup:.2f}x")

plt.figure(figsize=(10, 6))
plt.plot(nnz_values, speedups, marker='o', linewidth=2, markersize=8)
plt.xlabel('Number of Non-Zero Elements (nnz)', fontsize=12)
plt.ylabel('Speedup (CPU time / MPS time)', fontsize=12)
plt.title('MPS vs CPU Speedup for Sparse-Dense BMM', fontsize=14)
plt.grid(True, alpha=0.3)
plt.axhline(y=1, color='r', linestyle='--', alpha=0.5)
plt.xscale('log')
plt.tight_layout()
plt.show()
```

## Tested on M1 Pro
<img width="1000" height="600" alt="Figure_1" src="https://github.com/user-attachments/assets/4a2402ec-3dc4-402d-8196-a0426906ca3d" />

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165232
Approved by: https://github.com/malfet
---
 aten/src/ATen/native/native_functions.yaml    |   6 +-
 .../native/sparse/mps/SparseMPSTensorMath.mm  | 302 ++++++++++++++++++
 .../ATen/native/sparse/mps/kernels/Mul.metal  | 214 ++++++++++++-
 c10/metal/utils.h                             |  16 +
 test/test_sparse.py                           |   3 -
 5 files changed, 527 insertions(+), 14 deletions(-)

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index f04d93562357..b5ace440e64d 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -1370,6 +1370,7 @@
   dispatch:
     SparseCPU: bmm_sparse_cpu
     SparseCUDA: bmm_sparse_cuda
+    SparseMPS: bmm_sparse_mps
     NestedTensorCPU: bmm_nested
     NestedTensorCUDA: bmm_nested_cuda
   tags: core
@@ -1385,6 +1386,7 @@
     MTIA: bmm_out_mtia
     SparseCPU: bmm_out_sparse_cpu
     SparseCUDA: bmm_out_sparse_cuda
+    SparseMPS: bmm_out_sparse_mps
     SparseCsrCUDA: bmm_out_sparse_csr_cuda
 
 - func: bmm.dtype(Tensor self, Tensor mat2, ScalarType out_dtype) -> Tensor
@@ -4173,7 +4175,7 @@
   structured_delegate: mm.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: _sparse_mm
+    SparseCPU, SparseCUDA, SparseMPS: _sparse_mm
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: _sparse_csr_mm
   tags: core
 
@@ -7112,6 +7114,7 @@
     MTIA: addmm_out_mtia
     SparseCPU: addmm_out_sparse_dense_cpu
     SparseCUDA: addmm_out_sparse_dense_cuda
+    SparseMPS: addmm_out_sparse_dense_mps
     SparseCsrCPU: addmm_out_sparse_compressed_cpu
     SparseCsrCUDA: addmm_out_sparse_compressed_cuda
 
@@ -7121,6 +7124,7 @@
   dispatch:
     SparseCPU: addmm_sparse_dense_cpu
     SparseCUDA: addmm_sparse_dense_cuda
+    SparseMPS: addmm_sparse_dense_mps
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: addmm_sparse_compressed_dense
   tags: core
 
diff --git a/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm b/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm
index 1a17d01ee6d8..9f33f5b1106f 100644
--- a/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm
+++ b/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm
@@ -1,5 +1,6 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/SparseTensorUtils.h>
+#include <ATen/ExpandUtils.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <ATen/native/sparse/SparseStubs.h>
 #include <ATen/native/sparse/SparseBinaryOpIntersectionCommon.h>
@@ -18,6 +19,8 @@
 #include <ATen/ops/ones_like.h>
 #include <ATen/ops/argsort.h>
 #include <ATen/ops/result_type.h>
+#include <ATen/ops/bmm_native.h>
+#include <ATen/ops/addmm_native.h>
 #include <ATen/ops/copy_sparse_to_sparse.h>
 #include <ATen/ops/mul.h>
 #endif
@@ -33,6 +36,305 @@ static auto& lib = MetalShaderLibrary::getBundledLibrary();
 #include <ATen/native/mps/Mul_metallib.h>
 #endif
 
+static Tensor& s_addmm_out_sparse_dense_mps(
+    Tensor& r,
+    const Tensor& t,
+    const SparseTensor& sparse_,
+    const Tensor& dense,
+    const Scalar& beta,
+    const Scalar& alpha) {
+  TORCH_CHECK(sparse_.sparse_dim() == 2, "addmm: sparse_dim must be 2, got ", sparse_.sparse_dim());
+  TORCH_CHECK(sparse_.dense_dim() == 0, "addmm: sparse values must be 0-dense-dim, got ", sparse_.dense_dim());
+  TORCH_CHECK(dense.dim() == 2, "addmm: 'dense' must be 2D, got ", dense.dim());
+  TORCH_CHECK(t.dim() == 2, "addmm: 't' must be 2D, got ", t.dim());
+
+  const int64_t I = sparse_.size(0);
+  const int64_t J = sparse_.size(1);
+  const int64_t K = dense.size(1);
+
+  TORCH_CHECK(dense.size(0) == J,
+      "addmm: dense (mat2) dim0 must be ", J, ", got ", dense.size(0));
+  TORCH_CHECK(t.size(0) == I && t.size(1) == K,
+      "addmm: 't' shape must be (", I, ", ", K, "), got (", t.size(0), ", ", t.size(1), ")");
+
+  r.resize_({I, K});
+
+  auto sparse = sparse_.coalesce();
+  const int64_t nnz = sparse._nnz();
+
+  if (nnz == 0 || I == 0 || K == 0) {
+    at::mul_out(r, t, beta);
+    return r;
+  }
+
+  const auto v_dtype = sparse._values().scalar_type();
+  const auto d_dtype = dense.scalar_type();
+  const auto t_dtype = t.scalar_type();
+  auto compute_dtype = c10::promoteTypes(c10::promoteTypes(v_dtype, d_dtype), t_dtype);
+
+  TORCH_CHECK(canCast(compute_dtype, r.scalar_type()),
+              "Can't convert computed type ", compute_dtype, " to output ", r.scalar_type());
+
+  auto indices2d = sparse._indices().contiguous();
+  auto values = sparse._values().to(compute_dtype);
+  auto dense_c = dense.to(compute_dtype).contiguous();
+  auto t_c = t.to(compute_dtype).contiguous();
+
+  const bool out_needs_cast = (r.scalar_type() != compute_dtype) || !r.is_contiguous();
+  Tensor out_buf = out_needs_cast
+      ? at::empty({I, K}, r.options().dtype(compute_dtype))
+      : r;
+  auto out_contig = out_buf.contiguous();
+
+  auto device = r.device();
+  auto stream = getCurrentMPSStream();
+
+  const float alpha_f = alpha.to<float>();
+  const float beta_f  = beta.to<float>();
+
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      const std::string func = "spmm_addmm_coo_" + mps::scalarToMetalTypeString(values);
+      auto pso = lib.getPipelineStateForFunc(func);
+      auto enc = stream->commandEncoder();
+      [enc setComputePipelineState:pso];
+
+      const uint32_t tew = pso.threadExecutionWidth;
+      const uint32_t gridX = static_cast<uint32_t>(K);
+      const uint32_t gridZ = static_cast<uint32_t>(I);
+      const uint32_t tgW = std::min<uint32_t>(gridX, tew);
+
+      MTLSize grid = MTLSizeMake(gridX, 1, gridZ);
+      MTLSize tgs = MTLSizeMake(tgW, 1, 1);
+
+      mtl_setArgs(enc,
+                  indices2d,
+                  values,
+                  dense_c,
+                  t_c,
+                  out_contig,
+                  std::array<uint32_t, 3>{static_cast<uint32_t>(I),
+                                           static_cast<uint32_t>(J),
+                                           static_cast<uint32_t>(K)},
+                  std::array<float, 2>{alpha_f, beta_f},
+                  static_cast<uint32_t>(nnz));
+      [enc dispatchThreads:grid threadsPerThreadgroup:tgs];
+    }
+  });
+
+  if (out_needs_cast) {
+    r.copy_(out_contig.to(r.scalar_type()));
+  }
+
+  return r;
+}
+
+
+static void build_batch_ptr_mps(
+    const Tensor& indices_dim0,
+    int64_t B,
+    Tensor& batch_ptr
+) {
+  // Builds an array of pointers which point to each batches elements. Example:
+  // idx_b = [0, 0, 0, 1, 1, 2, 2, 2, 2]  // 9 non-zero elements
+  //          └─────┘  └──┘  └─────────┘
+  //          batch 0  batch 1  batch 2
+  // batch_ptr = [0, 3, 5, 9]
+  //              │  │  │  └─ end of batch 2 (total nnz)
+  //              │  │  └──── batch 2 starts at index 5
+  //              │  └─────── batch 1 starts at index 3
+  //              └────────── batch 0 starts at index 0
+  TORCH_CHECK(indices_dim0.is_mps() && batch_ptr.is_mps(), "MPS device expected");
+  auto device = indices_dim0.device();
+  auto stream = getCurrentMPSStream();
+
+  const int64_t nnz = indices_dim0.numel();
+
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      auto pso = lib.getPipelineStateForFunc("build_batch_ptr_from_sorted_batches");
+      auto enc = stream->commandEncoder();
+      [enc setComputePipelineState:pso];
+
+      const uint32_t tew = pso.threadExecutionWidth;
+      const uint32_t Q = static_cast<uint32_t>(B + 1);
+      const uint32_t tgW = std::min<uint32_t>(Q, tew);
+      MTLSize grid = MTLSizeMake(Q, 1, 1);
+      MTLSize tgs  = MTLSizeMake(tgW, 1, 1);
+
+      mtl_setArgs(enc,
+                  indices_dim0,
+                  batch_ptr,
+                  std::array<uint32_t, 2>{static_cast<uint32_t>(nnz),
+                                          static_cast<uint32_t>(B)});
+      [enc dispatchThreads:grid threadsPerThreadgroup:tgs];
+    }
+  });
+}
+
+static void build_row_ptr_per_batch_mps(
+    const Tensor& rows,
+    const Tensor& batch_ptr,
+    int64_t B,
+    int64_t I,
+    Tensor& row_ptr
+) {
+  // Build per-batch CSR-style row pointer arrays from row indices sorted by batch
+  // Given:
+  //   rows: 1-D array of length nnz with row ids in [0, I), sorted within each batch
+  //   batch_ptr: length B+1, where [batch_ptr[b], batch_ptr[b+1]) is the subrange for batch b
+  // Produces:
+  //   - row_ptr: shape [B, I+1]
+  //
+  // Example (B = 2, I = 4):
+  // rows       = [0,   0,   1,  3,  0,   2,    2]   // 7 non-zero elements
+  //               └─── batch 0 ──┘  └─ batch 1 ─┘
+  // batch_ptr  = [0, 4, 7]
+  //               │  │  └─ end of batch 1 (total nnz)
+  //               │  └──── end of batch 0/start of batch 1
+  //               └─────── start of batch 0
+  //
+  // per-batch row pointers (I+1 entries each):
+  //   row_ptr[0] = [0, 2, 3, 3, 4]
+  //   row_ptr[1] = [0, 1, 1, 3, 3]
+  // laid out in memory: [0, 2, 3, 3, 4,  0, 1, 1, 3, 3]
+  TORCH_CHECK(rows.is_mps() && batch_ptr.is_mps() && row_ptr.is_mps(), "MPS device expected");
+  auto stream = getCurrentMPSStream();
+
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      auto pso = lib.getPipelineStateForFunc("build_row_ptr_from_sorted_rows_by_batch");
+      auto enc = stream->commandEncoder();
+      [enc setComputePipelineState:pso];
+
+      const uint32_t tew = pso.threadExecutionWidth;
+      const uint32_t Qx = static_cast<uint32_t>(I + 1);
+      const uint32_t Qy = static_cast<uint32_t>(B);
+      const uint32_t tgW = std::min<uint32_t>(Qx, tew);
+
+      MTLSize grid = MTLSizeMake(Qx, Qy, 1);
+      MTLSize tgs = MTLSizeMake(tgW, 1, 1);
+
+      mtl_setArgs(enc,
+                  rows,
+                  batch_ptr,
+                  row_ptr,
+                  std::array<uint32_t, 2>{static_cast<uint32_t>(I),
+                                           static_cast<uint32_t>(B)});
+      [enc dispatchThreads:grid threadsPerThreadgroup:tgs];
+    }
+  });
+}
+
+Tensor& bmm_out_sparse_mps(const SparseTensor& self_, const Tensor& mat2_, Tensor& result_) {
+  TORCH_CHECK(result_.is_mps(), "bmm_sparse: expected 'out' to be MPS, got ", result_.device());
+  TORCH_CHECK(self_.is_mps(),  "bmm_sparse: expected 'self' to be MPS, got ", self_.device());
+  TORCH_CHECK(mat2_.is_mps(),  "bmm_sparse: expected 'mat2' to be MPS, got ", mat2_.device());
+
+  TORCH_CHECK(self_.dense_dim() == 0, "bmm_sparse: Tensor 'self' must have 0 dense dims, but has ", self_.dense_dim());
+  TORCH_CHECK(self_.sparse_dim() == 3, "bmm_sparse: Tensor 'self' must have 3 sparse dims, but has ", self_.sparse_dim());
+  TORCH_CHECK(mat2_.dim() == 3, "bmm_sparse: Tensor 'mat2' must have 3 dims, but has ", mat2_.dim());
+
+  TORCH_CHECK(self_.size(0) == mat2_.size(0), "bmm_sparse: 'self.size(0)' and 'mat2.size(0)' must match");
+  TORCH_CHECK(self_.size(2) == mat2_.size(1), "bmm_sparse: 'self.size(2)' and 'mat2.size(1)' must match");
+
+  const int64_t B = self_.size(0);
+  const int64_t I = self_.size(1);
+  const int64_t J = self_.size(2);
+  const int64_t K = mat2_.size(2);
+
+  auto self = self_.coalesce();
+  const int64_t nnz = self._nnz();
+  if (nnz == 0) {
+    return result_.zero_();
+  }
+
+  const auto computeDtype = at::kFloat;
+
+  auto indices = self._indices();
+  auto values  = self._values();
+
+  auto values_c = values.scalar_type() == computeDtype ? values : values.to(computeDtype);
+  auto mat2_c = mat2_.scalar_type()   == computeDtype ? mat2_   : mat2_.to(computeDtype);
+  auto mat2_contig = mat2_c.contiguous();
+
+  auto idx_b = indices.select(0, 0).contiguous();
+  auto idx_i = indices.select(0, 1).contiguous();
+  auto idx_j = indices.select(0, 2).contiguous();
+
+  // builds an array of pointers of where the batch_idx's pointer starts and ends
+  // look in function for better explanation
+  auto batch_ptr = at::empty({B + 1}, at::device(result_.device()).dtype(kLong));
+  build_batch_ptr_mps(idx_b, B, batch_ptr);
+  // build row_ptr per batch: for each (b, i) get [start, end) into rows/cols/vals
+  auto row_ptr = at::empty({B * (I + 1)}, at::device(result_.device()).dtype(kLong));
+  build_row_ptr_per_batch_mps(idx_i, batch_ptr, B, I, row_ptr);
+
+  const bool out_needs_cast = (result_.scalar_type() != computeDtype) || !result_.is_contiguous();
+  Tensor out_buf = out_needs_cast
+      ? at::empty({B, I, K}, result_.options().dtype(computeDtype))
+      : result_;
+  auto out_contig = out_buf.contiguous();
+
+  auto stream = getCurrentMPSStream();
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      auto pso = lib.getPipelineStateForFunc("spmm_bmm_coo_rows_grouped_" + mps::scalarToMetalTypeString(values));
+      auto enc = stream->commandEncoder();
+      [enc setComputePipelineState:pso];
+
+      const uint32_t tew = pso.threadExecutionWidth;
+      const uint32_t tgW = std::min<uint32_t>((uint32_t)K, tew);
+
+      // One threadgroup per (row i, batch b), lanes cover K
+      MTLSize grid = MTLSizeMake(tgW, (uint32_t)I, (uint32_t)B);
+      MTLSize tgs  = MTLSizeMake(tgW, 1, 1);
+
+      mtl_setArgs(enc,
+                  idx_i,
+                  idx_j,
+                  values_c,
+                  mat2_contig,
+                  out_contig,
+                  row_ptr,
+                  std::array<uint32_t, 4>{(uint32_t)B, (uint32_t)I, (uint32_t)J, (uint32_t)K});
+      [enc dispatchThreads:grid threadsPerThreadgroup:tgs];
+    }
+  });
+  if (out_needs_cast) {
+    result_.copy_(out_contig.to(result_.scalar_type()));
+  }
+  return result_;
+}
+
+Tensor bmm_sparse_mps(const Tensor& self, const Tensor& mat2) {
+  Tensor result = at::zeros({self.size(0), self.size(1), mat2.size(2)}, mat2.options());
+  return bmm_out_sparse_mps(self, mat2, result);
+}
+
+Tensor& addmm_out_sparse_dense_mps(
+    const Tensor& self,
+    const SparseTensor& mat1,
+    const Tensor& mat2,
+    const Scalar& beta,
+    const Scalar& alpha,
+    Tensor& result) {
+  c10::MaybeOwned<Tensor> b_self = expand_size(self, {mat1.size(0), mat2.size(1)}, "addmm_out");
+  return s_addmm_out_sparse_dense_mps(result, *b_self, mat1, mat2, beta, alpha);
+}
+
+Tensor addmm_sparse_dense_mps(
+    const Tensor& self,
+    const SparseTensor& mat1,
+    const Tensor& mat2,
+    const Scalar& beta,
+    const Scalar& alpha
+) {
+  c10::MaybeOwned<Tensor> b_self = expand_size(self, {mat1.size(0), mat2.size(1)}, "addmm_out");
+  Tensor result = at::empty({0}, self.options());
+  return s_addmm_out_sparse_dense_mps(result, *b_self, mat1, mat2, beta, alpha);
+}
+
 static SparseTensor& mul_out_dense_sparse_mps(
     const Tensor& dense,
     const Tensor& sparse,
diff --git a/aten/src/ATen/native/sparse/mps/kernels/Mul.metal b/aten/src/ATen/native/sparse/mps/kernels/Mul.metal
index 27a660836df6..a5a53e82a3fd 100644
--- a/aten/src/ATen/native/sparse/mps/kernels/Mul.metal
+++ b/aten/src/ATen/native/sparse/mps/kernels/Mul.metal
@@ -1,10 +1,105 @@
-#include <metal_stdlib>
 #include <c10/metal/indexing.h>
+#include <c10/metal/utils.h>
+using namespace c10::metal;
 using namespace metal;
 
+inline uint lower_bound_i64(device const long* arr, uint lo, uint hi, long key) {
+  uint l = lo, r = hi;
+  while (l < r) {
+    uint m = (l + r) >> 1;
+    long v = arr[m];
+    if (v < key) {
+      l = m + 1;
+    } else {
+      r = m;
+    }
+  }
+  return l;
+}
 
-template <typename T> struct MulAccum { using type = float; };
-template <> struct MulAccum<float2> { using type = float2; };
+inline uint upper_bound_i64(device const long* arr, uint lo, uint hi, long key) {
+  uint l = lo, r = hi;
+  while (l < r) {
+    uint m = (l + r) >> 1;
+    long v = arr[m];
+    if (v <= key) {
+      l = m + 1;
+    } else {
+      r = m;
+    }
+  }
+  return l;
+}
+
+kernel void build_row_ptr_from_sorted_rows_by_batch(
+    device const long* rows        [[buffer(0)]],
+    device const long* batch_ptr   [[buffer(1)]],
+    device long*       row_ptr     [[buffer(2)]],
+    constant uint2&    dims        [[buffer(3)]],
+    uint3              tid         [[thread_position_in_grid]])
+{
+  const uint I = dims.x;
+  const uint B = dims.y;
+
+  const uint i = tid.x;
+  const uint b = tid.y;
+
+  if (b >= B || i > I) return;
+
+  const uint base = (uint)batch_ptr[b];
+  const uint lim  = (uint)batch_ptr[b + 1];
+
+  const ulong out_base = (ulong)b * (ulong)(I + 1);
+
+  if (i == I) {
+    row_ptr[out_base + (ulong)I] = (long)lim;
+  } else {
+    const long key = (long)i;
+    const uint pos = lower_bound_i64(rows, base, lim, key);
+    row_ptr[out_base + (ulong)i] = (long)pos;
+  }
+}
+
+template <typename T>
+kernel void spmm_bmm_coo_rows_grouped(
+    device const long*   rows      [[buffer(0)]],
+    device const long*   cols      [[buffer(1)]],
+    device const T*      vals      [[buffer(2)]],
+    device const T*      dense     [[buffer(3)]],
+    device T*            out       [[buffer(4)]],
+    device const long*   row_ptr   [[buffer(5)]],
+    constant uint4&      dims      [[buffer(6)]],
+    uint3                tid       [[thread_position_in_grid]],
+    uint3                ltid      [[thread_position_in_threadgroup]],
+    uint3                tptg      [[threads_per_threadgroup]])
+{
+  const uint B = dims.x;
+  const uint I = dims.y;
+  const uint J = dims.z;
+  const uint K = dims.w;
+
+  const uint b = tid.z;
+  const uint i = tid.y;
+  const uint lane = ltid.x;
+  const uint tgW  = tptg.x;
+
+  const ulong rp_base = (ulong)b * (ulong)(I + 1);
+  const uint start = (uint)row_ptr[rp_base + (ulong)i];
+  const uint end   = (uint)row_ptr[rp_base + (ulong)i + 1];
+
+  for (uint k = lane; k < K; k += tgW) {
+    auto acc = static_cast<accum_t<T>>(T(0));
+    for (uint p = start; p < end; ++p) {
+      const uint c = (uint)cols[p];
+      const auto v = static_cast<accum_t<T>>(vals[p]);
+      const uint d_off = ((b * J) + c) * K + k;
+      const auto d = static_cast<accum_t<T>>(dense[d_off]);
+      acc += mul(v, d);
+    }
+    const uint y_off = ((b * I) + i) * K + k;
+    out[y_off] = static_cast<T>(acc);
+  }
+}
 
 template <typename T>
 kernel void dense_sparse_mul_kernel(
@@ -32,10 +127,9 @@ kernel void dense_sparse_mul_kernel(
   ulong dense_idx = (ulong)key * (ulong)view_cols + (ulong)col;
   ulong val_idx = (ulong)i * (ulong)view_cols + (ulong)col;
 
-  using accum_t = typename MulAccum<T>::type;
-  const accum_t a = static_cast<accum_t>(values[val_idx]);
-  const accum_t b = static_cast<accum_t>(dense[dense_idx]);
-  out_values[val_idx] = static_cast<T>(a * b);
+  const auto a = static_cast<accum_t<T>>(values[val_idx]);
+  const auto b = static_cast<accum_t<T>>(dense[dense_idx]);
+  out_values[val_idx] = static_cast<T>(mul(a, b));
 }
 
 kernel void intersect_binary_search(
@@ -120,6 +214,76 @@ kernel void fused_gather_mul_kernel(
   }
 }
 
+
+kernel void build_batch_ptr_from_sorted_batches(
+    device const long* batches       [[buffer(0)]],
+    device long*       batch_ptr     [[buffer(1)]],
+    constant uint2&    nnz_B         [[buffer(2)]],
+    uint3              tid           [[thread_position_in_grid]])
+{
+  uint b = tid.x;
+  uint nnz = nnz_B.x;
+  uint batch = nnz_B.y;
+
+  if (b == batch) {
+    batch_ptr[b] = (long)nnz;
+    return;
+  }
+
+  uint lo = 0;
+  uint hi = nnz;
+  long key = (long)b;
+  while (lo < hi) {
+    uint mid = (lo + hi) >> 1;
+    long v = batches[mid];
+    if (v < key) lo = mid + 1;
+    else         hi = mid;
+  }
+  batch_ptr[b] = (long)lo;
+}
+
+template <typename T>
+kernel void spmm_addmm_coo(
+    device const long*   indices2d   [[buffer(0)]],
+    device const T*      vals        [[buffer(1)]],
+    device const T*      dense       [[buffer(2)]],
+    device const T*      t_in        [[buffer(3)]],
+    device T*            out         [[buffer(4)]],
+    constant uint3&      dims        [[buffer(5)]],
+    constant float2&     alpha_beta  [[buffer(6)]],
+    constant uint&       nnz         [[buffer(7)]],
+    uint3                tid         [[thread_position_in_grid]])
+{
+  const uint K = dims.z;
+  const uint k = tid.x;
+  const uint i = tid.z;
+  const float alpha = alpha_beta.x;
+  const float beta = alpha_beta.y;
+
+  device const long* rows = indices2d;
+  device const long* cols = indices2d + nnz;
+
+  const uint start = lower_bound_i64(rows, 0u, nnz, (long)i);
+  const uint end = upper_bound_i64(rows, 0u, nnz, (long)i);
+
+  // accumulator is float for scalar/half/bfloat and float2 for float2
+  auto acc = static_cast<accum_t<T>>(T(0));
+
+  for (uint p = start; p < end; ++p) {
+    const uint c = (uint)cols[p];
+    const auto v = static_cast<accum_t<T>>(vals[p]);
+    const uint dense_off = c * K + k;
+    const auto d = static_cast<accum_t<T>>(dense[dense_off]);
+    acc += mul(v, d);
+  }
+
+  const uint off = i * K + k;
+  const auto base = (beta != 0.0f) ? (static_cast<accum_t<T>>(t_in[off]) * beta) : static_cast<accum_t<T>>(T(0));
+  const auto y = base + alpha * acc;
+  out[off] = static_cast<T>(y);
+}
+
+
 #define INSTANTIATE_DENSE_SPARSE_MUL(DTYPE)                                 \
   template [[host_name("dense_sparse_mul_kernel_" #DTYPE)]] kernel void     \
   dense_sparse_mul_kernel<DTYPE>(                                           \
@@ -151,6 +315,36 @@ INSTANTIATE_DENSE_SPARSE_MUL(float2);
       constant uint2&     dims_output   [[buffer(8)]],                       \
       uint3               gid           [[thread_position_in_grid]]);
 
-INSTANTIATE_FUSED_GATHER_MUL(float);
-INSTANTIATE_FUSED_GATHER_MUL(half);
-INSTANTIATE_FUSED_GATHER_MUL(bfloat);
\ No newline at end of file
+INSTANTIATE_FOR_FLOAT_TYPES(INSTANTIATE_FUSED_GATHER_MUL);
+
+
+#define INSTANTIATE_SPMM_BMM_COO_ROWS_GROUPED(DTYPE)                         \
+  template [[host_name("spmm_bmm_coo_rows_grouped_" #DTYPE)]] kernel void    \
+  spmm_bmm_coo_rows_grouped<DTYPE>(                                          \
+      device const long*   rows      [[buffer(0)]],                          \
+      device const long*   cols      [[buffer(1)]],                          \
+      device const DTYPE*  vals      [[buffer(2)]],                          \
+      device const DTYPE*  dense     [[buffer(3)]],                          \
+      device DTYPE*        out       [[buffer(4)]],                          \
+      device const long*   row_ptr   [[buffer(5)]],                          \
+      constant uint4&      dims      [[buffer(6)]],                          \
+      uint3                tid       [[thread_position_in_grid]],            \
+      uint3                ltid      [[thread_position_in_threadgroup]],     \
+      uint3                tptg      [[threads_per_threadgroup]]);
+
+INSTANTIATE_FOR_ALL_TYPES(INSTANTIATE_SPMM_BMM_COO_ROWS_GROUPED);
+
+#define INSTANTIATE_SPMM_ADDMM_COO(DTYPE) \
+  template [[host_name("spmm_addmm_coo_" #DTYPE)]] kernel void  \
+  spmm_addmm_coo<DTYPE>(                                        \
+    device const long*   indices2d   [[buffer(0)]],             \
+    device const DTYPE*  vals        [[buffer(1)]],             \
+    device const DTYPE*  dense       [[buffer(2)]],             \
+    device const DTYPE*  t_in        [[buffer(3)]],             \
+    device DTYPE*        out         [[buffer(4)]],             \
+    constant uint3&      dims        [[buffer(5)]],             \
+    constant float2&     alpha_beta  [[buffer(6)]],             \
+    constant uint&       nnz         [[buffer(7)]],             \
+    uint3                tid         [[thread_position_in_grid]]);
+
+INSTANTIATE_FOR_ALL_TYPES(INSTANTIATE_SPMM_ADDMM_COO);
diff --git a/c10/metal/utils.h b/c10/metal/utils.h
index aaa0e1741240..14c4b2b2cbae 100644
--- a/c10/metal/utils.h
+++ b/c10/metal/utils.h
@@ -328,5 +328,21 @@ struct pair {
   T2 second;
 };
 
+#define INSTANTIATE_FOR_ALL_TYPES(MACRO) \
+  MACRO(float);                          \
+  MACRO(half);                           \
+  MACRO(bfloat);                         \
+  MACRO(float2);                         \
+  MACRO(long);                           \
+  MACRO(char);                           \
+  MACRO(uchar);                          \
+  MACRO(short);                          \
+  MACRO(int);
+
+#define INSTANTIATE_FOR_FLOAT_TYPES(MACRO) \
+  MACRO(float);                            \
+  MACRO(half);                             \
+  MACRO(bfloat);
+
 } // namespace metal
 } // namespace c10
diff --git a/test/test_sparse.py b/test/test_sparse.py
index 196506a8e13d..2026ffeae528 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -1421,7 +1421,6 @@ class TestSparse(TestSparseBase):
         "bmm sparse-dense CUDA is not yet supported in Windows, at least up to CUDA 10.1"
     )
     @coalescedonoff
-    @expectedFailureMPS
     @dtypes(torch.double)
     @dtypesIfMPS(torch.float32)
     def test_bmm(self, device, dtype, coalesced):
@@ -1633,7 +1632,6 @@ class TestSparse(TestSparseBase):
         self.assertEqual(self.safeToDense(res), self.safeToDense(true_result))
 
     @coalescedonoff
-    @expectedFailureMPS
     @precisionOverride({torch.bfloat16: 5e-2, torch.float16: 5e-2})
     @dtypes(torch.double, torch.cdouble, torch.bfloat16, torch.float16)
     @dtypesIfMPS(torch.float32, torch.complex64, torch.bfloat16, torch.float16)
@@ -1724,7 +1722,6 @@ class TestSparse(TestSparseBase):
         # test_shape(2, 3, [2, 2, 0])
 
     @coalescedonoff
-    @expectedFailureMPS
     @dtypes(torch.double)
     @dtypesIfMPS(torch.float32)
     def test_dsmm(self, device, dtype, coalesced):

From 4740ce77879e2a7a721f5f67eac731349dfaa868 Mon Sep 17 00:00:00 2001
From: Chien-Chin Huang <chienchin@fb.com>
Date: Fri, 17 Oct 2025 23:41:05 -0700
Subject: [PATCH 110/123] [CP] Fix load balancer incorrectly assuming batch
 dimension exists (#165792)

https://github.com/pytorch/pytorch/pull/163617 removes the if/else statement to check if the input buffers have the batch dimension.

This PR fixes the issue and also adds a test.

In the future, we should explicitly ask users to unsqueeze the batch dimension. This is a BC of the existing contract but implicitly infers the batch dimension existence is not safe.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165792
Approved by: https://github.com/XilunWu
---
 test/distributed/tensor/test_attention.py     | 35 +++++++++++++++++++
 .../tensor/experimental/_attention.py         | 32 +++++++++++------
 2 files changed, 56 insertions(+), 11 deletions(-)

diff --git a/test/distributed/tensor/test_attention.py b/test/distributed/tensor/test_attention.py
index 4806c1b71d0d..66d80f604551 100644
--- a/test/distributed/tensor/test_attention.py
+++ b/test/distributed/tensor/test_attention.py
@@ -771,5 +771,40 @@ class TestCPCustomOps(DTensorTestBase):
             torch.library.opcheck(flex_cp_allgather, example)
 
 
+class TestSharding(DTensorTestBase):
+    @property
+    def world_size(self) -> int:
+        return 2
+
+    @skip_if_lt_x_gpu(2)
+    @with_comms
+    def test_context_parallel_shard(self) -> None:
+        B = 4
+        seq_len = 32
+
+        device_mesh = init_device_mesh(
+            mesh_shape=(2,), mesh_dim_names=("cp",), device_type=self.device_type
+        )
+        freqs_cis = torch.arange(0, seq_len, device=self.device_type)
+        q = torch.ones(B * seq_len, device=self.device_type).reshape(B, seq_len)
+        k = torch.ones(B * seq_len, device=self.device_type).reshape(B, seq_len)
+        v = torch.ones(B * seq_len, device=self.device_type).reshape(B, seq_len)
+
+        load_balancer = _HeadTailLoadBalancer(
+            seq_len, self.world_size, torch.device(self.device_type)
+        )
+        freqs_cis_shard, q_shard, k_shard, v_shard = _context_parallel_shard(
+            device_mesh, [freqs_cis, q, k, v], [0, 1, 1, 1], load_balancer=load_balancer
+        )
+        self.assertEqual(freqs_cis_shard.size(), (seq_len // 2,))
+        chunks = freqs_cis.chunk(self.world_size * 2)
+        self.assertEqual(
+            freqs_cis_shard,
+            torch.cat(
+                [chunks[self.rank], chunks[self.world_size * 2 - self.rank - 1]], dim=0
+            ),
+        )
+
+
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/distributed/tensor/experimental/_attention.py b/torch/distributed/tensor/experimental/_attention.py
index 8d0a07bbd97f..9b89563a0ef9 100644
--- a/torch/distributed/tensor/experimental/_attention.py
+++ b/torch/distributed/tensor/experimental/_attention.py
@@ -1068,10 +1068,16 @@ def _context_parallel_buffers(
     for buffer, seq_dim in zip(buffers, buffer_seq_dims):
         if isinstance(buffer, torch.Tensor):
             # TODO: the load balance doesn't perform error handling.
+
+            # NOTE: assuming batch dim is 0
+
             if load_balance_indices is not None:
-                # NOTE: assuming batch dim is 0
+                # TODO: we should expclitly ask users to unsqueeze the batch dim.
+                # But this is a BC breaking ask.
+                # However, what we have done today is also not very safe.
                 idx_batch_size = load_balance_indices.size(0)
-                data_batch_size = buffer.size(0)
+                data_batch_size = buffer.size(0) if seq_dim > 0 else 1
+
                 if idx_batch_size != 1 and idx_batch_size != data_batch_size:
                     raise ValueError(
                         "Cannot rearrange buffer: "
@@ -1079,16 +1085,20 @@ def _context_parallel_buffers(
                         f"but buffer has shape {buffer.shape}."
                     )
 
-                for i in range(data_batch_size):
-                    index = (
-                        load_balance_indices[0]  # identical load-balance in batch
-                        if idx_batch_size == 1
-                        else load_balance_indices[i]
+                if seq_dim == 0:
+                    buffer = torch.index_select(
+                        buffer, dim=0, index=load_balance_indices[0]
                     )
-                    buffer_batch_i = torch.index_select(
-                        buffer[i], dim=seq_dim - 1, index=index
-                    )
-                    buffer[i] = buffer_batch_i
+                else:
+                    indices = load_balance_indices
+                    if idx_batch_size == 1:
+                        size = [data_batch_size] + list(indices.size())[1:]
+                        indices = indices.expand(*size)
+
+                    for i in range(data_batch_size):
+                        buffer[i] = torch.index_select(
+                            buffer[i], dim=seq_dim - 1, index=indices[i]
+                        )
 
             # use DTensor to shard the buffer on sequence dimension, retain the local tensor
             sharded_buffer = distribute_tensor(

From beb6b62e8c94d7e8683795dda6d3247eb2d30a9b Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Sat, 18 Oct 2025 09:15:49 +0000
Subject: [PATCH 111/123] Revert "Enable more DTensor tests in local tensor
 mode and fix more integration issues (#165716)"

This reverts commit 1b397420f22b22f90a1093233ecd9167656e50cb.

Reverted https://github.com/pytorch/pytorch/pull/165716 on behalf of https://github.com/pytorch-auto-revert due to Reverted automatically by pytorch's autorevert, to avoid this behaviour add the tag autorevert: disable ([comment](https://github.com/pytorch/pytorch/pull/165716#issuecomment-3418083391))
---
 test/distributed/tensor/test_tensor_ops.py    | 15 +---
 torch/distributed/_local_tensor/__init__.py   | 78 ++-----------------
 .../distributed/tensor/_ops/_embedding_ops.py | 31 +++-----
 torch/distributed/tensor/_sharding_prop.py    |  3 -
 torch/distributed/tensor/debug/__init__.py    | 11 ---
 torch/distributed/tensor/placement_types.py   | 18 +----
 torch/testing/_internal/common_distributed.py | 16 +---
 .../distributed/_tensor/common_dtensor.py     |  3 -
 8 files changed, 25 insertions(+), 150 deletions(-)

diff --git a/test/distributed/tensor/test_tensor_ops.py b/test/distributed/tensor/test_tensor_ops.py
index 8368befabfec..eaa1969068c1 100644
--- a/test/distributed/tensor/test_tensor_ops.py
+++ b/test/distributed/tensor/test_tensor_ops.py
@@ -17,7 +17,6 @@ from torch.distributed.tensor.debug import CommDebugMode
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_utils import run_tests, skipIfRocm
 from torch.testing._internal.distributed._tensor.common_dtensor import (
-    create_local_tensor_test_class,
     DTensorConverter,
     DTensorTestBase,
     with_comms,
@@ -705,12 +704,6 @@ class DistTensorOpsTest(DTensorTestBase):
 
     @with_comms
     def test_dtensor_dtype_conversion(self):
-        from torch.distributed.tensor.debug import (
-            _clear_sharding_prop_cache,
-            _get_sharding_prop_cache_info,
-        )
-
-        _clear_sharding_prop_cache()
         device_mesh = self.build_device_mesh()
         shard_spec = [Shard(0)]
         # by default we start from bf16 dtype
@@ -729,6 +722,8 @@ class DistTensorOpsTest(DTensorTestBase):
         self.assertEqual(bf16_sharded_dtensor1.dtype, torch.bfloat16)
         self.assertEqual(bf16_sharded_dtensor1.to_local().dtype, torch.bfloat16)
 
+        from torch.distributed.tensor.debug import _get_sharding_prop_cache_info
+
         # by this point we only have cache misses
         hits, misses, _, _ = _get_sharding_prop_cache_info()
         self.assertEqual(hits, 0)
@@ -780,7 +775,7 @@ class DistTensorOpsTest(DTensorTestBase):
         )
 
     def _test_split_on_partial(self, reduce_op: str, split_size: int, split_dim: int):
-        self.init_manual_seed_for_rank()
+        torch.manual_seed(self.rank)
         mesh = self.build_device_mesh()
 
         partial_tensor = torch.randn(8, 8, device=self.device_type)
@@ -827,9 +822,5 @@ class DistTensorOpsTest(DTensorTestBase):
                     self.assertEqual(x.full_tensor(), y)
 
 
-DistTensorOpsTestWithLocalTensor = create_local_tensor_test_class(
-    DistTensorOpsTest,
-)
-
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/distributed/_local_tensor/__init__.py b/torch/distributed/_local_tensor/__init__.py
index 8121b367790a..d9eb7b47e9a3 100644
--- a/torch/distributed/_local_tensor/__init__.py
+++ b/torch/distributed/_local_tensor/__init__.py
@@ -104,62 +104,6 @@ def _map_to_rank_local_val(val: Any, rank: int) -> Any:
     return val
 
 
-def collect_cuda_rng_states() -> list[torch.Tensor]:
-    """
-    Collects RNG state from all available CUDA devices.
-
-    Returns:
-        List of RNG state tensors, one for each CUDA device.
-        Returns empty list if CUDA is not available.
-    """
-    if not torch.cuda.is_available():
-        return []
-
-    num_devices = torch.cuda.device_count()
-    rng_states = []
-
-    for device_idx in range(num_devices):
-        with torch.cuda.device(device_idx):
-            rng_state = torch.cuda.get_rng_state()
-            rng_states.append(rng_state)
-
-    return rng_states
-
-
-def set_cuda_rng_states(rng_states: list[torch.Tensor]) -> None:
-    """
-    Sets RNG state for all CUDA devices from a list of states.
-
-    Args:
-        rng_states: List of RNG state tensors to restore.
-    """
-    if not torch.cuda.is_available():
-        return
-
-    num_devices = min(len(rng_states), torch.cuda.device_count())
-
-    for device_idx in range(num_devices):
-        with torch.cuda.device(device_idx):
-            torch.cuda.set_rng_state(rng_states[device_idx])
-
-
-def _get_rng_state() -> tuple[torch.Tensor, list[torch.Tensor]]:
-    """
-    Gets CPU and CUDA rng states from all devices.
-    """
-    return (torch.get_rng_state(), collect_cuda_rng_states())
-
-
-def _set_rng_state(cpu_state: torch.Tensor, cuda_states: list[torch.Tensor]) -> None:
-    """
-    Sets CPU and CUDA rng states for all devices. If the list of cuda states
-    is shorter than the number of devices only the first len(cuda_states) devices
-    will get their rng state set.
-    """
-    torch.set_rng_state(cpu_state)
-    set_cuda_rng_states(cuda_states)
-
-
 def _for_each_rank_run_func(
     func: Callable[..., Any],
     ranks: frozenset[int],
@@ -173,15 +117,14 @@ def _for_each_rank_run_func(
         a.wait() if isinstance(a, AsyncCollectiveTensor) else a for a in flat_args
     ]
 
-    # NB: Before invoking an op we are collecting rng states from CPU and
-    # CUDA devices such that we can reset to the same before invoking op
-    # for each rank. This is not very efficient and will likely be revisited
-    # to support per rank rng state.
-    rng_state = _get_rng_state()
+    cpu_state = torch.get_rng_state()
+    devices, states = get_device_states((args, kwargs))
+
     flat_rank_rets = {}
 
     for r in sorted(ranks):
-        _set_rng_state(*rng_state)
+        torch.set_rng_state(cpu_state)
+        set_device_states(devices, states)
         rank_flat_args = [_map_to_rank_local_val(a, r) for a in flat_args]
         rank_args, rank_kwargs = pytree.tree_unflatten(rank_flat_args, args_spec)
         rank_ret = func(*rank_args, **rank_kwargs)
@@ -761,11 +704,6 @@ class _LocalDeviceMesh:
 
     @staticmethod
     def get_coordinate(self: DeviceMesh) -> Optional[list[int] | None]:
-        # NB: In order to support submeshes the code below recreates for each
-        # rank submesh with the same mesh dimensions as current mesh. We are
-        # doing this because when submesh is created it is created for a particular
-        # rank (therefore below we are patching get_rank method). We are trying to
-        # limit the invasiveness of local tensor.
         lm = local_tensor_mode()
         assert lm is not None, "Unexpectedly not in LocalTensorMode"
 
@@ -778,9 +716,7 @@ class _LocalDeviceMesh:
                 coords[d][r] = c
 
         out = [torch.SymInt(LocalIntNode(c)) for c in coords]
-        # The output contains coordinates for each of the ranks with respect to
-        # their meshes formed from root mesh and selecting the same dimensions
-        # as the current mesh.
+
         return out  # type: ignore[return-value]
 
 
@@ -858,6 +794,8 @@ def maybe_run_for_local_tensor(func: Callable[..., Any]) -> Callable[..., Any]:
         with lm.disable():
             ret = _for_each_rank_run_func(func, lm.ranks, args, kwargs, alias=False)
 
+        lm = local_tensor_mode()
+        assert lm is not None
         return ret
 
     return wrapper
diff --git a/torch/distributed/tensor/_ops/_embedding_ops.py b/torch/distributed/tensor/_ops/_embedding_ops.py
index 283cffb78efd..445b1830defe 100644
--- a/torch/distributed/tensor/_ops/_embedding_ops.py
+++ b/torch/distributed/tensor/_ops/_embedding_ops.py
@@ -6,7 +6,6 @@ from typing import cast, Optional
 
 import torch
 import torch.distributed._functional_collectives as funcol
-from torch.distributed._local_tensor import maybe_run_for_local_tensor
 from torch.distributed.device_mesh import DeviceMesh
 from torch.distributed.tensor._op_schema import (
     OpSchema,
@@ -84,27 +83,9 @@ class _MaskPartial(Partial):
     offset_shape: Optional[torch.Size] = None
     offset_dim: int = 0
 
-    @staticmethod
-    @maybe_run_for_local_tensor
-    def _mask_tensor(
-        tensor: torch.Tensor, local_offset_on_dim: int, local_shard_size: int
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        # Build the input mask and save it for the current partial placement
-        # this is so that the output of embedding op can reuse the same partial
-        # placement saved mask to perform mask + reduction
-        mask = (tensor < local_offset_on_dim) | (
-            tensor >= local_offset_on_dim + local_shard_size
-        )
-        # mask the input tensor
-        masked_tensor = tensor.clone() - local_offset_on_dim
-        masked_tensor[mask] = 0
-        return mask, masked_tensor
-
     def _partition_value(
         self, tensor: torch.Tensor, mesh: DeviceMesh, mesh_dim: int
     ) -> torch.Tensor:
-        my_coordinate = mesh.get_coordinate()
-        assert my_coordinate is not None, "my_coordinate should not be None"
         # override parent logic to perform partial mask for embedding
         num_chunks = mesh.size(mesh_dim)
         # get local shard size and offset on the embedding_dim
@@ -114,11 +95,17 @@ class _MaskPartial(Partial):
         local_shard_size, local_offset_on_dim = Shard.local_shard_size_and_offset(
             self.offset_shape[self.offset_dim],
             num_chunks,
-            my_coordinate[mesh_dim],
+            mesh.get_local_rank(mesh_dim),
         )
-        mask, masked_tensor = _MaskPartial._mask_tensor(
-            tensor, local_offset_on_dim, local_shard_size
+        # Build the input mask and save it for the current partial placement
+        # this is so that the output of embedding op can reuse the same partial
+        # placement saved mask to perform mask + reduction
+        mask = (tensor < local_offset_on_dim) | (
+            tensor >= local_offset_on_dim + local_shard_size
         )
+        # mask the input tensor
+        masked_tensor = tensor.clone() - local_offset_on_dim
+        masked_tensor[mask] = 0
         # materialize the mask buffer to be used for reduction
         self.mask_buffer.materialize_mask(mask)
         return masked_tensor
diff --git a/torch/distributed/tensor/_sharding_prop.py b/torch/distributed/tensor/_sharding_prop.py
index c1af2c131717..4af72b4d3d8f 100644
--- a/torch/distributed/tensor/_sharding_prop.py
+++ b/torch/distributed/tensor/_sharding_prop.py
@@ -48,9 +48,6 @@ class LocalLRUCache(threading.local):
     def cache_info(self):
         return self.cache.cache_info()
 
-    def cache_clear(self):
-        return self.cache.cache_clear()
-
 
 class ShardingPropagator:
     def __init__(self) -> None:
diff --git a/torch/distributed/tensor/debug/__init__.py b/torch/distributed/tensor/debug/__init__.py
index a74f1449ad12..e5bf3b833fe4 100644
--- a/torch/distributed/tensor/debug/__init__.py
+++ b/torch/distributed/tensor/debug/__init__.py
@@ -19,17 +19,6 @@ def _get_sharding_prop_cache_info():
     )
 
 
-def _clear_sharding_prop_cache():
-    """
-    Clears the cache for the sharding propagation cache, used for debugging purpose only.
-    """
-    from torch.distributed.tensor._api import DTensor
-
-    return (
-        DTensor._op_dispatcher.sharding_propagator.propagate_op_sharding.cache_clear()  # type:ignore[attr-defined]
-    )
-
-
 # Set namespace for exposed private names
 CommDebugMode.__module__ = "torch.distributed.tensor.debug"
 visualize_sharding.__module__ = "torch.distributed.tensor.debug"
diff --git a/torch/distributed/tensor/placement_types.py b/torch/distributed/tensor/placement_types.py
index 8930d3b1b29c..5f68ff03ee22 100644
--- a/torch/distributed/tensor/placement_types.py
+++ b/torch/distributed/tensor/placement_types.py
@@ -359,16 +359,6 @@ class Shard(Placement):
 
         return Shard._select_shard(shards, shard_index)
 
-    @staticmethod
-    @maybe_run_for_local_tensor
-    def _get_shard_pad_size(
-        full_size: int, local_tensor: torch.Tensor, dim: int
-    ) -> int:
-        """
-        Get the padding size of the local tensor on the shard dimension.
-        """
-        return full_size - local_tensor.size(dim)
-
     def _to_new_shard_dim(
         self,
         local_tensor: torch.Tensor,
@@ -397,16 +387,14 @@ class Shard(Placement):
             old_dim_full_chunk_size = (
                 old_dim_logical_size + num_chunks - 1
             ) // num_chunks
-            old_dim_pad_size = Shard._get_shard_pad_size(
-                old_dim_full_chunk_size, local_tensor, self.dim
-            )
+            old_dim_pad_size = old_dim_full_chunk_size - local_tensor.size(self.dim)
             local_tensor = pad_tensor(local_tensor, self.dim, old_dim_pad_size)
         if new_dim_padding:
             new_dim_full_chunk_size = (
                 new_dim_logical_size + num_chunks - 1
             ) // num_chunks
-            new_dim_pad_size = Shard._get_shard_pad_size(
-                new_dim_full_chunk_size * num_chunks, local_tensor, new_shard_dim
+            new_dim_pad_size = new_dim_full_chunk_size * num_chunks - local_tensor.size(
+                new_shard_dim
             )
             local_tensor = pad_tensor(local_tensor, new_shard_dim, new_dim_pad_size)
 
diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index 719713e7c9f6..89408b62c9aa 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -211,14 +211,6 @@ def at_least_x_gpu(x):
     return False
 
 
-def _maybe_handle_skip_if_lt_x_gpu(args, msg) -> bool:
-    _handle_test_skip = getattr(args[0], "_handle_test_skip", None)
-    if len(args) == 0 or _handle_test_skip is None:
-        return False
-    _handle_test_skip(msg)
-    return True
-
-
 def skip_if_lt_x_gpu(x):
     def decorator(func):
         @wraps(func)
@@ -229,9 +221,7 @@ def skip_if_lt_x_gpu(x):
                 return func(*args, **kwargs)
             if TEST_XPU and torch.xpu.device_count() >= x:
                 return func(*args, **kwargs)
-            test_skip = TEST_SKIPS[f"multi-gpu-{x}"]
-            if _maybe_handle_skip_if_lt_x_gpu(args, test_skip.message):
-                sys.exit(test_skip.exit_code)
+            sys.exit(TEST_SKIPS[f"multi-gpu-{x}"].exit_code)
 
         return wrapper
 
@@ -247,9 +237,7 @@ def nccl_skip_if_lt_x_gpu(backend, x):
                 return func(*args, **kwargs)
             if torch.cuda.is_available() and torch.cuda.device_count() >= x:
                 return func(*args, **kwargs)
-            test_skip = TEST_SKIPS[f"multi-gpu-{x}"]
-            if _maybe_handle_skip_if_lt_x_gpu(args, test_skip.message):
-                sys.exit(test_skip.exit_code)
+            sys.exit(TEST_SKIPS[f"multi-gpu-{x}"].exit_code)
 
         return wrapper
 
diff --git a/torch/testing/_internal/distributed/_tensor/common_dtensor.py b/torch/testing/_internal/distributed/_tensor/common_dtensor.py
index 22d6d8e7dede..1f982aa42074 100644
--- a/torch/testing/_internal/distributed/_tensor/common_dtensor.py
+++ b/torch/testing/_internal/distributed/_tensor/common_dtensor.py
@@ -701,9 +701,6 @@ class DTensorConverter:
 
 
 class LocalDTensorTestBase(DTensorTestBase):
-    def _handle_test_skip(self, msg: str) -> None:
-        self.skipTest(msg)
-
     def _get_local_tensor_mode(self):
         return LocalTensorMode(frozenset(range(self.world_size)))
 

From f510d0dbc0108a90c4b0275eb761bf189ff7a7d2 Mon Sep 17 00:00:00 2001
From: arkadip-maitra <amaitra@redhat.com>
Date: Sat, 18 Oct 2025 11:53:48 +0000
Subject: [PATCH 112/123] =?UTF-8?q?Clarrifying=20input=20output=20angle=20?=
 =?UTF-8?q?unit=20in=20the=20docs=20for=20trigonometric=20fun=E2=80=A6=20(?=
 =?UTF-8?q?#161248)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…ctions

Fixes #[160995](https://github.com/pytorch/pytorch/issues/160995)

Modified the docs to clarify that input tensor  values for torch.sin, torch.cos and torch.tan should be in radians and the output tensor  values for torch.acos, torch.asin and torch.atan is in radians.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/161248
Approved by: https://github.com/isuruf

Co-authored-by: Isuru Fernando <isuruf@gmail.com>
---
 torch/_torch_docs.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 681025f5d283..3a8c2083afac 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -253,7 +253,7 @@ add_docstr(
     r"""
 acos(input: Tensor, *, out: Optional[Tensor]) -> Tensor
 
-Computes the inverse cosine of each element in :attr:`input`.
+Returns a new tensor with the arccosine (in radians) of each element in :attr:`input`.
 
 .. math::
     \text{out}_{i} = \cos^{-1}(\text{input}_{i})
@@ -1047,7 +1047,7 @@ add_docstr(
     r"""
 asin(input: Tensor, *, out: Optional[Tensor]) -> Tensor
 
-Returns a new tensor with the arcsine of the elements of :attr:`input`.
+Returns a new tensor with the arcsine of the elements (in radians) in the :attr:`input` tensor.
 
 .. math::
     \text{out}_{i} = \sin^{-1}(\text{input}_{i})
@@ -1119,7 +1119,7 @@ add_docstr(
     r"""
 atan(input: Tensor, *, out: Optional[Tensor]) -> Tensor
 
-Returns a new tensor with the arctangent of the elements of :attr:`input`.
+Returns a new tensor with the arctangent of the elements (in radians) in the :attr:`input` tensor.
 
 .. math::
     \text{out}_{i} = \tan^{-1}(\text{input}_{i})
@@ -3135,7 +3135,7 @@ add_docstr(
     r"""
 cos(input, *, out=None) -> Tensor
 
-Returns a new tensor with the cosine  of the elements of :attr:`input`.
+Returns a new tensor with the cosine of the elements of :attr:`input` given in radians.
 
 .. math::
     \text{out}_{i} = \cos(\text{input}_{i})
@@ -9940,7 +9940,8 @@ add_docstr(
     r"""
 sin(input, *, out=None) -> Tensor
 
-Returns a new tensor with the sine of the elements of :attr:`input`.
+Returns a new tensor with the sine of the elements in the :attr:`input` tensor,
+where each value in this input tensor is in radians.
 
 .. math::
     \text{out}_{i} = \sin(\text{input}_{i})
@@ -11357,7 +11358,8 @@ add_docstr(
     r"""
 tan(input, *, out=None) -> Tensor
 
-Returns a new tensor with the tangent of the elements of :attr:`input`.
+Returns a new tensor with the tangent of the elements in the :attr:`input` tensor,
+where each value in this input tensor is in radians.
 
 .. math::
     \text{out}_{i} = \tan(\text{input}_{i})

From d14cbb44760e69b3f2871a1fc428a03ae16a9056 Mon Sep 17 00:00:00 2001
From: Simon Layton <simonlayton@meta.com>
Date: Fri, 17 Oct 2025 23:29:10 +0000
Subject: [PATCH 113/123] Add NVFP4 two-level scaling to scaled_mm (#165774)

Summary:

* Add second-level scaling dispatch to scaled_mm, tying into optional `alpha` passing
* Add two-level tests

Test Plan:

```
pytest -svv -k "nvfp4_global_scale" test/test_scaled_matmul_cuda.py
```

Reviewers:

Subscribers:

Tasks:

Tags:
Signed-off-by: Simon Layton <simonlayton@meta.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165774
Approved by: https://github.com/drisspg
---
 aten/src/ATen/native/cuda/Blas.cpp | 24 ++++++--
 test/test_scaled_matmul_cuda.py    | 89 ++++++++++++++++++++++++++++++
 2 files changed, 107 insertions(+), 6 deletions(-)

diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp
index 4ee35013ab77..68a9582a09c1 100644
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@@ -2322,12 +2322,23 @@ _scaled_nvfp4_nvfp4(
           const Tensor& scale_b, const SwizzleType swizzle_b,
           const std::optional<Tensor>& bias,
           const c10::ScalarType out_dtype,
-          const bool single_scale,
-          Tensor& out) {
+          Tensor& out,
+          const std::optional<Tensor>& global_scale_a = std::nullopt,
+          const std::optional<Tensor>& global_scale_b = std::nullopt) {
 #ifdef USE_ROCM
   TORCH_CHECK_NOT_IMPLEMENTED(false, "NVFP4 scaling not supported on ROCM");
 #endif
-  TORCH_CHECK_VALUE(single_scale, "Only single-scaled NVFP4 currently supported");
+  std::optional<Tensor> alpha = std::nullopt;
+  // Note: "Or" here means that if only one scale is passed, we check for the other. Otherwise,
+  //       if this is "And" we would silently do nothing in the case where one global scale is
+  //       passed and not the other.
+  if (global_scale_a.has_value() || global_scale_b.has_value()) {
+    TORCH_CHECK_VALUE(global_scale_a.has_value(),
+        "For two-level-scaled NVFP4, global_scale_a must have a value");
+    TORCH_CHECK_VALUE(global_scale_b.has_value(),
+        "For two-level-scaled NVFP4, global_scale_b must have a value");
+    alpha = global_scale_a.value().mul(global_scale_b.value());
+  }
   // Restrictions:
   // A, B are FP4, scales are e8m0, A: shape K//32, B: K, N//32
   // Scales must be swizzled
@@ -2349,7 +2360,7 @@ _scaled_nvfp4_nvfp4(
 
   auto scaling_choice_a = ScalingType::BlockWise1x16;
   auto scaling_choice_b = ScalingType::BlockWise1x16;
-  return _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, false /* use_fast_accum */, out);
+  return _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, false /* use_fast_accum */, out, alpha);
 }
 
 
@@ -2555,9 +2566,10 @@ _scaled_mm_cuda_v2_out(
   } else if (gemm_impl == ScaledGemmImplementation::MXFP8_MXFP8) {
     return _scaled_mxfp8_mxfp8(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, out);
   } else if (gemm_impl == ScaledGemmImplementation::NVFP4_NVFP4) {
-    TORCH_CHECK_NOT_IMPLEMENTED(false, "Only single-scale NVFP4 currently supported");
+    return _scaled_nvfp4_nvfp4(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, out,
+                               scale_a[1], scale_b[1]);
   } else if (gemm_impl == ScaledGemmImplementation::NVFP4_NVFP4_SINGLE_SCALE) {
-    return _scaled_nvfp4_nvfp4(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, true /* single_scale */, out);
+    return _scaled_nvfp4_nvfp4(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, out);
   } else if (gemm_impl == ScaledGemmImplementation::MXFP4_MXFP4) {
     return _scaled_mxfp4_mxfp4(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, out);
   } else {
diff --git a/test/test_scaled_matmul_cuda.py b/test/test_scaled_matmul_cuda.py
index d57b1535d02f..7dd6f10d3a82 100644
--- a/test/test_scaled_matmul_cuda.py
+++ b/test/test_scaled_matmul_cuda.py
@@ -413,6 +413,42 @@ def data_to_nvfp4_scale(x, block_size):
     return scale
 
 
+def data_to_nvfp4_with_global_scale(x, block_size):
+    # Simple (slow) reference implementation of NVFP4 two-level-scaling
+    orig_shape = x.shape
+    x = x.reshape(-1, block_size)
+
+    # Per-block-amax
+    block_max = torch.amax(torch.abs(x), 1) + 1e-12
+
+    # Per-tensor max
+    global_max = x.abs().max()
+
+    # Contants
+    # Global encoding scale for block-scales
+    S_enc = FP4_MAX_VAL * F8E4M3_MAX_VAL / global_max
+    S_dec = 1. / S_enc
+
+    # Per-block decode-scale
+    S_dec_b = block_max / FP4_MAX_VAL
+
+    # Stored scaled-e4m3 per-block decode scales
+    S_dec_b_e4m3 = (S_dec_b * S_enc).to(torch.float8_e4m3fn)
+
+    # Actual per-block encoding scale
+    S_enc_b = S_enc / S_dec_b_e4m3.float()
+
+    # scale & reshape input, reshape scales
+    x = (S_enc_b.unsqueeze(1) * x).bfloat16().reshape(orig_shape)
+    S_dec_b_e4m3 = S_dec_b_e4m3.reshape(orig_shape[0], -1)
+
+    # cast input
+    x_fp4 = _bfloat16_to_float4_e2m1fn_x2(x)
+
+    # fp4x2, fp8_e4m3, float respectively
+    return x_fp4, S_dec_b_e4m3, S_dec.float()
+
+
 def down_size(size):
     assert size[-1] % 2 == 0, f"{size} last dim not divisible by two"
     return (*size[:-1], size[-1] // 2)
@@ -1254,6 +1290,59 @@ class TestFP8Matmul(TestCase):
         lp_data_expected = torch.tensor([0b10110010], dtype=torch.uint8)
         torch.testing.assert_close(lp_data_actual, lp_data_expected, atol=0, rtol=0)
 
+
+    @onlyCUDA
+    @unittest.skipIf(not PLATFORM_SUPPORTS_MX_GEMM, mx_skip_msg)
+    @parametrize("mkn", [
+        # Nice shapes
+        (128, 128, 128),
+        (256, 256, 256),
+        (128, 256, 512),
+        (256, 512, 128),
+        (512, 128, 256),
+
+        # Very unbalanced
+        (1023, 64, 48),
+        (31, 1024, 64),
+        (45, 96, 1024),
+
+        # Mixed large and small
+        (2, 1024, 128),
+        (127, 96, 1024),
+        (1025, 128, 96)
+    ], name_fn=lambda mkn: f"{mkn[0]}_{mkn[1]}_{mkn[2]}")
+    def test_blockwise_nvfp4_with_global_scale(self, mkn) -> None:
+        device = 'cuda'
+        M, K, N = mkn
+        BLOCK_SIZE = 16
+        # Note: SQNR target from `test_blockwise_mxfp8_nvfp4_mxfp4_numerics` test
+        approx_match_sqnr_target = 15.8
+
+        A_ref = torch.randn((M, K), device=device, dtype=torch.bfloat16) * 1000
+        B_ref = torch.randn((N, K), device=device, dtype=torch.bfloat16) * 1000
+
+        A, A_scale, A_global_scale = data_to_nvfp4_with_global_scale(A_ref, BLOCK_SIZE)
+        B, B_scale, B_global_scale = data_to_nvfp4_with_global_scale(B_ref, BLOCK_SIZE)
+        A_scale = to_blocked(A_scale)
+        B_scale = to_blocked(B_scale)
+
+        C_ref = A_ref @ B_ref.t()
+
+        C = scaled_mm(
+            A,
+            B.t(),
+            scale_a=[A_scale, A_global_scale],
+            scale_recipe_a=[ScalingType.BlockWise1x16, ScalingType.TensorWise],
+            scale_b=[B_scale, B_global_scale],
+            scale_recipe_b=[ScalingType.BlockWise1x16, ScalingType.TensorWise],
+            swizzle_a=[SwizzleType.SWIZZLE_32_4_4, SwizzleType.NO_SWIZZLE],
+            swizzle_b=[SwizzleType.SWIZZLE_32_4_4, SwizzleType.NO_SWIZZLE],
+            output_dtype=torch.bfloat16,
+        )
+
+        sqnr = compute_error(C_ref, C)
+        assert sqnr.item() > approx_match_sqnr_target
+
     @unittest.skipIf(not PLATFORM_SUPPORTS_MX_GEMM, mx_skip_msg)
     @parametrize("test_case_name", [
         "a_eye_b_eye",

From 032bed95cd06a18a971273c7cfb07b8321e70d74 Mon Sep 17 00:00:00 2001
From: Yuanyuan Chen <cyyever@outlook.com>
Date: Sat, 18 Oct 2025 17:59:23 +0000
Subject: [PATCH 114/123] Various C++ code fixes in LSAN integration (#165818)

This PR extracts the C++ code fixes from #154584, which are fixes in enabling LSAN.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165818
Approved by: https://github.com/ezyang
---
 torch/csrc/Module.cpp          | 2 +-
 torch/csrc/autograd/variable.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index 4a864daa8c12..772fe1d141be 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -241,7 +241,7 @@ static PyObject* THPModule_initExtension(
   END_HANDLE_TH_ERRORS
 }
 
-// The idea behind these two functions is to make it easy to test if we are
+// The idea behind these functions is to make it easy to test if we are
 // built with ASAN: they're designed not to crash if ASAN is not enabled, but
 // to trigger ASAN if it is enabled.  This lets us run a "canary" tests which
 // checks if our build environment is misconfigured.
diff --git a/torch/csrc/autograd/variable.h b/torch/csrc/autograd/variable.h
index d0fd3d7ee66e..a297a9f5ef42 100644
--- a/torch/csrc/autograd/variable.h
+++ b/torch/csrc/autograd/variable.h
@@ -876,7 +876,7 @@ inline Variable make_variable_non_differentiable_view(
         /*version_counter=*/impl::version_counter(base),
         /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
     data_impl_copy->set_autograd_meta(nullptr);
-    return Variable(data_impl_copy);
+    return Variable(std::move(data_impl_copy));
   }
   return Variable();
 }
@@ -935,7 +935,7 @@ inline Variable make_variable(
         /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
     data_impl_copy->set_autograd_meta(std::make_unique<AutogradMeta>(
         data_impl_copy.get(), false, std::move(gradient_edge)));
-    return Variable(data_impl_copy);
+    return Variable(std::move(data_impl_copy));
   }
   return Variable();
 }

From 1f43d17ce672ff1fca2f5eab033cb03c27132385 Mon Sep 17 00:00:00 2001
From: Yuanyuan Chen <cyyever@outlook.com>
Date: Sat, 18 Oct 2025 18:51:49 +0000
Subject: [PATCH 115/123] Fix self assignment (#165816)

This PR removes assignments of the form `var=var`.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165816
Approved by: https://github.com/jansel
---
 torch/_functorch/vmap.py                                | 2 +-
 torch/_inductor/fx_passes/efficient_conv_bn_eval.py     | 8 ++------
 torch/_inductor/tiling_utils.py                         | 1 -
 torch/_inductor/utils.py                                | 1 -
 torch/_numpy/_dtypes.py                                 | 2 --
 torch/_prims/__init__.py                                | 6 +-----
 torch/nn/utils/stateless.py                             | 3 ---
 torch/testing/_internal/distributed/rpc/jit/rpc_test.py | 2 +-
 8 files changed, 5 insertions(+), 20 deletions(-)

diff --git a/torch/_functorch/vmap.py b/torch/_functorch/vmap.py
index 25ffe9c525f3..465be67e41fa 100644
--- a/torch/_functorch/vmap.py
+++ b/torch/_functorch/vmap.py
@@ -293,7 +293,7 @@ def vmap_impl(func, in_dims, out_dims, randomness, chunk_size, *args, **kwargs):
 
 
 def get_chunk_sizes(total_elems, chunk_size):
-    n_chunks = n_chunks = total_elems // chunk_size
+    n_chunks = total_elems // chunk_size
     chunk_sizes = [chunk_size] * n_chunks
     # remainder chunk
     remainder = total_elems % chunk_size
diff --git a/torch/_inductor/fx_passes/efficient_conv_bn_eval.py b/torch/_inductor/fx_passes/efficient_conv_bn_eval.py
index 78cd317284d2..b6db1367de6e 100644
--- a/torch/_inductor/fx_passes/efficient_conv_bn_eval.py
+++ b/torch/_inductor/fx_passes/efficient_conv_bn_eval.py
@@ -108,14 +108,10 @@ def efficient_conv_bn_eval_decomposed(
     else:
         bias_on_the_fly = torch.zeros_like(bn_running_var)
 
-    if bn_weight is not None:
-        bn_weight = bn_weight
-    else:
+    if bn_weight is None:
         bn_weight = torch.ones_like(bn_running_var)
 
-    if bn_bias is not None:
-        bn_bias = bn_bias
-    else:
+    if bn_bias is None:
         bn_bias = torch.zeros_like(bn_running_var)
 
     # shape of [C_out, 1, 1, 1] in Conv2d
diff --git a/torch/_inductor/tiling_utils.py b/torch/_inductor/tiling_utils.py
index 3142f97f8c40..30efae2293c8 100644
--- a/torch/_inductor/tiling_utils.py
+++ b/torch/_inductor/tiling_utils.py
@@ -477,7 +477,6 @@ def extract_normalized_read_writes(
     (norm_pw_vars, norm_red_vars), ranges = index_vars_no_squeeze(
         pw_splits, red_splits, prefix="n"
     )
-    node = node
 
     for n in list(node.get_nodes()):
         if not isinstance(n, torch._inductor.scheduler.SchedulerNode):
diff --git a/torch/_inductor/utils.py b/torch/_inductor/utils.py
index f1c7f23cf719..b7c347fd7acc 100644
--- a/torch/_inductor/utils.py
+++ b/torch/_inductor/utils.py
@@ -760,7 +760,6 @@ def get_fused_kernel_name(
         ]
     else:
         raise NotImplementedError
-    sources = sources
     return "_".join(["fused"] + sources)
 
 
diff --git a/torch/_numpy/_dtypes.py b/torch/_numpy/_dtypes.py
index e955a47060ff..a429d28f30cc 100644
--- a/torch/_numpy/_dtypes.py
+++ b/torch/_numpy/_dtypes.py
@@ -408,8 +408,6 @@ def set_default_dtype(fp_dtype="numpy", int_dtype="numpy"):
 
     if int_dtype in ["numpy", "pytorch"]:
         int_dtype = torch.int64
-    else:
-        int_dtype = int_dtype
 
     new_defaults = _dtypes_impl.DefaultDTypes(
         float_dtype=float_dtype, complex_dtype=complex_dtype, int_dtype=int_dtype
diff --git a/torch/_prims/__init__.py b/torch/_prims/__init__.py
index f3fd27e59139..7827aa244a2e 100644
--- a/torch/_prims/__init__.py
+++ b/torch/_prims/__init__.py
@@ -447,9 +447,7 @@ def _prim_elementwise_meta(
     # (but getting it wrong will cause too many casts to be inserted in traces!)
     if device is not None:
         assert dtype is not None
-        if type_promotion == ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT:
-            dtype = dtype
-        elif type_promotion == ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.ALWAYS_BOOL:
+        if type_promotion == ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.ALWAYS_BOOL:
             dtype = torch.bool
         elif type_promotion == ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.INT_TO_FLOAT:
             if utils.is_integer_dtype(dtype) or utils.is_boolean_dtype(dtype):
@@ -457,8 +455,6 @@ def _prim_elementwise_meta(
         elif type_promotion == ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.COMPLEX_TO_FLOAT:
             if utils.is_complex_dtype(dtype):
                 dtype = utils.corresponding_real_dtype(dtype)
-            else:
-                dtype = dtype
 
         assert shape is not None
         return torch.empty_permuted(shape, l2p_perm, device=device, dtype=dtype)  # type: ignore[return-value]
diff --git a/torch/nn/utils/stateless.py b/torch/nn/utils/stateless.py
index ce55641faab4..148052740922 100644
--- a/torch/nn/utils/stateless.py
+++ b/torch/nn/utils/stateless.py
@@ -103,9 +103,6 @@ def _reparametrize_module(
     strict: bool = False,
     stack_weights: bool = False,
 ):
-    parameters_and_buffers = parameters_and_buffers
-    stack_weights = stack_weights
-
     if tie_weights:
         untied_parameters_and_buffers = _untie_named_tensors_map(
             module, parameters_and_buffers
diff --git a/torch/testing/_internal/distributed/rpc/jit/rpc_test.py b/torch/testing/_internal/distributed/rpc/jit/rpc_test.py
index ec2f2b949907..76c089f45800 100644
--- a/torch/testing/_internal/distributed/rpc/jit/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/jit/rpc_test.py
@@ -85,7 +85,7 @@ class RRefAPITest:
         ):
             rref_local_value(rref)
 
-        ret = ret = rpc.rpc_sync(dst_worker_name, rref_local_value, (rref,))
+        ret = rpc.rpc_sync(dst_worker_name, rref_local_value, (rref,))
         self.assertEqual(ret, torch.add(torch.ones(2, 2), 1))
 
     @dist_init

From 35e51893bd2ee2966503ed5f426e2323328a9a0b Mon Sep 17 00:00:00 2001
From: Yuanyuan Chen <cyyever@outlook.com>
Date: Sat, 18 Oct 2025 20:05:50 +0000
Subject: [PATCH 116/123] Remove CUDA 11 workarounds for
 CUB_SUPPORTS_SCAN_BY_KEY and CUB_SUPPORTS_UNIQUE_BY_KEY (#164637)

`CUB_SUPPORTS_SCAN_BY_KEY` and `CUB_SUPPORTS_UNIQUE_BY_KEY` are true since CUDA 12. This PR removes the old branches and source files.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/164637
Approved by: https://github.com/ezyang
---
 aten/src/ATen/cuda/cub.cuh                    |  4 -
 aten/src/ATen/cuda/cub_definitions.cuh        | 16 ----
 aten/src/ATen/native/cuda/Embedding.cu        | 12 ---
 .../native/cuda/EmbeddingBackwardKernel.cu    | 19 ----
 aten/src/ATen/native/cuda/EmbeddingBag.cu     | 12 ---
 .../ATen/native/cuda/LegacyThrustHelpers.cu   | 90 -------------------
 aten/src/ATen/native/cuda/TensorTopK.cpp      | 12 +--
 aten/src/ATen/native/cuda/TensorTopK.cu       | 45 ----------
 8 files changed, 1 insertion(+), 209 deletions(-)
 delete mode 100644 aten/src/ATen/native/cuda/LegacyThrustHelpers.cu

diff --git a/aten/src/ATen/cuda/cub.cuh b/aten/src/ATen/cuda/cub.cuh
index 23a3ff8c8958..7828c3917fc4 100644
--- a/aten/src/ATen/cuda/cub.cuh
+++ b/aten/src/ATen/cuda/cub.cuh
@@ -177,7 +177,6 @@ inline void segmented_sort_pairs(
   }
 }
 
-#if CUB_SUPPORTS_UNIQUE_BY_KEY()
 template <typename KeysInputIteratorT, typename ValuesInputIteratorT, typename ValuesOutputIteratorT, typename NumSelectedIteratorT>
 inline void unique_by_key(
   KeysInputIteratorT keys_in, ValuesInputIteratorT values_in,
@@ -193,7 +192,6 @@ inline void unique_by_key(
   CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceSelect::UniqueByKey,
     keys_in, values_in, keys_out_, values_out, num_selected, num_input_items, c10::cuda::getCurrentCUDAStream());
 }
-#endif
 
 namespace impl {
 
@@ -579,7 +577,6 @@ inline void exclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT
 #endif
 }
 
-#if CUB_SUPPORTS_SCAN_BY_KEY()
 
 template <typename KeysInputIteratorT, typename ValuesInputIteratorT, typename ValuesOutputIteratorT>
 inline void inclusive_sum_by_key(KeysInputIteratorT keys, ValuesInputIteratorT input, ValuesOutputIteratorT output, int64_t num_items) {
@@ -607,7 +604,6 @@ inline void inclusive_scan_by_key(KeysInputIteratorT keys, ValuesInputIteratorT
 #endif
 }
 
-#endif
 
 template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT>
 void unique(InputIteratorT input, OutputIteratorT output,
diff --git a/aten/src/ATen/cuda/cub_definitions.cuh b/aten/src/ATen/cuda/cub_definitions.cuh
index b80951269209..0d76ae6e8dcf 100644
--- a/aten/src/ATen/cuda/cub_definitions.cuh
+++ b/aten/src/ATen/cuda/cub_definitions.cuh
@@ -28,22 +28,6 @@
 #define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() false
 #endif
 
-// cub support for UniqueByKey is added to cub 1.16 in:
-// https://github.com/NVIDIA/cub/pull/405
-#if CUB_VERSION >= 101600
-#define CUB_SUPPORTS_UNIQUE_BY_KEY() true
-#else
-#define CUB_SUPPORTS_UNIQUE_BY_KEY() false
-#endif
-
-// cub support for scan by key is added to cub 1.15
-// in https://github.com/NVIDIA/cub/pull/376
-#if CUB_VERSION >= 101500
-#define CUB_SUPPORTS_SCAN_BY_KEY() 1
-#else
-#define CUB_SUPPORTS_SCAN_BY_KEY() 0
-#endif
-
 // cub support for cub::FutureValue is added to cub 1.15 in:
 // https://github.com/NVIDIA/cub/pull/305
 #if CUB_VERSION >= 101500
diff --git a/aten/src/ATen/native/cuda/Embedding.cu b/aten/src/ATen/native/cuda/Embedding.cu
index adc300a5a9ef..65b0e1441de7 100644
--- a/aten/src/ATen/native/cuda/Embedding.cu
+++ b/aten/src/ATen/native/cuda/Embedding.cu
@@ -15,9 +15,7 @@
 #include <ATen/native/cuda/block_reduce.cuh>
 #include <ATen/native/cuda/thread_constants.h>
 
-#if CUB_SUPPORTS_SCAN_BY_KEY()
 #include <thrust/iterator/reverse_iterator.h>
-#endif
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -240,10 +238,6 @@ __global__ void renorm_kernel(
 
 } // anonymous namespace
 
-#if !CUB_SUPPORTS_SCAN_BY_KEY()
-template<typename index_t>
-void embedding_dense_backward_cuda_scan(Tensor &sorted_indices, Tensor &count);
-#endif
 
 Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indices_,
                                int64_t num_weights, int64_t padding_idx,
@@ -306,7 +300,6 @@ Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indice
 
   if (scale_grad_by_freq) {
     count = at::empty_like(indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-#if CUB_SUPPORTS_SCAN_BY_KEY()
     AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_dense_backward_cuda", [&] () {
       cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
@@ -333,11 +326,6 @@ Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indice
         num_indices
       );
     });
-#else
-    AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_dense_backward_cuda", [&] () {
-      embedding_dense_backward_cuda_scan<index_t>(sorted_indices, count);
-    });
-#endif
   }
 
   return embedding_backward_cuda_kernel(grad, orig_indices,
diff --git a/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu b/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu
index 4f67696bd022..6ce419137345 100644
--- a/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu
+++ b/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu
@@ -10,9 +10,7 @@
 
 #include <c10/macros/Macros.h>
 
-#if CUB_SUPPORTS_UNIQUE_BY_KEY()
 #include <thrust/iterator/counting_iterator.h>
-#endif
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -196,18 +194,9 @@ __global__ void compute_num_of_partial_segments(const index_t *partials_per_segm
             partials_per_segment_offset[num_of_segments-1];
 }
 
-#if !CUB_SUPPORTS_UNIQUE_BY_KEY()
-__global__ void write_num_of_segments_for_legacy_thrust_path(int64_t *num_of_segments_ptr, int64_t num_of_segments) {
-  *num_of_segments_ptr = num_of_segments;
-}
-#endif
 
 } // anon namespace
 
-#if !CUB_SUPPORTS_UNIQUE_BY_KEY()
-template<typename index_t>
-int64_t embedding_backward_cuda_kernel_unique_by_key(const Tensor &sorted_indices, Tensor &segment_offsets);
-#endif
 
 Tensor embedding_backward_cuda_kernel(
         const Tensor &grad,
@@ -234,20 +223,12 @@ Tensor embedding_backward_cuda_kernel(
   auto segment_offsets = at::empty({numel}, orig_indices.options());
   auto num_of_segments_tensor = at::empty({}, grad.options().dtype(kLong));
   int64_t *num_of_segments_ptr = num_of_segments_tensor.mutable_data_ptr<int64_t>();
-#if !CUB_SUPPORTS_UNIQUE_BY_KEY()
-  AT_DISPATCH_INDEX_TYPES(orig_indices.scalar_type(), "embedding_backward_cuda_kernel", [&] () {
-    int64_t num_of_segments = embedding_backward_cuda_kernel_unique_by_key<index_t>(sorted_indices, segment_offsets);
-    write_num_of_segments_for_legacy_thrust_path<<<1, 1, 0, c10::cuda::getCurrentCUDAStream()>>>(num_of_segments_ptr, num_of_segments);
-    C10_CUDA_KERNEL_LAUNCH_CHECK();
-  });
-#else
   AT_DISPATCH_INDEX_TYPES(orig_indices.scalar_type(), "embedding_backward_cuda_kernel", [&] () {
     cuda::cub::unique_by_key(
       sorted_indices.const_data_ptr<index_t>(), thrust::make_counting_iterator(0),
       segment_offsets.mutable_data_ptr<index_t>(),
       num_of_segments_ptr, sorted_indices.numel());
   });
-#endif
 
   int64_t max_segments = std::min<int64_t>(numel, num_weights);
 
diff --git a/aten/src/ATen/native/cuda/EmbeddingBag.cu b/aten/src/ATen/native/cuda/EmbeddingBag.cu
index fb92c7488a15..ab3747df031e 100644
--- a/aten/src/ATen/native/cuda/EmbeddingBag.cu
+++ b/aten/src/ATen/native/cuda/EmbeddingBag.cu
@@ -31,16 +31,10 @@
 
 #include <c10/macros/Macros.h>
 
-#if CUB_SUPPORTS_SCAN_BY_KEY()
 #include <thrust/iterator/reverse_iterator.h>
-#endif
 
 namespace at::native {
 
-#if !CUB_SUPPORTS_SCAN_BY_KEY()
-template<typename index_t>
-void embedding_dense_backward_cuda_scan(Tensor &sorted_indices, Tensor &count);
-#endif
 
 namespace {
 
@@ -199,7 +193,6 @@ Tensor embedding_bag_backward_cuda_sum_avg(
 
   if (scale_grad_by_freq) {
     count = at::empty_like(indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-#if CUB_SUPPORTS_SCAN_BY_KEY()
     AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_bag_backward_cuda_sum_avg", [&] () {
       cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
@@ -226,11 +219,6 @@ Tensor embedding_bag_backward_cuda_sum_avg(
         num_indices
       );
     });
-#else
-    AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_bag_backward_cuda_sum_avg", [&] () {
-      embedding_dense_backward_cuda_scan<index_t>(sorted_indices, count);
-    });
-#endif
   }
   return embedding_backward_cuda_kernel(grad, orig_indices, sorted_indices,
       count, num_weights, padding_idx, mode == EmbeddingBagMode::MEAN, offset2bag,
diff --git a/aten/src/ATen/native/cuda/LegacyThrustHelpers.cu b/aten/src/ATen/native/cuda/LegacyThrustHelpers.cu
deleted file mode 100644
index 6a549ac3d62c..000000000000
--- a/aten/src/ATen/native/cuda/LegacyThrustHelpers.cu
+++ /dev/null
@@ -1,90 +0,0 @@
-#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
-#include <ATen/core/Tensor.h>
-#include <ATen/native/cuda/SortingCommon.cuh>
-#include <ATen/cuda/cub_definitions.cuh>
-
-#ifndef AT_PER_OPERATOR_HEADERS
-#include <ATen/Functions.h>
-#else
-#include <ATen/ops/empty_like.h>
-#endif
-
-#include <ATen/cuda/ThrustAllocator.h>
-#include <thrust/device_ptr.h>
-#include <thrust/execution_policy.h>
-#include <thrust/sort.h>
-#include <thrust/unique.h>
-#include <thrust/device_ptr.h>
-#include <thrust/iterator/constant_iterator.h>
-
-namespace at::native {
-
-#if !CUB_SUPPORTS_SCAN_BY_KEY()
-
-template<typename index_t>
-void embedding_dense_backward_cuda_scan(Tensor &sorted_indices, Tensor &count) {
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  at::cuda::ThrustAllocator allocator;
-  auto policy = thrust::cuda::par(allocator).on(stream);
-
-  auto num_indices = count.numel();
-
-  // Compute an increasing sequence per unique item in sortedIndices:
-  // sorted: 2 5 5 5 7 7 8 9 9
-  //  count: 1 1 2 3 1 2 1 1 2
-  auto sorted_data = thrust::device_ptr<const index_t>(sorted_indices.const_data_ptr<index_t>());
-  auto count_data = thrust::device_ptr<index_t>(count.mutable_data_ptr<index_t>());
-  thrust::inclusive_scan_by_key(
-    policy,
-    sorted_data,
-    sorted_data + num_indices,
-    thrust::make_constant_iterator(1),
-    count_data
-  );
-
-  // Take the maximum of each count per unique key in reverse:
-  // sorted: 2 5 5 5 7 7 8 9 9
-  //  count: 1 3 3 3 2 2 1 2 2
-  thrust::inclusive_scan_by_key(
-    policy,
-    thrust::make_reverse_iterator(sorted_data + num_indices),
-    thrust::make_reverse_iterator(sorted_data),
-    thrust::make_reverse_iterator(count_data + num_indices),
-    thrust::make_reverse_iterator(count_data + num_indices),
-    thrust::equal_to<index_t>(),
-    thrust::maximum<index_t>()
-  );
-}
-
-template
-void embedding_dense_backward_cuda_scan<int>(Tensor &sorted_indices, Tensor &count);
-template
-void embedding_dense_backward_cuda_scan<int64_t>(Tensor &sorted_indices, Tensor &count);
-
-#endif
-
-template<typename index_t>
-int64_t embedding_backward_cuda_kernel_unique_by_key(const Tensor &sorted_indices, Tensor &segment_offsets) {
-  auto stream = at::cuda::getCurrentCUDAStream();
-  at::cuda::ThrustAllocator allocator;
-  auto policy = thrust::cuda::par(allocator).on(stream);
-  const ptrdiff_t numel = sorted_indices.numel();
-  auto sorted_indices_dev = thrust::device_ptr<const index_t>(sorted_indices.const_data_ptr<index_t>());
-  auto dummy = at::empty_like(sorted_indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-  auto dummy_dev = thrust::device_ptr<index_t>(dummy.mutable_data_ptr<index_t>());
-  auto ends = thrust::unique_by_key_copy(
-          policy,
-          sorted_indices_dev,
-          sorted_indices_dev + numel,
-          thrust::make_counting_iterator(0),
-          dummy_dev,
-          thrust::device_ptr<index_t>(segment_offsets.mutable_data_ptr<index_t>()));
-  return thrust::get<0>(ends) - dummy_dev;
-}
-
-template
-int64_t embedding_backward_cuda_kernel_unique_by_key<int>(const Tensor &sorted_indices, Tensor &segment_offsets);
-template
-int64_t embedding_backward_cuda_kernel_unique_by_key<int64_t>(const Tensor &sorted_indices, Tensor &segment_offsets);
-
-} // namespace at::native
diff --git a/aten/src/ATen/native/cuda/TensorTopK.cpp b/aten/src/ATen/native/cuda/TensorTopK.cpp
index f47e7a887ebe..bc609f829a26 100644
--- a/aten/src/ATen/native/cuda/TensorTopK.cpp
+++ b/aten/src/ATen/native/cuda/TensorTopK.cpp
@@ -19,7 +19,6 @@
 
 namespace at::native {
 
-// TODO: remove this when CUDA <11.6 is no longer supported
 void topk_out_with_sort(
   const Tensor& self,
   int64_t k, int64_t dim, bool largest,
@@ -31,21 +30,12 @@ void topk_out_with_sort(
   indices.copy_(sorted_indices.narrow(dim, 0, k));
 }
 
-// TODO: remove this when CUDA <11.6 is no longer supported
-bool disable_sort_for_topk();
 bool should_use_sort(const Tensor& self, int64_t dim) {
 #if defined(USE_ROCM)
   if (self.dtype() == kBool) return false; // Bool sort not supported in ROCm: https://github.com/pytorch/pytorch/issues/139972
   return (self.numel() >= 10000 && self.numel() == self.size(dim)); // based on the experiments in https://github.com/pytorch/pytorch/pull/146387
 #else
-  if (disable_sort_for_topk()) return false;
-  // This heuristics is based on the experiment in https://github.com/pytorch/pytorch/pull/68632
-  if (self.dim() == 0) return false;
-  if (self.dtype() == kBool) return false; // Bool is not support by topk
-  int64_t slice_size = self.size(dim);
-  if (slice_size == 0) return false;
-  int64_t num_slices = self.numel() / slice_size;
-  return num_slices <= 10 && slice_size >= 100000;
+  return false;
 #endif
 }
 
diff --git a/aten/src/ATen/native/cuda/TensorTopK.cu b/aten/src/ATen/native/cuda/TensorTopK.cu
index 3f57281ebf56..d95d85bf0237 100644
--- a/aten/src/ATen/native/cuda/TensorTopK.cu
+++ b/aten/src/ATen/native/cuda/TensorTopK.cu
@@ -21,11 +21,6 @@ using namespace at::native;
 
 namespace at::native {
 
-// TODO: remove this when CUDA <11.6 is no longer supported
-bool disable_sort_for_topk() {
-  return CUB_SUPPORTS_SCAN_BY_KEY();
-}
-
 namespace sbtopk { // single_block_topk
 
 template <typename T>
@@ -418,10 +413,6 @@ __global__ void computeBlockwiseWithinKCounts(
   }
   __syncthreads();
 
-#if !CUB_SUPPORTS_SCAN_BY_KEY()
-  return;
-#endif
-
   Bitwise desired_digit = at::cuda::Bitfield<Bitwise>::getBitfield(desired, current_bit, RADIX_BITS);
 
   // if largest, then only threads that has tidx > desired_digit are active
@@ -477,7 +468,6 @@ __global__ void computeBlockwiseWithinKCounts(
   }
 }
 
-#if CUB_SUPPORTS_SCAN_BY_KEY()
 // Assumption: slice_size can not be larger than UINT32_MAX
 template <typename Bitwise>
 __global__ void computeBlockwiseKthCounts(
@@ -609,7 +599,6 @@ __global__ void gatherTopK(at::cuda::detail::TensorInfo<const T, IndexType> inpu
     }
   }
 }
-#endif
 
 int get_items_per_thread(uint64_t num_slices, uint64_t slice_size) {
   // occupancy of this kernel is limited by registers per threads
@@ -687,16 +676,12 @@ void launch(
   uint32_t* digit_cum_sum = reinterpret_cast<uint32_t*>(digit_cum_sum_buffer.get());
   AT_CUDA_CHECK(cudaMemsetAsync(digit_cum_sum, 0, numInputSlices * RADIX_DIGITS * sizeof(uint32_t), stream));
 
-#if CUB_SUPPORTS_SCAN_BY_KEY()
   auto withinKCounts_buffer = allocator.allocate(num_blocks * sizeof(uint32_t));
   uint32_t* withinKCounts = reinterpret_cast<uint32_t*>(withinKCounts_buffer.get());
   AT_CUDA_CHECK(cudaMemsetAsync(withinKCounts, 0, num_blocks * sizeof(uint32_t), stream));
 
   auto kthCounts_buffer = allocator.allocate(num_blocks * sizeof(uint32_t));
   uint32_t* kthCounts = reinterpret_cast<uint32_t*>(kthCounts_buffer.get());
-#else
-  uint32_t* withinKCounts = nullptr;
-#endif
 
   Bitwise desiredMask = 0;
   dim3 grid;
@@ -743,7 +728,6 @@ void launch(
   }
   desired = desired_in;
 
-#if CUB_SUPPORTS_SCAN_BY_KEY()
   computeBlockwiseKthCounts<Bitwise><<<std::min(((int64_t)numInputSlices + 255) / 256, (int64_t)1073741824), 256, 0, stream>>>(
     desired, counts, num_blocks, blocks_per_slice, kthCounts);
   C10_CUDA_KERNEL_LAUNCH_CHECK();
@@ -759,28 +743,6 @@ void launch(
     topK, topKWithinSliceStride, indices, indicesWithinSliceStride, items_per_thread,
     blocks_per_slice, kthValues, withinKCounts, kthCounts, num_blocks);
   C10_CUDA_KERNEL_LAUNCH_CHECK();
-#else
-  // Find topk values based on kth values
-  {
-    dim3 grid;
-    TORCH_INTERNAL_ASSERT(getGridFromTiles(numInputSlices, grid), "Too many slices for topk");
-    int warp_size = at::cuda::warp_size();
-    dim3 block(std::min(at::ceil_div((int64_t)inputSliceSize, (int64_t)warp_size) * (int64_t)warp_size, (int64_t)1024));
-    sbtopk::gatherTopK<T, IndexType, Dim, /* WithKthValues= */true><<<grid, block, 0, stream>>>(
-        input,
-        inputSliceSize,
-        outputSliceSize,
-        largest,
-        numInputSlices,
-        inputWithinSliceStride,
-        topK,
-        topKWithinSliceStride,
-        indices,
-        indicesWithinSliceStride,
-        kthValues);
-    C10_CUDA_KERNEL_LAUNCH_CHECK();
-  }
-#endif
 }
 
 } // namespace mbtopk
@@ -788,7 +750,6 @@ void launch(
 bool should_use_multiblock(int64_t num_slices, int64_t slice_size) {
   if (num_slices > std::numeric_limits<uint32_t>::max() ||
       slice_size > std::numeric_limits<uint32_t>::max()) return false;
-#if CUB_SUPPORTS_SCAN_BY_KEY()
   // This heuristics is based on the experiment in https://github.com/pytorch/pytorch/pull/74267
   return (num_slices <= 20 && slice_size >= 20000) ||
       (num_slices > 20 && num_slices <= 40 && slice_size >= 10000) ||
@@ -797,12 +758,6 @@ bool should_use_multiblock(int64_t num_slices, int64_t slice_size) {
       (num_slices >= 200 && num_slices < 800 && slice_size >= 3000) ||
       (num_slices >= 800 && num_slices <= 4000 && slice_size >= 800) ||
       (num_slices > 4000 && slice_size >= 400);
-#else
-  // This heuristics is based on the experiment in https://github.com/pytorch/pytorch/pull/71081
-  return (num_slices <= 400 && slice_size >= 5000) ||
-      (num_slices > 400 && num_slices < 4000 && slice_size >= 1000) ||
-      (num_slices >= 4000 && slice_size >= 300);
-#endif
 }
 
 void launch_gather_topk_kernel(

From f18041cca8542bf8c7d92d69966038fa2130a06e Mon Sep 17 00:00:00 2001
From: andreh7 <andre.holzner@gmail.com>
Date: Sat, 18 Oct 2025 22:09:18 +0000
Subject: [PATCH 117/123] Fix missing closing quote  in __init__.py
 documentation (#165827)

Title says it all.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165827
Approved by: https://github.com/Skylion007
---
 torch/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/__init__.py b/torch/__init__.py
index 40838191707b..39555a8360e8 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -2503,7 +2503,7 @@ def compile(
     to compile it and cache the compiled result on the code object for future
     use.  A single frame may be compiled multiple times if previous compiled
     results are not applicable for subsequent calls (this is called a "guard
-    failure), you can use TORCH_LOGS=guards to debug these situations.
+    failure"), you can use TORCH_LOGS=guards to debug these situations.
     Multiple compiled results can be associated with a frame up to
     ``torch._dynamo.config.recompile_limit``, which defaults to 8; at which
     point we will fall back to eager.  Note that compile caches are per

From c4f6619330bdac5bf4addb9070ecb42994202e1f Mon Sep 17 00:00:00 2001
From: Dzmitry Huba <huba@meta.com>
Date: Sat, 18 Oct 2025 12:54:20 -0700
Subject: [PATCH 118/123] Enable more DTensor tests in local tensor mode and
 fix more integration issues (#165716)

- During op dispatch local tensor is supposed to collect rng state from CPU and CUDA
devices so that it can be reset before execution of the op for each such that ops
with randomness produces the same result for all ranks (note that we are planning a
separate change to add support of per rank rng state). Previously we relied on
op input arguments to deduce which devices to get rng state from. Which doesn't work
for factory functions such torch.randn. Hence this changes switches to uncondionally
collecting rng state from all devices.

- Fixing per rank specific computations in _MaskedPartial and Shard placements discovered
during test enablement.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165716
Approved by: https://github.com/ezyang
---
 test/distributed/tensor/test_tensor_ops.py    | 15 +++-
 test/distributed/test_dist2.py                | 18 ++++-
 torch/distributed/_local_tensor/__init__.py   | 78 +++++++++++++++++--
 .../distributed/tensor/_ops/_embedding_ops.py | 41 ++++++----
 torch/distributed/tensor/_sharding_prop.py    |  3 +
 torch/distributed/tensor/debug/__init__.py    | 11 +++
 torch/distributed/tensor/placement_types.py   | 18 ++++-
 torch/testing/_internal/common_distributed.py | 16 +++-
 .../distributed/_tensor/common_dtensor.py     |  3 +
 9 files changed, 169 insertions(+), 34 deletions(-)

diff --git a/test/distributed/tensor/test_tensor_ops.py b/test/distributed/tensor/test_tensor_ops.py
index eaa1969068c1..8368befabfec 100644
--- a/test/distributed/tensor/test_tensor_ops.py
+++ b/test/distributed/tensor/test_tensor_ops.py
@@ -17,6 +17,7 @@ from torch.distributed.tensor.debug import CommDebugMode
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_utils import run_tests, skipIfRocm
 from torch.testing._internal.distributed._tensor.common_dtensor import (
+    create_local_tensor_test_class,
     DTensorConverter,
     DTensorTestBase,
     with_comms,
@@ -704,6 +705,12 @@ class DistTensorOpsTest(DTensorTestBase):
 
     @with_comms
     def test_dtensor_dtype_conversion(self):
+        from torch.distributed.tensor.debug import (
+            _clear_sharding_prop_cache,
+            _get_sharding_prop_cache_info,
+        )
+
+        _clear_sharding_prop_cache()
         device_mesh = self.build_device_mesh()
         shard_spec = [Shard(0)]
         # by default we start from bf16 dtype
@@ -722,8 +729,6 @@ class DistTensorOpsTest(DTensorTestBase):
         self.assertEqual(bf16_sharded_dtensor1.dtype, torch.bfloat16)
         self.assertEqual(bf16_sharded_dtensor1.to_local().dtype, torch.bfloat16)
 
-        from torch.distributed.tensor.debug import _get_sharding_prop_cache_info
-
         # by this point we only have cache misses
         hits, misses, _, _ = _get_sharding_prop_cache_info()
         self.assertEqual(hits, 0)
@@ -775,7 +780,7 @@ class DistTensorOpsTest(DTensorTestBase):
         )
 
     def _test_split_on_partial(self, reduce_op: str, split_size: int, split_dim: int):
-        torch.manual_seed(self.rank)
+        self.init_manual_seed_for_rank()
         mesh = self.build_device_mesh()
 
         partial_tensor = torch.randn(8, 8, device=self.device_type)
@@ -822,5 +827,9 @@ class DistTensorOpsTest(DTensorTestBase):
                     self.assertEqual(x.full_tensor(), y)
 
 
+DistTensorOpsTestWithLocalTensor = create_local_tensor_test_class(
+    DistTensorOpsTest,
+)
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/test_dist2.py b/test/distributed/test_dist2.py
index b335eff1c216..2c444fbfe567 100644
--- a/test/distributed/test_dist2.py
+++ b/test/distributed/test_dist2.py
@@ -53,7 +53,13 @@ class ProcessGroupTest(TestCase):
 
 
 class Dist2MultiProcessTestCase(MultiProcessTestCase):
-    device: torch.device
+    @property
+    def device(self) -> torch.device:
+        raise NotImplementedError
+
+    # @device.setter
+    # def device(self, value: torch.device) -> None:
+    #     self._device = value
 
     @property
     def world_size(self) -> int:
@@ -257,7 +263,9 @@ class Dist2MultiProcessTestCase(MultiProcessTestCase):
 
 
 class ProcessGroupGlooTest(Dist2MultiProcessTestCase):
-    device = torch.device("cpu")
+    @property
+    def device(self) -> torch.device:
+        return torch.device("cpu")
 
     @requires_gloo()
     def new_group(self) -> torch.distributed.ProcessGroup:
@@ -274,6 +282,10 @@ class ProcessGroupGlooTest(Dist2MultiProcessTestCase):
 
 
 class ProcessGroupNCCLTest(Dist2MultiProcessTestCase):
+    @property
+    def device(self) -> torch.device:
+        return torch.device("cuda", self.rank)
+
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
     def new_group(self) -> torch.distributed.ProcessGroup:
@@ -282,8 +294,6 @@ class ProcessGroupNCCLTest(Dist2MultiProcessTestCase):
         os.environ["MASTER_ADDR"] = "127.0.0.1"
         os.environ["MASTER_PORT"] = "29501"
 
-        self.device = torch.device("cuda", self.rank)
-
         return dist2.new_group(
             backend="nccl",
             timeout=timedelta(seconds=60),
diff --git a/torch/distributed/_local_tensor/__init__.py b/torch/distributed/_local_tensor/__init__.py
index d9eb7b47e9a3..8121b367790a 100644
--- a/torch/distributed/_local_tensor/__init__.py
+++ b/torch/distributed/_local_tensor/__init__.py
@@ -104,6 +104,62 @@ def _map_to_rank_local_val(val: Any, rank: int) -> Any:
     return val
 
 
+def collect_cuda_rng_states() -> list[torch.Tensor]:
+    """
+    Collects RNG state from all available CUDA devices.
+
+    Returns:
+        List of RNG state tensors, one for each CUDA device.
+        Returns empty list if CUDA is not available.
+    """
+    if not torch.cuda.is_available():
+        return []
+
+    num_devices = torch.cuda.device_count()
+    rng_states = []
+
+    for device_idx in range(num_devices):
+        with torch.cuda.device(device_idx):
+            rng_state = torch.cuda.get_rng_state()
+            rng_states.append(rng_state)
+
+    return rng_states
+
+
+def set_cuda_rng_states(rng_states: list[torch.Tensor]) -> None:
+    """
+    Sets RNG state for all CUDA devices from a list of states.
+
+    Args:
+        rng_states: List of RNG state tensors to restore.
+    """
+    if not torch.cuda.is_available():
+        return
+
+    num_devices = min(len(rng_states), torch.cuda.device_count())
+
+    for device_idx in range(num_devices):
+        with torch.cuda.device(device_idx):
+            torch.cuda.set_rng_state(rng_states[device_idx])
+
+
+def _get_rng_state() -> tuple[torch.Tensor, list[torch.Tensor]]:
+    """
+    Gets CPU and CUDA rng states from all devices.
+    """
+    return (torch.get_rng_state(), collect_cuda_rng_states())
+
+
+def _set_rng_state(cpu_state: torch.Tensor, cuda_states: list[torch.Tensor]) -> None:
+    """
+    Sets CPU and CUDA rng states for all devices. If the list of cuda states
+    is shorter than the number of devices only the first len(cuda_states) devices
+    will get their rng state set.
+    """
+    torch.set_rng_state(cpu_state)
+    set_cuda_rng_states(cuda_states)
+
+
 def _for_each_rank_run_func(
     func: Callable[..., Any],
     ranks: frozenset[int],
@@ -117,14 +173,15 @@ def _for_each_rank_run_func(
         a.wait() if isinstance(a, AsyncCollectiveTensor) else a for a in flat_args
     ]
 
-    cpu_state = torch.get_rng_state()
-    devices, states = get_device_states((args, kwargs))
-
+    # NB: Before invoking an op we are collecting rng states from CPU and
+    # CUDA devices such that we can reset to the same before invoking op
+    # for each rank. This is not very efficient and will likely be revisited
+    # to support per rank rng state.
+    rng_state = _get_rng_state()
     flat_rank_rets = {}
 
     for r in sorted(ranks):
-        torch.set_rng_state(cpu_state)
-        set_device_states(devices, states)
+        _set_rng_state(*rng_state)
         rank_flat_args = [_map_to_rank_local_val(a, r) for a in flat_args]
         rank_args, rank_kwargs = pytree.tree_unflatten(rank_flat_args, args_spec)
         rank_ret = func(*rank_args, **rank_kwargs)
@@ -704,6 +761,11 @@ class _LocalDeviceMesh:
 
     @staticmethod
     def get_coordinate(self: DeviceMesh) -> Optional[list[int] | None]:
+        # NB: In order to support submeshes the code below recreates for each
+        # rank submesh with the same mesh dimensions as current mesh. We are
+        # doing this because when submesh is created it is created for a particular
+        # rank (therefore below we are patching get_rank method). We are trying to
+        # limit the invasiveness of local tensor.
         lm = local_tensor_mode()
         assert lm is not None, "Unexpectedly not in LocalTensorMode"
 
@@ -716,7 +778,9 @@ class _LocalDeviceMesh:
                 coords[d][r] = c
 
         out = [torch.SymInt(LocalIntNode(c)) for c in coords]
-
+        # The output contains coordinates for each of the ranks with respect to
+        # their meshes formed from root mesh and selecting the same dimensions
+        # as the current mesh.
         return out  # type: ignore[return-value]
 
 
@@ -794,8 +858,6 @@ def maybe_run_for_local_tensor(func: Callable[..., Any]) -> Callable[..., Any]:
         with lm.disable():
             ret = _for_each_rank_run_func(func, lm.ranks, args, kwargs, alias=False)
 
-        lm = local_tensor_mode()
-        assert lm is not None
         return ret
 
     return wrapper
diff --git a/torch/distributed/tensor/_ops/_embedding_ops.py b/torch/distributed/tensor/_ops/_embedding_ops.py
index 445b1830defe..283cffb78efd 100644
--- a/torch/distributed/tensor/_ops/_embedding_ops.py
+++ b/torch/distributed/tensor/_ops/_embedding_ops.py
@@ -6,6 +6,7 @@ from typing import cast, Optional
 
 import torch
 import torch.distributed._functional_collectives as funcol
+from torch.distributed._local_tensor import maybe_run_for_local_tensor
 from torch.distributed.device_mesh import DeviceMesh
 from torch.distributed.tensor._op_schema import (
     OpSchema,
@@ -83,20 +84,11 @@ class _MaskPartial(Partial):
     offset_shape: Optional[torch.Size] = None
     offset_dim: int = 0
 
-    def _partition_value(
-        self, tensor: torch.Tensor, mesh: DeviceMesh, mesh_dim: int
-    ) -> torch.Tensor:
-        # override parent logic to perform partial mask for embedding
-        num_chunks = mesh.size(mesh_dim)
-        # get local shard size and offset on the embedding_dim
-        assert self.offset_shape is not None, (
-            "offset_shape needs to be set for _MaskPartial"
-        )
-        local_shard_size, local_offset_on_dim = Shard.local_shard_size_and_offset(
-            self.offset_shape[self.offset_dim],
-            num_chunks,
-            mesh.get_local_rank(mesh_dim),
-        )
+    @staticmethod
+    @maybe_run_for_local_tensor
+    def _mask_tensor(
+        tensor: torch.Tensor, local_offset_on_dim: int, local_shard_size: int
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         # Build the input mask and save it for the current partial placement
         # this is so that the output of embedding op can reuse the same partial
         # placement saved mask to perform mask + reduction
@@ -106,6 +98,27 @@ class _MaskPartial(Partial):
         # mask the input tensor
         masked_tensor = tensor.clone() - local_offset_on_dim
         masked_tensor[mask] = 0
+        return mask, masked_tensor
+
+    def _partition_value(
+        self, tensor: torch.Tensor, mesh: DeviceMesh, mesh_dim: int
+    ) -> torch.Tensor:
+        my_coordinate = mesh.get_coordinate()
+        assert my_coordinate is not None, "my_coordinate should not be None"
+        # override parent logic to perform partial mask for embedding
+        num_chunks = mesh.size(mesh_dim)
+        # get local shard size and offset on the embedding_dim
+        assert self.offset_shape is not None, (
+            "offset_shape needs to be set for _MaskPartial"
+        )
+        local_shard_size, local_offset_on_dim = Shard.local_shard_size_and_offset(
+            self.offset_shape[self.offset_dim],
+            num_chunks,
+            my_coordinate[mesh_dim],
+        )
+        mask, masked_tensor = _MaskPartial._mask_tensor(
+            tensor, local_offset_on_dim, local_shard_size
+        )
         # materialize the mask buffer to be used for reduction
         self.mask_buffer.materialize_mask(mask)
         return masked_tensor
diff --git a/torch/distributed/tensor/_sharding_prop.py b/torch/distributed/tensor/_sharding_prop.py
index 4af72b4d3d8f..c1af2c131717 100644
--- a/torch/distributed/tensor/_sharding_prop.py
+++ b/torch/distributed/tensor/_sharding_prop.py
@@ -48,6 +48,9 @@ class LocalLRUCache(threading.local):
     def cache_info(self):
         return self.cache.cache_info()
 
+    def cache_clear(self):
+        return self.cache.cache_clear()
+
 
 class ShardingPropagator:
     def __init__(self) -> None:
diff --git a/torch/distributed/tensor/debug/__init__.py b/torch/distributed/tensor/debug/__init__.py
index e5bf3b833fe4..a74f1449ad12 100644
--- a/torch/distributed/tensor/debug/__init__.py
+++ b/torch/distributed/tensor/debug/__init__.py
@@ -19,6 +19,17 @@ def _get_sharding_prop_cache_info():
     )
 
 
+def _clear_sharding_prop_cache():
+    """
+    Clears the cache for the sharding propagation cache, used for debugging purpose only.
+    """
+    from torch.distributed.tensor._api import DTensor
+
+    return (
+        DTensor._op_dispatcher.sharding_propagator.propagate_op_sharding.cache_clear()  # type:ignore[attr-defined]
+    )
+
+
 # Set namespace for exposed private names
 CommDebugMode.__module__ = "torch.distributed.tensor.debug"
 visualize_sharding.__module__ = "torch.distributed.tensor.debug"
diff --git a/torch/distributed/tensor/placement_types.py b/torch/distributed/tensor/placement_types.py
index 5f68ff03ee22..8930d3b1b29c 100644
--- a/torch/distributed/tensor/placement_types.py
+++ b/torch/distributed/tensor/placement_types.py
@@ -359,6 +359,16 @@ class Shard(Placement):
 
         return Shard._select_shard(shards, shard_index)
 
+    @staticmethod
+    @maybe_run_for_local_tensor
+    def _get_shard_pad_size(
+        full_size: int, local_tensor: torch.Tensor, dim: int
+    ) -> int:
+        """
+        Get the padding size of the local tensor on the shard dimension.
+        """
+        return full_size - local_tensor.size(dim)
+
     def _to_new_shard_dim(
         self,
         local_tensor: torch.Tensor,
@@ -387,14 +397,16 @@ class Shard(Placement):
             old_dim_full_chunk_size = (
                 old_dim_logical_size + num_chunks - 1
             ) // num_chunks
-            old_dim_pad_size = old_dim_full_chunk_size - local_tensor.size(self.dim)
+            old_dim_pad_size = Shard._get_shard_pad_size(
+                old_dim_full_chunk_size, local_tensor, self.dim
+            )
             local_tensor = pad_tensor(local_tensor, self.dim, old_dim_pad_size)
         if new_dim_padding:
             new_dim_full_chunk_size = (
                 new_dim_logical_size + num_chunks - 1
             ) // num_chunks
-            new_dim_pad_size = new_dim_full_chunk_size * num_chunks - local_tensor.size(
-                new_shard_dim
+            new_dim_pad_size = Shard._get_shard_pad_size(
+                new_dim_full_chunk_size * num_chunks, local_tensor, new_shard_dim
             )
             local_tensor = pad_tensor(local_tensor, new_shard_dim, new_dim_pad_size)
 
diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index 89408b62c9aa..6cd372a8596c 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -211,6 +211,14 @@ def at_least_x_gpu(x):
     return False
 
 
+def _maybe_handle_skip_if_lt_x_gpu(args, msg) -> bool:
+    _handle_test_skip = getattr(args[0], "_handle_test_skip", None)
+    if len(args) == 0 or _handle_test_skip is None:
+        return False
+    _handle_test_skip(msg)
+    return True
+
+
 def skip_if_lt_x_gpu(x):
     def decorator(func):
         @wraps(func)
@@ -221,7 +229,9 @@ def skip_if_lt_x_gpu(x):
                 return func(*args, **kwargs)
             if TEST_XPU and torch.xpu.device_count() >= x:
                 return func(*args, **kwargs)
-            sys.exit(TEST_SKIPS[f"multi-gpu-{x}"].exit_code)
+            test_skip = TEST_SKIPS[f"multi-gpu-{x}"]
+            if not _maybe_handle_skip_if_lt_x_gpu(args, test_skip.message):
+                sys.exit(test_skip.exit_code)
 
         return wrapper
 
@@ -237,7 +247,9 @@ def nccl_skip_if_lt_x_gpu(backend, x):
                 return func(*args, **kwargs)
             if torch.cuda.is_available() and torch.cuda.device_count() >= x:
                 return func(*args, **kwargs)
-            sys.exit(TEST_SKIPS[f"multi-gpu-{x}"].exit_code)
+            test_skip = TEST_SKIPS[f"multi-gpu-{x}"]
+            if not _maybe_handle_skip_if_lt_x_gpu(args, test_skip.message):
+                sys.exit(test_skip.exit_code)
 
         return wrapper
 
diff --git a/torch/testing/_internal/distributed/_tensor/common_dtensor.py b/torch/testing/_internal/distributed/_tensor/common_dtensor.py
index 1f982aa42074..22d6d8e7dede 100644
--- a/torch/testing/_internal/distributed/_tensor/common_dtensor.py
+++ b/torch/testing/_internal/distributed/_tensor/common_dtensor.py
@@ -701,6 +701,9 @@ class DTensorConverter:
 
 
 class LocalDTensorTestBase(DTensorTestBase):
+    def _handle_test_skip(self, msg: str) -> None:
+        self.skipTest(msg)
+
     def _get_local_tensor_mode(self):
         return LocalTensorMode(frozenset(range(self.world_size)))
 

From 3255e7872bc94d95c63db844f4279d50884741d7 Mon Sep 17 00:00:00 2001
From: Yuanyuan Chen <cyyever@outlook.com>
Date: Sun, 19 Oct 2025 00:59:28 +0000
Subject: [PATCH 119/123] Enable all flake8-logging-format rules (#164655)

These rules are enabled by removing existing suppressions.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164655
Approved by: https://github.com/janeyx99, https://github.com/mlazos
---
 .ci/lumen_cli/cli/lib/common/git_helper.py    |  4 ++--
 .flake8                                       |  2 --
 benchmarks/dynamo/common.py                   |  4 ++--
 .../microbenchmarks/operator_inp_utils.py     |  4 ++--
 pyproject.toml                                |  2 --
 test/test_quantization.py                     | 24 +++++++++----------
 tools/linter/adapters/clangformat_linter.py   |  2 +-
 tools/linter/adapters/flake8_linter.py        |  2 +-
 tools/linter/adapters/ruff_linter.py          |  2 +-
 tools/linter/adapters/s3_init.py              |  4 ++--
 tools/packaging/build_wheel.py                |  2 +-
 torch/_dynamo/convert_frame.py                |  2 +-
 torch/_dynamo/eval_frame.py                   |  6 +++--
 torch/_dynamo/exc.py                          |  4 ++--
 torch/_dynamo/graph_region_tracker.py         |  2 +-
 torch/_dynamo/package.py                      |  4 ++--
 torch/_dynamo/precompile_context.py           |  2 +-
 torch/_dynamo/variables/builtin.py            |  6 ++---
 torch/_dynamo/variables/higher_order_ops.py   |  2 +-
 .../_aot_autograd/autograd_cache.py           |  8 +++----
 torch/_inductor/codecache.py                  |  2 +-
 torch/_inductor/codegen/common.py             |  2 +-
 torch/_inductor/codegen/cuda/cuda_env.py      |  8 +++----
 torch/_inductor/codegen/cuda/cutlass_cache.py |  6 ++---
 torch/_inductor/codegen/cuda/cutlass_utils.py |  8 +++----
 .../codegen/cutedsl/cutedsl_template.py       |  4 ++--
 .../rocm/ck_universal_gemm_template.py        |  2 +-
 torch/_inductor/codegen/triton.py             |  2 +-
 torch/_inductor/comm_analysis.py              |  2 +-
 torch/_inductor/compile_fx_ext.py             |  2 +-
 .../_inductor/compile_worker/subproc_pool.py  |  4 ++--
 torch/_inductor/fx_passes/numeric_utils.py    |  2 +-
 torch/_inductor/memory.py                     |  8 +++----
 .../runtime/coordinate_descent_tuner.py       |  2 +-
 torch/_inductor/runtime/triton_heuristics.py  |  8 +++----
 torch/_inductor/scheduler.py                  |  8 +++----
 torch/_inductor/select_algorithm.py           | 15 ++++++------
 torch/_inductor/triton_bundler.py             |  6 ++---
 torch/_library/fake_class_registry.py         |  2 +-
 torch/_subclasses/fake_tensor.py              |  2 +-
 .../_experimental/checkpoint_process.py       | 12 ++++------
 torch/distributed/distributed_c10d.py         |  2 +-
 torch/distributed/elastic/agent/server/api.py |  2 +-
 .../elastic/multiprocessing/api.py            | 12 ++++++----
 .../elastic/multiprocessing/tail_log.py       |  5 ++--
 .../elastic/rendezvous/etcd_rendezvous.py     |  6 ++---
 .../elastic/rendezvous/etcd_server.py         |  2 +-
 torch/distributed/pipelining/schedules.py     |  2 +-
 torch/distributed/rpc/api.py                  |  8 +++----
 torch/export/__init__.py                      |  4 ++--
 torch/export/pt2_archive/_package.py          |  4 ++--
 torch/fx/experimental/symbolic_shapes.py      |  4 ++--
 .../onnx/_internal/exporter/_registration.py  |  2 +-
 .../onnx/_internal/exporter/_verification.py  |  7 ++----
 torch/testing/_internal/common_distributed.py | 16 ++++++-------
 55 files changed, 131 insertions(+), 140 deletions(-)

diff --git a/.ci/lumen_cli/cli/lib/common/git_helper.py b/.ci/lumen_cli/cli/lib/common/git_helper.py
index 9833caca956c..c4d6f8a0b6f5 100644
--- a/.ci/lumen_cli/cli/lib/common/git_helper.py
+++ b/.ci/lumen_cli/cli/lib/common/git_helper.py
@@ -57,8 +57,8 @@ def clone_external_repo(target: str, repo: str, dst: str = "", update_submodules
         logger.info("Successfully cloned %s", target)
         return r, commit
 
-    except GitCommandError as e:
-        logger.error("Git operation failed: %s", e)
+    except GitCommandError:
+        logger.exception("Git operation failed")
         raise
 
 
diff --git a/.flake8 b/.flake8
index 2be8eab0dc83..aff8849fa6d4 100644
--- a/.flake8
+++ b/.flake8
@@ -13,8 +13,6 @@ ignore =
     EXE001,
     # these ignores are from flake8-bugbear; please fix!
     B007,B008,B017,B019,B023,B028,B903,B905,B906,B907,B908,B910
-    # these ignores are from flake8-logging-format; please fix!
-    G100,G101,G200
     # these ignores are from flake8-simplify. please fix or ignore with commented reason
     SIM105,SIM108,SIM110,SIM111,SIM113,SIM114,SIM115,SIM116,SIM117,SIM118,SIM119,SIM12,
     # SIM104 is already covered by pyupgrade ruff
diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index f3b75e9f72ea..54900de1ed91 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -1751,8 +1751,8 @@ def maybe_snapshot_memory(should_snapshot_memory, suffix):
                         f"{output_filename.rstrip('.csv')}_{suffix}.pickle",
                     )
                 )
-            except Exception as e:
-                log.error("Failed to save memory snapshot, %s", e)
+            except Exception:
+                log.exception("Failed to save memory snapshot")
 
             torch.cuda.memory._record_memory_history(enabled=None)
 
diff --git a/benchmarks/dynamo/microbenchmarks/operator_inp_utils.py b/benchmarks/dynamo/microbenchmarks/operator_inp_utils.py
index f1f9ea9b30ba..8a6978dd448b 100644
--- a/benchmarks/dynamo/microbenchmarks/operator_inp_utils.py
+++ b/benchmarks/dynamo/microbenchmarks/operator_inp_utils.py
@@ -296,8 +296,8 @@ class OperatorInputsLoader:
         for key in self.operator_db.keys():
             try:
                 op = eval(key)
-            except AttributeError as ae:
-                log.warning("Evaluating an op name into an OpOverload: %s", ae)
+            except AttributeError:
+                log.warning("Evaluating an op name into an OpOverload", exc_info=True)
                 continue
             yield op
 
diff --git a/pyproject.toml b/pyproject.toml
index f18368b90d8d..5bb7f301b8a6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -159,8 +159,6 @@ ignore = [
     "EXE001",
     "F405",
     "FURB122", # writelines
-    # these ignores are from flake8-logging-format; please fix!
-    "G101",
     # these ignores are from ruff NPY; please fix!
     "NPY002",
     # these ignores are from ruff PERF; please fix!
diff --git a/test/test_quantization.py b/test/test_quantization.py
index 6d72da3279e1..01006e3f6e22 100644
--- a/test/test_quantization.py
+++ b/test/test_quantization.py
@@ -72,7 +72,7 @@ try:
 except ImportError as e:
     # In FBCode we separate FX out into a separate target for the sake of dev
     # velocity. These are covered by a separate test target `quantization_fx`
-    log.warning(e)
+    log.warning(e)  # noqa:G200
 
 # PyTorch 2 Export Quantization
 try:
@@ -94,7 +94,7 @@ try:
 except ImportError as e:
     # In FBCode we separate PT2 out into a separate target for the sake of dev
     # velocity. These are covered by a separate test target `quantization_pt2e`
-    log.warning(e)
+    log.warning(e)  # noqa:G200
 
 try:
     from quantization.fx.test_numeric_suite_fx import TestFXGraphMatcher  # noqa: F401
@@ -103,7 +103,7 @@ try:
     from quantization.fx.test_numeric_suite_fx import TestFXNumericSuiteNShadows  # noqa: F401
     from quantization.fx.test_numeric_suite_fx import TestFXNumericSuiteCoreAPIsModels  # noqa: F401
 except ImportError as e:
-    log.warning(e)
+    log.warning(e)  # noqa:G200
 
 # Test the model report module
 try:
@@ -115,19 +115,19 @@ try:
     from quantization.fx.test_model_report_fx import TestFxDetectOutliers  # noqa: F401
     from quantization.fx.test_model_report_fx import TestFxModelReportVisualizer  # noqa: F401
 except ImportError as e:
-    log.warning(e)
+    log.warning(e)  # noqa:G200
 
 # Equalization for FX mode
 try:
     from quantization.fx.test_equalize_fx import TestEqualizeFx  # noqa: F401
 except ImportError as e:
-    log.warning(e)
+    log.warning(e)  # noqa:G200
 
 # Backward Compatibility. Tests serialization and BC for quantized modules.
 try:
     from quantization.bc.test_backward_compatibility import TestSerialization  # noqa: F401
 except ImportError as e:
-    log.warning(e)
+    log.warning(e)  # noqa:G200
 
 # JIT Graph Mode Quantization
 from quantization.jit.test_quantize_jit import TestQuantizeJit  # noqa: F401
@@ -146,29 +146,29 @@ from quantization.ao_migration.test_ao_migration import TestAOMigrationNNIntrins
 try:
     from quantization.ao_migration.test_quantization_fx import TestAOMigrationQuantizationFx  # noqa: F401
 except ImportError as e:
-    log.warning(e)
+    log.warning(e)  # noqa:G200
 
 # Experimental functionality
 try:
     from quantization.core.experimental.test_bits import TestBitsCPU  # noqa: F401
 except ImportError as e:
-    log.warning(e)
+    log.warning(e)  # noqa:G200
 try:
     from quantization.core.experimental.test_bits import TestBitsCUDA  # noqa: F401
 except ImportError as e:
-    log.warning(e)
+    log.warning(e)  # noqa:G200
 try:
     from quantization.core.experimental.test_floatx import TestFloat8DtypeCPU  # noqa: F401
 except ImportError as e:
-    log.warning(e)
+    log.warning(e)  # noqa:G200
 try:
     from quantization.core.experimental.test_floatx import TestFloat8DtypeCUDA  # noqa: F401
 except ImportError as e:
-    log.warning(e)
+    log.warning(e)  # noqa:G200
 try:
     from quantization.core.experimental.test_floatx import TestFloat8DtypeCPUOnlyCPU  # noqa: F401
 except ImportError as e:
-    log.warning(e)
+    log.warning(e)  # noqa:G200
 
 if __name__ == '__main__':
     run_tests()
diff --git a/tools/linter/adapters/clangformat_linter.py b/tools/linter/adapters/clangformat_linter.py
index 9289dcd6375f..0d82ddd939b1 100644
--- a/tools/linter/adapters/clangformat_linter.py
+++ b/tools/linter/adapters/clangformat_linter.py
@@ -73,7 +73,7 @@ def run_command(
             if remaining_retries == 0:
                 raise err
             remaining_retries -= 1
-            logging.warning(
+            logging.warning(  # noqa: G200
                 "(%s/%s) Retrying because command failed with: %r",
                 retries - remaining_retries,
                 retries,
diff --git a/tools/linter/adapters/flake8_linter.py b/tools/linter/adapters/flake8_linter.py
index 0bc522821cab..d51ef09fec75 100644
--- a/tools/linter/adapters/flake8_linter.py
+++ b/tools/linter/adapters/flake8_linter.py
@@ -172,7 +172,7 @@ def run_command(
             ):
                 raise err
             remaining_retries -= 1
-            logging.warning(
+            logging.warning(  # noqa: G200
                 "(%s/%s) Retrying because command failed with: %r",
                 retries - remaining_retries,
                 retries,
diff --git a/tools/linter/adapters/ruff_linter.py b/tools/linter/adapters/ruff_linter.py
index d8120461b13b..28feae002f36 100644
--- a/tools/linter/adapters/ruff_linter.py
+++ b/tools/linter/adapters/ruff_linter.py
@@ -112,7 +112,7 @@ def run_command(
             if remaining_retries == 0:
                 raise err
             remaining_retries -= 1
-            logging.warning(
+            logging.warning(  # noqa: G200
                 "(%s/%s) Retrying because command failed with: %r",
                 retries - remaining_retries,
                 retries,
diff --git a/tools/linter/adapters/s3_init.py b/tools/linter/adapters/s3_init.py
index b33497d2ce6a..154e3d56ad26 100644
--- a/tools/linter/adapters/s3_init.py
+++ b/tools/linter/adapters/s3_init.py
@@ -95,8 +95,8 @@ Deleting %s just to be safe.
 
     try:
         binary_path.unlink()
-    except OSError as e:
-        logging.critical("Failed to delete binary: %s", e)
+    except OSError:
+        logging.critical("Failed to delete binary", exc_info=True)
         logging.critical(
             "Delete this binary as soon as possible and do not execute it!"
         )
diff --git a/tools/packaging/build_wheel.py b/tools/packaging/build_wheel.py
index dad2d8084967..5f6f262ab820 100644
--- a/tools/packaging/build_wheel.py
+++ b/tools/packaging/build_wheel.py
@@ -114,7 +114,7 @@ def _find_manylinux_interpreters() -> list[str]:
                 )
 
         except subprocess.CalledProcessError as e:
-            logger.debug("Failed to get version for %s: %s", python_path, e)
+            logger.debug("Failed to get version for %s: %s", python_path, e)  # noqa:G200
             continue
     return interpreters
 
diff --git a/torch/_dynamo/convert_frame.py b/torch/_dynamo/convert_frame.py
index 6f87d1cd445e..e1b4e051672e 100644
--- a/torch/_dynamo/convert_frame.py
+++ b/torch/_dynamo/convert_frame.py
@@ -1215,7 +1215,7 @@ def compile_frame(  # type: ignore[return]
         except exc.SkipFrame as e:
             if not isinstance(e, exc.TensorifyScalarRestartAnalysis):
                 TensorifyState.clear()
-            log.debug(
+            log.debug(  # noqa: G200
                 "Skipping frame %s %s \
                 %s %s",
                 e,
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index 451776ef25fd..f0b32976e5be 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -753,8 +753,10 @@ class _TorchDynamoContext:
                             fn, result.dynamo, ignore_inlined_sources=False
                         )
                         self._package.install(result.backends)
-                    except RuntimeError as e:
-                        log.warning("Failed to load entry from dynamo cache: %s", e)
+                    except RuntimeError:
+                        log.warning(
+                            "Failed to load entry from dynamo cache", exc_info=True
+                        )
                         self._package.initialize(fn, None, ignore_inlined_sources=False)
 
         fn = innermost_fn(fn)
diff --git a/torch/_dynamo/exc.py b/torch/_dynamo/exc.py
index 2667bee7aacb..295fed5618ea 100644
--- a/torch/_dynamo/exc.py
+++ b/torch/_dynamo/exc.py
@@ -532,8 +532,8 @@ def _load_gb_type_to_gb_id_map() -> dict[str, Any]:
         )
         with open(registry_path) as f:
             registry = json.load(f)
-    except Exception as e:
-        log.error("Error accessing the registry file: %s", e)
+    except Exception:
+        log.exception("Error accessing the registry file")
         registry = {}
 
     mapping = {}
diff --git a/torch/_dynamo/graph_region_tracker.py b/torch/_dynamo/graph_region_tracker.py
index 19211bd4491b..5fcf4e83cacb 100644
--- a/torch/_dynamo/graph_region_tracker.py
+++ b/torch/_dynamo/graph_region_tracker.py
@@ -269,7 +269,7 @@ class GraphRegionTracker:
                 duplicates.append(node)
                 self.node_to_duplicates[node] = duplicates
         except NodeHashException as e:
-            log.debug("Unable to hash node %s with exception %s", node, e)
+            log.debug("Unable to hash node %s with exception %s", node, e)  # noqa: G200
 
     def track_node_mutations(
         self,
diff --git a/torch/_dynamo/package.py b/torch/_dynamo/package.py
index 9c5dec0a98f9..b61728d03f05 100644
--- a/torch/_dynamo/package.py
+++ b/torch/_dynamo/package.py
@@ -1122,9 +1122,9 @@ class DiskDynamoCache(DiskDynamoStore):
                 result = super().load_cache_entry(key)
                 counters["dynamo_cache"]["dynamo_cache_hit"] += 1
                 return result
-            except Exception as e:
+            except Exception:
                 counters["dynamo_cache"]["dynamo_cache_error"] += 1
-                logger.warning("Failed to load package from path %s: %s", path, str(e))
+                logger.warning("Failed to load package from path %s", exc_info=True)
                 return None
         logger.info("No package found for %s", key)
         counters["dynamo_cache"]["dynamo_cache_miss"] += 1
diff --git a/torch/_dynamo/precompile_context.py b/torch/_dynamo/precompile_context.py
index d3b2c7df1f47..65ceab92262c 100644
--- a/torch/_dynamo/precompile_context.py
+++ b/torch/_dynamo/precompile_context.py
@@ -203,7 +203,7 @@ class PrecompileContext:
                 if result is not None:
                     precompile_cache_entries[key] = result
             except Exception as e:
-                logger.warning("Failed to create cache entry %s: %s", key, str(e))
+                logger.warning("Failed to create cache entry %s", key, exc_info=True)
 
                 error = e
                 data = json.dumps(
diff --git a/torch/_dynamo/variables/builtin.py b/torch/_dynamo/variables/builtin.py
index 09bdb81150e6..24136b5ddad6 100644
--- a/torch/_dynamo/variables/builtin.py
+++ b/torch/_dynamo/variables/builtin.py
@@ -1041,7 +1041,7 @@ class BuiltinVariable(VariableTracker):
                     except TypeError as e:
                         has_constant_handler = obj.has_constant_handler(args, kwargs)
                         if not has_constant_handler:
-                            log.warning(
+                            log.warning(  # noqa: G200
                                 "incorrect arg count %s %s and no constant handler",
                                 self_handler,
                                 e,
@@ -1560,9 +1560,9 @@ class BuiltinVariable(VariableTracker):
                 try:
                     # Only supports certain function types
                     user_func_variable = variables.UserFunctionVariable(bound_method)
-                except AssertionError as e:
+                except AssertionError:
                     # Won't be able to do inline the str method, return to avoid graph break
-                    log.warning("Failed to create UserFunctionVariable: %s", e)
+                    log.warning("Failed to create UserFunctionVariable", exc_info=True)
                     return
 
                 # Inline the user function
diff --git a/torch/_dynamo/variables/higher_order_ops.py b/torch/_dynamo/variables/higher_order_ops.py
index 956eb4676018..753b0a5414f0 100644
--- a/torch/_dynamo/variables/higher_order_ops.py
+++ b/torch/_dynamo/variables/higher_order_ops.py
@@ -1183,7 +1183,7 @@ def speculate_subgraph(
             f"fall back to eager-mode PyTorch, which could lead to a slowdown."
         )
         log.info(msg)
-        log.info(ex)
+        log.info(ex)  # noqa: G200
         raise ex
 
 
diff --git a/torch/_functorch/_aot_autograd/autograd_cache.py b/torch/_functorch/_aot_autograd/autograd_cache.py
index 0ac2407269ac..47506aff1ef2 100644
--- a/torch/_functorch/_aot_autograd/autograd_cache.py
+++ b/torch/_functorch/_aot_autograd/autograd_cache.py
@@ -1221,7 +1221,7 @@ class AOTAutogradCache(GuardedCache[GenericAOTAutogradCacheEntry]):
             except Exception as e:
                 cache_key = None
                 counters["aot_autograd"]["autograd_cache_bypass"] += 1
-                log.info("Bypassing autograd cache due to: %s", e)
+                log.info("Bypassing autograd cache due to: %s", e)  # noqa: G200
                 cache_state = "bypass"
                 cache_event_time = time.time_ns()
                 cache_info["cache_bypass_reason"] = str(e)
@@ -1368,7 +1368,7 @@ class AOTAutogradCache(GuardedCache[GenericAOTAutogradCacheEntry]):
                         ),
                     )
         except Exception as e:
-            log.info("AOTAutograd cache unable to load compiled graph: %s", e)
+            log.info("AOTAutograd cache unable to load compiled graph: %s", e)  # noqa: G200
             if config.strict_autograd_cache:
                 raise e
         if entry is not None:
@@ -1414,12 +1414,12 @@ class AOTAutogradCache(GuardedCache[GenericAOTAutogradCacheEntry]):
             counters["aot_autograd"]["autograd_cache_saved"] += 1
         except BypassAOTAutogradCache as e:
             counters["aot_autograd"]["autograd_cache_bypass"] += 1
-            log.info("Bypassing autograd cache due to: %s", e)
+            log.info("Bypassing autograd cache due to: %s", e)  # noqa: G200
             if remote:
                 log_cache_bypass("bypass_aot_autograd", str(e))
             return None
         except Exception as e:
-            log.info("AOTAutograd cache unable to serialize compiled graph: %s", e)
+            log.info("AOTAutograd cache unable to serialize compiled graph: %s", e)  # noqa: G200
             if remote:
                 log_cache_bypass(
                     "bypass_aot_autograd", "Unable to serialize: " + str(e)
diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
index 5cc178db2fc3..3ead901e1a36 100644
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@@ -1516,7 +1516,7 @@ class FxGraphCache(GuardedCache[CompiledFxGraph]):
             )
         except BypassFxGraphCache as e:
             counters["inductor"]["fxgraph_cache_bypass"] += 1
-            log.info("Bypassing FX Graph Cache because '%s'", e)
+            log.info("Bypassing FX Graph Cache because '%s'", e)  # noqa: G200
             if remote:
                 log_cache_bypass("bypass_fx_graph", str(e))
             cache_info = {
diff --git a/torch/_inductor/codegen/common.py b/torch/_inductor/codegen/common.py
index 743baec01dfa..5a953f80a1a2 100644
--- a/torch/_inductor/codegen/common.py
+++ b/torch/_inductor/codegen/common.py
@@ -2493,7 +2493,7 @@ class KernelTemplate:
             choices.append(self.generate(**kwargs))
             return None
         except NotImplementedError as e:
-            log.info(
+            log.info(  # noqa: G200
                 "Cannot Append Choice: %s. KernelTemplate type is %s",
                 e,
                 type(self),
diff --git a/torch/_inductor/codegen/cuda/cuda_env.py b/torch/_inductor/codegen/cuda/cuda_env.py
index 3eb65273285e..9ca3afbd9ca5 100644
--- a/torch/_inductor/codegen/cuda/cuda_env.py
+++ b/torch/_inductor/codegen/cuda/cuda_env.py
@@ -22,8 +22,8 @@ def get_cuda_arch() -> Optional[str]:
             major, minor = torch.cuda.get_device_capability(0)
             return str(major * 10 + minor)
         return str(cuda_arch)
-    except Exception as e:
-        log.error("Error getting cuda arch: %s", e)
+    except Exception:
+        log.exception("Error getting cuda arch")
         return None
 
 
@@ -45,8 +45,8 @@ def get_cuda_version() -> Optional[str]:
         if cuda_version is None:
             cuda_version = torch.version.cuda
         return cuda_version
-    except Exception as e:
-        log.error("Error getting cuda version: %s", e)
+    except Exception:
+        log.exception("Error getting cuda version")
         return None
 
 
diff --git a/torch/_inductor/codegen/cuda/cutlass_cache.py b/torch/_inductor/codegen/cuda/cutlass_cache.py
index 519125888c16..66db98867b41 100644
--- a/torch/_inductor/codegen/cuda/cutlass_cache.py
+++ b/torch/_inductor/codegen/cuda/cutlass_cache.py
@@ -94,11 +94,11 @@ def maybe_fetch_ops() -> Optional[list[Any]]:
             assert isinstance(serialized_ops, list), (
                 f"Expected serialized ops is a list, got {type(serialized_ops)}"
             )
-        except Exception as e:
+        except Exception:
             log.warning(
-                "Failed to load CUTLASS config %s from local cache: %s",
+                "Failed to load CUTLASS config %s from local cache",
                 filename,
-                e,
+                exc_info=True,
             )
             serialized_ops = None
     elif config.is_fbcode():
diff --git a/torch/_inductor/codegen/cuda/cutlass_utils.py b/torch/_inductor/codegen/cuda/cutlass_utils.py
index 2f673e92e24b..be812347188b 100644
--- a/torch/_inductor/codegen/cuda/cutlass_utils.py
+++ b/torch/_inductor/codegen/cuda/cutlass_utils.py
@@ -53,8 +53,8 @@ def move_cutlass_compiled_cache() -> None:
         filename = os.path.basename(cutlass_cppgen.CACHE_FILE)
         shutil.move(cutlass_cppgen.CACHE_FILE, os.path.join(cache_dir(), filename))
         log.debug("Moved CUTLASS compiled cache file to %s", cache_dir())
-    except OSError as e:
-        log.warning("Failed to move CUTLASS compiled cache file: %s", e)
+    except OSError:
+        log.warning("Failed to move CUTLASS compiled cache file", exc_info=True)
 
 
 def _rename_cutlass_import(content: str, cutlass_modules: list[str]) -> str:
@@ -79,7 +79,7 @@ def try_import_cutlass() -> bool:
             import cutlass_cppgen  # type: ignore[import-not-found]  # noqa: F401
             import cutlass_library  # type: ignore[import-not-found]
         except ImportError as e:
-            log.warning(
+            log.warning(  # noqa: G200
                 "Failed to import CUTLASS packages in fbcode: %s, ignoring the CUTLASS backend.",
                 str(e),
             )
@@ -164,7 +164,7 @@ def try_import_cutlass() -> bool:
 
             return True
         except ImportError as e:
-            log.debug(
+            log.debug(  # noqa: G200
                 "Failed to import CUTLASS packages: %s, ignoring the CUTLASS backend.",
                 str(e),
             )
diff --git a/torch/_inductor/codegen/cutedsl/cutedsl_template.py b/torch/_inductor/codegen/cutedsl/cutedsl_template.py
index 016edb63a352..31ff7e43afc5 100644
--- a/torch/_inductor/codegen/cutedsl/cutedsl_template.py
+++ b/torch/_inductor/codegen/cutedsl/cutedsl_template.py
@@ -58,10 +58,10 @@ class CuteDSLTemplate(KernelTemplate):
             choices.append(self.generate(**kwargs))
             return None
         except NotImplementedError as e:
-            log.debug("CuteDSL template choice generation failed: %s", e)
+            log.debug("CuteDSL template choice generation failed: %s", e)  # noqa: G200
             return e
         except Exception as e:
-            log.debug("CuteDSL template choice generation error: %s", e)
+            log.debug("CuteDSL template choice generation error: %s", e)  # noqa: G200
             return NotImplementedError(f"CuteDSL template failed: {e}")
 
     def generate(self, **kwargs: Any) -> ChoiceCaller:
diff --git a/torch/_inductor/codegen/rocm/ck_universal_gemm_template.py b/torch/_inductor/codegen/rocm/ck_universal_gemm_template.py
index db2bd69b1d09..8357e9fba774 100644
--- a/torch/_inductor/codegen/rocm/ck_universal_gemm_template.py
+++ b/torch/_inductor/codegen/rocm/ck_universal_gemm_template.py
@@ -510,7 +510,7 @@ class CKGemmTemplate(CKTemplate):
                         torch.cuda.get_device_properties(X_meta.device).warp_size,
                     )
                 except Exception as e:
-                    log.debug(
+                    log.debug(  # noqa: G200
                         "Failed to prefetch_stages for %s with exception %s", op.name, e
                     )
                     # be conservative here and disable the op
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
index e8d7996460fe..cc938de0ca22 100644
--- a/torch/_inductor/codegen/triton.py
+++ b/torch/_inductor/codegen/triton.py
@@ -5638,7 +5638,7 @@ class TritonScheduling(SIMDScheduling):
             except Exception as e:
                 if config.triton.disallow_failing_autotune_kernels_TESTING_ONLY:
                     raise
-                log.debug(
+                log.debug(  # noqa: G200
                     "Exception (%s) in compiling fused nodes %s",
                     e,
                     node_names,
diff --git a/torch/_inductor/comm_analysis.py b/torch/_inductor/comm_analysis.py
index 2bf9ff39f81f..51c5472c7fe3 100644
--- a/torch/_inductor/comm_analysis.py
+++ b/torch/_inductor/comm_analysis.py
@@ -204,7 +204,7 @@ def estimate_nccl_collective_runtime_nccl_estimator(snode) -> Optional[float]:
             torch.ops._c10d_functional.wait_tensor.default(w)
     except Exception as e:
         # NCCL estimator can fail
-        log.info(e)
+        log.info(e)  # noqa: G200
         return None
 
     est_time_us = time_estimator.estimated_time
diff --git a/torch/_inductor/compile_fx_ext.py b/torch/_inductor/compile_fx_ext.py
index 743819af7e67..113a7c92606d 100644
--- a/torch/_inductor/compile_fx_ext.py
+++ b/torch/_inductor/compile_fx_ext.py
@@ -445,7 +445,7 @@ class _SerializedFxCompile(FxCompile):
             # we can't cache (or serialize)
             FxGraphCache._check_for_hop(gm)
         except BypassFxGraphCache as e:
-            log.debug("Skipping %s compile: %s", type(self), e)
+            log.debug("Skipping %s compile: %s", type(self), e)  # noqa: G200
             return None
 
         context = torch._guards.TracingContext.try_get()
diff --git a/torch/_inductor/compile_worker/subproc_pool.py b/torch/_inductor/compile_worker/subproc_pool.py
index 474cd86eb362..c6b094cc52c6 100644
--- a/torch/_inductor/compile_worker/subproc_pool.py
+++ b/torch/_inductor/compile_worker/subproc_pool.py
@@ -284,8 +284,8 @@ class SubprocPool:
             self.process.wait(300)
             if self.log_file:
                 self.log_file.close()
-        except OSError as e:
-            log.warning("Ignored OSError in pool shutdown:  %s", e)
+        except OSError:
+            log.warning("Ignored OSError in pool shutdown", exc_info=True)
         finally:
             with self.futures_lock:
                 for future in self.pending_futures.values():
diff --git a/torch/_inductor/fx_passes/numeric_utils.py b/torch/_inductor/fx_passes/numeric_utils.py
index d5b140b49d20..b50859448f07 100644
--- a/torch/_inductor/fx_passes/numeric_utils.py
+++ b/torch/_inductor/fx_passes/numeric_utils.py
@@ -207,7 +207,7 @@ def numeric_check_if_enabled(
                 precision=precision,
             )
     except Exception as e:
-        logger.warning(
+        logger.warning(  # noqa: G200
             "Runtime numeric check failed in pre grad fx passes with error: %s", e
         )
         traceback.print_exc()
diff --git a/torch/_inductor/memory.py b/torch/_inductor/memory.py
index 1a02dbb1e6af..a8df2fe55987 100644
--- a/torch/_inductor/memory.py
+++ b/torch/_inductor/memory.py
@@ -913,8 +913,8 @@ def reorder_for_peak_memory(
     try:
         validate_graph_acyclic(nodes)
         validate_unique_buffer_names(nodes, name_to_buf, name_to_freeable_input_buf)
-    except RuntimeError as e:
-        torch_log.error("Memory planning validation failed: %s", e)
+    except RuntimeError:
+        torch_log.exception("Memory planning validation failed")
         if not is_fbcode():  # TODO: remove after ensuring OSS side is safe
             raise
 
@@ -942,8 +942,8 @@ def reorder_for_peak_memory(
                 PeakMemoryResult(order, peak_memory, method.__name__)
             )
             torch_log.info("%s peak memory: %d", method.__name__, peak_memory)
-        except Exception as e:
-            torch_log.error("Failed to reorder for %s: %s", method.__name__, e)
+        except Exception:
+            torch_log.exception("Failed to reorder for %s", method.__name__)
             if not is_fbcode():  # TODO: remove after ensuring OSS side is safe
                 raise
 
diff --git a/torch/_inductor/runtime/coordinate_descent_tuner.py b/torch/_inductor/runtime/coordinate_descent_tuner.py
index faa2b06bcaf1..68db68ca11c7 100644
--- a/torch/_inductor/runtime/coordinate_descent_tuner.py
+++ b/torch/_inductor/runtime/coordinate_descent_tuner.py
@@ -238,7 +238,7 @@ class CoordescTuner:
         try:
             candidate_timing = self.call_func(func, candidate_config)
         except Exception as e:
-            log.debug("Got exception %s", e)
+            log.debug("Got exception %s", e)  # noqa: G200
             return False, float("inf")
 
         if self.has_improvement(best_timing, candidate_timing):
diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py
index b49b9ac54228..f809d9f7d50a 100644
--- a/torch/_inductor/runtime/triton_heuristics.py
+++ b/torch/_inductor/runtime/triton_heuristics.py
@@ -1618,7 +1618,7 @@ class StaticTritonCompileResult(CompileResult[StaticallyLaunchedCudaKernel]):
             result = check_can_launch()
             return result
         except CannotStaticallyLaunchKernel as e:
-            log.info("Bypassing StaticallyLaunchedCudaKernel due to %s", str(e))
+            log.info("Bypassing StaticallyLaunchedCudaKernel due to %s", str(e))  # noqa: G200
             if torch._inductor.config.strict_static_cuda_launcher:
                 raise e
             return None
@@ -1997,11 +1997,11 @@ def end_graph(output_file):
                     )
                     file.write(bw_info_str + "\n")
                 file.write(f"{summary_str}\n\n")
-        except Exception as e:
+        except Exception:
             log.warning(
-                "failed to write profile bandwidth result into %s: %s",
+                "failed to write profile bandwidth result into %s",
                 output_file,
-                e,
+                exc_info=True,
             )
 
 
diff --git a/torch/_inductor/scheduler.py b/torch/_inductor/scheduler.py
index d76036d3859b..d68ce41251f9 100644
--- a/torch/_inductor/scheduler.py
+++ b/torch/_inductor/scheduler.py
@@ -896,11 +896,11 @@ class BaseSchedulerNode:
             except ValueError as e:
                 # We don't know how to estimate runtime for this collective,
                 # falling back to 0
-                log.info(e)
+                log.info(e)  # noqa: G200
                 return 0
             except TypeError as e:
                 # this happens when the collective is not of type ir._CollectiveKernel
-                log.info(e)
+                log.info(e)  # noqa: G200
                 return 0
 
         elif is_wait(self.node):
@@ -3366,7 +3366,7 @@ class Scheduler:
                             future.result()
                     except Exception as e:
                         if fusion_log.isEnabledFor(logging.DEBUG):
-                            fusion_log.debug(
+                            fusion_log.debug(  # noqa: G200
                                 "Exception in compiling %s: %s",
                                 "prologue" if not epilogue_fusion else "epilogue",
                                 str(e),
@@ -3442,7 +3442,7 @@ class Scheduler:
                     # triton  will unpredictably error with valid prologue fusions
                     except Exception as e:
                         if fusion_log.isEnabledFor(logging.DEBUG):
-                            fusion_log.debug(
+                            fusion_log.debug(  # noqa: G200
                                 "Exception in compiling %s: %s",
                                 "prologue" if not epilogue_fusion else "epilogue",
                                 str(e),
diff --git a/torch/_inductor/select_algorithm.py b/torch/_inductor/select_algorithm.py
index b0e81444ad84..24fd3ccbfe10 100644
--- a/torch/_inductor/select_algorithm.py
+++ b/torch/_inductor/select_algorithm.py
@@ -1702,7 +1702,7 @@ class TritonTemplate(KernelTemplate):
                 choices.append(choice)
             return None
         except NotImplementedError as e:
-            log.info(
+            log.info(  # noqa: G200
                 "Cannot Append Choice: %s. KernelTemplate type is %s",
                 e,
                 type(self),
@@ -3223,17 +3223,16 @@ class AlgorithmSelectorCache(PersistentCache):
         for choice in choices:
             try:
                 timing = cls.benchmark_choice(choice, autotune_args)
-            except CUDACompileError as e:
+            except CUDACompileError:
                 from torch._inductor.codegen.cuda.cuda_kernel import CUDATemplateCaller
 
                 if not isinstance(choice, CUDATemplateCaller):
-                    log.error(
-                        "CUDA compilation error during autotuning: \n%s. \nIgnoring this choice.",
-                        e,
+                    log.exception(
+                        "CUDA compilation error during autotuning: \n%s. \nIgnoring this choice."
                     )
                 timing = float("inf")
-            except NotImplementedError as e:
-                log.warning("Not yet implemented: %s", e)
+            except NotImplementedError:
+                log.warning("Not yet implemented", exc_info=True)
                 timing = float("inf")
             except RuntimeError as e:
                 from torch._inductor.codegen.cuda.cuda_kernel import CUDATemplateCaller
@@ -3266,7 +3265,7 @@ class AlgorithmSelectorCache(PersistentCache):
                     from triton.runtime.autotuner import OutOfResources
 
                     if isinstance(e, OutOfResources):
-                        log.warning(e)
+                        log.warning(e)  # noqa: G200
                         timing = float("inf")
                     else:
                         raise e
diff --git a/torch/_inductor/triton_bundler.py b/torch/_inductor/triton_bundler.py
index b210dbff5c84..5bf5210a2cf4 100644
--- a/torch/_inductor/triton_bundler.py
+++ b/torch/_inductor/triton_bundler.py
@@ -224,11 +224,11 @@ class TritonBundler:
                     # Make sure the cubin path exists and is valid
                     for compile_result in result.kernel.compile_results:
                         compile_result.reload_cubin_path()
-                except RuntimeError as e:
+                except RuntimeError:
                     log.warning(
-                        "Failed to reload cubin file statically launchable autotuner %s: %s",
+                        "Failed to reload cubin file statically launchable autotuner %s",
                         result.kernel_name,
-                        e,
+                        exc_info=True,
                     )
                     continue
                 # We make a future instead of returning the kernel here so that
diff --git a/torch/_library/fake_class_registry.py b/torch/_library/fake_class_registry.py
index 1902eafc0a48..b98949b388a9 100644
--- a/torch/_library/fake_class_registry.py
+++ b/torch/_library/fake_class_registry.py
@@ -21,7 +21,7 @@ class FakeScriptObject:
             with _disable_current_modes():
                 self.real_obj = copy.deepcopy(x)
         except RuntimeError as e:
-            log.warning(
+            log.warning(  # noqa: G200
                 "Unable to deepcopy the custom object %s due to %s. "
                 "Defaulting to the user given object. This might be "
                 "dangerous as side effects may be directly applied "
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index 31d129a3c861..3c2d609b7367 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -2568,7 +2568,7 @@ class FakeTensorMode(TorchDispatchMode):
                 # we shouldn't broadly catch all errors here;
                 # some come from real-kernel mutation/aliasing checks we want to run.
                 # add more exception types as needed.
-                log.debug(
+                log.debug(  # noqa: G200
                     "real-tensor fallback failed for %s: %s; silently ignoring",
                     func,
                     exc,
diff --git a/torch/distributed/checkpoint/_experimental/checkpoint_process.py b/torch/distributed/checkpoint/_experimental/checkpoint_process.py
index 4e1c8e7f8253..5fde55053eed 100644
--- a/torch/distributed/checkpoint/_experimental/checkpoint_process.py
+++ b/torch/distributed/checkpoint/_experimental/checkpoint_process.py
@@ -224,7 +224,7 @@ class CheckpointProcess:
                 )
             )
             parent_pipe.close()
-            logger.error("Subprocess terminated due to exception: %s", e)
+            logger.exception("Subprocess terminated due to exception")
 
     def _send(self, request_type: RequestType, payload: dict[str, Any]) -> None:
         try:
@@ -238,8 +238,8 @@ class CheckpointProcess:
             )
         except OSError as e:
             error_msg = "Child process terminated unexpectedly"
-            logger.error(
-                "Communication failed during %s request: %s", request_type.value, e
+            logger.exception(
+                "Communication failed during %s request", request_type.value
             )
             raise RuntimeError(error_msg) from e
 
@@ -354,10 +354,8 @@ class CheckpointProcess:
                     )
                     self.process.processes[0].kill()
                     logger.info("Subprocess killed forcefully")
-            except ProcessExitedException as e:
-                logger.error(
-                    "ProcessExitedException during subprocess termination: %s", e
-                )
+            except ProcessExitedException:
+                logger.exception("ProcessExitedException during subprocess termination")
                 raise
 
         logger.debug("CheckpointProcess closed successfully")
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index 2419e5aecca3..c39847176517 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -972,7 +972,7 @@ def _store_based_barrier(
         except RuntimeError as e:
             worker_count = store.add(store_key, 0)
             # Print status periodically to keep track.
-            logger.debug(
+            logger.debug(  # noqa: G200
                 "Waiting in store based barrier to initialize process group for %s seconds"
                 "rank: %s, key: %s (world_size=%s, num_workers_joined=%s, timeout=%s error=%s)",
                 time.time() - start,
diff --git a/torch/distributed/elastic/agent/server/api.py b/torch/distributed/elastic/agent/server/api.py
index f0fc50dd70b9..b02095304391 100644
--- a/torch/distributed/elastic/agent/server/api.py
+++ b/torch/distributed/elastic/agent/server/api.py
@@ -721,7 +721,7 @@ class SimpleElasticAgent(ElasticAgent):
             self._record_worker_events(result)
             return result
         except RendezvousGracefulExitError as e:
-            logger.info("Rendezvous gracefully exited: %s", e)
+            logger.info("Rendezvous gracefully exited: %s", e)  # noqa: G200
         except SignalException as e:
             logger.warning("Received %s death signal, shutting down workers", e.sigval)
             self._shutdown(e.sigval)
diff --git a/torch/distributed/elastic/multiprocessing/api.py b/torch/distributed/elastic/multiprocessing/api.py
index 9bb580c5bf78..ede23f8b801c 100644
--- a/torch/distributed/elastic/multiprocessing/api.py
+++ b/torch/distributed/elastic/multiprocessing/api.py
@@ -489,11 +489,13 @@ class PContext(abc.ABC):
                     sig = getattr(signal, sig_name.strip())
                     signal.signal(sig, _terminate_process_handler)
                     logger.info("Registered signal handler for %s", sig_name)
-                except (AttributeError, ValueError) as e:
+                except (AttributeError, ValueError):
                     logger.warning(
-                        "Failed to register signal handler for %s: %s", sig_name, e
+                        "Failed to register signal handler for %s",
+                        sig_name,
+                        exc_info=True,
                     )
-                except RuntimeError as e:
+                except RuntimeError:
                     if IS_WINDOWS and sig_name.strip() in [
                         "SIGHUP",
                         "SIGQUIT",
@@ -505,7 +507,9 @@ class PContext(abc.ABC):
                         )
                     else:
                         logger.warning(
-                            "Failed to register signal handler for %s: %s", sig_name, e
+                            "Failed to register signal handler for %s",
+                            sig_name,
+                            exc_info=True,
                         )
         else:
             logger.warning(
diff --git a/torch/distributed/elastic/multiprocessing/tail_log.py b/torch/distributed/elastic/multiprocessing/tail_log.py
index 9ff628157361..2aa73dc19dd6 100644
--- a/torch/distributed/elastic/multiprocessing/tail_log.py
+++ b/torch/distributed/elastic/multiprocessing/tail_log.py
@@ -142,12 +142,11 @@ class TailLog:
             try:
                 f.result()
             except Exception as e:
-                logger.error(
-                    "error in log tailor for %s%s. %s: %s",
+                logger.exception(
+                    "error in log tailor for %s%s. %s",
                     self._name,
                     local_rank,
                     e.__class__.__qualname__,
-                    e,
                 )
 
         if self._threadpool:
diff --git a/torch/distributed/elastic/rendezvous/etcd_rendezvous.py b/torch/distributed/elastic/rendezvous/etcd_rendezvous.py
index 0e4da86d4621..300399414d9c 100644
--- a/torch/distributed/elastic/rendezvous/etcd_rendezvous.py
+++ b/torch/distributed/elastic/rendezvous/etcd_rendezvous.py
@@ -208,8 +208,8 @@ class EtcdRendezvousHandler(RendezvousHandler):
         try:
             self.set_closed()
             return True
-        except BaseException as e:  # noqa: B036
-            logger.warning("Shutdown failed. Error occurred: %s", str(e))
+        except BaseException:  # noqa: B036
+            logger.warning("Shutdown failed", exc_info=True)
             return False
 
 
@@ -333,7 +333,7 @@ class EtcdRendezvous:
                 # to avoid spamming etcd
                 # FIXME: there are a few things that fall under this like
                 # etcd.EtcdKeyNotFound, etc, which could be handled more explicitly.
-                logger.info("Rendezvous attempt failed, will retry. Reason: %s", e)
+                logger.info("Rendezvous attempt failed, will retry. Reason: %s", e)  # noqa: G200
                 time.sleep(1)
 
     def init_phase(self):
diff --git a/torch/distributed/elastic/rendezvous/etcd_server.py b/torch/distributed/elastic/rendezvous/etcd_server.py
index 8af8c01c028a..7e54fdd9839a 100644
--- a/torch/distributed/elastic/rendezvous/etcd_server.py
+++ b/torch/distributed/elastic/rendezvous/etcd_server.py
@@ -176,7 +176,7 @@ class EtcdServer:
             except Exception as e:
                 curr_retries += 1
                 stop_etcd(self._etcd_proc)
-                logger.warning(
+                logger.warning(  # noqa: G200
                     "Failed to start etcd server, got error: %s, retrying", str(e)
                 )
                 if curr_retries >= num_retries:
diff --git a/torch/distributed/pipelining/schedules.py b/torch/distributed/pipelining/schedules.py
index 067a9351d823..d265bd295009 100644
--- a/torch/distributed/pipelining/schedules.py
+++ b/torch/distributed/pipelining/schedules.py
@@ -1734,7 +1734,7 @@ class PipelineScheduleMulti(_PipelineSchedule):
                 # do the communication
                 _wait_batch_p2p(_batch_p2p(ops))
             except Exception as e:
-                logger.error(
+                logger.error(  # noqa: G200
                     "[Rank %s] pipeline schedule %s caught the following exception '%s' \
 at time_step %s when running action %s",
                     self.rank,
diff --git a/torch/distributed/rpc/api.py b/torch/distributed/rpc/api.py
index dc552a7482ed..883b6b324f9b 100644
--- a/torch/distributed/rpc/api.py
+++ b/torch/distributed/rpc/api.py
@@ -295,8 +295,8 @@ def _barrier(worker_names):
     """
     try:
         _all_gather(None, set(worker_names))
-    except RuntimeError as ex:
-        logger.error("Failed to complete barrier, got error %s", ex)
+    except RuntimeError:
+        logger.exception("Failed to complete barrier")
 
 
 @_require_initialized
@@ -311,9 +311,7 @@ def _wait_all_workers(timeout=DEFAULT_SHUTDOWN_TIMEOUT):
     try:
         _all_gather(None, timeout=timeout)
     except RuntimeError as ex:
-        logger.error(
-            "Failed to respond to 'Shutdown Proceed' in time, got error %s", ex
-        )
+        logger.exception("Failed to respond to 'Shutdown Proceed' in time")
         raise ex
 
 
diff --git a/torch/export/__init__.py b/torch/export/__init__.py
index aeadf3e0e3a9..83b6b87fe4d8 100644
--- a/torch/export/__init__.py
+++ b/torch/export/__init__.py
@@ -448,8 +448,8 @@ def load(
             f,
             expected_opset_version=expected_opset_version,
         )
-    except RuntimeError as e:
-        log.warning("Ran into the following error when deserializing: %s", e)
+    except RuntimeError:
+        log.warning("Ran into the following error when deserializing", exc_info=True)
         pt2_contents = PT2ArchiveContents({}, {}, {})
 
     if len(pt2_contents.exported_programs) > 0 or len(pt2_contents.extra_files) > 0:
diff --git a/torch/export/pt2_archive/_package.py b/torch/export/pt2_archive/_package.py
index 7d9c0991721b..1a2e74b84e32 100644
--- a/torch/export/pt2_archive/_package.py
+++ b/torch/export/pt2_archive/_package.py
@@ -83,8 +83,8 @@ def is_pt2_package(serialized_model: Union[bytes, str]) -> bool:
         archive_format_path = f"{root_folder}/{ARCHIVE_FORMAT_PATH}"
         if archive_format_path in zip_reader.namelist():
             return zip_reader.read(archive_format_path) == b"pt2"
-    except Exception as ex:
-        logger.info("Model is not a PT2 package: %s", str(ex))
+    except Exception:
+        logger.info("Model is not a PT2 package")
     return False
 
 
diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py
index 771e75272018..67f8c0f66574 100644
--- a/torch/fx/experimental/symbolic_shapes.py
+++ b/torch/fx/experimental/symbolic_shapes.py
@@ -3209,8 +3209,8 @@ class DimConstraints:
                         self._dynamic_results.add(self._dcp.doprint(arg))
                 else:
                     self._dynamic_results.add(self._dcp.doprint(solution))
-            except (NotImplementedError, AssertionError) as e:
-                log.warning("Failed to reduce inequalities: %s", e)
+            except (NotImplementedError, AssertionError):
+                log.warning("Failed to reduce inequalities", exc_info=True)
                 for expr2 in exprs:
                     self._dynamic_results.add(self._dcp.doprint(expr2))
 
diff --git a/torch/onnx/_internal/exporter/_registration.py b/torch/onnx/_internal/exporter/_registration.py
index f4c7cfbf5127..38d9f31afab6 100644
--- a/torch/onnx/_internal/exporter/_registration.py
+++ b/torch/onnx/_internal/exporter/_registration.py
@@ -83,7 +83,7 @@ class OnnxDecompMeta:
                     # When the function is targeting an HOP, for example, it will accept
                     # functions as arguments and fail to generate an ONNX signature.
                     # In this case we set signature to None and dispatch to this function always.
-                    logger.warning(
+                    logger.warning(  # noqa: G200
                         "Failed to infer the signature for function '%s' because '%s'"
                         "All nodes targeting `%s` will be dispatched to this function",
                         self.onnx_function,
diff --git a/torch/onnx/_internal/exporter/_verification.py b/torch/onnx/_internal/exporter/_verification.py
index a475908b5825..9741ae81bfff 100644
--- a/torch/onnx/_internal/exporter/_verification.py
+++ b/torch/onnx/_internal/exporter/_verification.py
@@ -317,12 +317,9 @@ class _VerificationInterpreter(torch.fx.Interpreter):
             return result
         try:
             (onnx_result,) = self._onnx_program.compute_values([node_name], self._args)
-        except Exception as e:
+        except Exception:
             logger.warning(
-                "Failed to compute value for node %s: %s",
-                node_name,
-                e,
-                exc_info=True,
+                "Failed to compute value for node %s", node_name, exc_info=True
             )
             return result
         info = VerificationInfo.from_tensors(
diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index 6cd372a8596c..18384b311b93 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -875,7 +875,7 @@ class MultiProcessTestCase(TestCase):
         try:
             getattr(self, test_name)()
         except unittest.SkipTest as se:
-            logger.info(
+            logger.info(  # noqa: G200
                 "Process %s skipping test %s for following reason: %s",
                 self.rank,
                 test_name,
@@ -917,11 +917,10 @@ class MultiProcessTestCase(TestCase):
                 try:
                     pipe.send(MultiProcessTestCase.Event.GET_TRACEBACK)
                     pipes.append((i, pipe))
-                except ConnectionError as e:
-                    logger.error(
-                        "Encountered error while trying to get traceback for process %s: %s",
+                except ConnectionError:
+                    logger.exception(
+                        "Encountered error while trying to get traceback for process %s",
                         i,
-                        e,
                     )
 
         # Wait for results.
@@ -944,11 +943,10 @@ class MultiProcessTestCase(TestCase):
                     logger.error(
                         "Could not retrieve traceback for timed out process: %s", rank
                     )
-            except ConnectionError as e:
-                logger.error(
-                    "Encountered error while trying to get traceback for process %s: %s",
+            except ConnectionError:
+                logger.exception(
+                    "Encountered error while trying to get traceback for process %s",
                     rank,
-                    e,
                 )
 
     def _join_processes(self, fn) -> None:

From e939651972c150014e16d02efb5aff973288dd0b Mon Sep 17 00:00:00 2001
From: PyTorch UpdateBot <pytorchupdatebot@users.noreply.github.com>
Date: Sun, 19 Oct 2025 04:45:18 +0000
Subject: [PATCH 120/123] [audio hash update] update the pinned audio hash
 (#165807)

This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml).
Update the pinned audio hash.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165807
Approved by: https://github.com/pytorchbot
---
 .github/ci_commit_pins/audio.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ci_commit_pins/audio.txt b/.github/ci_commit_pins/audio.txt
index c464a6a3d61f..8af554d56ee5 100644
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@@ -1 +1 @@
-1b013f5b5a87a1882eb143c26d79d091150d6a37
+69bbe7363897764f9e758d851cd0340147d27f94

From 33adb276fef9d2050c0c36a87ef3ed644cc3d531 Mon Sep 17 00:00:00 2001
From: Aaron Gokaslan <aaronGokaslan@gmail.com>
Date: Sun, 19 Oct 2025 08:00:06 +0000
Subject: [PATCH 121/123] [BE][Ez]: Update Eigen to 5.0.0. C++14 support and
 more! (#165840)

Update Eigen pin to 5.0.0 . Tons of new features and perf improvements. Most importantly updates minimum from C++03 to C++14 giving a ton of performance optimizations like properly implemented move operators, simplified code, etc. Also improved vectorization particularily on ARM. We really only use this library as a fallback for sparse operators, but still useful to update it.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165840
Approved by: https://github.com/albanD
---
 third_party/eigen_pin.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/eigen_pin.txt b/third_party/eigen_pin.txt
index 18091983f59d..0062ac971805 100644
--- a/third_party/eigen_pin.txt
+++ b/third_party/eigen_pin.txt
@@ -1 +1 @@
-3.4.0
+5.0.0

From ceb11a584d6b3fdc600358577d9bf2644f88def9 Mon Sep 17 00:00:00 2001
From: Aaron Gokaslan <aaronGokaslan@gmail.com>
Date: Sun, 19 Oct 2025 08:25:00 +0000
Subject: [PATCH 122/123] [BE]: Update kleidai submodule to v1.15.0 (#165842)

This mostly just adds a few new kernels and fixes some IMA and performance improvement of prev kernels. Also improves compiler support.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165842
Approved by: https://github.com/albanD
---
 third_party/kleidiai | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/kleidiai b/third_party/kleidiai
index cca02c2f69dd..d7770c896323 160000
--- a/third_party/kleidiai
+++ b/third_party/kleidiai
@@ -1 +1 @@
-Subproject commit cca02c2f69dd18e1f12647c1c0bdc8cf90e680c7
+Subproject commit d7770c89632329a9914ef1a90289917597639cbe

From 57ba5752423249dd659e76e4d5a3d7b893edc85a Mon Sep 17 00:00:00 2001
From: Aaron Gokaslan <aaronGokaslan@gmail.com>
Date: Sun, 19 Oct 2025 09:24:08 +0000
Subject: [PATCH 123/123] [BE][Ez]: Update torch.is_tensor documentation
 (#165841)

TypeIs propogates the isinstance check with the typing system. They are now equivalent.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165841
Approved by: https://github.com/albanD
---
 torch/__init__.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/torch/__init__.py b/torch/__init__.py
index 39555a8360e8..f7fd0210d81f 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -1120,11 +1120,6 @@ def typename(obj: _Any, /) -> str:
 def is_tensor(obj: _Any, /) -> _TypeIs["torch.Tensor"]:
     r"""Returns True if `obj` is a PyTorch tensor.
 
-    Note that this function is simply doing ``isinstance(obj, Tensor)``.
-    Using that ``isinstance`` check is better for type checking with mypy,
-    and more explicit - so it's recommended to use that instead of
-    ``is_tensor``.
-
     Args:
         obj (object): Object to test
     Example::