[BE] Clarify defaulting behavior in optimizer (#135384)

Fixes #135340 Pull Request resolved: https://github.com/pytorch/pytorch/pull/135384 Approved by: https://github.com/drisspg, https://github.com/jainapurva
2025-10-20 21:14:14 +08:00 · 2024-09-06 11:55:40 -07:00
parent dc0e818738
commit b1612569f6
1 changed files with 7 additions and 11 deletions
--- a/torch/optim/optimizer.py
+++ b/torch/optim/optimizer.py
@ -245,17 +245,13 @@ _fused_doc = r"""fused (bool, optional): whether the fused implementation is use
            are supported. (default: None)

    .. note:: The foreach and fused implementations are typically faster than the for-loop,
-              single-tensor implementation. Thus, if the user has not specified BOTH flags
-              (i.e., when foreach = fused = None), we will attempt defaulting to the foreach
-              implementation when the tensors are all on CUDA. For example, if the user specifies
-              True for fused but nothing for foreach, we will run the fused implementation. If
-              the user specifies False for foreach but nothing for fused (or False for fused but
-              nothing for foreach), we will run the for-loop implementation. If the user specifies
-              True for both foreach and fused, we will prioritize fused over foreach, as it is
-              typically faster. We attempt to use the fastest, so the hierarchy goes fused ->
-              foreach -> for-loop. HOWEVER, since the fused implementation is relatively new,
-              we want to give it sufficient bake-in time, so we default to foreach and NOT
-              fused when the user has not specified either flag."""
+              single-tensor implementation, with fused being theoretically fastest with both
+              vertical and horizontal fusion. As such, if the user has not specified either
+              flag (i.e., when foreach = fused = None), we will attempt defaulting to the foreach
+              implementation when the tensors are all on CUDA. Why not fused? Since the fused
+              implementation is relatively new, we want to give it sufficient bake-in time.
+              To specify fused, pass True for fused. To force running the for-loop
+              implementation, pass False for either foreach or fused. """

 _capturable_doc = r"""capturable (bool, optional): whether this instance is safe to
            capture in a CUDA graph. Passing True can impair ungraphed performance,