Disabling amp context when invoking compiler (#138659 )

Disabling amp context when invoking compiler (#138624) Fix for https://github.com/pytorch/pytorch/issues/133974 Pull Request resolved: https://github.com/pytorch/pytorch/pull/138624 Approved by: https://github.com/bdhirsh, https://github.com/drisspg (cherry picked from commit 5942b2985000e0c69ec955b6c88dee8b5d7e67fd) Co-authored-by: eellison <elias.ellison@gmail.com>
[MPS] Fix sliced cast (#138535 )
2025-11-02 06:24:59 +08:00 · 2024-10-22 18:14:52 -07:00 · 2024-10-22 16:25:25 -07:00 · 2024-10-22 15:51:29 -07:00 · 2024-10-22 15:47:37 -07:00 · 2024-10-22 15:06:53 -07:00
11 changed files with 89 additions and 49 deletions
--- a/aten/src/ATen/native/mps/OperationUtils.mm
+++ b/aten/src/ATen/native/mps/OperationUtils.mm
@ -542,18 +542,9 @@ Placeholder::Placeholder(MPSGraphTensor* mpsGraphTensor,
    MPSShape* mpsShape = getMPSShape(_tensor);
    MPSShape* mpsStrides = getMPSShape(_tensor.strides());

-    IntArrayRef baseShape;
-    if (src.is_view()) {
-      baseShape = src._base().sizes();
-    } else {
-      baseShape = getIMPSAllocator()->getBufferShape(src.storage().data());
-    }
-    int flattenedShaped = 1;
-    for (const auto i : c10::irange(baseShape.size())) {
-      flattenedShaped *= baseShape[i];
-    }
-    MPSShape* mpsBaseShape = @[ @(flattenedShaped) ];
-    MPSNDArrayDescriptor* srcTensorDesc = [MPSNDArrayDescriptor descriptorWithDataType:dataType shape:mpsBaseShape];
+    auto storage_numel = src.storage().nbytes() / src.element_size();
+    MPSNDArrayDescriptor* srcTensorDesc = [MPSNDArrayDescriptor descriptorWithDataType:dataType
+                                                                                 shape:@[ @(storage_numel) ]];
    srcTensorDesc.preferPackedRows = YES;
    MPSNDArray* srcNDArray = [[[MPSNDArray alloc] initWithBuffer:srcBuf
                                                          offset:src.storage_offset() * src.element_size()
--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
@ -68,16 +68,11 @@ bool check_prefer_cudnn_attention() {
 std::array<SDPBackend, num_backends> priority_order(sdp_params const& params) {
  constexpr std::array<SDPBackend, num_backends> default_order{
      SDPBackend::flash_attention,
-      SDPBackend::cudnn_attention,
      SDPBackend::efficient_attention,
-      SDPBackend::math};
-  constexpr std::array<SDPBackend, num_backends> cudnn_order{
+      SDPBackend::math,
      SDPBackend::cudnn_attention,
-      SDPBackend::flash_attention,
-      SDPBackend::efficient_attention,
-      SDPBackend::math};
-  static const bool prefer_cudnn = check_prefer_cudnn_attention();
-  return prefer_cudnn ? cudnn_order : default_order;
+      };
+  return default_order;
 }

 bool use_tensor_cores(sdp_params const& params, cudaDeviceProp* dprops, bool is_half) {
--- a/docs/cpp/source/conf.py
+++ b/docs/cpp/source/conf.py
@ -123,7 +123,7 @@ master_doc = "index"

 # General information about the project.
 project = "PyTorch"
-copyright = "2022, PyTorch Contributors"
+copyright = "2024, PyTorch Contributors"
 author = "PyTorch Contributors"

 # The version info for the project you're documenting, acts as replacement for
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -3352,7 +3352,7 @@ master_doc = "index"

 # General information about the project.
 project = "PyTorch"
-copyright = "2023, PyTorch Contributors"
+copyright = "2024, PyTorch Contributors"
 author = "PyTorch Contributors"
 torch_version = str(torch.__version__)

--- a/docs/source/torch.compiler_troubleshooting.rst
+++ b/docs/source/torch.compiler_troubleshooting.rst
@ -3,6 +3,12 @@ PyTorch 2.0 Troubleshooting

 **Author**: `Michael Lazos <https://github.com/mlazos>`_

+
+.. note:: This document is currently outdated and requires revision.  For the interim period, please refer to
+   the `comprehensive manual for torch.compile <https://docs.google.com/document/d/1y5CRfMLdwEoF1nTk9q8qEu1mgMUuUtvhklPKJ2emLU8/edit#heading=h.ivdr7fmrbeab>`__
+   as the primary resource for troubleshooting guidance.
+
+
 We are actively developing debug tools, profilers, and improving our
 error and warning messages. Below is a table of the available
 tools and their typical usage. For additional help see
--- a/test/inductor/test_cpu_repro.py
+++ b/test/inductor/test_cpu_repro.py
@ -3941,6 +3941,47 @@ class CPUReproTests(TestCase):
        x = torch.randn(1, 4, 2, 2)
        self.common(fn, (x,))

+    @parametrize("is_inference", (True, False))
+    def test_disabled_amp(self, is_inference):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.all_head_size = 12 * 64
+                self.dense = nn.Linear(self.all_head_size, self.all_head_size)
+
+            def forward(self, q, k, v):
+                context_layer = F.scaled_dot_product_attention(
+                    q, k, v, attn_mask=None, dropout_p=0.2
+                )
+                context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+                new_context_layer_shape = context_layer.size()[:-2] + (
+                    self.all_head_size,
+                )
+                context_layer = context_layer.view(new_context_layer_shape)
+                return self.dense(context_layer)
+
+        mod = M().to(torch.bfloat16).eval()
+
+        q = torch.randn((4, 12, 512, 64), dtype=torch.bfloat16) / 10.0
+        k = torch.randn((4, 12, 512, 64), dtype=torch.bfloat16) / 10.0
+        v = torch.randn((4, 12, 512, 64), dtype=torch.bfloat16) / 10.0
+        inputs = (
+            q,
+            k,
+            v,
+        )
+        compiler_mode = torch.compile(mod)
+        from torch.nn.attention import sdpa_kernel, SDPBackend
+
+        context = contextlib.nullcontext if not is_inference else torch.no_grad
+        with config.patch(
+            {"fallback_random": True}
+        ), torch.cpu.amp.autocast(), context(), sdpa_kernel(SDPBackend.MATH):
+            torch.manual_seed(0)
+            eager = mod(*inputs)
+            torch.manual_seed(0)
+            self.assertEqual(compiler_mode(*inputs), eager)
+
    @requires_vectorization
    def test_vec_indirect_load_cse_cache(self):
        # https://github.com/pytorch/pytorch/issues/123502
--- a/test/test_mps.py
+++ b/test/test_mps.py
@ -10964,6 +10964,12 @@ class TestAdvancedIndexing(TestCaseMPS):
        t1.start()
        t2.start()

+    def test_sliced_view_cast(self):
+        # This used to crash on MacOS Sequoia
+        # See https://github.com/pytorch/pytorch/issues/137800
+        x = torch.rand(16, 16, device='mps', dtype=torch.float16)
+        y = x[:, 0:2].view(torch.float32) + 1
+
    def test_masked_select(self):
        x = torch.randn(3, 4)
        x_mps = x.to("mps")
--- a/test/test_transformers.py
+++ b/test/test_transformers.py
@ -2809,8 +2809,12 @@ class TestSDPACudaOnly(NNTestCase):
        value = value.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
        key = key.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)

+        # TODO we are currently disabling this by default, lets assert that this returns
+        # FlashAttention, we need to change when we make remove opt-in for cudnn
        if type != "nested" and PLATFORM_SUPPORTS_CUDNN_ATTENTION and SM90OrLater:
-            self.assertEqual(torch._fused_sdp_choice(query, key, value), SDPBackend.CUDNN_ATTENTION.value)
+            self.assertEqual(torch._fused_sdp_choice(query, key, value), SDPBackend.FLASH_ATTENTION.value)
+            with sdpa_kernel(backends=[SDPBackend.CUDNN_ATTENTION]):
+                self.assertEqual(torch._fused_sdp_choice(query, key, value), SDPBackend.CUDNN_ATTENTION.value)
        elif PLATFORM_SUPPORTS_FLASH_ATTENTION:
            self.assertEqual(torch._fused_sdp_choice(query, key, value), SDPBackend.FLASH_ATTENTION.value)
        elif type != "nested" and PLATFORM_SUPPORTS_CUDNN_ATTENTION:  # e.g., we're on Windows
--- a/third_party/cpuinfo
+++ b/third_party/cpuinfo
--- a/torch/init.py
+++ b/torch/init.py
@ -308,7 +308,6 @@ def _load_global_deps() -> None:
            "cuda_runtime": "libcudart.so.*[0-9]",
            "cuda_cupti": "libcupti.so.*[0-9]",
            "cufft": "libcufft.so.*[0-9]",
-            "cufile": "libcufile.so.*[0-9]",
            "curand": "libcurand.so.*[0-9]",
            "nvjitlink": "libnvJitLink.so.*[0-9]",
            "cusparse": "libcusparse.so.*[0-9]",
--- a/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py
+++ b/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py
@ -555,7 +555,9 @@ def aot_dispatch_autograd(
                ),
            )

-        with track_graph_compiling(aot_config, "forward"):
+        # AMP is already traced out in joint graph. we do not wish to reapply it accidentally
+        # in the compiler.
+        with track_graph_compiling(aot_config, "forward"), torch._C._DisableAutocast():
            # flat_args at this point might still be subclasses-
            # make sure to pass the unwrapped fake tensors into the compiler!
            adjusted_flat_args = joint_inputs[0]
@ -620,7 +622,7 @@ def aot_dispatch_autograd(
        # NB: It's important to compile backwards ahead of time, as this may
        # add extra guards which we need to apply to the Dynamo cache at
        # forwards
-        with track_graph_compiling(aot_config, "backward"):
+        with track_graph_compiling(aot_config, "backward"), torch._C._DisableAutocast():
            placeholder_list = fx_placeholder_vals(bw_module)

            forward_saved_for_backwards_strides = None
@ -672,28 +674,24 @@ def aot_dispatch_autograd(

            compiled_bw_func = None
            if num_symints_saved_for_bw > 0:
-                context = torch._C._DisableAutocast if disable_amp else nullcontext
-                with context():
-                    try:
-                        compiled_bw_func = aot_config.bw_compiler(
-                            bw_module, placeholder_list
-                        )
-                    except Exception as e:
-                        exc = e
-                        trace_structured(
-                            "artifact",
-                            metadata_fn=lambda: {
-                                "name": "eager_compile_backwards_failure",
-                                "encoding": "string",
-                            },
-                            payload_fn=lambda: "\n".join(
-                                traceback.format_exception(exc)
-                            ),
-                        )
-                        log.warning(
-                            "failed to eagerly compile backwards for dynamic, suppressing in case backwards not needed",
-                            exc_info=True,
-                        )
+                try:
+                    compiled_bw_func = aot_config.bw_compiler(
+                        bw_module, placeholder_list
+                    )
+                except Exception as e:
+                    exc = e
+                    trace_structured(
+                        "artifact",
+                        metadata_fn=lambda: {
+                            "name": "eager_compile_backwards_failure",
+                            "encoding": "string",
+                        },
+                        payload_fn=lambda: "\n".join(traceback.format_exception(exc)),
+                    )
+                    log.warning(
+                        "failed to eagerly compile backwards for dynamic, suppressing in case backwards not needed",
+                        exc_info=True,
+                    )
            # Compiled autograd will run the bw_module in the backward pass,
            # so recompilation need happen anyway if the backward pass is ever
            # called.
Author	SHA1	Message	Date
pytorchbot	a8d6afb511	Disabling amp context when invoking compiler (#138659 ) Disabling amp context when invoking compiler (#138624) Fix for https://github.com/pytorch/pytorch/issues/133974 Pull Request resolved: https://github.com/pytorch/pytorch/pull/138624 Approved by: https://github.com/bdhirsh, https://github.com/drisspg (cherry picked from commit 5942b2985000e0c69ec955b6c88dee8b5d7e67fd) Co-authored-by: eellison <elias.ellison@gmail.com>	2024-10-22 18:14:52 -07:00
pytorchbot	f31b8bbc5b	[MPS] Fix sliced cast (#138535 ) [MPS] Fix sliced cast (#138314) This fixes internal crash due to the invalid bufer size computation if sliced API is used Not sure what was the purpose of ```c++ IntArrayRef baseShape; if (src.is_view()) { baseShape = src._base().sizes(); } else { baseShape = getIMPSAllocator()->getBufferShape(src.storage().data()); } int flattenedShaped = 1; for (const auto i : c10::irange(baseShape.size())) { flattenedShaped *= baseShape[i]; } ``` As flattenShaped could be much easier computed as `[srcBuf lengh]/src.element_size()`, and even if `srcBuf` is padded it's a safe thing to do. When someone allocated buffer to hold say uint8 and that view-casted it to float16, attempt to compute `baseShape` returned sizes of original tensor in its data type, rather than size in new dtypes Fixes https://github.com/pytorch/pytorch/issues/137800 Pull Request resolved: https://github.com/pytorch/pytorch/pull/138314 Approved by: https://github.com/albanD, https://github.com/DenisVieriu97 (cherry picked from commit de16159e565e7a08294347e31e97ca08a3468227) Co-authored-by: Nikita Shulga <nikita.shulga@gmail.com>	2024-10-22 16:25:25 -07:00
pytorchbot	848e7ac42a	[SDPA-CUDNN] Make CuDNN Attention Opt in (#138587 ) [SDPA-CUDNN] Make CuDNN Attention Opt in (#138522) # Summary Currently we have a `cudnn_order` that says on H100 w/ new enough CuDNN backend (we ship a 9.1 version in OSS) try to run CuDNN attention first. We have already encountered a few bugs with the release of 2.5: 1. https://github.com/pytorch/pytorch/issues/138529 2. https://github.com/huggingface/diffusers/issues/9704 3. https://github.com/pytorch/pytorch/pull/138354 In light of the above we are going to make the CuDNN backend Opt-in by default. This can be done easily with the context manager for choosing backends I.e.: ``` Python from torch.nn.attention import sdpa_kernel, SDPBackend with sdpa_kernel(SDPBackend.CUDNN_ATTENTION): out = F.scaled_dot_product_attention(q, k, v) ``` This PR puts the CuDNN backend as the lowest precedence in the backend list, meaning that the Math backend will always be chosen unless disabled (which is done via the context manager). Cc @atalman Pull Request resolved: https://github.com/pytorch/pytorch/pull/138522 Approved by: https://github.com/ngimel, https://github.com/eqy, https://github.com/malfet (cherry picked from commit 9a9a0abc2818d40d06eda6c0b6fdbc949474f12e) Co-authored-by: drisspg <drisspguessous@gmail.com>	2024-10-22 15:51:29 -07:00
pytorchbot	885c823759	Update doc copyrights to 2024 (#138650 ) Update copyrights to 2024 (#138638) Spiritual successor of https://github.com/pytorch/pytorch/pull/119413 + CPP docs copyright update as well Fixes https://github.com/pytorch/pytorch/issues/138630 Pull Request resolved: https://github.com/pytorch/pytorch/pull/138638 Approved by: https://github.com/atalman (cherry picked from commit d1be61ce4eb31640d1bdce07c8e6b17d03cbdca6) Co-authored-by: Nikita Shulga <2453524+malfet@users.noreply.github.com>	2024-10-22 15:47:37 -07:00
Nikita Shulga	8c3ed97baa	Update cpuinfo submodule (#138600 ) Spiritual cherry-pick of https://github.com/pytorch/pytorch/pull/138351 that picks https://github.com/pytorch/cpuinfo/pull/258 into the branch Fixes https://github.com/pytorch/pytorch/issues/138333 Test Plan: `python -c "import torch"` finishes without any output on the screen	2024-10-22 15:06:53 -07:00
pytorchbot	70cf2bbc0b	Add link to torch.compile the missing manual in troubleshooting (#137369 ) Add link to torch.compile the missing manual in troubleshooting (#137301) Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/137301 Approved by: https://github.com/svekars Co-authored-by: Svetlana Karslioglu <svekars@meta.com> (cherry picked from commit 22e19bd2d70409c2908edf0b0d00abb9209e3aaa) Co-authored-by: Michael Lazos <mlazos@meta.com>	2024-10-22 12:32:56 -07:00
pytorchbot	cde6b382ff	Don't try to load cufile (#138539 ) Don't try to load cufile (#138501) Trying to loading it caused a big issue with 2.5.0 release - https://github.com/pytorch/pytorch/issues/138324 cufile is not actually used currently by default, see #133489 Pull Request resolved: https://github.com/pytorch/pytorch/pull/138501 Approved by: https://github.com/atalman, https://github.com/mikaylagawarecki, https://github.com/malfet (cherry picked from commit 012ff2a0aaf81bce25b7eda8c0021f5a784c11a6) Co-authored-by: Sergii Dymchenko <sdym@meta.com>	2024-10-22 10:45:53 -07:00