Revert "Require less alignment for attn bias (#114173 ) (#114837 )"

This reverts commit 59656491f3b1da809312942872cce010337504b0.
Fix NULL dereference in binary CPU ops (#115241 )
2025-11-03 23:45:05 +08:00 · 2023-12-12 08:41:07 -08:00 · 2023-12-06 01:20:06 -08:00 · 2023-12-05 14:50:58 -05:00 · 2023-12-01 10:58:57 -08:00 · 2023-11-30 08:11:08 -08:00
22 changed files with 254 additions and 131 deletions
--- a/aten/src/ATen/ExpandUtils.h
+++ b/aten/src/ATen/ExpandUtils.h
@ -187,17 +187,18 @@ expand_inplace(
 // See NOTE [ ExpandUtils Borrowing ] above for `MaybeOwned` explanation.
 inline std::tuple<c10::MaybeOwned<Tensor>, c10::MaybeOwned<Tensor>>
 expand_outplace(const Tensor& to_expand1, const Tensor& to_expand2) {
-  if (to_expand1.sizes().equals(to_expand2.sizes())) {
+  auto s1 = to_expand1.sym_sizes();
+  auto s2 = to_expand2.sym_sizes();
+  if (s1.equals(s2)) {
    return std::make_tuple(
        c10::MaybeOwned<Tensor>::borrowed(to_expand1),
        c10::MaybeOwned<Tensor>::borrowed(to_expand2));
  }

-  auto expanded_size =
-      infer_size_dimvector(to_expand1.sizes(), to_expand2.sizes());
+  auto expanded_size = infer_size_symdimvector(s1, s2);
  return std::make_tuple(
-      c10::MaybeOwned<Tensor>::owned(to_expand1.expand(expanded_size)),
-      c10::MaybeOwned<Tensor>::owned(to_expand2.expand(expanded_size)));
+      c10::MaybeOwned<Tensor>::owned(to_expand1.expand_symint(expanded_size)),
+      c10::MaybeOwned<Tensor>::owned(to_expand2.expand_symint(expanded_size)));
 }

 inline std::tuple<c10::MaybeOwned<Tensor>, c10::MaybeOwned<Tensor>>
--- a/aten/src/ATen/mps/MPSStream.mm
+++ b/aten/src/ATen/mps/MPSStream.mm
@ -147,9 +147,9 @@ void MPSStream::addCompletedHandler(MTLCommandBufferHandler block) {
 }

 void MPSStream::fill(id<MTLBuffer> buffer, uint8_t value, size_t length, size_t offset, SyncType syncType) {
-  TORCH_INTERNAL_ASSERT(length >= offset);
-  if (length == 0)
+  if (length == 0) {
    return;
+  }
  dispatch_sync(_serialQueue, ^() {
    @autoreleasepool {
      endKernelCoalescing();
--- a/aten/src/ATen/native/Distance.cpp
+++ b/aten/src/ATen/native/Distance.cpp
@ -308,16 +308,18 @@ Tensor cosine_similarity(const Tensor& x1_, const Tensor& x2_, int64_t dim, doub
  // We accept integral types (and bools lol) but vector_norm does not
  auto x1_is_int = c10::isIntegralType(x1_.scalar_type(), /*încludeBool=*/true);
  auto x2_is_int = c10::isIntegralType(x2_.scalar_type(), /*încludeBool=*/true);
-  auto x1 = x1_is_int ? x1_.to(commonDtype) : x1_;
-  auto x2 = x2_is_int ? x2_.to(commonDtype) : x2_;
+  auto x1_t = x1_is_int ? x1_.to(commonDtype) : x1_;
+  auto x2_t = x2_is_int ? x2_.to(commonDtype) : x2_;
+  c10::MaybeOwned<Tensor> x1, x2;
+  std::tie(x1, x2) = expand_outplace(x1_t, x2_t);


  // We want to divide each tensor by its norm first, as it's more numerically stable.
  // This keeps the result between -1.0 and 1.0
  // We clone them, as we're going to modify them in-place
  // This allows the gradients to propagate propertly all the way to x1 and x2
-  auto x1_norm = at::linalg_vector_norm(x1, 2, /*dim=*/dim, /*keepdim=*/true).clone();
-  auto x2_norm = at::linalg_vector_norm(x2, 2, /*dim=*/dim, /*keepdim=*/true).clone();
+  auto x1_norm = at::linalg_vector_norm(*x1, 2, /*dim=*/dim, /*keepdim=*/true).clone();
+  auto x2_norm = at::linalg_vector_norm(*x2, 2, /*dim=*/dim, /*keepdim=*/true).clone();

  {
    at::NoGradGuard guard;
@ -325,7 +327,7 @@ Tensor cosine_similarity(const Tensor& x1_, const Tensor& x2_, int64_t dim, doub
    x2_norm.clamp_min_(eps);
  }

-  return ((x1 / x1_norm) * (x2 / x2_norm)).sum(dim);
+  return ((*x1 / x1_norm) * (*x2 / x2_norm)).sum(dim);
 }

 }}  // namespace at::native
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@ -1483,12 +1483,14 @@ static void addmm_impl_cpu_(
  // it is faster to call oneDNN matrix multiplication primitive with RHS*LHS
  // that will call then into Arm® Compute Library (ACL) GEMM kernel and also
  // additionally have support for running kernel with BF16 instructions
-  bool apply_heur = apply_mkldnn_matmul_heur(b.sizes()[0], b.sizes()[1], a.sizes()[1]);
-  if (apply_heur && transpose_a && !transpose_b && result.scalar_type() == at::ScalarType::Float) {
-      mkldnn_matmul(b, a, c, beta.to<float>(), alpha.to<float>());
-      // We have dispatched to ACL GEMM for single precision float
-      // so do not need to dispatch to BLAS GEMM below
-      dispatched = true;
+  if (transpose_c) {
+    bool apply_heur = apply_mkldnn_matmul_heur(b.sizes()[0], b.sizes()[1], a.sizes()[1]);
+    if (apply_heur && transpose_a && !transpose_b && result.scalar_type() == at::ScalarType::Float) {
+        mkldnn_matmul(b, a, c, beta.to<float>(), alpha.to<float>());
+        // We have dispatched to ACL GEMM for single precision float
+        // so do not need to dispatch to BLAS GEMM below
+        dispatched = true;
+    }
  }
 #endif

--- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
@ -101,7 +101,7 @@ void mul_kernel(TensorIteratorBase& iter) {
          using comp_t = c10::complex<float>;
          return comp_t{a} * comp_t{b};
        });
-  } else if (iter.is_scalar(2) && at::isReducedFloatingType(dtype)) {
+  } else if (iter.is_scalar(2) && iter.data_ptr(2) != nullptr && at::isReducedFloatingType(dtype)) {
    AT_DISPATCH_REDUCED_FLOATING_TYPES(dtype, "mul_cpu_reduced_float", [&]() {
      using opmath_t = at::opmath_type<scalar_t>;
      opmath_t b = iter.original_scalar_value<opmath_t>(2);
@ -125,7 +125,7 @@ void mul_kernel(TensorIteratorBase& iter) {

 void div_true_kernel(TensorIteratorBase& iter) {
  const auto dtype = iter.common_dtype();
-  if (iter.is_scalar(2) && at::isReducedFloatingType(dtype)) {
+  if (iter.is_scalar(2) && iter.data_ptr(2) != nullptr && at::isReducedFloatingType(dtype)) {
    AT_DISPATCH_REDUCED_FLOATING_TYPES(dtype, "div_cpu_reduced_float", [&]() {
      using opmath_t = at::opmath_type<scalar_t>;
      opmath_t b = iter.original_scalar_value<opmath_t>(2);
@ -162,19 +162,28 @@ void div_trunc_kernel(TensorIteratorBase& iter) {
        return a / b;
      });
    });
-  } else if (iter.is_scalar(2) && at::isReducedFloatingType(dtype)) {
-    AT_DISPATCH_REDUCED_FLOATING_TYPES(dtype, "div_trunc_cpu_reduced_float", [&]() {
-      using opmath_t = at::opmath_type<scalar_t>;
-      opmath_t b = iter.original_scalar_value<opmath_t>(2);
-      iter.remove_operand(2);
-      cpu_kernel_vec(iter,
-        [=](scalar_t a) __ubsan_ignore_float_divide_by_zero__ -> scalar_t {
-          return std::trunc(static_cast<opmath_t>(a) / b);
-        },
-        [=](Vectorized<scalar_t> a) {
-          return binary_op_scalar(a, b, [](const Vectorized<opmath_t>& x, const Vectorized<opmath_t>& y) { return (x / y).trunc(); });
+  } else if (iter.is_scalar(2) && iter.data_ptr(2) != nullptr && at::isReducedFloatingType(dtype)) {
+    AT_DISPATCH_REDUCED_FLOATING_TYPES(
+        dtype, "div_trunc_cpu_reduced_float", [&]() {
+          using opmath_t = at::opmath_type<scalar_t>;
+          opmath_t b = iter.original_scalar_value<opmath_t>(2);
+          iter.remove_operand(2);
+          cpu_kernel_vec(
+              iter,
+              [=](scalar_t a)
+                  __ubsan_ignore_float_divide_by_zero__ -> scalar_t {
+                    return std::trunc(static_cast<opmath_t>(a) / b);
+                  },
+              [=](Vectorized<scalar_t> a) {
+                return binary_op_scalar(
+                    a,
+                    b,
+                    [](const Vectorized<opmath_t>& x,
+                       const Vectorized<opmath_t>& y) {
+                      return (x / y).trunc();
+                    });
+              });
        });
-    });
  } else {
    AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, dtype, "div_trunc_cpu", [&]() {
      cpu_kernel_vec(iter,
@ -223,20 +232,25 @@ void div_floor_kernel(TensorIteratorBase& iter) {
    });
  } else {
    // See NOTE: [Floor Division in Python]
-    if (iter.is_scalar(2) && at::isReducedFloatingType(dtype)) {
-      AT_DISPATCH_REDUCED_FLOATING_TYPES(dtype, "div_floor_cpu_reduced_float", [&]() {
-        using opmath_t = at::opmath_type<scalar_t>;
-        opmath_t b = iter.original_scalar_value<opmath_t>(2);
-        iter.remove_operand(2);
-        using vec_t = Vectorized<opmath_t>;
-        cpu_kernel_vec(iter,
-          [=](scalar_t a) -> scalar_t {
-            return div_floor_floating(static_cast<opmath_t>(a), b);
-          },
-          [=](Vectorized<scalar_t> a) {
-            return binary_op_scalar(a, b, [](const vec_t& x, const vec_t& y) { return div_floor_floating_vec(x, y); });
+    if (iter.is_scalar(2) && iter.data_ptr(2) != nullptr && at::isReducedFloatingType(dtype)) {
+      AT_DISPATCH_REDUCED_FLOATING_TYPES(
+          dtype, "div_floor_cpu_reduced_float", [&]() {
+            using opmath_t = at::opmath_type<scalar_t>;
+            opmath_t b = iter.original_scalar_value<opmath_t>(2);
+            iter.remove_operand(2);
+            using vec_t = Vectorized<opmath_t>;
+            cpu_kernel_vec(
+                iter,
+                [=](scalar_t a) -> scalar_t {
+                  return div_floor_floating(static_cast<opmath_t>(a), b);
+                },
+                [=](Vectorized<scalar_t> a) {
+                  return binary_op_scalar(
+                      a, b, [](const vec_t& x, const vec_t& y) {
+                        return div_floor_floating_vec(x, y);
+                      });
+                });
          });
-      });
    } else {
      AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, dtype, "div_floor_cpu", [&]() {
        using vec_t = Vectorized<scalar_t>;
--- a/aten/src/ATen/native/mps/operations/ConstantOps.mm
+++ b/aten/src/ATen/native/mps/operations/ConstantOps.mm
@ -72,7 +72,7 @@ static bool fill_mps_tensor_(Tensor& self, uint8_t value) {
  if (self.is_contiguous()) {
    MPSStream* stream = getCurrentMPSStream();
    auto storage_byte_offset = self.storage_offset() * self.itemsize();
-    stream->fill(mps::getMTLBufferStorage(self), 0, self.storage().nbytes(), storage_byte_offset);
+    stream->fill(mps::getMTLBufferStorage(self), value, self.nbytes(), storage_byte_offset);
    return true;
  }
  return false;
--- a/aten/src/ATen/native/mps/operations/Convolution.mm
+++ b/aten/src/ATen/native/mps/operations/Convolution.mm
@ -445,7 +445,7 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
    string key = "mps_convolution_backward_weights:" + to_string(stride[0]) + ":" + to_string(stride[1]) + ":" +
        to_string(dilation[0]) + ":" + to_string(dilation[1]) + ":" + to_string(padding[0]) + ":" +
        to_string(padding[1]) + ":" + to_string(groups) + ":" + mem_format_key +
-        getTensorsStringKey({grad_output_t, input_t}) + ":" + string([ns_shape_key UTF8String]);
+        getTensorsStringKey({grad_output_t, input_t, grad_weight_t}) + ":" + string([ns_shape_key UTF8String]);

    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
      MPSGraphConvolution2DOpDescriptor* conv2dDescriptor_ = [[MPSGraphConvolution2DOpDescriptor new] autorelease];
--- a/c10/cuda/driver_api.cpp
+++ b/c10/cuda/driver_api.cpp
@ -10,7 +10,7 @@ namespace {
 DriverAPI create_driver_api() {
 #define OPEN_LIBRARIES(name, n)               \
  void* handle_##n = dlopen(name, RTLD_LAZY); \
-  TORCH_INTERNAL_ASSERT(handle_##n);
+  TORCH_INTERNAL_ASSERT(handle_##n, "Can't open ", #name, ": ", dlerror());

  C10_FORALL_DRIVER_LIBRARIES(OPEN_LIBRARIES)
 #undef OPEN_LIBRARIES
@ -18,7 +18,7 @@ DriverAPI create_driver_api() {

 #define LOOKUP_ENTRY(name, n)                              \
  r.name##_ = ((decltype(&name))dlsym(handle_##n, #name)); \
-  TORCH_INTERNAL_ASSERT(r.name##_)
+  TORCH_INTERNAL_ASSERT(r.name##_, "Can't find ", #name, ": ", dlerror())
  C10_FORALL_DRIVER_API(LOOKUP_ENTRY)
 #undef LOOKUP_ENTRY
  return r;
--- a/c10/cuda/driver_api.h
+++ b/c10/cuda/driver_api.h
@ -19,7 +19,7 @@
  } while (0)

 #define C10_FORALL_DRIVER_LIBRARIES(_) \
-  _("libcuda.so", 0)                   \
+  _("libcuda.so.1", 0)                 \
  _("libnvidia-ml.so.1", 1)

 #define C10_FORALL_DRIVER_API(_)         \
--- a/caffe2/utils/threadpool/ThreadPool.cc
+++ b/caffe2/utils/threadpool/ThreadPool.cc
@ -41,8 +41,13 @@ namespace {
 }

 size_t getDefaultNumThreads() {
-  CAFFE_ENFORCE(cpuinfo_initialize(), "cpuinfo initialization failed");
-  int numThreads = cpuinfo_get_processors_count();
+  auto numThreads = 1U;
+  if (cpuinfo_initialize()) {
+    numThreads = std::max(cpuinfo_get_processors_count(), 1U);
+  } else {
+    LOG(WARNING) << "cpuinfo initialization failed";
+    numThreads = std::max(std::thread::hardware_concurrency(), 1U);
+  }

  bool applyCap = false;
 #if defined(C10_ANDROID)
@ -109,7 +114,7 @@ size_t getDefaultNumThreads() {
   * detect if we are running under tsan, for now capping the default
   * threadcount to the tsan limit unconditionally.
   */
-  int tsanThreadLimit = 63;
+  auto tsanThreadLimit = 63U;
  numThreads = std::min(numThreads, tsanThreadLimit);

  return numThreads;
--- a/setup.py
+++ b/setup.py
@ -1323,6 +1323,7 @@ def main():
        "include/torch/csrc/lazy/ts_backend/*.h",
        "include/pybind11/*.h",
        "include/pybind11/detail/*.h",
+        "include/pybind11/eigen/*.h",
        "include/TH/*.h*",
        "include/TH/generic/*.h*",
        "include/THC/*.cuh",
--- a/test/distributed/checkpoint/test_fsdp_optim_state.py
+++ b/test/distributed/checkpoint/test_fsdp_optim_state.py
@ -2,105 +2,137 @@

 import torch

+import torch.distributed.checkpoint as DCP
+import torch.nn as nn
+from torch.distributed._shard.sharded_tensor.api import ShardedTensor
+from torch.distributed.checkpoint.optimizer import load_sharded_optimizer_state_dict
+
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType
-import torch.distributed.checkpoint as dist_cp
-import torch.distributed as dist
-
-from torch.distributed.checkpoint.default_planner import (
-    DefaultSavePlanner,
-    DefaultLoadPlanner,
-)
-from torch.distributed.checkpoint.optimizer import (
-    load_sharded_optimizer_state_dict,
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+    run_tests,
 )

 from torch.testing._internal.distributed._tensor.common_dtensor import (
    DTensorTestBase,
    with_comms,
 )
-from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
-from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed.checkpoint_utils import with_temp_dir


 class FsdpOptimStateCheckpoint(DTensorTestBase):
+    def _create_model(self):
+        # make weight tensor dim_0 as large as the world size for scaling test
+        layer1_weight_dim = self.world_size
+        layer2_weight_dim = self.world_size * 2
+        layer3_weight_dim = self.world_size * 3
+
+        class TestDummyModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.net1 = nn.Sequential(nn.Linear(8, layer1_weight_dim), nn.ReLU())
+                self.net2 = nn.Sequential(
+                    nn.Linear(layer1_weight_dim, layer2_weight_dim), nn.ReLU()
+                )
+                self.net3 = nn.Sequential(
+                    nn.Linear(layer2_weight_dim, layer3_weight_dim), nn.ReLU()
+                )
+
+            def forward(self, x):
+                return self.net3(self.net2(self.net1(x)))
+
+            def get_input(self):
+                return torch.rand(8, 8, device="cuda")
+
+        model = TestDummyModel().cuda()
+        return model
+
+    @property
+    def backend(self):
+        return "cpu:gloo,cuda:nccl"
+
    @with_comms
    @skip_if_lt_x_gpu(4)
    @with_temp_dir
-    def test_distributed_tensor_planner(self) -> None:
+    @parametrize("pass_planner", [True, False])
+    def test_load_sharded_optimizer_state_dict(self, pass_planner) -> None:
        CHECKPOINT_DIR = self.temp_dir
+        planner = DCP.DefaultLoadPlanner() if pass_planner else None

-        model = FSDP(torch.nn.Linear(8, 8, device="meta"))
+        model = self._create_model()
+        model = FSDP(model)
        optim = torch.optim.Adam(model.parameters(), lr=0.1)

-        model(torch.rand(8, 8, device=dist.get_rank())).sum().backward()
+        # step ahead to initialize the optimizer
+        model(model.get_input()).sum().backward()
        optim.step()

-        with FSDP.state_dict_type(model, StateDictType.SHARDED_STATE_DICT):
-            state_dict = {
-                "model": model.state_dict(),
-                "optim": FSDP.optim_state_dict(model, optim),
-            }
+        FSDP.set_state_dict_type(
+            model,
+            StateDictType.SHARDED_STATE_DICT,
+        )
+        optim_osd = FSDP.optim_state_dict(model, optim)

-            dist_cp.save_state_dict(
-                state_dict=state_dict,
-                storage_writer=dist_cp.FileSystemWriter(CHECKPOINT_DIR),
-                planner=DefaultSavePlanner(),
-            )
+        state_dict = {
+            "model": model.state_dict(),
+            "optim": optim_osd,
+        }
+        DCP.save_state_dict(
+            state_dict=state_dict,
+            storage_writer=DCP.FileSystemWriter(CHECKPOINT_DIR),
+        )

        # now load the model and ensure the values are the same
-        model_2 = FSDP(torch.nn.Linear(8, 8, device="meta"))
+        model_2 = self._create_model()
+        model_2 = FSDP(model_2)
        optim_2 = torch.optim.Adam(model_2.parameters(), lr=0.1)

-        with FSDP.summon_full_params(model):
-            with FSDP.summon_full_params(model_2):
-                self.assertNotEqual(model.weight, model_2.weight)
-                self.assertNotEqual(model.bias, model_2.bias)
-
+        FSDP.set_state_dict_type(
+            model_2,
+            StateDictType.SHARDED_STATE_DICT,
+        )
        # Adam lazily creates its state
        self.assertEqual(0, len(optim_2.state))

-        with FSDP.state_dict_type(model_2, StateDictType.SHARDED_STATE_DICT):
-            state_dict = {
-                "model": model_2.state_dict(),
-                # cannot load the optimizer together with the model
-            }
-
-            dist_cp.load_state_dict(
-                state_dict=state_dict,
-                storage_reader=dist_cp.FileSystemReader(CHECKPOINT_DIR),
-                planner=DefaultLoadPlanner(),
-            )
-            model_2.load_state_dict(state_dict["model"])
-
-            optim_state = load_sharded_optimizer_state_dict(
-                model_state_dict=state_dict["model"],
-                optimizer_key="optim",
-                storage_reader=dist_cp.FileSystemReader(CHECKPOINT_DIR),
-            )
-
-            flattened_osd = FSDP.optim_state_dict_to_load(
-                model_2, optim_2, optim_state["optim"]
-            )
-            optim_2.load_state_dict(flattened_osd)
-
-        with FSDP.summon_full_params(model):
-            with FSDP.summon_full_params(model_2):
-                self.assertEqual(model.weight, model_2.weight)
-                self.assertEqual(model.bias, model_2.bias)
-
-        def opt_at(opt, idx):
-            return list(iter(opt.state.values()))[idx]
-
-        # Adam lazily creates its state
-        self.assertEqual(
-            opt_at(optim, 0)["exp_avg"], opt_at(optim_2, 0)["exp_avg"]
+        state_dict = {
+            "model": model_2.state_dict(),
+            # cannot load the optimizer together with the model
+        }
+        DCP.load_state_dict(
+            state_dict=state_dict,
+            storage_reader=DCP.FileSystemReader(CHECKPOINT_DIR),
        )
-        self.assertEqual(
-            opt_at(optim, 0)["exp_avg_sq"], opt_at(optim_2, 0)["exp_avg_sq"]
+        model_2.load_state_dict(state_dict["model"])
+
+        optim_state = load_sharded_optimizer_state_dict(
+            model_state_dict=state_dict["model"],
+            optimizer_key="optim",
+            storage_reader=DCP.FileSystemReader(CHECKPOINT_DIR),
+            planner=planner,
        )
+        flattened_osd = FSDP.optim_state_dict_to_load(
+            model_2, optim_2, optim_state["optim"]
+        )
+        optim_2.load_state_dict(flattened_osd)
+        osd_after_load = FSDP.optim_state_dict(model_2, optim_2)
+
+        # Compare optim_state_dict prior to save and after load
+        before_optim_state = optim_osd["state"]
+        after_optim_state = osd_after_load["state"]
+        self.assertEqual(len(before_optim_state), len(after_optim_state))
+        for fqn, states in before_optim_state.items():
+            for state_name, state in states.items():
+                state2 = after_optim_state.get(fqn).get(state_name)
+                if isinstance(state, ShardedTensor):
+                    self.assertTrue(isinstance(state2, ShardedTensor))
+                    self.assertTrue(torch.allclose(state, state2))
+                else:
+                    self.assertEqual(state, state2)


+instantiate_parametrized_tests(FsdpOptimStateCheckpoint)
 if __name__ == "__main__":
    run_tests()
--- a/test/test_foreach.py
+++ b/test/test_foreach.py
@ -516,16 +516,20 @@ class TestForeach(TestCase):
                sum(ref((ref_tensors,), ord=ord)).backward()
                self.assertEqual([t.grad for t in tensors], [t.grad for t in ref_tensors])

-    @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool))
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16))
    def test_add_scalar_with_empty_list_and_empty_tensor(self, device, dtype):
        # TODO: enable empty list case
-        for tensors in [[torch.randn([0])]]:
+        for tensors in [[torch.randn([0], device=device, dtype=dtype)],
+                        [torch.empty_strided((0, 1), (0, 0), dtype=dtype, device=device)]]:
            res = torch._foreach_add(tensors, 1)
            self.assertEqual(res, tensors)

            torch._foreach_add_(tensors, 1)
            self.assertEqual(res, tensors)

+            # Regression test for https://github.com/pytorch/pytorch/issues/113156
+            torch._foreach_mul_(tensors, 1)
+
    @ops(
        filter(lambda op: not op.has_no_out_of_place, foreach_binary_op_db),
        dtypes=OpDTypes.supported,
--- a/test/test_mps.py
+++ b/test/test_mps.py
@ -1197,17 +1197,22 @@ class TestMPS(TestCaseMPS):
        tensor_cpu = tensor_0[:][1].fill_(val)

        self.assertEqual(tensor_mps, tensor_cpu)
+        self.assertEqual(tensor, tensor_0)

        shape = [1, 10]
        val = 0.0
        tensor = torch.ones(shape, device="mps")
        val_tensor_mps = torch.tensor(val, device="mps")
        tensor_mps = tensor[:, 9].fill_(val_tensor_mps)
+        # Regression test for https://github.com/pytorch/pytorch/issues/114692
+        tensor[:, 5].fill_(val_tensor_mps)
        tensor_0 = torch.ones(shape, device="cpu")
        val_tensor_cpu = torch.tensor(val, device="cpu")
        tensor_cpu = tensor_0[:, 9].fill_(val_tensor_cpu)
+        tensor_0[:, 5].fill_(val_tensor_cpu)

-        self.assertEqual(tensor_mps, tensor_cpu)
+        self.assertEqual(tensor_mps.to(device="cpu"), tensor_cpu)
+        self.assertEqual(tensor.to(device="cpu"), tensor_0)

    def test_cdist_large(self, device="mps"):
        for cm in ['use_mm_for_euclid_dist_if_necessary', 'use_mm_for_euclid_dist', 'donot_use_mm_for_euclid_dist']:
@ -7982,6 +7987,18 @@ class TestNNMPS(NNTestCase):
        actual = F.conv2d(x, y, padding='valid')
        self.assertEqual(expect.to('cpu'), actual.to('cpu'))

+    def test_conv2d_backward_collision(self):
+        # Test for https://github.com/pytorch/pytorch/issues/112998
+        x = torch.rand(1, 1, 10, 10, device="mps", requires_grad=True)
+        m1 = nn.Conv2d(1, 1, 3, stride=2, padding=1).to("mps")
+        m2 = nn.Conv2d(1, 1, 4, stride=2, padding=1).to("mps")
+        y1, y2 = m1(x), m2(x)
+        self.assertEqual(y1.shape, y2.shape)
+        y1.sum().backward()
+        # This used to crash with MPSNDArrayConvolutionA14.mm:4352: failed assertion
+        y2.sum().backward()
+
+
    def test_gemm_permute_transpose(self):
        batch_size = 32
        n = 20
--- a/test/test_nn.py
+++ b/test/test_nn.py
@ -5609,6 +5609,18 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""")
        out = F.cosine_similarity(input.to(torch.int8), input, dim=-1)
        self.assertEqual(out, 1.)

+        # Check broadcasting #109333
+        a = torch.ones(2, 3, dtype=torch.float)
+        b = torch.ones(1, 1, dtype=torch.float)
+        out = F.cosine_similarity(a, b)
+        self.assertEqual(out, torch.ones(2, dtype=torch.float))
+
+        a = torch.ones(2, 3, dtype=torch.float)
+        b = torch.ones(1, dtype=torch.float)
+        out = F.cosine_similarity(a, b)
+        self.assertEqual(out, torch.ones(2, dtype=torch.float))
+
+
    def test_grid_sample_error_checking(self):
        input = torch.empty(1, 1, 2, 2)
        grid = torch.empty(1, 1, 1, 2)
--- a/test/test_numpy_interop.py
+++ b/test/test_numpy_interop.py
@ -472,6 +472,21 @@ class TestNumPyInterop(TestCase):
                else:
                    self.assertTrue(t == a)

+    @onlyCPU
+    def test_empty_tensors_interop(self, device):
+        x = torch.rand((), dtype=torch.float16)
+        y = torch.tensor(np.random.rand(0), dtype=torch.float16)
+        # Same can be achieved by running
+        # y = torch.empty_strided((0,), (0,), dtype=torch.float16)
+
+        # Regression test for https://github.com/pytorch/pytorch/issues/115068
+        self.assertEqual(torch.true_divide(x, y).shape, y.shape)
+        # Regression test for https://github.com/pytorch/pytorch/issues/115066
+        self.assertEqual(torch.mul(x, y).shape, y.shape)
+        # Regression test for https://github.com/pytorch/pytorch/issues/113037
+        self.assertEqual(torch.div(x, y, rounding_mode='floor').shape, y.shape)
+
+
 instantiate_device_type_tests(TestNumPyInterop, globals())

 if __name__ == '__main__':
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@ -1604,7 +1604,6 @@ symbolic_tensor_failures.update(symbolic_tensor_segfaults)

 outplace_symbolic_tensor_failures = {
    xfail('i0', ''),  # aten.i0.default - couldn't find symbolic meta function/decomposition
-    xfail('masked_scatter', ''),  # aten.masked_scatter.default - couldn't find symbolic meta function/decomposition
 }

 inplace_symbolic_tensor_failures = {
--- a/test/test_torch.py
+++ b/test/test_torch.py
@ -9151,6 +9151,13 @@ tensor([[[1.+1.j, 1.+1.j, 1.+1.j,  ..., 1.+1.j, 1.+1.j, 1.+1.j],
        t2 = t[0:0].view(0, 1)
        self.assertEqual(t2.data_ptr(), 0)

+    def test_invalid_arg_error_handling(self) -> None:
+        """ Tests that errors from old TH functions are propagated back """
+        for invalid_val in [-1, 2**65]:
+            self.assertRaises(RuntimeError, lambda: torch.set_num_threads(invalid_val))
+            self.assertRaises(RuntimeError, lambda: torch.set_num_interop_threads(invalid_val))
+
+
 # The following block extends TestTorch with negative dim wrapping tests
 # FIXME: replace these with OpInfo sample inputs or systemic OpInfo tests
 # Functions to test negative dimension wrapping
--- a/tools/stats/import_test_stats.py
+++ b/tools/stats/import_test_stats.py
@ -66,7 +66,7 @@ def fetch_and_cache(
 def get_slow_tests(
    dirpath: str, filename: str = SLOW_TESTS_FILE
 ) -> Optional[Dict[str, float]]:
-    url = "https://ossci-metrics.s3.amazonaws.com/slow-tests.json"
+    url = "https://ossci-metrics.s3.amazonaws.com/slow-tests.json?versionId=iWAOsEqlVH1mfs7w5A3KlyTalvubE4Ru"
    try:
        return fetch_and_cache(dirpath, filename, url, lambda x: x)
    except Exception:
@ -98,7 +98,7 @@ def get_disabled_tests(
        return disabled_test_from_issues

    try:
-        url = "https://ossci-metrics.s3.amazonaws.com/disabled-tests-condensed.json"
+        url = "https://ossci-metrics.s3.amazonaws.com/disabled-tests-condensed.json?versionId=JMUOdxgUAeI4yXhzc.dJlCuxVrsfkZTj"
        return fetch_and_cache(dirpath, filename, url, process_disabled_test)
    except Exception:
        print("Couldn't download test skip set, leaving all tests enabled...")
--- a/torch/autograd/profiler_util.py
+++ b/torch/autograd/profiler_util.py
@ -253,7 +253,7 @@ class EventList(list):
                        '"pid": "CPU functions", '
                        f'"id": {next_id}, '
                        f'"cat": "cpu_to_{device_name}", '
-                        '"args": {{}}}}, '
+                        '"args": {}}, '
                    )
                    # Note: use torch.profiler to get device kernel trace
                    next_id += 1
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@ -240,6 +240,7 @@ static PyObject* THPModule_getNumThreads(PyObject* module, PyObject* noargs) {
 }

 static PyObject* THPModule_setNumThreads(PyObject* module, PyObject* arg) {
+  HANDLE_TH_ERRORS
  THPUtils_assert(
      THPUtils_checkLong(arg),
      "set_num_threads expects an int, "
@ -249,6 +250,7 @@ static PyObject* THPModule_setNumThreads(PyObject* module, PyObject* arg) {
  THPUtils_assert(nthreads > 0, "set_num_threads expects a positive integer");
  at::set_num_threads(nthreads);
  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
 }

 static PyObject* THPModule_getNumInteropThreads(
@ -260,6 +262,7 @@ static PyObject* THPModule_getNumInteropThreads(
 static PyObject* THPModule_setNumInteropThreads(
    PyObject* module,
    PyObject* arg) {
+  HANDLE_TH_ERRORS
  THPUtils_assert(
      THPUtils_checkLong(arg),
      "set_num_interop_threads expects an int, "
@ -270,6 +273,7 @@ static PyObject* THPModule_setNumInteropThreads(
      nthreads > 0, "set_num_interop_threads expects a positive integer");
  at::set_num_interop_threads(nthreads);
  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
 }

 PyObject* THPModule_setDefaultTensorType(PyObject* _unused, PyObject* type) {
--- a/torch/distributed/checkpoint/optimizer.py
+++ b/torch/distributed/checkpoint/optimizer.py
@ -22,6 +22,8 @@ from torch.distributed.checkpoint.metadata import (
    TensorStorageMetadata,
    ChunkStorageMetadata,
 )
+from torch.distributed.distributed_c10d import _get_default_group
+from torch.distributed.fsdp._shard_utils import _create_chunk_sharded_tensor
 from torch.distributed.checkpoint.planner_helpers import (
    create_read_items_for_chunk_list,
    _create_read_items,
@ -32,7 +34,8 @@ from torch.distributed._tensor import DTensor
 from torch.distributed.checkpoint.default_planner import (
    DefaultLoadPlanner,
 )
-from torch.distributed._shard.api import _shard_tensor
+
+from torch.distributed.checkpoint.planner import LoadPlanner

 from torch.distributed.checkpoint._nested_dict import unflatten_state_dict
 from torch.distributed.checkpoint.utils import (
@ -212,6 +215,7 @@ def load_sharded_optimizer_state_dict(
    model_state_dict: STATE_DICT_TYPE,
    optimizer_key: str,
    storage_reader: dist_cp.StorageReader,
+    planner: Optional[LoadPlanner] = None,
 ) -> STATE_DICT_TYPE:
    """
    Loads a state_dict in conjunction with FSDP sharded optimizer state.
@ -291,8 +295,12 @@ def load_sharded_optimizer_state_dict(
        if value.size.numel() == 1:
            state_dict[key] = _alloc_tensor(value.properties, value.size, dp_pg_device_type)
        elif dp_pg is None:
-            state_dict[key] = _shard_tensor(
-                _alloc_tensor(value.properties, value.size, dp_pg_device_type), sharding_spec
+            state_dict[key] = _create_chunk_sharded_tensor(
+                _alloc_tensor(value.properties, value.size, dp_pg_device_type),
+                rank=dist.get_rank(),
+                world_size=dist.get_world_size(),
+                num_devices_per_node=device_module.device_count(),
+                pg=_get_default_group(),
            )
        else:
            spec_key = key_path[2]
@ -337,7 +345,7 @@ def load_sharded_optimizer_state_dict(
        state_dict=state_dict,
        storage_reader=storage_reader,
        # FIXME the type of planner is wrong in load_state_dict
-        planner=_ReaderWithOffset(fqn_to_offset) if dp_pg is not None else None,
+        planner=_ReaderWithOffset(fqn_to_offset) if dp_pg is not None else planner,
    )

    state_dict = unflatten_state_dict(state_dict, metadata.planner_data)
Author	SHA1	Message	Date
atalman	a8e7c98cb9	Revert "Require less alignment for attn bias (#114173 ) (#114837 )" This reverts commit 59656491f3b1da809312942872cce010337504b0.	2023-12-12 08:41:07 -08:00
Huy Do	448700d18e	Fix NULL dereference in binary CPU ops (#115241 ) * Fix NULL dereference in binary CPU ops (#115183) Targeted fix for https://github.com/pytorch/pytorch/issues/113037 A more fundamental one, where those functions are not even called for empty tensors are coming later Pull Request resolved: https://github.com/pytorch/pytorch/pull/115183 Approved by: https://github.com/drisspg, https://github.com/atalman, https://github.com/huydhn * Fix build after conflict resolution * Also include https://github.com/pytorch/pytorch/pull/113262 to pass the test --------- Co-authored-by: Nikita Shulga <nshulga@meta.com>	2023-12-06 01:20:06 -08:00
Driss Guessous	59656491f3	Require less alignment for attn bias (#114173 ) (#114837 ) Improved Fix for Attention Mask Alignment Issue (#112577) This PR addresses Issue #112577 by refining the previously implemented fix, which was found to be incorrect and causes un-needed memory regressions. The update simplifies the approach to handling the alignment of the attention mask for mem eff attention. Alignment Check and Padding: Initially, the alignment of the attention mask is checked. If misalignment is detected, padding is applied, followed by slicing. During this process, a warning is raised to alert users. Should this be warn_once? We only call expand, once on the aligned mask. Reference https://github.com/facebookresearch/xformers/blob/main/xformers/ops/fmha/cutlass.py#L115 @albanD, @mruberry, @jbschlosser, @walterddr, and @mikaylagawarecki. Pull Request resolved: https://github.com/pytorch/pytorch/pull/114173 Approved by: https://github.com/danthe3rd	2023-12-05 14:50:58 -05:00
Andrey Talman	41210eaedc	[MPS] Fix out-of-bounds fill to sliced tensor (#114958 ) This fixes regression introduced by https://github.com/pytorch/pytorch/pull/81951 that caused out-of-bounds access when sliced tensor is filled with zeros Remove bogus `TORCH_INTERNAL_ASSERT(length >= offset)` as [NSMakeRange](https://developer.apple.com/documentation/foundation/1417188-nsmakerange?language=objc) arguments are location and length rather than start and end offset. In `fill_mps_tensor_`: - Pass `value` argument to `MPSStream::fill` - Pass `self.nbytes()` rather than `self.storage().nbytes()` as length of of buffer to fill as later will always results in out-of-bounds write if offset within the store is non-zero Add regression test Fixes https://github.com/pytorch/pytorch/issues/114692 Cherry pick of https://github.com/pytorch/pytorch/pull/114838 into release/2.1 branch Co-authored-by: Nikita Shulga <nshulga@meta.com>	2023-12-01 10:58:57 -08:00
Honglin Zhu	3183bcd417	Fix mkldnn_matmul error on AArch64 (#114851 ) Fixes https://github.com/pytorch/pytorch/issues/110149 Cherry pick https://github.com/pytorch/pytorch/pull/110150. This is a bug fix against 2.1 release	2023-11-30 08:11:08 -08:00
Huy Do	b5a89bbc5f	Fix broadcasting cosine_similarity (#114795 ) * Fix broadcasting cosine_similarity (#109363) Fixes https://github.com/pytorch/pytorch/issues/109333 Pull Request resolved: https://github.com/pytorch/pytorch/pull/109363 Approved by: https://github.com/peterbell10 * The PR incidentally fixes the test by switching from sizes to sym_sizes test_make_fx_symbolic_exhaustive_masked_scatter_cpu_float32 --------- Co-authored-by: lezcano <lezcano-93@hotmail.com>	2023-11-30 00:23:40 -08:00
Andrey Talman	3f662b6255	Package `pybind11/eigen/` (#113055 ) (#114756 ) Which was added for eigen 2.11 release, see https://github.com/pybind/pybind11/tree/v2.11.0/include/pybind11/eigen Fixes https://github.com/pytorch/pytorch/issues/112841 Cherry-pick of https://github.com/pytorch/pytorch/pull/113055 into release/2.1 branch Co-authored-by: Nikita Shulga <2453524+malfet@users.noreply.github.com>	2023-11-29 07:04:29 -08:00
Andrey Talman	614af50378	[release only] Pin disabled-test-condensed and slow-tests json (#114514 ) * [release only] Pin disabled-test-condensed json * pin slow tests json	2023-11-27 13:30:27 -05:00
Nikita Shulga	b3b22d7390	[BE] Handle errors in `set_num_threads` (#114420 ) and `set_num_interop_threads` Before that, call `torch.set_num_threads(265)` resulted in segmentation fault, afterwards it becomes a good old runtime error: ``` % python -c "import torch;torch.set_num_threads(265)" Traceback (most recent call last): File "<string>", line 1, in <module> RuntimeError: Overflow when unpacking long ``` Similar to https://github.com/pytorch/pytorch/pull/60073 Cherry pick of https://github.com/pytorch/pytorch/pull/113684 into release/2.1 (cherry picked from commit 78f3937ee84e71475942598f4b51dce7c8a70783)	2023-11-23 14:04:26 -05:00
Nikita Shulga	7405d70c30	[MPS] Fix crashes during Conv backward pass (#114419 ) By adding weights tensor to the MPSGraph cache key. Add regression test to validate that collision no longer happens Fixes https://github.com/pytorch/pytorch/issues/112998 Cherry pick of https://github.com/pytorch/pytorch/pull/113398 into release/2.1 (cherry picked from commit 265d6aac0b71b917d6e36c5dd65c22f61644b715)	2023-11-23 14:02:46 -05:00
Nikita Shulga	d62c757533	[Caffe2] Handle `cpuinfo_initialize()` failure (#114418 ) It can fail on ARM platform if `/sys` folder is not accessible. In that case, call `std:🧵:hardware_concurrency()`, which is aligned with the thread_pool initialization logic of `c10::TaskThreadPoolBase:defaultNumThreads()` Further addresses issue raised in https://github.com/pytorch/pytorch/issues/113568 This is a cherry-pick of https://github.com/pytorch/pytorch/pull/114011 into release/2.1 branch (cherry picked from commit 310e3060b7e4d0c76149aadad4519c7abed8c2a7)	2023-11-23 14:01:16 -05:00
Nikita Shulga	7833889a44	Fix chrome trace entry format (#113763 ) (#114416 ) Fix regression introduced by https://github.com/pytorch/pytorch/pull/107519 `'"args": {{}}}}, '` was part of format string, when curly braces a duplicated to get them printed single time, but ruff change left the string format as is Fixes https://github.com/pytorch/pytorch/issues/113756 Pull Request resolved: https://github.com/pytorch/pytorch/pull/113763 Approved by: https://github.com/Skylion007, https://github.com/aaronenyeshi (cherry picked from commit e100ff42fd087d7a1696cb52c216507d45b8fb85)	2023-11-23 13:57:43 -05:00
Iris Z	4c55dc5035	remove _shard_tensor() call (#111687 ) Co-authored-by: Andrey Talman <atalman@fb.com>	2023-11-08 07:49:29 -05:00
Nikita Shulga	f58669bc5f	`c10::DriverAPI` Try opening libcuda.so.1 (#113096 ) As `libcuda.so` is only installed on dev environment (i.e. when CUDAToolkit is installed), while `libcuda.so.1` is part of NVIDIA driver. Also, this will keep it aligned with `a5cb8f75a7/aten/src/ATen/cuda/detail/LazyNVRTC.cpp (L16)` Better errors in `c10::DriverAPI` on `dlopn`/`dlsym` failures Cherry-pick of following PR into release/2.1 branch - Better errors in `c10::DriverAPI` on `dl` failure (#112995) - `c10::DriverAPI` Try opening libcuda.so.1 (#112996) (cherry picked from commit 3be0e1cd587ece8fa54a3a4da8ae68225b9cbb9b) (cherry picked from commit d0a80f8af19625cbd0b3eb74a1970ac5b7c5439a)	2023-11-07 11:47:24 -08:00
Iris Z	33106b706e	[DCP] Add test for planner option for load_sharded_optimizer_state_dict (#112930 ) Add test for a user submitted PR: https://github.com/pytorch/pytorch/pull/112259 Cherry-pick of https://github.com/pytorch/pytorch/pull/112891 into `release/2.1` branch	2023-11-07 11:38:50 -08:00
Iris Z	4b4c012a60	Enable planner to be used for loading sharded optimizer state dict (#112520 ) Cherry-pick [#112259](https://github.com/pytorch/pytorch/pull/112259) Requested by MosaicML Comments from users: > without this, we can't do training resumption because the model gets loaded without the optimizer --------------------------------------------------------------------------------------------------------------------- This creates a more consistent interface for saving and loading sharded state dicts. A planner is able to be specified when saving a sharded optimizer state dict, but there is currently no planner support for loading one. This change does not affect the default behavior of the function. Co-authored-by: Brian <23239305+b-chu@users.noreply.github.com>	2023-11-07 11:35:20 -08:00
Iris Z	47ac50248a	[DCP][test] Make dim_0 size of params scale with world_size in torch/distributed/checkpoint/test_fsdp_optim_state.py (#112825 ) (#112894 ) Make dim_0 size of params scale with world_size so it can be used to test the impact on performance when scaling up. More context of performance improvement is added in: https://github.com/pytorch/pytorch/pull/111687 For this cherry-pick pair, we remove `_shard_tensor()` call in `load_sharded_optimizer_state_dict()` in optimizer.py, which is reported to scale poorly with number of GPUs. The reason behind is that `_shard_tensor()` calls into `dist.all_gather_object()`, which is extremely expensive in communication when world_size becomes large. main: https://github.com/pytorch/pytorch/pull/111096 cherry-pick: https://github.com/pytorch/pytorch/pull/111687 Pull Request resolved: https://github.com/pytorch/pytorch/pull/112825 Approved by: https://github.com/fegin	2023-11-06 16:14:14 -05:00