Compare commits

...

3 Commits

Author SHA1 Message Date
7e8021f878 Automated submodule update: FBGEMM 2025-11-15 14:28:42 -08:00
79fc0a9141 [xpu][fix]Fall back deterministic index_copy to index_put on XPU (#167830)
A minor update has been made to the deterministic behavior checks in the `index_copy_out` implementation. This change ensures that deterministic  `index_copy` is dispatched to `index_put` not only for CUDA tensors but also for XPU tensors.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/167830
Approved by: https://github.com/guangyey, https://github.com/ezyang
2025-11-15 18:09:25 +00:00
d01a7b0241 Back out "MatMal - fix folding logic" (#167884)
Summary:
For sepcific hardware (A100), Autocast will generate a relatively large error on Transformer (torch.nn.TransformerEncoder) when using no_grad decorator on dim=256 (and larger presuably).

H100 seems fine, as does A100 with mig (so less than full SMs).

For now backing out, and revisting next week.

Test Plan:
failed jobs:
https://fburl.com/scuba/remote_execution_action/jzcmujgk

 {F1983543613}

Reviewed By: t-ivan-gr

Differential Revision: D87111518

Pull Request resolved: https://github.com/pytorch/pytorch/pull/167884
Approved by: https://github.com/malfet
2025-11-15 08:29:08 +00:00
4 changed files with 19 additions and 9 deletions

View File

@ -1936,7 +1936,7 @@ static bool should_fold(const Tensor& tensor1, const Tensor& tensor2, bool has_o
// We order the tensors. t1 will be the larger tensor
// We can always transpose tensor2 as the dimensions are always >= 1 (precondition from matmul)
// and tensor1_larger iff tensor2.dim() > tensor1.dim()
// and tensor1_larger iff tensor2.dim() > tensor1.dim(9
const auto t1 = tensor1_larger ? MaybeOwned<Tensor>::borrowed(tensor1)
: MaybeOwned<Tensor>::owned(tensor2.mT());
const int64_t dim_t1 = t1->dim();
@ -1948,11 +1948,20 @@ static bool should_fold(const Tensor& tensor1, const Tensor& tensor2, bool has_o
return false;
}
// If we require a gradient, we should fold to minimize backward memory usage - even if this
// leads to a copy in forward because is needed in backward,
// only time we avoid this strict pre-allocated memory usage (has_out = True)
bool requires_grad = tensor1.requires_grad() || tensor2.requires_grad();
if (requires_grad && !has_out) {
// In this case we *do* incur in an extra copy to avoid creating an unnecessary large tensor in the backward
// Suppose we don't fold here. Let t1.shape = [b, m, n] t2.shape = [n, k] like in a transformer
// t2 will be expanded to a tensor of shape [b, n, k] and then we do t1.bmm(t2_expanded)
// The issue appears in the backward.
// The output gradient g of this operation would have shape [b, m, k]
// The backward wrt. t2 of bmm would be given by t1.mH @ g, which has shape [b, n, k]
// Then, the backward of expand is simply `sum(0)`. As such, we are instantiating a tensor
// of shape [b, n, k] unnecessarily, which may cause a large memory footprint, and in the
// worst case, an OOM
bool t2_requires_grad = tensor1_larger ? tensor2.requires_grad() : tensor1.requires_grad();
if (t2_requires_grad && !has_out) {
// We should be checking !at::GradMode::is_enabled(), but apparently
// this regresses performance in some cases:
// https://github.com/pytorch/pytorch/issues/118548#issuecomment-1916022394
return true;
}

View File

@ -1087,7 +1087,8 @@ TORCH_IMPL_FUNC(index_copy_out)
result.copy_(self);
// See Note [Enabling Deterministic Operations]
if (result.is_cuda() && globalContext().deterministicAlgorithms()) {
if ((result.is_cuda() || result.is_xpu()) &&
globalContext().deterministicAlgorithms()) {
torch::List<std::optional<Tensor>> indices;
indices.resize(dim + 1);
indices.set(dim, index);

View File

@ -4535,7 +4535,7 @@ def should_fold(tensor1: torch.Tensor, tensor2: torch.Tensor, is_out: bool) -> b
if not (t1.ndim >= 3 and t2.ndim <= 2):
return False
if (t1.requires_grad or t2.requires_grad) and not is_out:
if t2.requires_grad and not is_out:
return True
if tensor1.ndim == 2:
return False