[HPU] Add HPU as a supported device for NestedTensor (#148659)

This change enables basic NestedTensor operations on HPU,
    fixing the runtime error when creating a NestedTensor on HPU.

    - Extended `NestedTensorImpl` to recognize `hpu` as a valid storage device.
    - Added `NestedTensorHPU` to `DispatchKey` parsing in `DispatchKey.cpp`.
    - Updated `torchgen/model.py` to include `NestedTensorHPU` in `dispatch_keys`.
    - Modified `native_functions.yaml` to enable `NestedTensorHPU` support for various ops.

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/148659
Approved by: https://github.com/jeromean, https://github.com/albanD, https://github.com/sujoysaraswati
This commit is contained in:
Nitin Singh
2025-04-14 03:42:31 +00:00
committed by PyTorch MergeBot
parent 9aca00102f
commit 9458b83729
4 changed files with 91 additions and 89 deletions

View File

@ -182,7 +182,7 @@ NestedTensorImpl::NestedTensorImpl(
"coverage, and works with torch.compile.");
auto storage_device = storage_.device();
TORCH_INTERNAL_ASSERT(
storage_device.is_cpu() || storage_device.is_cuda() || storage_device.is_xpu() || storage_device.is_privateuseone(),
storage_device.is_cpu() || storage_device.is_cuda() || storage_device.is_xpu() || storage_device.is_hpu() || storage_device.is_privateuseone(),
"NestedTensorImpl storage must be either CUDA, CPU, XPU or ", get_privateuse1_backend(), " but got ",
storage_device);
validate_nested_tensor_metadata(nested_sizes_, nested_strides_, storage_offsets_);

View File

@ -288,13 +288,13 @@
dispatch:
CPU: native_dropout_cpu
CUDA: native_dropout_cuda
NestedTensorCPU, NestedTensorCUDA: native_dropout_nested
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: native_dropout_nested
tags: [nondeterministic_seeded, core]
autogen: native_dropout.out
- func: native_dropout_backward(Tensor grad_output, Tensor mask, float scale) -> Tensor
dispatch:
CPU, NestedTensorCPU, NestedTensorCUDA: native_dropout_backward
CPU, NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: native_dropout_backward
CUDA: native_dropout_backward_cuda
autogen: native_dropout_backward.out
tags: pointwise
@ -342,7 +342,7 @@
CompositeExplicitAutograd: abs
SparseCPU, SparseCUDA: abs_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: abs_sparse_csr
NestedTensorCPU, NestedTensorCUDA: NestedTensor_abs
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_abs
tags: [core, pointwise]
- func: abs_(Tensor(a!) self) -> Tensor(a!)
@ -352,7 +352,7 @@
CompositeExplicitAutograd: abs_
SparseCPU, SparseCUDA: abs_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: abs_sparse_csr_
NestedTensorCPU, NestedTensorCUDA: NestedTensor_abs_
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_abs_
- func: abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
@ -431,7 +431,7 @@
dispatch:
SparseCPU, SparseCUDA: sgn_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sgn_sparse_csr
NestedTensorCPU, NestedTensorCUDA: NestedTensor_sgn
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sgn
tags: pointwise
- func: sgn_(Tensor(a!) self) -> Tensor(a!)
@ -440,7 +440,7 @@
dispatch:
SparseCPU, SparseCUDA: sgn_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sgn_sparse_csr_
NestedTensorCPU, NestedTensorCUDA: NestedTensor_sgn_
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sgn_
tags: pointwise
- func: sgn.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@ -560,7 +560,7 @@
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr
MkldnnCPU: mkldnn_add
ZeroTensor: add_zerotensor
NestedTensorCPU, NestedTensorCUDA: NestedTensor_add_Tensor
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_add_Tensor
tags: [core, pointwise]
- func: add_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
@ -571,7 +571,7 @@
SparseCPU, SparseCUDA, SparseMeta: add_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr_
MkldnnCPU: mkldnn_add_
NestedTensorCPU, NestedTensorCUDA: NestedTensor_add__Tensor
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_add__Tensor
tags: pointwise
- func: add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
@ -703,7 +703,7 @@
structured_delegate: all.out
variants: function, method
dispatch:
NestedTensorCPU, NestedTensorCUDA: NestedTensor_all
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_all
- func: all.dims(Tensor self, int[]? dim=None, bool keepdim=False) -> Tensor
@ -1262,7 +1262,7 @@
variants: function, method
dispatch:
CompositeExplicitAutograd: logical_not
NestedTensorCPU, NestedTensorCUDA: NestedTensor_logical_not
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_logical_not
tags: [core, pointwise]
- func: logical_not_(Tensor(a!) self) -> Tensor(a!)
@ -1270,7 +1270,7 @@
variants: method
dispatch:
CompositeExplicitAutograd: logical_not_
NestedTensorCPU, NestedTensorCUDA: NestedTensor_logical_not_
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_logical_not_
tags: pointwise
- func: logical_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@ -1394,7 +1394,7 @@
dispatch:
SparseCPU, SparseCUDA: cat_sparse
QuantizedCPU: cat_quantized_cpu
NestedTensorCPU, NestedTensorCUDA: cat_nested
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: cat_nested
tags: core
- func: cat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
@ -1482,7 +1482,7 @@
device_guard: False
dispatch:
CompositeImplicitAutograd: chunk
NestedTensorCPU, NestedTensorCUDA: chunk_nested_tensor
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: chunk_nested_tensor
- func: tensor_split.sections(Tensor(a -> *) self, SymInt sections, int dim=0) -> Tensor(a)[]
variants: function, method
@ -1779,7 +1779,7 @@
SparseCPU, SparseCUDA: copy_sparse_wrapper_
CompositeExplicitAutograd: copy_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: copy_sparse_compressed_
NestedTensorCPU, NestedTensorCUDA: copy_nested_
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: copy_nested_
autogen: copy.out
- func: _copy_from(Tensor self, Tensor dst, bool non_blocking=False) -> Tensor
@ -1799,7 +1799,7 @@
variants: function, method
structured_delegate: cos.out
dispatch:
NestedTensorCPU, NestedTensorCUDA: NestedTensor_cos
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_cos
tags: [core, pointwise]
- func: cos_(Tensor(a!) self) -> Tensor(a!)
@ -2137,7 +2137,7 @@
dispatch:
SparseCPU, SparseCUDA: div_sparse
ZeroTensor: div_zerotensor
NestedTensorCPU, NestedTensorCUDA: NestedTensor_div_Tensor
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_div_Tensor
tags: [core, pointwise]
- func: div_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
@ -2190,7 +2190,7 @@
variants: function, method
dispatch:
CompositeExplicitAutograd: div
NestedTensorCPU, NestedTensorCUDA: NestedTensor_div_Scalar
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_div_Scalar
tags: [core, pointwise]
- func: div_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
@ -2290,7 +2290,7 @@
- func: embedding(Tensor weight, Tensor indices, SymInt padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor
dispatch:
CompositeExplicitAutograd: embedding_symint
NestedTensorCPU, NestedTensorCUDA: NestedTensor_embedding
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_embedding
autogen: embedding.out
tags: core
@ -2496,7 +2496,7 @@
QuantizedCPU, QuantizedCUDA: empty_like_quantized
SparseCPU, SparseCUDA, SparseMeta: empty_like_sparse_coo
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: empty_like_sparse_csr
NestedTensorCPU, NestedTensorCUDA: empty_like_nested
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: empty_like_nested
autogen: empty_like.out
- func: empty_strided(SymInt[] size, SymInt[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
@ -2701,7 +2701,7 @@
QuantizedCPU, QuantizedCUDA: fill_quantized_
Meta: fill_meta_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: fill_sparse_csr_
NestedTensorCPU, NestedTensorCUDA: fill_nested_
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: fill_nested_
autogen: fill.Scalar_out
- func: fill_.Tensor(Tensor(a!) self, Tensor value) -> Tensor(a!)
@ -2712,7 +2712,7 @@
MPS: fill_tensor_mps_
QuantizedCPU, QuantizedCUDA: fill_quantized_
Meta: fill_meta_
NestedTensorCPU, NestedTensorCUDA: fill_nested_
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: fill_nested_
autogen: fill.Tensor_out
- func: floor(Tensor self) -> Tensor
@ -3187,7 +3187,7 @@
device_guard: False
dispatch:
CPU, CUDA, MPS: isnan
NestedTensorCPU, NestedTensorCUDA: NestedTensor_isnan
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isnan
SparseCPU, SparseCUDA: isnan_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isnan_sparse_csr
autogen: isnan.out
@ -3238,7 +3238,7 @@
device_check: NoCheck
device_guard: False
dispatch:
NestedTensorCPU, NestedTensorCUDA: nested_is_same_size
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: nested_is_same_size
CompositeExplicitAutograd: is_same_size
- func: is_signed(Tensor self) -> bool
@ -3285,7 +3285,7 @@
CUDA: layer_norm_cuda
MPS: layer_norm_mps
CompositeExplicitAutograd: math_native_layer_norm
NestedTensorCPU, NestedTensorCUDA: nested_layer_norm
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: nested_layer_norm
autogen: native_layer_norm.out
tags: core
@ -3294,7 +3294,7 @@
CPU: layer_norm_backward_cpu
CUDA: layer_norm_backward_cuda
MPS: layer_norm_backward_mps
NestedTensorCPU, NestedTensorCUDA: layer_norm_backward_nested
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: layer_norm_backward_nested
autogen: native_layer_norm_backward.out
tags: core
@ -3327,12 +3327,12 @@
python_module: nn
dispatch:
CompositeImplicitAutograd: linear
NestedTensorCPU, NestedTensorCUDA: nested_linear
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: nested_linear
MPS: _mps_linear
- func: linear_backward(Tensor self, Tensor grad_output, Tensor weight, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
dispatch:
NestedTensorCPU, NestedTensorCUDA: nested_linear_backward
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: nested_linear_backward
MPS: mps_linear_backward
autogen: linear_backward.out
@ -3771,17 +3771,17 @@
variants: function, method
dispatch:
CompositeImplicitAutograd: matmul
NestedTensorCPU, NestedTensorCUDA: matmul_nested
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: matmul_nested
- func: matmul_backward(Tensor grad, Tensor self, Tensor other, bool[2] mask) -> (Tensor, Tensor)
dispatch:
NestedTensorCPU, NestedTensorCUDA: matmul_backward_nested
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: matmul_backward_nested
autogen: matmul_backward.out
- func: matmul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
dispatch:
CompositeImplicitAutograd: matmul_out
NestedTensorCPU, NestedTensorCUDA: matmul_out_nested
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: matmul_out_nested
# Alias to linalg.matrix_power
- func: matrix_power(Tensor self, int n) -> Tensor
@ -4227,7 +4227,7 @@
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_sparse_csr
MkldnnCPU: mkldnn_mul
ZeroTensor: mul_zerotensor
NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul_Tensor
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_mul_Tensor
tags: [core, pointwise]
- func: mul_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
@ -4238,7 +4238,7 @@
SparseCPU, SparseCUDA: mul_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_sparse_csr_
MkldnnCPU: mkldnn_mul_
NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul__Tensor
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_mul__Tensor
tags: pointwise
- func: mul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@ -4261,7 +4261,7 @@
dispatch:
CompositeExplicitAutograd: mul
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_scalar_sparse_csr
NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul_Scalar
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_mul_Scalar
tags: [core, pointwise]
- func: mul_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
@ -4270,7 +4270,7 @@
dispatch:
CompositeExplicitAutograd: mul_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul__scalar_sparse_csr
NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul__Scalar
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_mul__Scalar
autogen: mul.Scalar_out
tags: pointwise
# multiply, alias for mul
@ -4336,7 +4336,7 @@
device_guard: False
dispatch:
CompositeImplicitAutograd: narrow_symint
NestedTensorCPU, NestedTensorCUDA: narrow_nested_symint
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: narrow_nested_symint
- func: narrow.Tensor(Tensor(a) self, int dim, Tensor start, SymInt length) -> Tensor(a)
variants: function, method
@ -4475,7 +4475,7 @@
# NB: Although this composite mutates on the inside, it is
# non-differentiable so NonFunctional doesn't apply
CompositeExplicitAutograd: ones_like
NestedTensorCPU, NestedTensorCUDA: ones_like
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: ones_like
autogen: ones_like.out
- func: pairwise_distance(Tensor x1, Tensor x2, float p=2, float eps=1e-06, bool keepdim=False) -> Tensor
@ -4877,7 +4877,7 @@
dispatch:
SparseCPU, SparseCUDA: neg_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: neg_sparse_csr
NestedTensorCPU, NestedTensorCUDA: NestedTensor_neg
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_neg
tags: [core, pointwise]
- func: neg_(Tensor(a!) self) -> Tensor(a!)
@ -4887,7 +4887,7 @@
dispatch:
SparseCPU, SparseCUDA: neg_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: neg_sparse_csr_
NestedTensorCPU, NestedTensorCUDA: NestedTensor_neg_
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_neg_
tags: pointwise
- func: neg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@ -5041,7 +5041,7 @@
MkldnnCPU: mkldnn_relu
QuantizedCPU: relu_quantized_cpu
QuantizedCUDA: relu_quantized_cuda
NestedTensorCPU, NestedTensorCUDA: NestedTensor_relu
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_relu
SparseCPU, SparseCUDA: relu_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: relu_sparse_csr
tags: [core, pointwise]
@ -5055,7 +5055,7 @@
MkldnnCPU: mkldnn_relu_
QuantizedCPU: relu_quantized_cpu_
QuantizedCUDA: relu_quantized_cuda_
NestedTensorCPU, NestedTensorCUDA: NestedTensor_relu_
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_relu_
SparseCPU, SparseCUDA: relu_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: relu_sparse_csr_
autogen: relu.out
@ -5101,7 +5101,7 @@
python_module: nn
dispatch:
QuantizedCPU: gelu_quantized_cpu_
NestedTensorCPU, NestedTensorCUDA: NestedTensor_gelu_
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_gelu_
- func: gelu(Tensor self, *, str approximate='none') -> Tensor
structured_delegate: gelu.out
@ -5111,7 +5111,7 @@
MkldnnCPU: mkldnn_gelu
QuantizedCPU: gelu_quantized_cpu
QuantizedCUDA: gelu_quantized_cuda
NestedTensorCPU, NestedTensorCUDA: NestedTensor_gelu
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_gelu
tags: [core, pointwise]
- func: gelu_backward.grad_input(Tensor grad_output, Tensor self, *, str approximate='none', Tensor(a!) grad_input) -> Tensor(a!)
@ -5128,7 +5128,7 @@
python_module: nn
dispatch:
MkldnnCPU: mkldnn_gelu_backward
NestedTensorCPU, NestedTensorCUDA: gelu_backwards_nested
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: gelu_backwards_nested
tags: pointwise
- func: infinitely_differentiable_gelu_backward(Tensor grad, Tensor self) -> Tensor
@ -5193,7 +5193,7 @@
dispatch:
CompositeExplicitAutograd: select_symint
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: select_sparse_csr
NestedTensorCPU, NestedTensorCUDA: select_nested
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: select_nested
tags: core
- func: select_backward(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt index) -> Tensor
@ -5209,7 +5209,7 @@
device_check: NoCheck
device_guard: False
dispatch:
NestedTensorCPU, NestedTensorCUDA: _nested_select_backward_symint
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: _nested_select_backward_symint
- func: selu(Tensor self) -> Tensor
device_check: NoCheck # TensorIterator
@ -5234,14 +5234,14 @@
structured_delegate: silu.out
python_module: nn
dispatch:
NestedTensorCPU, NestedTensorCUDA: NestedTensor_silu
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_silu
tags: pointwise
- func: silu_(Tensor(a!) self) -> Tensor(a!)
structured_delegate: silu.out
python_module: nn
dispatch:
NestedTensorCPU, NestedTensorCUDA: NestedTensor_silu_
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_silu_
tags: pointwise
- func: silu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@ -5267,7 +5267,7 @@
python_module: nn
dispatch:
CompositeImplicitAutograd: math_silu_backward
NestedTensorCPU, NestedTensorCUDA: silu_backward_nested
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: silu_backward_nested
tags: pointwise
- func: mish(Tensor self) -> Tensor
@ -5346,7 +5346,7 @@
dispatch:
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sin_sparse_csr
SparseCPU, SparseCUDA: sin_sparse
NestedTensorCPU, NestedTensorCUDA: NestedTensor_sin
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sin
tags: [core, pointwise]
- func: sin_(Tensor(a!) self) -> Tensor(a!)
@ -5430,7 +5430,7 @@
variants: function, method
dispatch:
CompositeExplicitAutograd: detach
NestedTensorCPU, NestedTensorCUDA: detach
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: detach
# Like `detach()`, but modifies this `Variable` in-place. This method may
# only be called on non-view `Variable`s. You can use `is_view()` to check
@ -5560,7 +5560,7 @@
structured_delegate: _softmax.out
dispatch:
MkldnnCPU: mkldnn_softmax
NestedTensorCPU, NestedTensorCUDA: softmax_nested
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: softmax_nested
tags: core
- func: _softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out) -> Tensor(a!)
@ -5573,7 +5573,7 @@
- func: _softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor
structured_delegate: _softmax_backward_data.out
dispatch:
NestedTensorCPU, NestedTensorCUDA: nested_softmax_backward
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: nested_softmax_backward
- func: _softmax_backward_data.out(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype, *, Tensor(a!) grad_input) -> Tensor(a!)
structured: True
@ -5617,7 +5617,7 @@
device_guard: False
dispatch:
CompositeExplicitAutograd: split_with_sizes
NestedTensorCPU, NestedTensorCUDA: split_with_sizes_nested
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: split_with_sizes_nested
tags: core
- func: hsplit.int(Tensor(a -> *) self, int sections) -> Tensor(a)[]
@ -5645,7 +5645,7 @@
dispatch:
CompositeExplicitAutograd: squeeze
QuantizedCPU, QuantizedCUDA: squeeze_quantized
NestedTensorCPU, NestedTensorCUDA: squeeze_nested
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: squeeze_nested
- func: squeeze.dim(Tensor(a) self, int dim) -> Tensor(a)
variants: function, method
@ -5654,7 +5654,7 @@
dispatch:
CompositeExplicitAutograd: squeeze
QuantizedCPU, QuantizedCUDA: squeeze_quantized
NestedTensorCPU, NestedTensorCUDA: squeeze_dim_nested
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: squeeze_dim_nested
tags: core
- func: squeeze.dimname(Tensor(a) self, Dimname dim) -> Tensor(a)
@ -5670,7 +5670,7 @@
dispatch:
CompositeExplicitAutograd: squeeze
QuantizedCPU, QuantizedCUDA: squeeze_quantized
NestedTensorCPU, NestedTensorCUDA: squeeze_dim_nested
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: squeeze_dim_nested
tags: core
- func: squeeze_(Tensor(a!) self) -> Tensor(a!)
@ -5844,7 +5844,7 @@
structured_delegate: sqrt.out
variants: function, method
dispatch:
NestedTensorCPU, NestedTensorCUDA: NestedTensor_sqrt
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sqrt
SparseCPU, SparseCUDA: sqrt_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sqrt_sparse_csr
tags: [core, pointwise]
@ -6035,7 +6035,7 @@
MkldnnCPU: mkldnn_tanh
SparseCPU, SparseCUDA: tanh_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tanh_sparse_csr
NestedTensorCPU, NestedTensorCUDA: NestedTensor_tanh
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_tanh
tags: [core, pointwise]
- func: tanh_(Tensor(a!) self) -> Tensor(a!)
@ -6046,7 +6046,7 @@
MkldnnCPU: mkldnn_tanh_
SparseCPU, SparseCUDA: tanh_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tanh_sparse_csr_
NestedTensorCPU, NestedTensorCUDA: NestedTensor_tanh_
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_tanh_
tags: pointwise
- func: tanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@ -6103,7 +6103,7 @@
MkldnnCPU: mkldnn_relu_backward
SparseCPU, SparseCUDA: threshold_backward_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: threshold_backward_sparse_compressed
NestedTensorCPU, NestedTensorCUDA: threshold_backwards_nested
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: threshold_backwards_nested
tags: pointwise
- func: tile(Tensor self, SymInt[] dims) -> Tensor
@ -6117,7 +6117,7 @@
device_guard: False
dispatch:
CompositeExplicitAutograd: transpose
NestedTensorCPU, NestedTensorCUDA: transpose_nested
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: transpose_nested
- func: transpose.Dimname(Tensor(a) self, Dimname dim0, Dimname dim1) -> Tensor(a)
variants: function, method
@ -6214,13 +6214,13 @@
- func: _nested_tensor_size(Tensor self) -> Tensor
variants: method
dispatch:
NestedTensorCPU, NestedTensorCUDA: _nested_tensor_size
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: _nested_tensor_size
autogen: _nested_tensor_size.out
- func: _nested_tensor_strides(Tensor self) -> Tensor
variants: method
dispatch:
NestedTensorCPU, NestedTensorCUDA: _nested_tensor_strides
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: _nested_tensor_strides
autogen: _nested_tensor_strides.out
- func: _nested_tensor_storage_offsets(Tensor self) -> Tensor
@ -6233,7 +6233,7 @@
# _nested_from_padded_and_nested_example is available for testing.
- func: _nested_from_padded_and_nested_example(Tensor padded, Tensor nt_example) -> Tensor
dispatch:
NestedTensorCPU, NestedTensorCUDA: NestedTensor_from_padded_and_nested_example
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_from_padded_and_nested_example
autogen: _nested_from_padded_and_nested_example.out
# The input arguments' types to this functions are temporary. When nested tensors switch to using SymInts for their metadata representation
@ -6424,7 +6424,7 @@
CompositeExplicitAutograd: unsqueeze
SparseCPU, SparseCUDA: unsqueeze_sparse
QuantizedCPU, QuantizedCUDA: unsqueeze_quantized
NestedTensorCPU, NestedTensorCUDA: unsqueeze_nested
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: unsqueeze_nested
tags: core
- func: unsqueeze_(Tensor(a!) self, int dim) -> Tensor(a!)
@ -6519,14 +6519,14 @@
variants: function, method
dispatch:
CPU, CUDA, MPS: where
NestedTensorCPU, NestedTensorCUDA: NestedTensor_where
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_where
tags: [core, pointwise]
- func: where.self_out(Tensor condition, Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
dispatch:
CPU, CUDA, MPS: where_self_out
NestedTensorCPU, NestedTensorCUDA: NestedTensor_where_out
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_where_out
- func: where.ScalarSelf(Tensor condition, Scalar self, Tensor other) -> Tensor
variants: function
@ -6861,7 +6861,7 @@
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: clone_sparse_compressed
MkldnnCPU: mkldnn_clone
QuantizedCPU, QuantizedCUDA: quantized_clone
NestedTensorCPU, NestedTensorCUDA: clone_nested
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: clone_nested
autogen: clone.out
tags: [core, pointwise]
@ -6895,7 +6895,7 @@
SparseCPU, SparseCUDA, SparseMeta: zero_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: zero_sparse_csr_
MkldnnCPU: mkldnn_zero_
NestedTensorCPU, NestedTensorCUDA: zero_nested_
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: zero_nested_
autogen: zero, zero.out
- func: sub.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
@ -6915,7 +6915,7 @@
dispatch:
SparseCPU, SparseCUDA: sub_sparse
ZeroTensor: sub_zerotensor
NestedTensorCPU, NestedTensorCUDA: NestedTensor_sub_Tensor
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sub_Tensor
tags: [core, pointwise]
- func: sub_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
@ -7405,7 +7405,7 @@
dispatch:
SparseCPU, SparseCUDA, SparseMeta: values_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: values_sparse_csr
NestedTensorCPU, NestedTensorCUDA: values_nested
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: values_nested
CompositeExplicitAutograd: values_default
device_check: NoCheck
device_guard: False
@ -7464,7 +7464,7 @@
variants: function, method
dispatch:
CompositeExplicitAutograd: unbind
NestedTensorCPU, NestedTensorCUDA: NestedTensor_unbind
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_unbind
- func: unbind.Dimname(Tensor(a -> *) self, Dimname dim) -> Tensor(a)[]
variants: function, method
@ -7752,7 +7752,7 @@
device_guard: False
dispatch:
CompositeExplicitAutograd: _to_copy
NestedTensorCPU, NestedTensorCUDA: _to_copy_nested
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: _to_copy_nested
autogen: _to_copy.out
tags: core
@ -8038,7 +8038,7 @@
variants: function, method
dispatch:
CompositeExplicitAutograd: masked_fill
NestedTensorCPU, NestedTensorCUDA: NestedTensor_masked_fill
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_masked_fill
tags: pointwise
- func: masked_fill_.Tensor(Tensor(a!) self, Tensor mask, Tensor value) -> Tensor(a!)
@ -8095,7 +8095,7 @@
dispatch:
ZeroTensor, Meta, CPU, CUDA, QuantizedCPU, QuantizedCUDA, MPS: view
MkldnnCPU: mkldnn_view
NestedTensorCPU, NestedTensorCUDA: view_nested
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: view_nested
tags: core
# Warning: If you want to change the name or overload name of this
@ -8936,7 +8936,7 @@
variants: method, function
dispatch:
QuantizedCPU: eq_quantized_cpu
NestedTensorCPU, NestedTensorCUDA: eq_scalar_nested
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: eq_scalar_nested
tags: [core, pointwise]
- func: eq.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@ -8955,7 +8955,7 @@
variants: method, function
dispatch:
QuantizedCPU: eq_quantized_cpu
NestedTensorCPU, NestedTensorCUDA: eq_tensor_nested
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: eq_tensor_nested
tags: [core, pointwise]
- func: ge.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
@ -8974,7 +8974,7 @@
variants: method, function
dispatch:
QuantizedCPU: ge_quantized_cpu
NestedTensorCPU, NestedTensorCUDA: ge_scalar_nested
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: ge_scalar_nested
tags: [core, pointwise]
- func: ge.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@ -9101,7 +9101,7 @@
variants: method, function
dispatch:
QuantizedCPU: gt_quantized_cpu
NestedTensorCPU, NestedTensorCUDA: gt_scalar_nested
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: gt_scalar_nested
tags: [core, pointwise]
- func: gt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@ -10324,7 +10324,7 @@
MPS: normal_mps_
Meta: normal_meta_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: normal_sparse_csr_
NestedTensorCPU, NestedTensorCUDA: normal_nested_
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: normal_nested_
autogen: normal.out
# Only used by the functionalization pass.
@ -10392,7 +10392,7 @@
variants: method, function
dispatch:
CompositeExplicitAutograd: alias
NestedTensorCPU, NestedTensorCUDA: alias_nested
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: alias_nested
tags: core
- func: _amp_foreach_non_finite_check_and_unscale_(Tensor(a!)[] self, Tensor(b!) found_inf, Tensor inv_scale) -> ()
@ -13169,7 +13169,7 @@
device_guard: False
dispatch:
CompositeExplicitAutograd: isinf
NestedTensorCPU, NestedTensorCUDA: NestedTensor_isinf
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isinf
SparseCPU, SparseCUDA: isinf_sparse
SparseMeta: isinf_sparse_meta
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isinf_sparse_csr
@ -13185,7 +13185,7 @@
variants: function, method
structured_delegate: isposinf.out
dispatch:
NestedTensorCPU, NestedTensorCUDA: NestedTensor_isposinf
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isposinf
SparseCPU, SparseCUDA: isposinf_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isposinf_sparse_csr
tags: pointwise
@ -13203,7 +13203,7 @@
variants: function, method
structured_delegate: isneginf.out
dispatch:
NestedTensorCPU, NestedTensorCUDA: NestedTensor_isneginf
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isneginf
SparseCPU, SparseCUDA: isneginf_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isneginf_sparse_csr
tags: pointwise
@ -14479,13 +14479,13 @@
dispatch:
# the NestedTensor keys are necessary because NestedTensor has been removed
# from the CompositeExplicitAutograd keyset see Note [NestedTensor Not Included in Backend Keys]
CompositeExplicitAutograd, NestedTensorCPU, NestedTensorCUDA: _test_autograd_multiple_dispatch_fullcoverage
CompositeExplicitAutograd, NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: _test_autograd_multiple_dispatch_fullcoverage
autogen: _test_autograd_multiple_dispatch.fullcoverage_out
# Note: this function is only for testing.
- func: _test_autograd_multiple_dispatch.ntonly(Tensor self, bool b) -> Tensor
dispatch:
CompositeImplicitAutograd, NestedTensorCPU, NestedTensorCUDA: _test_autograd_multiple_dispatch_ntonly
CompositeImplicitAutograd, NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: _test_autograd_multiple_dispatch_ntonly
# Note: this function is only for testing.
- func: _test_autograd_multiple_dispatch_view(Tensor(a) self) -> Tensor(a)
@ -14830,13 +14830,13 @@
- func: _safe_softmax(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
dispatch:
CompositeExplicitAutograd: _safe_softmax
NestedTensorCPU, NestedTensorCUDA: _safe_softmax
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: _safe_softmax
# Apparently, putting "forward" in the name will cause Python bindings to be skipped, so "fwd" it is.
- func: _transformer_encoder_layer_fwd(Tensor src, int embed_dim, int num_heads, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, bool use_gelu, bool norm_first, float eps, Tensor norm_weight_1, Tensor norm_bias_1, Tensor norm_weight_2, Tensor norm_bias_2, Tensor ffn_weight_1, Tensor ffn_bias_1, Tensor ffn_weight_2, Tensor ffn_bias_2, Tensor? mask=None, int? mask_type=None) -> Tensor
variants: function
dispatch:
CPU, CUDA, NestedTensorCPU, NestedTensorCUDA: transformer_encoder_layer_forward
CPU, CUDA, NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: transformer_encoder_layer_forward
autogen: _transformer_encoder_layer_fwd.out
- func: _native_multi_head_attention(Tensor query, Tensor key, Tensor value, int embed_dim, int num_head, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, Tensor? mask=None, bool need_weights=True, bool average_attn_weights=True, int? mask_type=None) -> (Tensor, Tensor)

View File

@ -340,6 +340,7 @@ c10::DispatchKey parseDispatchKey(const std::string& k) {
{"NestedTensorCPU", c10::DispatchKey::NestedTensorCPU},
{"NestedTensorCUDA", c10::DispatchKey::NestedTensorCUDA},
{"NestedTensorXPU", c10::DispatchKey::NestedTensorXPU},
{"NestedTensorHPU", c10::DispatchKey::NestedTensorHPU},
{"NestedTensorMeta", c10::DispatchKey::NestedTensorMeta},
{"NestedTensorPrivateUse1", c10::DispatchKey::NestedTensorPrivateUse1},
{"PrivateUse1", c10::DispatchKey::PrivateUse1},

View File

@ -295,6 +295,7 @@ dispatch_keys = [
DispatchKey.NestedTensorCPU,
DispatchKey.NestedTensorCUDA,
DispatchKey.NestedTensorXPU,
DispatchKey.NestedTensorHPU,
# Meta is a magic key: it is automatically generated for structured
# kernels
DispatchKey.Meta,