[HPU] Add HPU as a supported device for NestedTensor (#148659)

This change enables basic NestedTensor operations on HPU, fixing the runtime error when creating a NestedTensor on HPU. - Extended `NestedTensorImpl` to recognize `hpu` as a valid storage device. - Added `NestedTensorHPU` to `DispatchKey` parsing in `DispatchKey.cpp`. - Updated `torchgen/model.py` to include `NestedTensorHPU` in `dispatch_keys`. - Modified `native_functions.yaml` to enable `NestedTensorHPU` support for various ops. Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/148659 Approved by: https://github.com/jeromean, https://github.com/albanD, https://github.com/sujoysaraswati
2025-10-20 21:14:14 +08:00 · 2025-04-14 03:42:31 +00:00
parent 9aca00102f
commit 9458b83729
4 changed files with 91 additions and 89 deletions
--- a/aten/src/ATen/NestedTensorImpl.cpp
+++ b/aten/src/ATen/NestedTensorImpl.cpp
@ -182,7 +182,7 @@ NestedTensorImpl::NestedTensorImpl(
      "coverage, and works with torch.compile.");
  auto storage_device = storage_.device();
  TORCH_INTERNAL_ASSERT(
-      storage_device.is_cpu() || storage_device.is_cuda() || storage_device.is_xpu() || storage_device.is_privateuseone(),
+      storage_device.is_cpu() || storage_device.is_cuda() || storage_device.is_xpu() || storage_device.is_hpu() || storage_device.is_privateuseone(),
      "NestedTensorImpl storage must be either CUDA, CPU, XPU or ", get_privateuse1_backend(), " but got ",
      storage_device);
  validate_nested_tensor_metadata(nested_sizes_, nested_strides_, storage_offsets_);
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -288,13 +288,13 @@
  dispatch:
    CPU: native_dropout_cpu
    CUDA: native_dropout_cuda
-    NestedTensorCPU, NestedTensorCUDA: native_dropout_nested
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: native_dropout_nested
  tags: [nondeterministic_seeded, core]
  autogen: native_dropout.out

 - func: native_dropout_backward(Tensor grad_output, Tensor mask, float scale) -> Tensor
  dispatch:
-    CPU, NestedTensorCPU, NestedTensorCUDA: native_dropout_backward
+    CPU, NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: native_dropout_backward
    CUDA: native_dropout_backward_cuda
  autogen: native_dropout_backward.out
  tags: pointwise
@ -342,7 +342,7 @@
    CompositeExplicitAutograd: abs
    SparseCPU, SparseCUDA: abs_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: abs_sparse_csr
-    NestedTensorCPU, NestedTensorCUDA: NestedTensor_abs
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_abs
  tags: [core, pointwise]

 - func: abs_(Tensor(a!) self) -> Tensor(a!)
@ -352,7 +352,7 @@
    CompositeExplicitAutograd: abs_
    SparseCPU, SparseCUDA: abs_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: abs_sparse_csr_
-    NestedTensorCPU, NestedTensorCUDA: NestedTensor_abs_
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_abs_

 - func: abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
  device_check: NoCheck   # TensorIterator
@ -431,7 +431,7 @@
  dispatch:
    SparseCPU, SparseCUDA: sgn_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sgn_sparse_csr
-    NestedTensorCPU, NestedTensorCUDA: NestedTensor_sgn
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sgn
  tags: pointwise

 - func: sgn_(Tensor(a!) self) -> Tensor(a!)
@ -440,7 +440,7 @@
  dispatch:
    SparseCPU, SparseCUDA: sgn_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sgn_sparse_csr_
-    NestedTensorCPU, NestedTensorCUDA: NestedTensor_sgn_
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sgn_
  tags: pointwise

 - func: sgn.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@ -560,7 +560,7 @@
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr
    MkldnnCPU: mkldnn_add
    ZeroTensor: add_zerotensor
-    NestedTensorCPU, NestedTensorCUDA: NestedTensor_add_Tensor
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_add_Tensor
  tags: [core, pointwise]

 - func: add_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
@ -571,7 +571,7 @@
    SparseCPU, SparseCUDA, SparseMeta: add_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr_
    MkldnnCPU: mkldnn_add_
-    NestedTensorCPU, NestedTensorCUDA: NestedTensor_add__Tensor
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_add__Tensor
  tags: pointwise

 - func: add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
@ -703,7 +703,7 @@
  structured_delegate: all.out
  variants: function, method
  dispatch:
-    NestedTensorCPU, NestedTensorCUDA: NestedTensor_all
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_all


 - func: all.dims(Tensor self, int[]? dim=None, bool keepdim=False) -> Tensor
@ -1262,7 +1262,7 @@
  variants: function, method
  dispatch:
    CompositeExplicitAutograd: logical_not
-    NestedTensorCPU, NestedTensorCUDA: NestedTensor_logical_not
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_logical_not
  tags: [core, pointwise]

 - func: logical_not_(Tensor(a!) self) -> Tensor(a!)
@ -1270,7 +1270,7 @@
  variants: method
  dispatch:
    CompositeExplicitAutograd: logical_not_
-    NestedTensorCPU, NestedTensorCUDA: NestedTensor_logical_not_
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_logical_not_
  tags: pointwise

 - func: logical_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@ -1394,7 +1394,7 @@
  dispatch:
    SparseCPU, SparseCUDA: cat_sparse
    QuantizedCPU: cat_quantized_cpu
-    NestedTensorCPU, NestedTensorCUDA: cat_nested
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: cat_nested
  tags: core

 - func: cat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
@ -1482,7 +1482,7 @@
  device_guard: False
  dispatch:
    CompositeImplicitAutograd: chunk
-    NestedTensorCPU, NestedTensorCUDA: chunk_nested_tensor
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: chunk_nested_tensor

 - func: tensor_split.sections(Tensor(a -> *) self, SymInt sections, int dim=0) -> Tensor(a)[]
  variants: function, method
@ -1779,7 +1779,7 @@
    SparseCPU, SparseCUDA: copy_sparse_wrapper_
    CompositeExplicitAutograd: copy_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: copy_sparse_compressed_
-    NestedTensorCPU, NestedTensorCUDA: copy_nested_
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: copy_nested_
  autogen: copy.out

 - func: _copy_from(Tensor self, Tensor dst, bool non_blocking=False) -> Tensor
@ -1799,7 +1799,7 @@
  variants: function, method
  structured_delegate: cos.out
  dispatch:
-    NestedTensorCPU, NestedTensorCUDA: NestedTensor_cos
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_cos
  tags: [core, pointwise]

 - func: cos_(Tensor(a!) self) -> Tensor(a!)
@ -2137,7 +2137,7 @@
  dispatch:
    SparseCPU, SparseCUDA: div_sparse
    ZeroTensor: div_zerotensor
-    NestedTensorCPU, NestedTensorCUDA: NestedTensor_div_Tensor
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_div_Tensor
  tags: [core, pointwise]

 - func: div_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
@ -2190,7 +2190,7 @@
  variants: function, method
  dispatch:
    CompositeExplicitAutograd: div
-    NestedTensorCPU, NestedTensorCUDA: NestedTensor_div_Scalar
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_div_Scalar
  tags: [core, pointwise]

 - func: div_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
@ -2290,7 +2290,7 @@
 - func: embedding(Tensor weight, Tensor indices, SymInt padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor
  dispatch:
    CompositeExplicitAutograd: embedding_symint
-    NestedTensorCPU, NestedTensorCUDA: NestedTensor_embedding
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_embedding
  autogen: embedding.out
  tags: core

@ -2496,7 +2496,7 @@
    QuantizedCPU, QuantizedCUDA: empty_like_quantized
    SparseCPU, SparseCUDA, SparseMeta: empty_like_sparse_coo
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: empty_like_sparse_csr
-    NestedTensorCPU, NestedTensorCUDA: empty_like_nested
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: empty_like_nested
  autogen: empty_like.out

 - func: empty_strided(SymInt[] size, SymInt[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
@ -2701,7 +2701,7 @@
    QuantizedCPU, QuantizedCUDA: fill_quantized_
    Meta: fill_meta_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: fill_sparse_csr_
-    NestedTensorCPU, NestedTensorCUDA: fill_nested_
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: fill_nested_
  autogen: fill.Scalar_out

 - func: fill_.Tensor(Tensor(a!) self, Tensor value) -> Tensor(a!)
@ -2712,7 +2712,7 @@
    MPS: fill_tensor_mps_
    QuantizedCPU, QuantizedCUDA: fill_quantized_
    Meta: fill_meta_
-    NestedTensorCPU, NestedTensorCUDA: fill_nested_
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: fill_nested_
  autogen: fill.Tensor_out

 - func: floor(Tensor self) -> Tensor
@ -3187,7 +3187,7 @@
  device_guard: False
  dispatch:
    CPU, CUDA, MPS: isnan
-    NestedTensorCPU, NestedTensorCUDA: NestedTensor_isnan
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isnan
    SparseCPU, SparseCUDA: isnan_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isnan_sparse_csr
  autogen: isnan.out
@ -3238,7 +3238,7 @@
  device_check: NoCheck
  device_guard: False
  dispatch:
-    NestedTensorCPU, NestedTensorCUDA: nested_is_same_size
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: nested_is_same_size
    CompositeExplicitAutograd: is_same_size

 - func: is_signed(Tensor self) -> bool
@ -3285,7 +3285,7 @@
    CUDA: layer_norm_cuda
    MPS: layer_norm_mps
    CompositeExplicitAutograd: math_native_layer_norm
-    NestedTensorCPU, NestedTensorCUDA: nested_layer_norm
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: nested_layer_norm
  autogen: native_layer_norm.out
  tags: core

@ -3294,7 +3294,7 @@
    CPU: layer_norm_backward_cpu
    CUDA: layer_norm_backward_cuda
    MPS: layer_norm_backward_mps
-    NestedTensorCPU, NestedTensorCUDA: layer_norm_backward_nested
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: layer_norm_backward_nested
  autogen: native_layer_norm_backward.out
  tags: core

@ -3327,12 +3327,12 @@
  python_module: nn
  dispatch:
    CompositeImplicitAutograd: linear
-    NestedTensorCPU, NestedTensorCUDA: nested_linear
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: nested_linear
    MPS: _mps_linear

 - func: linear_backward(Tensor self, Tensor grad_output, Tensor weight, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
  dispatch:
-    NestedTensorCPU, NestedTensorCUDA: nested_linear_backward
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: nested_linear_backward
    MPS: mps_linear_backward
  autogen: linear_backward.out

@ -3771,17 +3771,17 @@
  variants: function, method
  dispatch:
    CompositeImplicitAutograd: matmul
-    NestedTensorCPU, NestedTensorCUDA: matmul_nested
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: matmul_nested

 - func: matmul_backward(Tensor grad, Tensor self, Tensor other, bool[2] mask) -> (Tensor, Tensor)
  dispatch:
-    NestedTensorCPU, NestedTensorCUDA: matmul_backward_nested
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: matmul_backward_nested
  autogen: matmul_backward.out

 - func: matmul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CompositeImplicitAutograd: matmul_out
-    NestedTensorCPU, NestedTensorCUDA: matmul_out_nested
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: matmul_out_nested

 # Alias to linalg.matrix_power
 - func: matrix_power(Tensor self, int n) -> Tensor
@ -4227,7 +4227,7 @@
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_sparse_csr
    MkldnnCPU: mkldnn_mul
    ZeroTensor: mul_zerotensor
-    NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul_Tensor
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_mul_Tensor
  tags: [core, pointwise]

 - func: mul_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
@ -4238,7 +4238,7 @@
    SparseCPU, SparseCUDA: mul_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_sparse_csr_
    MkldnnCPU: mkldnn_mul_
-    NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul__Tensor
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_mul__Tensor
  tags: pointwise

 - func: mul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@ -4261,7 +4261,7 @@
  dispatch:
    CompositeExplicitAutograd: mul
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_scalar_sparse_csr
-    NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul_Scalar
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_mul_Scalar
  tags: [core, pointwise]

 - func: mul_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
@ -4270,7 +4270,7 @@
  dispatch:
    CompositeExplicitAutograd: mul_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul__scalar_sparse_csr
-    NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul__Scalar
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_mul__Scalar
  autogen: mul.Scalar_out
  tags: pointwise
 # multiply, alias for mul
@ -4336,7 +4336,7 @@
  device_guard: False
  dispatch:
    CompositeImplicitAutograd: narrow_symint
-    NestedTensorCPU, NestedTensorCUDA: narrow_nested_symint
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: narrow_nested_symint

 - func: narrow.Tensor(Tensor(a) self, int dim, Tensor start, SymInt length) -> Tensor(a)
  variants: function, method
@ -4475,7 +4475,7 @@
    # NB: Although this composite mutates on the inside, it is
    # non-differentiable so NonFunctional doesn't apply
    CompositeExplicitAutograd: ones_like
-    NestedTensorCPU, NestedTensorCUDA: ones_like
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: ones_like
  autogen: ones_like.out

 - func: pairwise_distance(Tensor x1, Tensor x2, float p=2, float eps=1e-06, bool keepdim=False) -> Tensor
@ -4877,7 +4877,7 @@
  dispatch:
    SparseCPU, SparseCUDA: neg_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: neg_sparse_csr
-    NestedTensorCPU, NestedTensorCUDA: NestedTensor_neg
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_neg
  tags: [core, pointwise]

 - func: neg_(Tensor(a!) self) -> Tensor(a!)
@ -4887,7 +4887,7 @@
  dispatch:
    SparseCPU, SparseCUDA: neg_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: neg_sparse_csr_
-    NestedTensorCPU, NestedTensorCUDA: NestedTensor_neg_
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_neg_
  tags: pointwise

 - func: neg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@ -5041,7 +5041,7 @@
    MkldnnCPU: mkldnn_relu
    QuantizedCPU: relu_quantized_cpu
    QuantizedCUDA: relu_quantized_cuda
-    NestedTensorCPU, NestedTensorCUDA: NestedTensor_relu
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_relu
    SparseCPU, SparseCUDA: relu_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: relu_sparse_csr
  tags: [core, pointwise]
@ -5055,7 +5055,7 @@
    MkldnnCPU: mkldnn_relu_
    QuantizedCPU: relu_quantized_cpu_
    QuantizedCUDA: relu_quantized_cuda_
-    NestedTensorCPU, NestedTensorCUDA: NestedTensor_relu_
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_relu_
    SparseCPU, SparseCUDA: relu_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: relu_sparse_csr_
  autogen: relu.out
@ -5101,7 +5101,7 @@
  python_module: nn
  dispatch:
    QuantizedCPU: gelu_quantized_cpu_
-    NestedTensorCPU, NestedTensorCUDA: NestedTensor_gelu_
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_gelu_

 - func: gelu(Tensor self, *, str approximate='none') -> Tensor
  structured_delegate: gelu.out
@ -5111,7 +5111,7 @@
    MkldnnCPU: mkldnn_gelu
    QuantizedCPU: gelu_quantized_cpu
    QuantizedCUDA: gelu_quantized_cuda
-    NestedTensorCPU, NestedTensorCUDA: NestedTensor_gelu
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_gelu
  tags: [core, pointwise]

 - func: gelu_backward.grad_input(Tensor grad_output, Tensor self, *, str approximate='none', Tensor(a!) grad_input) -> Tensor(a!)
@ -5128,7 +5128,7 @@
  python_module: nn
  dispatch:
    MkldnnCPU: mkldnn_gelu_backward
-    NestedTensorCPU, NestedTensorCUDA: gelu_backwards_nested
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: gelu_backwards_nested
  tags: pointwise

 - func: infinitely_differentiable_gelu_backward(Tensor grad, Tensor self) -> Tensor
@ -5193,7 +5193,7 @@
  dispatch:
    CompositeExplicitAutograd: select_symint
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: select_sparse_csr
-    NestedTensorCPU, NestedTensorCUDA: select_nested
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: select_nested
  tags: core

 - func: select_backward(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt index) -> Tensor
@ -5209,7 +5209,7 @@
  device_check: NoCheck
  device_guard: False
  dispatch:
-    NestedTensorCPU, NestedTensorCUDA: _nested_select_backward_symint
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: _nested_select_backward_symint

 - func: selu(Tensor self) -> Tensor
  device_check: NoCheck   # TensorIterator
@ -5234,14 +5234,14 @@
  structured_delegate: silu.out
  python_module: nn
  dispatch:
-    NestedTensorCPU, NestedTensorCUDA: NestedTensor_silu
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_silu
  tags: pointwise

 - func: silu_(Tensor(a!) self) -> Tensor(a!)
  structured_delegate: silu.out
  python_module: nn
  dispatch:
-    NestedTensorCPU, NestedTensorCUDA: NestedTensor_silu_
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_silu_
  tags: pointwise

 - func: silu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@ -5267,7 +5267,7 @@
  python_module: nn
  dispatch:
    CompositeImplicitAutograd: math_silu_backward
-    NestedTensorCPU, NestedTensorCUDA: silu_backward_nested
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: silu_backward_nested
  tags: pointwise

 - func: mish(Tensor self) -> Tensor
@ -5346,7 +5346,7 @@
  dispatch:
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sin_sparse_csr
    SparseCPU, SparseCUDA: sin_sparse
-    NestedTensorCPU, NestedTensorCUDA: NestedTensor_sin
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sin
  tags: [core, pointwise]

 - func: sin_(Tensor(a!) self) -> Tensor(a!)
@ -5430,7 +5430,7 @@
  variants: function, method
  dispatch:
    CompositeExplicitAutograd: detach
-    NestedTensorCPU, NestedTensorCUDA: detach
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: detach

 # Like `detach()`, but modifies this `Variable` in-place. This method may
 # only be called on non-view `Variable`s. You can use `is_view()` to check
@ -5560,7 +5560,7 @@
  structured_delegate: _softmax.out
  dispatch:
    MkldnnCPU: mkldnn_softmax
-    NestedTensorCPU, NestedTensorCUDA: softmax_nested
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: softmax_nested
  tags: core

 - func: _softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out) -> Tensor(a!)
@ -5573,7 +5573,7 @@
 - func: _softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor
  structured_delegate: _softmax_backward_data.out
  dispatch:
-    NestedTensorCPU, NestedTensorCUDA: nested_softmax_backward
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: nested_softmax_backward

 - func: _softmax_backward_data.out(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype, *, Tensor(a!) grad_input) -> Tensor(a!)
  structured: True
@ -5617,7 +5617,7 @@
  device_guard: False
  dispatch:
    CompositeExplicitAutograd: split_with_sizes
-    NestedTensorCPU, NestedTensorCUDA: split_with_sizes_nested
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: split_with_sizes_nested
  tags: core

 - func: hsplit.int(Tensor(a -> *) self, int sections) -> Tensor(a)[]
@ -5645,7 +5645,7 @@
  dispatch:
    CompositeExplicitAutograd: squeeze
    QuantizedCPU, QuantizedCUDA: squeeze_quantized
-    NestedTensorCPU, NestedTensorCUDA: squeeze_nested
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: squeeze_nested

 - func: squeeze.dim(Tensor(a) self, int dim) -> Tensor(a)
  variants: function, method
@ -5654,7 +5654,7 @@
  dispatch:
    CompositeExplicitAutograd: squeeze
    QuantizedCPU, QuantizedCUDA: squeeze_quantized
-    NestedTensorCPU, NestedTensorCUDA: squeeze_dim_nested
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: squeeze_dim_nested
  tags: core

 - func: squeeze.dimname(Tensor(a) self, Dimname dim) -> Tensor(a)
@ -5670,7 +5670,7 @@
  dispatch:
    CompositeExplicitAutograd: squeeze
    QuantizedCPU, QuantizedCUDA: squeeze_quantized
-    NestedTensorCPU, NestedTensorCUDA: squeeze_dim_nested
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: squeeze_dim_nested
  tags: core

 - func: squeeze_(Tensor(a!) self) -> Tensor(a!)
@ -5844,7 +5844,7 @@
  structured_delegate: sqrt.out
  variants: function, method
  dispatch:
-    NestedTensorCPU, NestedTensorCUDA: NestedTensor_sqrt
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sqrt
    SparseCPU, SparseCUDA: sqrt_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sqrt_sparse_csr
  tags: [core, pointwise]
@ -6035,7 +6035,7 @@
    MkldnnCPU: mkldnn_tanh
    SparseCPU, SparseCUDA: tanh_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tanh_sparse_csr
-    NestedTensorCPU, NestedTensorCUDA: NestedTensor_tanh
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_tanh
  tags: [core, pointwise]

 - func: tanh_(Tensor(a!) self) -> Tensor(a!)
@ -6046,7 +6046,7 @@
    MkldnnCPU: mkldnn_tanh_
    SparseCPU, SparseCUDA: tanh_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tanh_sparse_csr_
-    NestedTensorCPU, NestedTensorCUDA: NestedTensor_tanh_
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_tanh_
  tags: pointwise

 - func: tanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@ -6103,7 +6103,7 @@
    MkldnnCPU: mkldnn_relu_backward
    SparseCPU, SparseCUDA: threshold_backward_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: threshold_backward_sparse_compressed
-    NestedTensorCPU, NestedTensorCUDA: threshold_backwards_nested
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: threshold_backwards_nested
  tags: pointwise

 - func: tile(Tensor self, SymInt[] dims) -> Tensor
@ -6117,7 +6117,7 @@
  device_guard: False
  dispatch:
    CompositeExplicitAutograd: transpose
-    NestedTensorCPU, NestedTensorCUDA: transpose_nested
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: transpose_nested

 - func: transpose.Dimname(Tensor(a) self, Dimname dim0, Dimname dim1) -> Tensor(a)
  variants: function, method
@ -6214,13 +6214,13 @@
 - func: _nested_tensor_size(Tensor self) -> Tensor
  variants: method
  dispatch:
-    NestedTensorCPU, NestedTensorCUDA: _nested_tensor_size
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: _nested_tensor_size
  autogen: _nested_tensor_size.out

 - func: _nested_tensor_strides(Tensor self) -> Tensor
  variants: method
  dispatch:
-    NestedTensorCPU, NestedTensorCUDA: _nested_tensor_strides
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: _nested_tensor_strides
  autogen: _nested_tensor_strides.out

 - func: _nested_tensor_storage_offsets(Tensor self) -> Tensor
@ -6233,7 +6233,7 @@
 # _nested_from_padded_and_nested_example is available for testing.
 - func: _nested_from_padded_and_nested_example(Tensor padded, Tensor nt_example) -> Tensor
  dispatch:
-    NestedTensorCPU, NestedTensorCUDA: NestedTensor_from_padded_and_nested_example
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_from_padded_and_nested_example
  autogen: _nested_from_padded_and_nested_example.out

 # The input arguments' types to this functions are temporary. When nested tensors switch to using SymInts for their metadata representation
@ -6424,7 +6424,7 @@
    CompositeExplicitAutograd: unsqueeze
    SparseCPU, SparseCUDA: unsqueeze_sparse
    QuantizedCPU, QuantizedCUDA: unsqueeze_quantized
-    NestedTensorCPU, NestedTensorCUDA: unsqueeze_nested
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: unsqueeze_nested
  tags: core

 - func: unsqueeze_(Tensor(a!) self, int dim) -> Tensor(a!)
@ -6519,14 +6519,14 @@
  variants: function, method
  dispatch:
    CPU, CUDA, MPS: where
-    NestedTensorCPU, NestedTensorCUDA: NestedTensor_where
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_where
  tags: [core, pointwise]

 - func: where.self_out(Tensor condition, Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
  device_check: NoCheck   # TensorIterator
  dispatch:
    CPU, CUDA, MPS: where_self_out
-    NestedTensorCPU, NestedTensorCUDA: NestedTensor_where_out
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_where_out

 - func: where.ScalarSelf(Tensor condition, Scalar self, Tensor other) -> Tensor
  variants: function
@ -6861,7 +6861,7 @@
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: clone_sparse_compressed
    MkldnnCPU: mkldnn_clone
    QuantizedCPU, QuantizedCUDA: quantized_clone
-    NestedTensorCPU, NestedTensorCUDA: clone_nested
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: clone_nested
  autogen: clone.out
  tags: [core, pointwise]

@ -6895,7 +6895,7 @@
    SparseCPU, SparseCUDA, SparseMeta: zero_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: zero_sparse_csr_
    MkldnnCPU: mkldnn_zero_
-    NestedTensorCPU, NestedTensorCUDA: zero_nested_
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: zero_nested_
  autogen: zero, zero.out

 - func: sub.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
@ -6915,7 +6915,7 @@
  dispatch:
    SparseCPU, SparseCUDA: sub_sparse
    ZeroTensor: sub_zerotensor
-    NestedTensorCPU, NestedTensorCUDA: NestedTensor_sub_Tensor
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sub_Tensor
  tags: [core, pointwise]

 - func: sub_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
@ -7405,7 +7405,7 @@
  dispatch:
    SparseCPU, SparseCUDA, SparseMeta: values_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: values_sparse_csr
-    NestedTensorCPU, NestedTensorCUDA: values_nested
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: values_nested
    CompositeExplicitAutograd: values_default
  device_check: NoCheck
  device_guard: False
@ -7464,7 +7464,7 @@
  variants: function, method
  dispatch:
    CompositeExplicitAutograd: unbind
-    NestedTensorCPU, NestedTensorCUDA: NestedTensor_unbind
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_unbind

 - func: unbind.Dimname(Tensor(a -> *) self, Dimname dim) -> Tensor(a)[]
  variants: function, method
@ -7752,7 +7752,7 @@
  device_guard: False
  dispatch:
    CompositeExplicitAutograd: _to_copy
-    NestedTensorCPU, NestedTensorCUDA: _to_copy_nested
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: _to_copy_nested
  autogen: _to_copy.out
  tags: core

@ -8038,7 +8038,7 @@
  variants: function, method
  dispatch:
    CompositeExplicitAutograd: masked_fill
-    NestedTensorCPU, NestedTensorCUDA: NestedTensor_masked_fill
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_masked_fill
  tags: pointwise

 - func: masked_fill_.Tensor(Tensor(a!) self, Tensor mask, Tensor value) -> Tensor(a!)
@ -8095,7 +8095,7 @@
  dispatch:
    ZeroTensor, Meta, CPU, CUDA, QuantizedCPU, QuantizedCUDA, MPS: view
    MkldnnCPU: mkldnn_view
-    NestedTensorCPU, NestedTensorCUDA: view_nested
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: view_nested
  tags: core

 # Warning: If you want to change the name or overload name of this
@ -8936,7 +8936,7 @@
  variants: method, function
  dispatch:
    QuantizedCPU: eq_quantized_cpu
-    NestedTensorCPU, NestedTensorCUDA: eq_scalar_nested
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: eq_scalar_nested
  tags: [core, pointwise]

 - func: eq.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@ -8955,7 +8955,7 @@
  variants: method, function
  dispatch:
    QuantizedCPU: eq_quantized_cpu
-    NestedTensorCPU, NestedTensorCUDA: eq_tensor_nested
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: eq_tensor_nested
  tags: [core, pointwise]

 - func: ge.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
@ -8974,7 +8974,7 @@
  variants: method, function
  dispatch:
    QuantizedCPU: ge_quantized_cpu
-    NestedTensorCPU, NestedTensorCUDA: ge_scalar_nested
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: ge_scalar_nested
  tags: [core, pointwise]

 - func: ge.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@ -9101,7 +9101,7 @@
  variants: method, function
  dispatch:
    QuantizedCPU: gt_quantized_cpu
-    NestedTensorCPU, NestedTensorCUDA: gt_scalar_nested
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: gt_scalar_nested
  tags: [core, pointwise]

 - func: gt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@ -10324,7 +10324,7 @@
    MPS: normal_mps_
    Meta: normal_meta_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: normal_sparse_csr_
-    NestedTensorCPU, NestedTensorCUDA: normal_nested_
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: normal_nested_
  autogen: normal.out

 # Only used by the functionalization pass.
@ -10392,7 +10392,7 @@
  variants: method, function
  dispatch:
    CompositeExplicitAutograd: alias
-    NestedTensorCPU, NestedTensorCUDA: alias_nested
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: alias_nested
  tags: core

 - func: _amp_foreach_non_finite_check_and_unscale_(Tensor(a!)[] self, Tensor(b!) found_inf, Tensor inv_scale) -> ()
@ -13169,7 +13169,7 @@
  device_guard: False
  dispatch:
    CompositeExplicitAutograd: isinf
-    NestedTensorCPU, NestedTensorCUDA: NestedTensor_isinf
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isinf
    SparseCPU, SparseCUDA: isinf_sparse
    SparseMeta: isinf_sparse_meta
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isinf_sparse_csr
@ -13185,7 +13185,7 @@
  variants: function, method
  structured_delegate: isposinf.out
  dispatch:
-    NestedTensorCPU, NestedTensorCUDA: NestedTensor_isposinf
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isposinf
    SparseCPU, SparseCUDA: isposinf_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isposinf_sparse_csr
  tags: pointwise
@ -13203,7 +13203,7 @@
  variants: function, method
  structured_delegate: isneginf.out
  dispatch:
-    NestedTensorCPU, NestedTensorCUDA: NestedTensor_isneginf
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isneginf
    SparseCPU, SparseCUDA: isneginf_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isneginf_sparse_csr
  tags: pointwise
@ -14479,13 +14479,13 @@
  dispatch:
    # the NestedTensor keys are necessary because NestedTensor has been removed
    # from the CompositeExplicitAutograd keyset see Note [NestedTensor Not Included in Backend Keys]
-    CompositeExplicitAutograd, NestedTensorCPU, NestedTensorCUDA: _test_autograd_multiple_dispatch_fullcoverage
+    CompositeExplicitAutograd, NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: _test_autograd_multiple_dispatch_fullcoverage
  autogen: _test_autograd_multiple_dispatch.fullcoverage_out

 # Note: this function is only for testing.
 - func: _test_autograd_multiple_dispatch.ntonly(Tensor self, bool b) -> Tensor
  dispatch:
-    CompositeImplicitAutograd, NestedTensorCPU, NestedTensorCUDA: _test_autograd_multiple_dispatch_ntonly
+    CompositeImplicitAutograd, NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: _test_autograd_multiple_dispatch_ntonly

 # Note: this function is only for testing.
 - func: _test_autograd_multiple_dispatch_view(Tensor(a) self) -> Tensor(a)
@ -14830,13 +14830,13 @@
 - func: _safe_softmax(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
  dispatch:
    CompositeExplicitAutograd: _safe_softmax
-    NestedTensorCPU, NestedTensorCUDA: _safe_softmax
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: _safe_softmax

 # Apparently, putting "forward" in the name will cause Python bindings to be skipped, so "fwd" it is.
 - func: _transformer_encoder_layer_fwd(Tensor src, int embed_dim, int num_heads, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, bool use_gelu, bool norm_first, float eps, Tensor norm_weight_1, Tensor norm_bias_1, Tensor norm_weight_2, Tensor norm_bias_2, Tensor ffn_weight_1, Tensor ffn_bias_1, Tensor ffn_weight_2, Tensor ffn_bias_2, Tensor? mask=None, int? mask_type=None) -> Tensor
  variants: function
  dispatch:
-    CPU, CUDA, NestedTensorCPU, NestedTensorCUDA: transformer_encoder_layer_forward
+    CPU, CUDA, NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: transformer_encoder_layer_forward
  autogen: _transformer_encoder_layer_fwd.out

 - func: _native_multi_head_attention(Tensor query, Tensor key, Tensor value, int embed_dim, int num_head, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, Tensor? mask=None, bool need_weights=True, bool average_attn_weights=True, int? mask_type=None) -> (Tensor, Tensor)
--- a/c10/core/DispatchKey.cpp
+++ b/c10/core/DispatchKey.cpp
@ -340,6 +340,7 @@ c10::DispatchKey parseDispatchKey(const std::string& k) {
      {"NestedTensorCPU", c10::DispatchKey::NestedTensorCPU},
      {"NestedTensorCUDA", c10::DispatchKey::NestedTensorCUDA},
      {"NestedTensorXPU", c10::DispatchKey::NestedTensorXPU},
+      {"NestedTensorHPU", c10::DispatchKey::NestedTensorHPU},
      {"NestedTensorMeta", c10::DispatchKey::NestedTensorMeta},
      {"NestedTensorPrivateUse1", c10::DispatchKey::NestedTensorPrivateUse1},
      {"PrivateUse1", c10::DispatchKey::PrivateUse1},
--- a/torchgen/model.py
+++ b/torchgen/model.py
@ -295,6 +295,7 @@ dispatch_keys = [
    DispatchKey.NestedTensorCPU,
    DispatchKey.NestedTensorCUDA,
    DispatchKey.NestedTensorXPU,
+    DispatchKey.NestedTensorHPU,
    # Meta is a magic key: it is automatically generated for structured
    # kernels
    DispatchKey.Meta,