Fix triu_/tril_ overlap handling

2025-10-31 12:15:03 +08:00 · 2025-10-21 07:54:24 -07:00
466 changed files with 4720 additions and 9906 deletions
--- a/.ci/docker/common/install_onnx.sh
+++ b/.ci/docker/common/install_onnx.sh
@ -19,7 +19,7 @@ pip_install \
  transformers==4.36.2

 pip_install coloredlogs packaging
-pip_install onnxruntime==1.23.1
+pip_install onnxruntime==1.23.0
 pip_install onnxscript==0.5.4

 # Cache the transformers model to be used later by ONNX tests. We need to run the transformers
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -334,12 +334,12 @@ sympy==1.13.3
 #Pinned versions:
 #test that import:

-onnx==1.19.1
+onnx==1.18.0
 #Description: Required by onnx tests, and mypy and test_public_bindings.py when checking torch.onnx._internal
 #Pinned versions:
 #test that import:

-onnxscript==0.5.4
+onnxscript==0.5.3
 #Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
 #Pinned versions:
 #test that import:
--- a/.ci/lumen_cli/pyproject.toml
+++ b/.ci/lumen_cli/pyproject.toml
@ -6,7 +6,7 @@ dependencies = [
    "GitPython==3.1.45",
    "docker==7.1.0",
    "pytest==7.3.2",
-    "uv==0.9.5"
+    "uv==0.8.6"
 ]

 [tool.setuptools]
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@ -163,13 +163,8 @@ if [[ "$(uname)" != Darwin ]]; then
  MEMORY_LIMIT_MAX_JOBS=12
  NUM_CPUS=$(( $(nproc) - 2 ))

-  if [[ "$(uname)" == Linux ]]; then
-    # Defaults here for **binary** linux builds so they can be changed in one place
-    export MAX_JOBS=${MAX_JOBS:-$(( ${NUM_CPUS} > ${MEMORY_LIMIT_MAX_JOBS} ? ${MEMORY_LIMIT_MAX_JOBS} : ${NUM_CPUS} ))}
-  else
-    # For other builds
-    export MAX_JOBS=${NUM_CPUS}
-  fi
+  # Defaults here for **binary** linux builds so they can be changed in one place
+  export MAX_JOBS=${MAX_JOBS:-$(( ${NUM_CPUS} > ${MEMORY_LIMIT_MAX_JOBS} ? ${MEMORY_LIMIT_MAX_JOBS} : ${NUM_CPUS} ))}

  cat >>"$envfile" <<EOL
  export MAX_JOBS="${MAX_JOBS}"
--- a/.claude/skills/pytorch-docstring.md
+++ b/.claude/skills/pytorch-docstring.md
@ -1,354 +0,0 @@
-# PyTorch Docstring Writing Guide
-
-This skill describes how to write docstrings for functions and methods in the PyTorch project, following the conventions in `torch/_tensor_docs.py` and `torch/nn/functional.py`.
-
-## General Principles
-
- Use **raw strings** (`r"""..."""`) for all docstrings to avoid issues with LaTeX/math backslashes
- Follow **Sphinx/reStructuredText** (reST) format for documentation
- Be **concise but complete** - include all essential information
- Always include **examples** when possible
- Use **cross-references** to related functions/classes
-
-## Docstring Structure
-
-### 1. Function Signature (First Line)
-
-Start with the function signature showing all parameters:
-
-```python
-r"""function_name(param1, param2, *, kwarg1=default1, kwarg2=default2) -> ReturnType
-```
-
-**Notes:**
- Include the function name
- Show positional and keyword-only arguments (use `*` separator)
- Include default values
- Show return type annotation
- This line should NOT end with a period
-
-### 2. Brief Description
-
-Provide a one-line description of what the function does:
-
-```python
-r"""conv2d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1) -> Tensor
-
-Applies a 2D convolution over an input image composed of several input
-planes.
-```
-
-### 3. Mathematical Formulas (if applicable)
-
-Use Sphinx math directives for mathematical expressions:
-
-```python
-.. math::
-    \text{Softmax}(x_{i}) = \frac{\exp(x_i)}{\sum_j \exp(x_j)}
-```
-
-Or inline math: `:math:\`x^2\``
-
-### 4. Cross-References
-
-Link to related classes and functions using Sphinx roles:
-
- `:class:\`~torch.nn.ModuleName\`` - Link to a class
- `:func:\`torch.function_name\`` - Link to a function
- `:meth:\`~Tensor.method_name\`` - Link to a method
- `:attr:\`attribute_name\`` - Reference an attribute
- The `~` prefix shows only the last component (e.g., `Conv2d` instead of `torch.nn.Conv2d`)
-
-**Example:**
-```python
-See :class:`~torch.nn.Conv2d` for details and output shape.
-```
-
-### 5. Notes and Warnings
-
-Use admonitions for important information:
-
-```python
-.. note::
-    This function doesn't work directly with NLLLoss,
-    which expects the Log to be computed between the Softmax and itself.
-    Use log_softmax instead (it's faster and has better numerical properties).
-
-.. warning::
-    :func:`new_tensor` always copies :attr:`data`. If you have a Tensor
-    ``data`` and want to avoid a copy, use :func:`torch.Tensor.requires_grad_`
-    or :func:`torch.Tensor.detach`.
-```
-
-### 6. Args Section
-
-Document all parameters with type annotations and descriptions:
-
-```python
-Args:
-    input (Tensor): input tensor of shape :math:`(\text{minibatch} , \text{in\_channels} , iH , iW)`
-    weight (Tensor): filters of shape :math:`(\text{out\_channels} , kH , kW)`
-    bias (Tensor, optional): optional bias tensor of shape :math:`(\text{out\_channels})`. Default: ``None``
-    stride (int or tuple): the stride of the convolving kernel. Can be a single number or a
-      tuple `(sH, sW)`. Default: 1
-```
-
-**Formatting rules:**
- Parameter name in **lowercase**
- Type in parentheses: `(Type)`, `(Type, optional)` for optional parameters
- Description follows the type
- For optional parameters, include "Default: ``value``" at the end
- Use double backticks for inline code: ``` ``None`` ```
- Indent continuation lines by 2 spaces
-
-### 7. Keyword Args Section (if applicable)
-
-Sometimes keyword arguments are documented separately:
-
-```python
-Keyword args:
-    dtype (:class:`torch.dtype`, optional): the desired type of returned tensor.
-        Default: if None, same :class:`torch.dtype` as this tensor.
-    device (:class:`torch.device`, optional): the desired device of returned tensor.
-        Default: if None, same :class:`torch.device` as this tensor.
-    requires_grad (bool, optional): If autograd should record operations on the
-        returned tensor. Default: ``False``.
-```
-
-### 8. Returns Section (if needed)
-
-Document the return value:
-
-```python
-Returns:
-    Tensor: Sampled tensor of same shape as `logits` from the Gumbel-Softmax distribution.
-        If ``hard=True``, the returned samples will be one-hot, otherwise they will
-        be probability distributions that sum to 1 across `dim`.
-```
-
-Or simply include it in the function signature line if obvious from context.
-
-### 9. Examples Section
-
-Always include examples when possible:
-
-```python
-Examples::
-
-    >>> inputs = torch.randn(33, 16, 30)
-    >>> filters = torch.randn(20, 16, 5)
-    >>> F.conv1d(inputs, filters)
-
-    >>> # With square kernels and equal stride
-    >>> filters = torch.randn(8, 4, 3, 3)
-    >>> inputs = torch.randn(1, 4, 5, 5)
-    >>> F.conv2d(inputs, filters, padding=1)
-```
-
-**Formatting rules:**
- Use `Examples::` with double colon
- Use `>>>` prompt for Python code
- Include comments with `#` when helpful
- Show actual output when it helps understanding (indent without `>>>`)
-
-### 10. External References
-
-Link to papers or external documentation:
-
-```python
-.. _Link Name:
-    https://arxiv.org/abs/1611.00712
-```
-
-Reference them in text: ```See `Link Name`_```
-
-## Method Types
-
-### Native Python Functions
-
-For regular Python functions, use a standard docstring:
-
-```python
-def relu(input: Tensor, inplace: bool = False) -> Tensor:
-    r"""relu(input, inplace=False) -> Tensor
-
-    Applies the rectified linear unit function element-wise. See
-    :class:`~torch.nn.ReLU` for more details.
-    """
-    # implementation
-```
-
-### C-Bound Functions (using add_docstr)
-
-For C-bound functions, use `_add_docstr`:
-
-```python
-conv1d = _add_docstr(
-    torch.conv1d,
-    r"""
-conv1d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1) -> Tensor
-
-Applies a 1D convolution over an input signal composed of several input
-planes.
-
-See :class:`~torch.nn.Conv1d` for details and output shape.
-
-Args:
-    input: input tensor of shape :math:`(\text{minibatch} , \text{in\_channels} , iW)`
-    weight: filters of shape :math:`(\text{out\_channels} , kW)`
-    ...
-""",
-)
-```
-
-### In-Place Variants
-
-For in-place operations (ending with `_`), reference the original:
-
-```python
-add_docstr_all(
-    "abs_",
-    r"""
-abs_() -> Tensor
-
-In-place version of :meth:`~Tensor.abs`
-""",
-)
-```
-
-### Alias Functions
-
-For aliases, simply reference the original:
-
-```python
-add_docstr_all(
-    "absolute",
-    r"""
-absolute() -> Tensor
-
-Alias for :func:`abs`
-""",
-)
-```
-
-## Common Patterns
-
-### Shape Documentation
-
-Use LaTeX math notation for tensor shapes:
-
-```python
-:math:`(\text{minibatch} , \text{in\_channels} , iH , iW)`
-```
-
-### Reusable Argument Definitions
-
-For commonly used arguments, define them once and reuse:
-
-```python
-common_args = parse_kwargs(
-    """
-    dtype (:class:`torch.dtype`, optional): the desired type of returned tensor.
-        Default: if None, same as this tensor.
-"""
-)
-
-# Then use with .format():
-r"""
-...
-
-Keyword args:
-    {dtype}
-    {device}
-""".format(**common_args)
-```
-
-### Template Insertion
-
-Insert reproducibility notes or other common text:
-
-```python
-r"""
-{tf32_note}
-
-{cudnn_reproducibility_note}
-""".format(**reproducibility_notes, **tf32_notes)
-```
-
-## Complete Example
-
-Here's a complete example showing all elements:
-
-```python
-def gumbel_softmax(
-    logits: Tensor,
-    tau: float = 1,
-    hard: bool = False,
-    eps: float = 1e-10,
-    dim: int = -1,
-) -> Tensor:
-    r"""
-    Sample from the Gumbel-Softmax distribution and optionally discretize.
-
-    Args:
-        logits (Tensor): `[..., num_features]` unnormalized log probabilities
-        tau (float): non-negative scalar temperature
-        hard (bool): if ``True``, the returned samples will be discretized as one-hot vectors,
-              but will be differentiated as if it is the soft sample in autograd. Default: ``False``
-        dim (int): A dimension along which softmax will be computed. Default: -1
-
-    Returns:
-        Tensor: Sampled tensor of same shape as `logits` from the Gumbel-Softmax distribution.
-            If ``hard=True``, the returned samples will be one-hot, otherwise they will
-            be probability distributions that sum to 1 across `dim`.
-
-    .. note::
-        This function is here for legacy reasons, may be removed from nn.Functional in the future.
-
-    Examples::
-        >>> logits = torch.randn(20, 32)
-        >>> # Sample soft categorical using reparametrization trick:
-        >>> F.gumbel_softmax(logits, tau=1, hard=False)
-        >>> # Sample hard categorical using "Straight-through" trick:
-        >>> F.gumbel_softmax(logits, tau=1, hard=True)
-
-    .. _Link 1:
-        https://arxiv.org/abs/1611.00712
-    """
-    # implementation
-```
-
-## Quick Checklist
-
-When writing a PyTorch docstring, ensure:
-
- [ ] Use raw string (`r"""`)
- [ ] Include function signature on first line
- [ ] Provide brief description
- [ ] Document all parameters in Args section with types
- [ ] Include default values for optional parameters
- [ ] Use Sphinx cross-references (`:func:`, `:class:`, `:meth:`)
- [ ] Add mathematical formulas if applicable
- [ ] Include at least one example in Examples section
- [ ] Add warnings/notes for important caveats
- [ ] Link to related module class with `:class:`
- [ ] Use proper math notation for tensor shapes
- [ ] Follow consistent formatting and indentation
-
-## Common Sphinx Roles Reference
-
- `:class:\`~torch.nn.Module\`` - Class reference
- `:func:\`torch.function\`` - Function reference
- `:meth:\`~Tensor.method\`` - Method reference
- `:attr:\`attribute\`` - Attribute reference
- `:math:\`equation\`` - Inline math
- `:ref:\`label\`` - Internal reference
- ``` ``code`` ``` - Inline code (use double backticks)
-
-## Additional Notes
-
- **Indentation**: Use 4 spaces for code, 2 spaces for continuation of parameter descriptions
- **Line length**: Try to keep lines under 100 characters when possible
- **Periods**: End sentences with periods, but not the signature line
- **Backticks**: Use double backticks for code: ``` ``True`` ``None`` ``False`` ```
- **Types**: Common types are `Tensor`, `int`, `float`, `bool`, `str`, `tuple`, `list`, etc.
--- a/.github/actions/setup-rocm/action.yml
+++ b/.github/actions/setup-rocm/action.yml
@ -124,10 +124,3 @@ runs:
      id: login-ecr
      continue-on-error: true
      uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
-
-    - name: Preserve github env variables for use in docker
-      shell: bash
-      run: |
-        env | grep '^GITHUB' >> "${RUNNER_TEMP}/github_env_${GITHUB_RUN_ID}"
-        env | grep '^CI' >> "${RUNNER_TEMP}/github_env_${GITHUB_RUN_ID}"
-        env | grep '^RUNNER' >> "${RUNNER_TEMP}/github_env_${GITHUB_RUN_ID}"
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@ -1 +1 @@
-1752fe6809b74921644866275ab80244b96e80bc
+faffd5cf673615583da6517275e361cb3dbc77e6
--- a/.github/ci_configs/vllm/Dockerfile
+++ b/.github/ci_configs/vllm/Dockerfile
@ -283,9 +283,6 @@ RUN --mount=type=bind,source=${TORCH_WHEELS_PATH},target=/dist \
        uv pip install --system $(cat torch_build_versions.txt | xargs) --index-url https://download.pytorch.org/whl/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
    fi

-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system --pre apache-tvm-ffi==0.1.0b15
-
 # Install the vllm wheel from previous stage
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system /wheels/vllm/*.whl --verbose
@ -298,8 +295,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 ARG torch_cuda_arch_list='8.0;8.9;9.0a;10.0a;12.0'
 ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}

-# TODO(elainewy): remove this once vllm commit is updated, and install flashinfer from pip
-# see https://github.com/pytorch/pytorch/pull/165274#issuecomment-3408531784
 ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
 ARG FLASHINFER_GIT_REF="v0.2.14.post1"

--- a/.github/label_to_label.yml
+++ b/.github/label_to_label.yml
@ -15,11 +15,6 @@
  - "module: reinplacing"
  then:
  - "module: pt2-dispatcher"
- any:
-  - "vllm-compile"
-  then:
-  - "module: vllm"
-  - "oncall: pt2"
 - any:
  - "module: vmap"
  then:
@ -32,6 +27,10 @@
  - "module: pt2 optimizer"
  then:
  - "module: dynamo"
+- any:
+  - "module: flex attention"
+  then:
+  - "module: higher order operators"
 - any:
  - "module: aotinductor"
  then:
--- a/.github/templates/windows_binary_build_workflow.yml.j2
+++ b/.github/templates/windows_binary_build_workflow.yml.j2
@ -79,9 +79,9 @@ jobs:
    runs-on: "windows-11-arm64-preview"
    {%- else %}
    {%- if branches == "nightly" %}
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    {%- else %}
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge.nonephemeral"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
    {%- endif %}
    {%- endif %}
    timeout-minutes: !{{ common.timeout_minutes_windows_binary }}
--- a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
@ -44,7 +44,7 @@ jobs:
  libtorch-cpu-shared-with-deps-debug-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -291,7 +291,7 @@ jobs:
  libtorch-cuda12_6-shared-with-deps-debug-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -541,7 +541,7 @@ jobs:
  libtorch-cuda12_8-shared-with-deps-debug-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -791,7 +791,7 @@ jobs:
  libtorch-cuda13_0-shared-with-deps-debug-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
--- a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
@ -44,7 +44,7 @@ jobs:
  libtorch-cpu-shared-with-deps-release-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -291,7 +291,7 @@ jobs:
  libtorch-cuda12_6-shared-with-deps-release-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -541,7 +541,7 @@ jobs:
  libtorch-cuda12_8-shared-with-deps-release-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -791,7 +791,7 @@ jobs:
  libtorch-cuda13_0-shared-with-deps-release-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
--- a/.github/workflows/generated-windows-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-windows-binary-wheel-nightly.yml
@ -44,7 +44,7 @@ jobs:
  wheel-py3_10-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -279,7 +279,7 @@ jobs:
  wheel-py3_10-cuda12_6-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -517,7 +517,7 @@ jobs:
  wheel-py3_10-cuda12_8-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -755,7 +755,7 @@ jobs:
  wheel-py3_10-cuda13_0-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -993,7 +993,7 @@ jobs:
  wheel-py3_10-xpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -1229,7 +1229,7 @@ jobs:
  wheel-py3_11-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -1464,7 +1464,7 @@ jobs:
  wheel-py3_11-cuda12_6-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -1702,7 +1702,7 @@ jobs:
  wheel-py3_11-cuda12_8-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -1940,7 +1940,7 @@ jobs:
  wheel-py3_11-cuda13_0-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -2178,7 +2178,7 @@ jobs:
  wheel-py3_11-xpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -2414,7 +2414,7 @@ jobs:
  wheel-py3_12-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -2649,7 +2649,7 @@ jobs:
  wheel-py3_12-cuda12_6-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -2887,7 +2887,7 @@ jobs:
  wheel-py3_12-cuda12_8-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -3125,7 +3125,7 @@ jobs:
  wheel-py3_12-cuda13_0-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -3363,7 +3363,7 @@ jobs:
  wheel-py3_12-xpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -3599,7 +3599,7 @@ jobs:
  wheel-py3_13-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -3834,7 +3834,7 @@ jobs:
  wheel-py3_13-cuda12_6-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -4072,7 +4072,7 @@ jobs:
  wheel-py3_13-cuda12_8-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -4310,7 +4310,7 @@ jobs:
  wheel-py3_13-cuda13_0-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -4548,7 +4548,7 @@ jobs:
  wheel-py3_13-xpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -4784,7 +4784,7 @@ jobs:
  wheel-py3_13t-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -5019,7 +5019,7 @@ jobs:
  wheel-py3_13t-cuda12_6-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -5257,7 +5257,7 @@ jobs:
  wheel-py3_13t-cuda12_8-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -5495,7 +5495,7 @@ jobs:
  wheel-py3_13t-cuda13_0-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -5733,7 +5733,7 @@ jobs:
  wheel-py3_13t-xpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -5969,7 +5969,7 @@ jobs:
  wheel-py3_14-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -6204,7 +6204,7 @@ jobs:
  wheel-py3_14-cuda12_6-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -6442,7 +6442,7 @@ jobs:
  wheel-py3_14-cuda12_8-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -6680,7 +6680,7 @@ jobs:
  wheel-py3_14-cuda13_0-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -6918,7 +6918,7 @@ jobs:
  wheel-py3_14-xpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -7154,7 +7154,7 @@ jobs:
  wheel-py3_14t-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -7389,7 +7389,7 @@ jobs:
  wheel-py3_14t-cuda12_6-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -7627,7 +7627,7 @@ jobs:
  wheel-py3_14t-cuda12_8-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -7865,7 +7865,7 @@ jobs:
  wheel-py3_14t-cuda13_0-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -8103,7 +8103,7 @@ jobs:
  wheel-py3_14t-xpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
--- a/.github/workflows/inductor-periodic.yml
+++ b/.github/workflows/inductor-periodic.yml
@ -88,6 +88,7 @@ jobs:
    with:
      build-environment: linux-jammy-rocm-py3_10
      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3-benchmarks
+      sync-tag: rocm-build
      test-matrix: |
        { include: [
          { config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@ -147,16 +147,15 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-debug
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9
-      cuda-arch-list: 8.9
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", owners: ["oncall:debug-build"] },
-          { config: "default", shard: 2, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", owners: ["oncall:debug-build"] },
-          { config: "default", shard: 3, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", owners: ["oncall:debug-build"] },
-          { config: "default", shard: 4, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", owners: ["oncall:debug-build"] },
-          { config: "default", shard: 5, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", owners: ["oncall:debug-build"] },
-          { config: "default", shard: 6, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", owners: ["oncall:debug-build"] },
-          { config: "default", shard: 7, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", owners: ["oncall:debug-build"] },
+          { config: "default", shard: 1, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu", owners: ["oncall:debug-build"] },
+          { config: "default", shard: 2, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu", owners: ["oncall:debug-build"] },
+          { config: "default", shard: 3, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu", owners: ["oncall:debug-build"] },
+          { config: "default", shard: 4, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu", owners: ["oncall:debug-build"] },
+          { config: "default", shard: 5, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu", owners: ["oncall:debug-build"] },
+          { config: "default", shard: 6, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu", owners: ["oncall:debug-build"] },
+          { config: "default", shard: 7, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu", owners: ["oncall:debug-build"] },
        ]}
    secrets: inherit

--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -347,8 +347,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      # This should sync with the build in xpu.yml but xpu uses a larger runner
-      # sync-tag: linux-xpu-n-build
+      sync-tag: linux-xpu-n-build
      runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
      build-environment: linux-jammy-xpu-n-py3.10
      docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3
--- a/.github/workflows/rocm-mi300.yml
+++ b/.github/workflows/rocm-mi300.yml
@ -45,6 +45,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-noble-rocm-py3.12-mi300
      docker-image-name: ci-image:pytorch-linux-noble-rocm-n-py3
+      sync-tag: rocm-build
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1" },
--- a/.github/workflows/rocm-mi355.yml
+++ b/.github/workflows/rocm-mi355.yml
@ -42,6 +42,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-noble-rocm-py3.12-mi355
      docker-image-name: ci-image:pytorch-linux-noble-rocm-n-py3
+      sync-tag: rocm-build
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.mi355.1" },
--- a/.github/workflows/rocm-navi31.yml
+++ b/.github/workflows/rocm-navi31.yml
@ -26,23 +26,11 @@ jobs:
      id-token: write
      contents: read

-  get-label-type:
-    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
-    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
-    with:
-      triggering_actor: ${{ github.triggering_actor }}
-      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
-      curr_branch: ${{ github.head_ref || github.ref_name }}
-      curr_ref_type: ${{ github.ref_type }}
-
  linux-jammy-rocm-py3_10-build:
    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
    name: linux-jammy-rocm-py3.10
    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-rocm-py3.10
      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
      sync-tag: rocm-build
--- a/.github/workflows/rocm.yml
+++ b/.github/workflows/rocm.yml
@ -26,23 +26,11 @@ jobs:
      id-token: write
      contents: read

-  get-label-type:
-    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
-    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
-    with:
-      triggering_actor: ${{ github.triggering_actor }}
-      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
-      curr_branch: ${{ github.head_ref || github.ref_name }}
-      curr_ref_type: ${{ github.ref_type }}
-
  linux-jammy-rocm-py3_10-build:
    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
    name: linux-jammy-rocm-py3.10
    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-rocm-py3.10
      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
      sync-tag: rocm-build
--- a/.github/workflows/trunk-tagging.yml
+++ b/.github/workflows/trunk-tagging.yml
@ -58,10 +58,8 @@ jobs:
          else
            COMMIT_SHA="${{ github.sha }}"
          fi
-          {
-            echo "sha=${COMMIT_SHA}"
-            echo "tag_name=trunk/${COMMIT_SHA}"
-          } >> "${GITHUB_OUTPUT}"
+          echo "sha=${COMMIT_SHA}" >> "${GITHUB_OUTPUT}"
+          echo "tag_name=trunk/${COMMIT_SHA}" >> "${GITHUB_OUTPUT}"

      - name: Validate commit SHA
        run: |
@ -89,7 +87,7 @@ jobs:
            echo "✅ Commit ${COMMIT_SHA} is valid (automatic push trigger)"
          fi

-      - name: Create and push tag(s) with retry
+      - name: Create and push tag with retry
        id: check_tag
        env:
          TAG_NAME: ${{ steps.commit.outputs.tag_name }}
@ -114,23 +112,14 @@ jobs:
            return 1
          }

-          # Counters for summary reporting
-          created_count=0
-          skipped_count=0
-          failed_count=0
+          # Exit early if tag already exists
+          if check_tag_exists; then
+            echo "✅ Tag already exists - no action needed"
+            echo "exists=true" >> "${GITHUB_OUTPUT}"
+            exit 0
+          fi

-          # Always write outputs once on exit
-          finish() {
-            set +e
-            if [ -n "${GITHUB_OUTPUT:-}" ]; then
-              {
-                echo "created_count=${created_count}"
-                echo "skipped_count=${skipped_count}"
-                echo "failed_count=${failed_count}"
-              } >> "${GITHUB_OUTPUT}"
-            fi
-          }
-          trap finish EXIT
+          echo "Tag ${TAG_NAME} does not exist, proceeding with creation"

          # Retry configuration
          MAX_RETRIES=5
@ -205,111 +194,31 @@ jobs:
            }
          }

-          # New behavior for push events: enumerate commits in the push and tag each one.
-          # For workflow_dispatch, retain existing single-SHA behavior.
-
-          # Always fetch tags once up front to improve idempotency in loops
-          git fetch origin --tags --quiet || true
-
-          if [ "${{ github.event_name }}" = "push" ]; then
-            BEFORE_SHA="${{ github.event.before }}"
-            AFTER_SHA="${{ github.sha }}"  # same as event.after
-
-            # List commits introduced by this push (old..new), oldest first for stable ordering
-            commits_file="$(mktemp)"
-            git rev-list --reverse "${BEFORE_SHA}..${AFTER_SHA}" > "${commits_file}"
-
-            if [ ! -s "${commits_file}" ]; then
-              echo "No new commits found between ${BEFORE_SHA}..${AFTER_SHA}; nothing to tag."
-              rm -f "${commits_file}"
-              exit 0
-            fi
-
-            commit_count="$(wc -l < "${commits_file}" | tr -d ' ')"
-            echo "Found ${commit_count} commit(s) to tag for push:"
-            while IFS= read -r sha; do
-              printf '  %s\n' "${sha}"
-            done < "${commits_file}"
-
-            while IFS= read -r sha; do
-              TAG_NAME="trunk/${sha}"
-              COMMIT_SHA="${sha}"
-
-              # If tag already exists locally or remotely, skip (idempotent)
-              if check_tag_exists; then
-                echo "✅ Tag ${TAG_NAME} already exists - skipping"
-                skipped_count=$((skipped_count + 1))
-                continue
-              fi
-
-              echo "Tag ${TAG_NAME} does not exist, proceeding with creation"
-
-              if retry_with_backoff "tag_with_retry" "Creating tag ${TAG_NAME} for commit ${COMMIT_SHA}"; then
-                created_count=$((created_count + 1))
-              else
-                echo "Tag creation failed after all retry attempts for ${TAG_NAME}"
-                failed_count=$((failed_count + 1))
-              fi
-            done < "${commits_file}"
-
-            rm -f "${commits_file}"
-
-            if [ "${failed_count}" -gt 0 ]; then
-              exit 1
-            fi
+          # Execute with retry
+          if retry_with_backoff "tag_with_retry" "Creating tag ${TAG_NAME} for commit ${COMMIT_SHA}"; then
+            echo "exists=false" >> "${GITHUB_OUTPUT}"
            exit 0
          else
-            # workflow_dispatch path (single SHA tagging preserved)
-
-            # Exit early if tag already exists
-            if check_tag_exists; then
-              echo "✅ Tag already exists - no action needed"
-              skipped_count=1
-              exit 0
-            fi
-
-            echo "Tag ${TAG_NAME} does not exist, proceeding with creation"
-
-            if retry_with_backoff "tag_with_retry" "Creating tag ${TAG_NAME} for commit ${COMMIT_SHA}"; then
-              created_count=1
-              exit 0
-            else
-              echo "Tag creation failed after all retry attempts"
-              failed_count=1
-              exit 1
-            fi
+            echo "Tag creation failed after all retry attempts"
+            exit 1
          fi

      - name: Tag creation summary
        if: always()
        run: |
-          if [ "${{ github.event_name }}" = "push" ]; then
-            echo "Trigger: push on main"
-            echo "Created: ${{ steps.check_tag.outputs.created_count }}"
-            echo "Skipped (already existed): ${{ steps.check_tag.outputs.skipped_count }}"
-            echo "Failed: ${{ steps.check_tag.outputs.failed_count }}"
-            if [ "${{ steps.check_tag.outputs.failed_count }}" = "0" ]; then
-              echo "✅ Completed tagging for push range ${{ github.event.before }}..${{ github.sha }}"
-            else
-              echo "❌ Some tags failed to create for push range ${{ github.event.before }}..${{ github.sha }}"
-            fi
+          if [ "${{ steps.check_tag.outputs.exists }}" = "true" ]; then
+            echo "✅ Tag ${{ steps.commit.outputs.tag_name }} already existed - no action needed"
+          elif [ "${{ job.status }}" = "success" ]; then
+            echo "✅ Successfully created tag ${{ steps.commit.outputs.tag_name }} for commit ${{ steps.commit.outputs.sha }}"
          else
-            if [ "${{ steps.check_tag.outputs.failed_count }}" = "0" ]; then
-              if [ "${{ steps.check_tag.outputs.created_count }}" = "0" ]; then
-                echo "✅ Tag ${{ steps.commit.outputs.tag_name }} already existed - no action needed"
-              else
-                echo "✅ Successfully created tag ${{ steps.commit.outputs.tag_name }} for commit ${{ steps.commit.outputs.sha }}"
-              fi
-            else
-              echo "❌ Failed to create tag ${{ steps.commit.outputs.tag_name }} for commit ${{ steps.commit.outputs.sha }}"
-            fi
-
-            echo ""
-            echo "Tag details:"
-            echo "  Name: ${{ steps.commit.outputs.tag_name }}"
-            echo "  Commit: ${{ steps.commit.outputs.sha }}"
-            echo "  Trigger: ${{ github.event_name }}"
-            if [ -n "${{ github.event.inputs.commit_sha }}" ]; then
-              echo "  Manual commit: ${{ github.event.inputs.commit_sha }}"
-            fi
+            echo "❌ Failed to create tag ${{ steps.commit.outputs.tag_name }} for commit ${{ steps.commit.outputs.sha }}"
+          fi
+
+          echo ""
+          echo "Tag details:"
+          echo "  Name: ${{ steps.commit.outputs.tag_name }}"
+          echo "  Commit: ${{ steps.commit.outputs.sha }}"
+          echo "  Trigger: ${{ github.event_name }}"
+          if [ -n "${{ github.event.inputs.commit_sha }}" ]; then
+            echo "  Manual commit: ${{ github.event.inputs.commit_sha }}"
          fi
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -1138,8 +1138,11 @@ command = [
 [[linter]]
 code = 'WORKFLOWSYNC'
 include_patterns = [
-    '.github/workflows/*.yml',
-    '.github/workflows/*.yaml',
+    '.github/workflows/pull.yml',
+    '.github/workflows/trunk.yml',
+    '.github/workflows/periodic.yml',
+    '.github/workflows/mac-mps.yml',
+    '.github/workflows/slow.yml',
 ]
 command = [
    'python3',
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -289,15 +289,14 @@ IF(USE_FBGEMM_GENAI)

    set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON)

-    set(fbgemm_genai_cuh
+    set(fbgemm_genai_mx8mx8bf16_grouped
      "${FBGEMM_GENAI_SRCS}/cutlass_extensions/mx8mx8bf16_grouped/"
-      "${FBGEMM_GENAI_SRCS}/"
    )

    target_include_directories(fbgemm_genai PRIVATE
      ${FBGEMM_THIRD_PARTY}/cutlass/include
      ${FBGEMM_THIRD_PARTY}/cutlass/tools/util/include
-      ${fbgemm_genai_cuh}
+      ${fbgemm_genai_mx8mx8bf16_grouped}
      ${FBGEMM_GENAI_SRCS}/common/include/   # includes fbgemm_gpu/quantize/utils.h, fbgemm_gpu/quantize/tuning_cache.hpp
      ${FBGEMM_GENAI_SRCS}/include/          # includes fbgemm_gpu/torch_ops.h
    )
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@ -19,7 +19,6 @@
 #include <ATen/detail/MPSHooksInterface.h>
 #include <ATen/detail/MTIAHooksInterface.h>
 #include <ATen/detail/PrivateUse1HooksInterface.h>
-#include <ATen/detail/XLAHooksInterface.h>
 #include <ATen/detail/XPUHooksInterface.h>
 #include <c10/core/QEngine.h>
 #include <c10/core/impl/DeviceGuardImplInterface.h>
@ -89,8 +88,6 @@ class TORCH_API Context {
      return at::detail::getHIPHooks();
    } else if (opt_device_type == at::kHPU) {
      return at::detail::getHPUHooks();
-    } else if (opt_device_type == at::kXLA) {
-      return at::detail::getXLAHooks();
    } else {
      TORCH_CHECK(
          false,
@ -199,7 +196,7 @@ class TORCH_API Context {
    return c10::impl::hasDeviceGuardImpl(c10::DeviceType::IPU);
  }
  static bool hasXLA() {
-    return detail::getXLAHooks().hasXLA();
+    return c10::impl::hasDeviceGuardImpl(c10::DeviceType::XLA);
  }
  static bool hasXPU() {
    return detail::getXPUHooks().hasXPU();
--- a/aten/src/ATen/core/CachingHostAllocator.h
+++ b/aten/src/ATen/core/CachingHostAllocator.h
@ -39,7 +39,7 @@ struct HostBlock {
 };

 template <typename B>
-struct alignas(hardware_destructive_interference_size) FreeBlockList {
+struct alignas(64) FreeBlockList {
  std::mutex mutex_;
  std::deque<B*> list_;
 };
@ -122,7 +122,7 @@ struct TORCH_API HostStats {
 // Struct containing memory allocator summary statistics for host, as they
 // are staged for reporting. This is a temporary struct that is used to
 // avoid locking the allocator while collecting stats.
-struct alignas(hardware_destructive_interference_size) HostStatsStaged {
+struct alignas(64) HostStatsStaged {
  std::mutex timing_mutex_;
  // COUNT: total allocations (active + free)
  // LOCK: access to this stat is protected by the allocator's blocks_mutex_
@ -669,7 +669,7 @@ struct CachingHostAllocatorImpl {
    TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for query_event");
  }

-  alignas(hardware_destructive_interference_size) std::mutex blocks_mutex_;
+  alignas(64) std::mutex blocks_mutex_;
  ska::flat_hash_set<B*> blocks_; // block list
  ska::flat_hash_map<void*, B*> ptr_to_block_;

@ -677,17 +677,17 @@ struct CachingHostAllocatorImpl {
  // size. This allows us to quickly find a free block of the right size.
  // We use deque to store per size free list and guard the list with its own
  // mutex.
-  alignas(hardware_destructive_interference_size) std::vector<FreeBlockList<B>> free_list_ =
+  alignas(64) std::vector<FreeBlockList<B>> free_list_ =
      std::vector<FreeBlockList<B>>(MAX_SIZE_INDEX);

-  alignas(hardware_destructive_interference_size) std::mutex events_mutex_;
+  alignas(64) std::mutex events_mutex_;
  std::deque<std::pair<E, B*>> events_; // event queue paired with block

  // Indicates whether the object is active.
  // Set to false in the destructor to signal background threads to stop.
  std::atomic<bool> active_{true};
 protected:
-  alignas(hardware_destructive_interference_size) HostStatsStaged stats_;
+  alignas(64) HostStatsStaged stats_;
 };

 struct TORCH_API HostAllocator : public at::Allocator {
--- a/aten/src/ATen/core/Generator.h
+++ b/aten/src/ATen/core/Generator.h
@ -59,7 +59,9 @@ struct TORCH_API Generator {

  explicit Generator(c10::intrusive_ptr<c10::GeneratorImpl> gen_impl)
   : impl_(std::move(gen_impl)) {
-    TORCH_CHECK(impl_.get(), "GeneratorImpl with nullptr is not supported");
+    if (impl_.get() == nullptr) {
+      throw std::runtime_error("GeneratorImpl with nullptr is not supported");
+    }
  }

  bool operator==(const Generator& rhs) const {
--- a/aten/src/ATen/core/TensorBase.h
+++ b/aten/src/ATen/core/TensorBase.h
@ -111,7 +111,9 @@ class TORCH_API TensorBase {
  explicit TensorBase(
      c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl)
      : impl_(std::move(tensor_impl)) {
-    TORCH_CHECK(impl_.get(), "TensorImpl with nullptr is not supported");
+    if (impl_.get() == nullptr) {
+      throw std::runtime_error("TensorImpl with nullptr is not supported");
+    }
  }
  TensorBase(const TensorBase&) = default;
  TensorBase(TensorBase&&) noexcept = default;
--- a/aten/src/ATen/core/VariableFallbackKernel.cpp
+++ b/aten/src/ATen/core/VariableFallbackKernel.cpp
@ -109,10 +109,6 @@ TORCH_LIBRARY_IMPL(_, AutogradHPU, m) {
  m.fallback(AUTOGRAD_FALLBACK);
 }

-TORCH_LIBRARY_IMPL(_, AutogradPrivateUse1, m) {
-  m.fallback(AUTOGRAD_FALLBACK);
-}
-
 #undef AUTOGRAD_FALLBACK

 } // namespace
--- a/aten/src/ATen/core/dispatch/Dispatcher.cpp
+++ b/aten/src/ATen/core/dispatch/Dispatcher.cpp
@ -442,17 +442,11 @@ RegistrationHandleRAII Dispatcher::registerFallback(DispatchKey dispatchKey, Ker

  auto idx = getDispatchTableIndexForDispatchKey(dispatchKey);
  TORCH_CHECK(idx >= 0 && static_cast<uint64_t>(idx) < backendFallbackKernels_.size(), "idx=", idx);
-  // NB: Perserve BC for registering fallback for AutogradPrivateUse1 multiple time,
-  // refer to https://github.com/pytorch/pytorch/issues/163979 for more informations.
  TORCH_CHECK(
-      dispatchKey == DispatchKey::AutogradPrivateUse1 ||
-          !backendFallbackKernels_[idx].kernel.isValid(),
-      "Tried to register multiple backend fallbacks for the same dispatch key ",
-      dispatchKey,
-      "; previous registration ",
-      backendFallbackKernels_[idx].debug,
-      ", new registration ",
-      debug);
+    !backendFallbackKernels_[idx].kernel.isValid(),
+    "Tried to register multiple backend fallbacks for the same dispatch key ", dispatchKey, "; previous registration ",
+    backendFallbackKernels_[idx].debug, ", new registration ", debug
+  );
  // NB: inferred function schema is always nullptr for fallbacks, as fallbacks
  // cannot be unboxed
  backendFallbackKernels_[idx] = impl::AnnotatedKernel(std::move(kernel), nullptr, std::move(debug));
--- a/aten/src/ATen/core/interned_strings.cpp
+++ b/aten/src/ATen/core/interned_strings.cpp
@ -68,7 +68,11 @@ Symbol InternedStrings::_symbol(const std::string& s) {
    return it->second;

  auto pos = s.find("::");
-  TORCH_CHECK(pos != std::string::npos, "all symbols must have a namespace, <namespace>::<string>, but found: ", s);
+  if (pos == std::string::npos) {
+    std::stringstream ss;
+    ss << "all symbols must have a namespace, <namespace>::<string>, but found: " << s;
+    throw std::runtime_error(ss.str());
+  }
  Symbol ns = _symbol("namespaces::" + s.substr(0, pos));

  Symbol sym(sym_to_info_.size());
@ -117,7 +121,12 @@ std::string Symbol::domainString() const {
 }

 Symbol Symbol::fromDomainAndUnqualString(const std::string & d, const std::string & s) {
-  TORCH_CHECK(d.compare(0, domain_prefix().size(), domain_prefix()) == 0, "Symbol: domain string is expected to be prefixed with '", domain_prefix(), "', e.g. 'org.pytorch.aten'");
+  if (d.compare(0, domain_prefix().size(), domain_prefix()) != 0) {
+    std::ostringstream ss;
+    ss << "Symbol: domain string is expected to be prefixed with '"
+       << domain_prefix() << "', e.g. 'org.pytorch.aten'";
+    throw std::runtime_error(ss.str());
+  }
  std::string qualString = d.substr(domain_prefix().size()) + "::" + s;
  return fromQualString(qualString);
 }
--- a/aten/src/ATen/core/ivalue.cpp
+++ b/aten/src/ATen/core/ivalue.cpp
@ -7,7 +7,6 @@
 #include <ATen/core/jit_type.h>
 #include <ATen/core/stack.h>
 #include <ATen/core/type_factory.h>
-#include <c10/util/Exception.h>
 #include <c10/util/StringUtil.h>
 #include <c10/util/hash.h>
 #include <c10/util/irange.h>
@ -413,7 +412,7 @@ size_t IValue::hash(const IValue& v) {
    case Tag::Enum:
    case Tag::Stream:
    case Tag::Uninitialized:
-      TORCH_CHECK(false,
+      throw std::runtime_error(
          "unhashable type: '" + v.type()->repr_str() + "'");
  }
  // the above switch should be exhaustive
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@ -8,7 +8,6 @@
 #include <ATen/core/type_factory.h>
 #include <ATen/core/qualified_name.h>
 #include <c10/util/TypeList.h>
-#include <c10/util/Exception.h>
 #include <optional>
 #include <c10/core/SymFloat.h>
 #include <c10/core/SymBool.h>
@ -117,8 +116,10 @@ struct SingleElementType : public SharedType {

 protected:
  SingleElementType(TypePtr elem) : SharedType(Kind), elem(std::move(elem)) {
-    TORCH_CHECK(this->elem, c10::str(
+    if (!this->elem) {
+      throw std::runtime_error(c10::str(
            "Can not create ", typeKindToString(Kind), " with None type"));
+    }
  }

 private:
@ -415,12 +416,16 @@ struct TORCH_API SymbolicShape {
  }

  ShapeSymbol operator[](size_t i) const {
-    TORCH_CHECK(dims_, "Rank isn't fixed");
+    if (!dims_) {
+      throw std::runtime_error("Rank isn't fixed");
+    }
    return (*dims_).at(i);
  }

  ShapeSymbol at(size_t i) const {
-    TORCH_CHECK(dims_, "Rank isn't fixed");
+    if (!dims_) {
+      throw std::runtime_error("Rank isn't fixed");
+    }
    return (*dims_).at(i);
  }

@ -515,7 +520,9 @@ struct VaryingShape {
  }

  const std::optional<T> &operator[](size_t i) const {
-    TORCH_CHECK(dims_, "Rank isn't fixed");
+    if (!dims_) {
+      throw std::runtime_error("Rank isn't fixed");
+    }
    return (*dims_).at(i);
  }

@ -950,7 +957,9 @@ struct TORCH_API DictType : public SharedType {

  TypePtr createWithContained(
      std::vector<TypePtr> contained_types) const override {
-    TORCH_CHECK(contained_types.size() == 2, "Expected 2 contained types");
+    if (contained_types.size() != 2) {
+      throw std::runtime_error("Expected 2 contained types");
+    }
    return create(std::move(contained_types.at(0)), std::move(contained_types.at(1)));
  }

--- a/aten/src/ATen/core/type.cpp
+++ b/aten/src/ATen/core/type.cpp
@ -8,7 +8,6 @@
 #include <ATen/core/jit_type.h>
 #include <c10/macros/Macros.h>
 #include <c10/util/env.h>
-#include <c10/util/Exception.h>
 #include <c10/util/flat_hash_map.h>
 #include <c10/util/irange.h>
 #include <array>
@ -827,7 +826,9 @@ TupleType::TupleType(
    : NamedType(TypeKind::TupleType, std::move(name)),
      elements_(std::move(elements)),
      has_free_variables_(std::any_of(elements_.begin(), elements_.end(), [](const TypePtr& v) {
-        TORCH_CHECK(v, "Can not create tuple with None type");
+        if (!v) {
+          throw std::runtime_error("Can not create tuple with None type");
+        }
        return v->hasFreeVariables();
      })), schema_(std::move(schema)) {

--- a/aten/src/ATen/cpu/vec/vec128/vec128.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128.h
@ -6,11 +6,9 @@
 #ifdef __aarch64__
 #if !defined(CPU_CAPABILITY_SVE)
 #include <ATen/cpu/vec/vec128/vec128_bfloat16_neon.h>
-#include <ATen/cpu/vec/vec128/vec128_double_neon.h>
 #include <ATen/cpu/vec/vec128/vec128_float_neon.h>
 #include <ATen/cpu/vec/vec128/vec128_half_neon.h>
 #include <ATen/cpu/vec/vec128/vec128_int_aarch64.h>
-#include <ATen/cpu/vec/vec128/vec128_uint_aarch64.h>
 #endif

 #include <ATen/cpu/vec/vec128/vec128_convert.h>
--- a/aten/src/ATen/cpu/vec/vec128/vec128_bfloat16_neon.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_bfloat16_neon.h
@ -354,47 +354,9 @@ class Vectorized<c10::BFloat16> : public Vectorized16<

  DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(abs)
  Vectorized frac() const;
+  DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(neg)
  DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(trunc)
  DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(sqrt)
-
-#ifdef __ARM_FEATURE_BF16
-  Vectorized<c10::BFloat16> neg() const {
-    return -values;
-  }
-  Vectorized<c10::BFloat16> reciprocal() const {
-    return 1.0f / values;
-  }
-  Vectorized<c10::BFloat16> operator==(
-      const Vectorized<c10::BFloat16>& other) const {
-    return values == other.values;
-  }
-
-  Vectorized<c10::BFloat16> operator!=(
-      const Vectorized<c10::BFloat16>& other) const {
-    return values != other.values;
-  }
-
-  Vectorized<c10::BFloat16> operator<(
-      const Vectorized<c10::BFloat16>& other) const {
-    return values < other.values;
-  }
-
-  Vectorized<c10::BFloat16> operator<=(
-      const Vectorized<c10::BFloat16>& other) const {
-    return values <= other.values;
-  }
-
-  Vectorized<c10::BFloat16> operator>(
-      const Vectorized<c10::BFloat16>& other) const {
-    return values > other.values;
-  }
-
-  Vectorized<c10::BFloat16> operator>=(
-      const Vectorized<c10::BFloat16>& other) const {
-    return values >= other.values;
-  }
-#else
-  DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(neg)
  DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(reciprocal)
  DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator==)
  DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator!=)
@ -402,7 +364,6 @@ class Vectorized<c10::BFloat16> : public Vectorized16<
  DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator<=)
  DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator>)
  DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator>=)
-#endif

 #undef DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD
 #undef DEFINE_BINARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD
@ -451,52 +412,28 @@ template <>
 Vectorized<c10::BFloat16> inline operator+(
    const Vectorized<c10::BFloat16>& a,
    const Vectorized<c10::BFloat16>& b) {
-#ifdef __ARM_FEATURE_BF16
-  bfloat16x8_t x = a;
-  bfloat16x8_t y = b;
-  return x + y;
-#else
  return binary_operator_via_float(std::plus<Vectorized<float>>(), a, b);
-#endif
 }

 template <>
 Vectorized<c10::BFloat16> inline operator-(
    const Vectorized<c10::BFloat16>& a,
    const Vectorized<c10::BFloat16>& b) {
-#ifdef __ARM_FEATURE_BF16
-  bfloat16x8_t x = a;
-  bfloat16x8_t y = b;
-  return x - y;
-#else
  return binary_operator_via_float(std::minus<Vectorized<float>>(), a, b);
-#endif
 }

 template <>
 Vectorized<c10::BFloat16> inline operator*(
    const Vectorized<c10::BFloat16>& a,
    const Vectorized<c10::BFloat16>& b) {
-#ifdef __ARM_FEATURE_BF16
-  bfloat16x8_t x = a;
-  bfloat16x8_t y = b;
-  return x * y;
-#else
  return binary_operator_via_float(std::multiplies<Vectorized<float>>(), a, b);
-#endif
 }

 template <>
 Vectorized<c10::BFloat16> inline operator/(
    const Vectorized<c10::BFloat16>& a,
    const Vectorized<c10::BFloat16>& b) {
-#ifdef __ARM_FEATURE_BF16
-  bfloat16x8_t x = a;
-  bfloat16x8_t y = b;
-  return x / y;
-#else
  return binary_operator_via_float(std::divides<Vectorized<float>>(), a, b);
-#endif
 }

 // frac. Implement this here so we can use subtraction
@ -607,19 +544,12 @@ Vectorized<c10::BFloat16> inline fmadd(
    const Vectorized<c10::BFloat16>& a,
    const Vectorized<c10::BFloat16>& b,
    const Vectorized<c10::BFloat16>& c) {
-#ifdef __ARM_FEATURE_BF16
-  bfloat16x8_t x = a;
-  bfloat16x8_t y = b;
-  bfloat16x8_t z = c;
-  return x * y + z;
-#else
  // NOTE [BF16 FMA]: There isn't an FMA that accumulates into BF16!  Also,
  // vbfmlalbq_f32 and vbfmlaltq_f32 take the even and odd-numbered
  // elements, not the bottom and top half, so they don't seem
  // particularly useful here. Ideally we would include dot product in
  // the Vectorized interface...
  return a * b + c;
-#endif
 }

 template <>
@ -627,15 +557,8 @@ Vectorized<c10::BFloat16> inline fnmadd(
    const Vectorized<c10::BFloat16>& a,
    const Vectorized<c10::BFloat16>& b,
    const Vectorized<c10::BFloat16>& c) {
-#ifdef __ARM_FEATURE_BF16
-  bfloat16x8_t x = a;
-  bfloat16x8_t y = b;
-  bfloat16x8_t z = c;
-  return (-x) * y + z;
-#else
  // See NOTE [BF16 FMA] above.
  return -a * b + c;
-#endif
 }

 template <>
@ -643,15 +566,8 @@ Vectorized<c10::BFloat16> inline fmsub(
    const Vectorized<c10::BFloat16>& a,
    const Vectorized<c10::BFloat16>& b,
    const Vectorized<c10::BFloat16>& c) {
-#ifdef __ARM_FEATURE_BF16
-  bfloat16x8_t x = a;
-  bfloat16x8_t y = b;
-  bfloat16x8_t z = c;
-  return x * y - z;
-#else
  // See NOTE [BF16 FMA] above.
  return a * b - c;
-#endif
 }

 template <>
@ -659,15 +575,8 @@ Vectorized<c10::BFloat16> inline fnmsub(
    const Vectorized<c10::BFloat16>& a,
    const Vectorized<c10::BFloat16>& b,
    const Vectorized<c10::BFloat16>& c) {
-#ifdef __ARM_FEATURE_BF16
-  bfloat16x8_t x = a;
-  bfloat16x8_t y = b;
-  bfloat16x8_t z = c;
-  return (-x) * y - z;
-#else
  // See NOTE [BF16 FMA] above.
  return -a * b - c;
-#endif
 }

 #endif // !defined(C10_MOBILE) && defined(__aarch64__)
--- a/aten/src/ATen/cpu/vec/vec128/vec128_convert.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_convert.h
@ -5,114 +5,6 @@
 namespace at::vec {
 inline namespace CPU_CAPABILITY {
 #if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))
-
-// Enable auto-vectorization for GCC-13+ and clang-17+
-// GCC-12 has a bug: gcc.gnu.org/bugzilla/show_bug.cgi?id=117001
-#if __GNUC__ > 12 || (defined(__clang__) && (__clang_major__ >= 17))
-
-template <typename from_type, typename to_type>
-inline void convertImpl(
-    const from_type* __restrict src,
-    to_type* __restrict dst,
-    int64_t n) {
-  uint64_t len = static_cast<uint64_t>(n);
-  for (uint64_t i = 0; i < len; i++) {
-    dst[i] = static_cast<to_type>(src[i]);
-  }
-}
-
-#define CONVERT_TEMPLATE(from_type, to_type)                           \
-  template <>                                                          \
-  inline void convert(const from_type* src, to_type* dst, int64_t n) { \
-    return convertImpl<from_type, to_type>(src, dst, n);               \
-  }
-
-CONVERT_TEMPLATE(uint8_t, uint8_t)
-CONVERT_TEMPLATE(uint8_t, int8_t)
-CONVERT_TEMPLATE(uint8_t, int16_t)
-CONVERT_TEMPLATE(uint8_t, int32_t)
-CONVERT_TEMPLATE(uint8_t, int64_t)
-CONVERT_TEMPLATE(uint8_t, float)
-CONVERT_TEMPLATE(uint8_t, double)
-CONVERT_TEMPLATE(int8_t, uint8_t)
-CONVERT_TEMPLATE(int8_t, int8_t)
-CONVERT_TEMPLATE(int8_t, int16_t)
-CONVERT_TEMPLATE(int8_t, int32_t)
-CONVERT_TEMPLATE(int8_t, int64_t)
-CONVERT_TEMPLATE(int8_t, float)
-CONVERT_TEMPLATE(int8_t, double)
-CONVERT_TEMPLATE(int16_t, uint8_t)
-CONVERT_TEMPLATE(int16_t, int8_t)
-CONVERT_TEMPLATE(int16_t, int16_t)
-CONVERT_TEMPLATE(int16_t, int32_t)
-CONVERT_TEMPLATE(int16_t, int64_t)
-CONVERT_TEMPLATE(int16_t, float)
-CONVERT_TEMPLATE(int16_t, double)
-CONVERT_TEMPLATE(int32_t, uint8_t)
-CONVERT_TEMPLATE(int32_t, int8_t)
-CONVERT_TEMPLATE(int32_t, int16_t)
-CONVERT_TEMPLATE(int32_t, int32_t)
-CONVERT_TEMPLATE(int32_t, int64_t)
-CONVERT_TEMPLATE(int32_t, float)
-CONVERT_TEMPLATE(int32_t, double)
-CONVERT_TEMPLATE(int64_t, uint8_t)
-CONVERT_TEMPLATE(int64_t, int8_t)
-CONVERT_TEMPLATE(int64_t, int16_t)
-CONVERT_TEMPLATE(int64_t, int32_t)
-CONVERT_TEMPLATE(int64_t, int64_t)
-CONVERT_TEMPLATE(int64_t, float)
-CONVERT_TEMPLATE(int64_t, double)
-CONVERT_TEMPLATE(float, uint8_t)
-CONVERT_TEMPLATE(float, int8_t)
-CONVERT_TEMPLATE(float, int16_t)
-CONVERT_TEMPLATE(float, int32_t)
-CONVERT_TEMPLATE(float, int64_t)
-CONVERT_TEMPLATE(float, float)
-CONVERT_TEMPLATE(float, double)
-CONVERT_TEMPLATE(double, uint8_t)
-CONVERT_TEMPLATE(double, int8_t)
-CONVERT_TEMPLATE(double, int16_t)
-CONVERT_TEMPLATE(double, int32_t)
-CONVERT_TEMPLATE(double, int64_t)
-CONVERT_TEMPLATE(double, float)
-CONVERT_TEMPLATE(double, double)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-CONVERT_TEMPLATE(float16_t, uint8_t)
-CONVERT_TEMPLATE(float16_t, int8_t)
-CONVERT_TEMPLATE(float16_t, int16_t)
-CONVERT_TEMPLATE(float16_t, int32_t)
-CONVERT_TEMPLATE(float16_t, int64_t)
-CONVERT_TEMPLATE(float16_t, float16_t)
-CONVERT_TEMPLATE(float16_t, float)
-CONVERT_TEMPLATE(float16_t, double)
-CONVERT_TEMPLATE(uint8_t, float16_t)
-CONVERT_TEMPLATE(int8_t, float16_t)
-CONVERT_TEMPLATE(int16_t, float16_t)
-CONVERT_TEMPLATE(int32_t, float16_t)
-CONVERT_TEMPLATE(int64_t, float16_t)
-CONVERT_TEMPLATE(float, float16_t)
-CONVERT_TEMPLATE(double, float16_t)
-#endif
-#ifdef __ARM_FEATURE_BF16
-CONVERT_TEMPLATE(bfloat16_t, uint8_t)
-CONVERT_TEMPLATE(bfloat16_t, int8_t)
-CONVERT_TEMPLATE(bfloat16_t, int16_t)
-CONVERT_TEMPLATE(bfloat16_t, int32_t)
-CONVERT_TEMPLATE(bfloat16_t, int64_t)
-CONVERT_TEMPLATE(bfloat16_t, bfloat16_t)
-CONVERT_TEMPLATE(bfloat16_t, float)
-CONVERT_TEMPLATE(bfloat16_t, double)
-CONVERT_TEMPLATE(uint8_t, bfloat16_t)
-CONVERT_TEMPLATE(int8_t, bfloat16_t)
-CONVERT_TEMPLATE(int16_t, bfloat16_t)
-CONVERT_TEMPLATE(int32_t, bfloat16_t)
-CONVERT_TEMPLATE(int64_t, bfloat16_t)
-CONVERT_TEMPLATE(float, bfloat16_t)
-CONVERT_TEMPLATE(double, bfloat16_t)
-#endif
-
-#endif
-
 template <typename src_t>
 struct VecConvert<
    float,
--- a/aten/src/ATen/cpu/vec/vec128/vec128_double_neon.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_double_neon.h
@ -1,586 +0,0 @@
-#pragma once
-
-#include <ATen/cpu/vec/intrinsics.h>
-#include <ATen/cpu/vec/vec_base.h>
-#include <c10/macros/Macros.h>
-#include <c10/util/irange.h>
-#include <cmath>
-
-namespace at::vec {
-// Note [CPU_CAPABILITY namespace]
-// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-// This header, and all of its subheaders, will be compiled with
-// different architecture flags for each supported set of vector
-// intrinsics. So we need to make sure they aren't inadvertently
-// linked together. We do this by declaring objects in an `inline
-// namespace` which changes the name mangling, but can still be
-// accessed as `at::vec`.
-inline namespace CPU_CAPABILITY {
-
-template <>
-struct is_vec_specialized_for<double> : std::bool_constant<true> {};
-
-template <>
-class Vectorized<double> {
- private:
-  float64x2_t values;
-
- public:
-  using value_type = double;
-  using size_type = int;
-  static constexpr size_type size() {
-    return 2;
-  }
-  Vectorized() {
-    values = vdupq_n_f64(0.0);
-  }
-  Vectorized(float64x2_t v) : values(v) {}
-  Vectorized(double val) {
-    values = vdupq_n_f64(val);
-  }
-  template <
-      typename... Args,
-      typename = std::enable_if_t<(sizeof...(Args) == size())>>
-  Vectorized(Args... vals) {
-    __at_align__ double buffer[size()] = {vals...};
-    values = vld1q_f64(buffer);
-  }
-  operator float64x2_t() const {
-    return values;
-  }
-  template <int64_t mask>
-  static Vectorized<double> blend(
-      const Vectorized<double>& a,
-      const Vectorized<double>& b) {
-    // Build an array of flags: each bit of element is 1 if the corresponding
-    // bit in 'mask' is set, 0 otherwise.
-    uint64x2_t maskArray = {
-        (mask & 1ULL) ? 0xFFFFFFFFFFFFFFFF : 0,
-        (mask & 2ULL) ? 0xFFFFFFFFFFFFFFFF : 0};
-    // Use BSL to select elements from b where the mask is 1, else from a
-    return vbslq_f64(maskArray, b.values, a.values);
-  }
-  static Vectorized<double> blendv(
-      const Vectorized<double>& a,
-      const Vectorized<double>& b,
-      const Vectorized<double>& mask_) {
-    return vbslq_f64(vreinterpretq_u64_f64(mask_.values), b.values, a.values);
-  }
-  template <typename step_t>
-  static Vectorized<double> arange(
-      double base = 0.,
-      step_t step = static_cast<step_t>(1)) {
-    return {base, base + static_cast<double>(step)};
-  }
-  static inline Vectorized<double> set(
-      const Vectorized<double>& a,
-      const Vectorized<double>& b,
-      int64_t count = size()) {
-    if (count == 0) {
-      return a;
-    } else if (count >= 2) {
-      return b;
-    } else {
-      float64x2_t c = {b.values[0], a.values[1]};
-      return c;
-    }
-  }
-  static Vectorized<double> loadu(const void* ptr, int64_t count = size()) {
-    if (count == size()) {
-      return vld1q_f64(reinterpret_cast<const double*>(ptr));
-    } else if (count == 1) {
-      float64x1_t x = vld1_f64(reinterpret_cast<const double*>(ptr));
-      float64x1_t z = {0.0};
-      return vcombine_f64(x, z);
-    } else {
-      return vdupq_n_f64(0.0);
-    }
-  }
-  void store(void* ptr, int64_t count = size()) const {
-    if (count == size()) {
-      vst1q_f64(reinterpret_cast<double*>(ptr), values);
-    } else if (count == 1) {
-      vst1_f64(reinterpret_cast<double*>(ptr), vget_low_f64(values));
-    }
-  }
-  const double& operator[](int idx) const = delete;
-  double& operator[](int idx) = delete;
-  int64_t zero_mask() const {
-    // returns an integer mask where all zero elements are translated to 1-bit
-    // and others are translated to 0-bit
-    uint64x2_t cmpReg = vceqzq_f64(values);
-    uint64x2_t mask = {1, 2};
-    uint64x2_t res = vandq_u64(cmpReg, mask);
-    return res[0] | res[1];
-  }
-  Vectorized<double> isnan() const {
-    // NaN check
-    return vreinterpretq_f64_u32(
-        vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(values, values))));
-  }
-  bool has_inf_nan() const {
-    Vectorized<double> x = vsubq_f64(values, values);
-    float64x2_t r = x.isnan();
-    uint64x2_t u = vreinterpretq_u64_f64(r);
-    return u[0] | u[1];
-  }
-  Vectorized<double> map(double (*f)(double)) const {
-    float64x2_t result;
-    result[0] = f(values[0]);
-    result[1] = f(values[1]);
-    return result;
-  }
-  Vectorized<double> map2(
-      const Vectorized<double>& second,
-      double (*const f)(double, double)) const {
-    float64x2_t result;
-    result[0] = f(values[0], second.values[0]);
-    result[1] = f(values[1], second.values[1]);
-    return result;
-  }
-  Vectorized<double> abs() const {
-    return vabsq_f64(values);
-  }
-  Vectorized<double> angle() const {
-    auto zero = Vectorized<double>(0.0);
-    auto pi = Vectorized<double>(c10::pi<double>);
-    auto tmp = blendv(zero, pi, vreinterpretq_f64_u64(vcltzq_f64(values)));
-    return blendv(tmp, *this, isnan());
-  }
-  Vectorized<double> real() const {
-    return *this;
-  }
-  Vectorized<double> imag() const {
-    return Vectorized<double>(0.0);
-  }
-  Vectorized<double> conj() const {
-    return *this;
-  }
-  Vectorized<double> acos() const {
-    return USE_SLEEF(
-        Vectorized<double>(Sleef_acosd2_u10(values)), map(std::acos));
-  }
-  Vectorized<double> acosh() const {
-    return USE_SLEEF(
-        Vectorized<double>(Sleef_acoshd2_u10(values)), map(std::acosh));
-  }
-  Vectorized<double> asin() const {
-    return USE_SLEEF(
-        Vectorized<double>(Sleef_asind2_u10(values)), map(std::asin));
-  }
-  Vectorized<double> asinh() const {
-    return USE_SLEEF(
-        Vectorized<double>(Sleef_asinhd2_u10(values)), map(std::asinh));
-  }
-  Vectorized<double> atan() const {
-    return USE_SLEEF(
-        Vectorized<double>(Sleef_atand2_u10(values)), map(std::atan));
-  }
-  Vectorized<double> atanh() const {
-    return USE_SLEEF(
-        Vectorized<double>(Sleef_atanhd2_u10(values)), map(std::atanh));
-  }
-  Vectorized<double> atan2(const Vectorized<double>& b) const {USE_SLEEF(
-      { return Vectorized<double>(Sleef_atan2d2_u10(values, b)); },
-      {
-        __at_align__ double tmp[size()];
-        __at_align__ double tmp_b[size()];
-        store(tmp);
-        b.store(tmp_b);
-        for (int64_t i = 0; i < size(); i++) {
-          tmp[i] = std::atan2(tmp[i], tmp_b[i]);
-        }
-        return loadu(tmp);
-      })} Vectorized<double> copysign(const Vectorized<double>& sign) const {
-      USE_SLEEF(
-          { return Vectorized<double>(Sleef_copysignd2(values, sign)); },
-          {
-            __at_align__ double tmp[size()];
-            __at_align__ double tmp_sign[size()];
-            store(tmp);
-            sign.store(tmp_sign);
-            for (int64_t i = 0; i < size(); i++) {
-              tmp[i] = std::copysign(tmp[i], tmp_sign[i]);
-            }
-            return loadu(tmp);
-          })} Vectorized<double> erf() const {
-    return USE_SLEEF(
-        Vectorized<double>(Sleef_erfd2_u10(values)), map(std::erf));
-  }
-  Vectorized<double> erfc() const {
-    return USE_SLEEF(
-        Vectorized<double>(Sleef_erfcd2_u15(values)), map(std::erfc));
-  }
-  Vectorized<double> exp() const {
-    return USE_SLEEF(
-        Vectorized<double>(Sleef_expd2_u10(values)), map(std::exp));
-  }
-  Vectorized<double> exp2() const {
-    return USE_SLEEF(
-        Vectorized<double>(Sleef_exp2d2_u10(values)), map(std::exp2));
-  }
-  Vectorized<double> expm1() const {
-    return USE_SLEEF(
-        Vectorized<double>(Sleef_expm1d2_u10(values)), map(std::expm1));
-  }
-  Vectorized<double> fmod(const Vectorized<double>& q) const {USE_SLEEF(
-      { return Vectorized<double>(Sleef_fmodd2(values, q)); },
-      {
-        __at_align__ double tmp[size()];
-        __at_align__ double tmp_q[size()];
-        store(tmp);
-        q.store(tmp_q);
-        for (int64_t i = 0; i < size(); i++) {
-          tmp[i] = std::fmod(tmp[i], tmp_q[i]);
-        }
-        return loadu(tmp);
-      })} Vectorized<double> hypot(const Vectorized<double>& b) const {
-      USE_SLEEF(
-          { return Vectorized<double>(Sleef_hypotd2_u05(values, b)); },
-          {
-            __at_align__ double tmp[size()];
-            __at_align__ double tmp_b[size()];
-            store(tmp);
-            b.store(tmp_b);
-            for (int64_t i = 0; i < size(); i++) {
-              tmp[i] = std::hypot(tmp[i], tmp_b[i]);
-            }
-            return loadu(tmp);
-          })} Vectorized<double> i0() const {
-    return map(calc_i0);
-  }
-  Vectorized<double> nextafter(const Vectorized<double>& b) const {USE_SLEEF(
-      { return Vectorized<double>(Sleef_nextafterd2(values, b)); },
-      {
-        __at_align__ double tmp[size()];
-        __at_align__ double tmp_b[size()];
-        store(tmp);
-        b.store(tmp_b);
-        for (int64_t i = 0; i < size(); ++i) {
-          tmp[i] = std::nextafter(tmp[i], tmp_b[i]);
-        }
-        return loadu(tmp);
-      })} Vectorized<double> log() const {
-    return USE_SLEEF(
-        Vectorized<double>(Sleef_logd2_u10(values)), map(std::log));
-  }
-  Vectorized<double> log2() const {
-    return USE_SLEEF(
-        Vectorized<double>(Sleef_log2d2_u10(values)), map(std::log2));
-  }
-  Vectorized<double> log10() const {
-    return USE_SLEEF(
-        Vectorized<double>(Sleef_log10d2_u10(values)), map(std::log10));
-  }
-  Vectorized<double> log1p() const {
-    return USE_SLEEF(
-        Vectorized<double>(Sleef_log1pd2_u10(values)), map(std::log1p));
-  }
-  Vectorized<double> frac() const;
-  Vectorized<double> sin() const {
-    return USE_SLEEF(
-        Vectorized<double>(Sleef_sind2_u10(values)), map(std::sin));
-  }
-  Vectorized<double> sinh() const {
-    return USE_SLEEF(
-        Vectorized<double>(Sleef_sinhd2_u10(values)), map(std::sinh));
-  }
-  Vectorized<double> cos() const {
-    return USE_SLEEF(
-        Vectorized<double>(Sleef_cosd2_u10(values)), map(std::cos));
-  }
-  Vectorized<double> cosh() const {
-    return USE_SLEEF(
-        Vectorized<double>(Sleef_coshd2_u10(values)), map(std::cosh));
-  }
-  Vectorized<double> pow(const Vectorized<double>& b) const {USE_SLEEF(
-      { return Vectorized<double>(Sleef_powd2_u10(values, b)); },
-      {
-        __at_align__ double tmp[size()];
-        __at_align__ double tmp_b[size()];
-        store(tmp);
-        b.store(tmp_b);
-        for (int64_t i = 0; i < size(); i++) {
-          tmp[i] = std::pow(tmp[i], tmp_b[i]);
-        }
-        return loadu(tmp);
-      })} // Comparison using the _CMP_**_OQ predicate.
-          //   `O`: get false if an operand is NaN
-          //   `Q`: do not raise if an operand is NaN
-  Vectorized<double> tan() const {
-    return USE_SLEEF(
-        Vectorized<double>(Sleef_tand2_u10(values)), map(std::tan));
-  }
-  Vectorized<double> tanh() const {
-    return USE_SLEEF(
-        Vectorized<double>(Sleef_tanhd2_u10(values)), map(std::tanh));
-  }
-  Vectorized<double> lgamma() const {
-    return USE_SLEEF(
-        Vectorized<double>(Sleef_lgammad2_u10(values)), map(std::lgamma));
-  }
-  Vectorized<double> erfinv() const {
-    return map(calc_erfinv);
-  }
-  Vectorized<double> exp_u20() const {
-    return exp();
-  }
-  Vectorized<double> fexp_u20() const {
-    return exp();
-  }
-  Vectorized<double> i0e() const {
-    return map(calc_i0e);
-  }
-  Vectorized<double> digamma() const {
-    return map(calc_digamma);
-  }
-  Vectorized<double> igamma(const Vectorized<double>& x) const {
-    __at_align__ double tmp[size()];
-    __at_align__ double tmp_x[size()];
-    store(tmp);
-    x.store(tmp_x);
-    for (int64_t i = 0; i < size(); i++) {
-      tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
-    }
-    return loadu(tmp);
-  }
-  Vectorized<double> igammac(const Vectorized<double>& x) const {
-    __at_align__ double tmp[size()];
-    __at_align__ double tmp_x[size()];
-    store(tmp);
-    x.store(tmp_x);
-    for (int64_t i = 0; i < size(); i++) {
-      tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
-    }
-    return loadu(tmp);
-  }
-  Vectorized<double> ceil() const {
-    return vrndpq_f64(values);
-  }
-  Vectorized<double> floor() const {
-    return vrndmq_f64(values);
-  }
-  Vectorized<double> neg() const {
-    return vnegq_f64(values);
-  }
-  Vectorized<double> round() const {
-    return vrndiq_f64(values);
-  }
-  Vectorized<double> trunc() const {
-    return vrndq_f64(values);
-  }
-  Vectorized<double> sqrt() const {
-    return vsqrtq_f64(values);
-  }
-  Vectorized<double> reciprocal() const {
-    return vdivq_f64(vdupq_n_f64(1.0), values);
-  }
-  Vectorized<double> rsqrt() const {
-    return vdivq_f64(vdupq_n_f64(1.0), vsqrtq_f64(values));
-  }
-  double reduce_add() const {
-    return vaddvq_f64(values);
-  }
-  double reduce_max() const {
-    return vmaxvq_f64(values);
-  }
-  Vectorized<double> operator==(const Vectorized<double>& other) const {
-    return Vectorized<double>(
-        vreinterpretq_f64_u64(vceqq_f64(values, other.values)));
-  }
-
-  Vectorized<double> operator!=(const Vectorized<double>& other) const {
-    float64x2_t r0 = vreinterpretq_f64_u32(
-        vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(values, other.values))));
-    return Vectorized<double>(r0);
-  }
-
-  Vectorized<double> operator<(const Vectorized<double>& other) const {
-    return Vectorized<double>(
-        vreinterpretq_f64_u64(vcltq_f64(values, other.values)));
-  }
-
-  Vectorized<double> operator<=(const Vectorized<double>& other) const {
-    return Vectorized<double>(
-        vreinterpretq_f64_u64(vcleq_f64(values, other.values)));
-  }
-
-  Vectorized<double> operator>(const Vectorized<double>& other) const {
-    return Vectorized<double>(
-        vreinterpretq_f64_u64(vcgtq_f64(values, other.values)));
-  }
-
-  Vectorized<double> operator>=(const Vectorized<double>& other) const {
-    return Vectorized<double>(
-        vreinterpretq_f64_u64(vcgeq_f64(values, other.values)));
-  }
-
-  Vectorized<double> eq(const Vectorized<double>& other) const;
-  Vectorized<double> ne(const Vectorized<double>& other) const;
-  Vectorized<double> gt(const Vectorized<double>& other) const;
-  Vectorized<double> ge(const Vectorized<double>& other) const;
-  Vectorized<double> lt(const Vectorized<double>& other) const;
-  Vectorized<double> le(const Vectorized<double>& other) const;
-};
-
-template <>
-Vectorized<double> inline operator+(
-    const Vectorized<double>& a,
-    const Vectorized<double>& b) {
-  return vaddq_f64(a, b);
-}
-
-template <>
-Vectorized<double> inline operator-(
-    const Vectorized<double>& a,
-    const Vectorized<double>& b) {
-  return vsubq_f64(a, b);
-}
-
-template <>
-Vectorized<double> inline operator*(
-    const Vectorized<double>& a,
-    const Vectorized<double>& b) {
-  return vmulq_f64(a, b);
-}
-
-template <>
-Vectorized<double> inline operator/(
-    const Vectorized<double>& a,
-    const Vectorized<double>& b) {
-  return vdivq_f64(a, b);
-}
-
-// frac. Implement this here so we can use subtraction
-Vectorized<double> inline Vectorized<double>::frac() const {
-  return *this - this->trunc();
-}
-
-// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
-// either input is a NaN.
-template <>
-Vectorized<double> inline maximum(
-    const Vectorized<double>& a,
-    const Vectorized<double>& b) {
-  return vmaxq_f64(a, b);
-}
-
-// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
-// either input is a NaN.
-template <>
-Vectorized<double> inline minimum(
-    const Vectorized<double>& a,
-    const Vectorized<double>& b) {
-  return vminq_f64(a, b);
-}
-
-template <>
-Vectorized<double> inline clamp(
-    const Vectorized<double>& a,
-    const Vectorized<double>& min,
-    const Vectorized<double>& max) {
-  return vminq_f64(max, vmaxq_f64(min, a));
-}
-
-template <>
-Vectorized<double> inline clamp_max(
-    const Vectorized<double>& a,
-    const Vectorized<double>& max) {
-  return vminq_f64(max, a);
-}
-
-template <>
-Vectorized<double> inline clamp_min(
-    const Vectorized<double>& a,
-    const Vectorized<double>& min) {
-  return vmaxq_f64(min, a);
-}
-
-template <>
-Vectorized<double> inline operator&(
-    const Vectorized<double>& a,
-    const Vectorized<double>& b) {
-  return vreinterpretq_f64_u64(
-      vandq_u64(vreinterpretq_u64_f64(a), vreinterpretq_u64_f64(b)));
-}
-
-template <>
-Vectorized<double> inline operator|(
-    const Vectorized<double>& a,
-    const Vectorized<double>& b) {
-  return vreinterpretq_f64_u64(
-      vorrq_u64(vreinterpretq_u64_f64(a), vreinterpretq_u64_f64(b)));
-}
-
-template <>
-Vectorized<double> inline operator^(
-    const Vectorized<double>& a,
-    const Vectorized<double>& b) {
-  return vreinterpretq_f64_u64(
-      veorq_u64(vreinterpretq_u64_f64(a), vreinterpretq_u64_f64(b)));
-}
-
-inline Vectorized<double> Vectorized<double>::eq(
-    const Vectorized<double>& other) const {
-  return (*this == other) & Vectorized<double>(1.0);
-}
-
-inline Vectorized<double> Vectorized<double>::ne(
-    const Vectorized<double>& other) const {
-  return (*this != other) & Vectorized<double>(1.0);
-}
-
-inline Vectorized<double> Vectorized<double>::gt(
-    const Vectorized<double>& other) const {
-  return (*this > other) & Vectorized<double>(1.0);
-}
-
-inline Vectorized<double> Vectorized<double>::ge(
-    const Vectorized<double>& other) const {
-  return (*this >= other) & Vectorized<double>(1.0);
-}
-
-inline Vectorized<double> Vectorized<double>::lt(
-    const Vectorized<double>& other) const {
-  return (*this < other) & Vectorized<double>(1.0);
-}
-
-inline Vectorized<double> Vectorized<double>::le(
-    const Vectorized<double>& other) const {
-  return (*this <= other) & Vectorized<double>(1.0);
-}
-
-template <>
-Vectorized<double> inline fmadd(
-    const Vectorized<double>& a,
-    const Vectorized<double>& b,
-    const Vectorized<double>& c) {
-  return vfmaq_f64(c, a, b);
-}
-
-template <>
-Vectorized<double> inline fnmadd(
-    const Vectorized<double>& a,
-    const Vectorized<double>& b,
-    const Vectorized<double>& c) {
-  return vfmsq_f64(c, a, b);
-}
-
-template <>
-Vectorized<double> inline fmsub(
-    const Vectorized<double>& a,
-    const Vectorized<double>& b,
-    const Vectorized<double>& c) {
-  return vfmaq_f64(vnegq_f64(c), a, b);
-}
-
-template <>
-Vectorized<double> inline fnmsub(
-    const Vectorized<double>& a,
-    const Vectorized<double>& b,
-    const Vectorized<double>& c) {
-  return vfmsq_f64(vnegq_f64(c), a, b);
-}
-
-} // namespace CPU_CAPABILITY
-} // namespace at::vec
--- a/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h
@ -540,6 +540,42 @@ inline Vectorized<float> Vectorized<float>::le(
  return (*this <= other) & Vectorized<float>(1.0f);
 }

+template <>
+inline void convert(const float* src, int32_t* dst, int64_t n) {
+  int64_t i;
+#ifndef __msvc_cl__
+#pragma unroll
+#endif
+  for (i = 0; i <= (n - Vectorized<float>::size());
+       i += Vectorized<float>::size()) {
+    vst1q_s32(dst + i, vcvtq_s32_f32(vld1q_f32(src + i)));
+  }
+#ifndef __msvc_cl__
+#pragma unroll
+#endif
+  for (; i < n; i++) {
+    dst[i] = static_cast<int32_t>(src[i]);
+  }
+}
+
+template <>
+inline void convert(const int32_t* src, float* dst, int64_t n) {
+  int64_t i;
+#ifndef __msvc_cl__
+#pragma unroll
+#endif
+  for (i = 0; i <= (n - Vectorized<float>::size());
+       i += Vectorized<float>::size()) {
+    vst1q_f32(dst + i, vcvtq_f32_s32(vld1q_s32(src + i)));
+  }
+#ifndef __msvc_cl__
+#pragma unroll
+#endif
+  for (; i < n; i++) {
+    dst[i] = static_cast<float>(src[i]);
+  }
+}
+
 template <>
 Vectorized<float> inline fmadd(
    const Vectorized<float>& a,
--- a/aten/src/ATen/cpu/vec/vec128/vec128_half_neon.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_half_neon.h
@ -569,6 +569,46 @@ inline Vectorized<c10::Half> Vectorized<c10::Half>::le(
  return (*this <= other) & Vectorized<c10::Half>(1);
 }

+// These are global functions, so the defaults in vec_base.h should
+// work fine if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC is not available.
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+template <>
+inline void convert(const float16_t* src, int16_t* dst, int64_t n) {
+  int64_t i;
+#ifndef __msvc_cl__
+#pragma unroll
+#endif
+  for (i = 0; i <= (n - Vectorized<c10::Half>::size());
+       i += Vectorized<c10::Half>::size()) {
+    vst1q_s16(dst + i, vcvtq_s16_f16(vld1q_f16(src + i)));
+  }
+#ifndef __msvc_cl__
+#pragma unroll
+#endif
+  for (; i < n; i++) {
+    dst[i] = static_cast<int16_t>(src[i]);
+  }
+}
+
+template <>
+inline void convert(const int16_t* src, float16_t* dst, int64_t n) {
+  int64_t i;
+#ifndef __msvc_cl__
+#pragma unroll
+#endif
+  for (i = 0; i <= (n - Vectorized<c10::Half>::size());
+       i += Vectorized<c10::Half>::size()) {
+    vst1q_f16(dst + i, vcvtq_f16_s16(vld1q_s16(src + i)));
+  }
+#ifndef __msvc_cl__
+#pragma unroll
+#endif
+  for (; i < n; i++) {
+    dst[i] = static_cast<float16_t>(src[i]);
+  }
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
 template <>
 Vectorized<c10::Half> inline fmadd(
    const Vectorized<c10::Half>& a,
--- a/aten/src/ATen/cpu/vec/vec128/vec128_uint_aarch64.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_uint_aarch64.h
@ -1,378 +0,0 @@
-#pragma once
-
-#include <ATen/cpu/vec/intrinsics.h>
-#include <ATen/cpu/vec/vec_base.h>
-#include <c10/macros/Macros.h>
-#include <c10/util/irange.h>
-
-namespace at::vec {
-// Note [CPU_CAPABILITY namespace]
-// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-// This header, and all of its subheaders, will be compiled with
-// different architecture flags for each supported set of vector
-// intrinsics. So we need to make sure they aren't inadvertently
-// linked together. We do this by declaring objects in an `inline
-// namespace` which changes the name mangling, but can still be
-// accessed as `at::vec`.
-inline namespace CPU_CAPABILITY {
-
-#define VEC_UINT_NEON_TEMPLATE(vl, bit)                                       \
-  template <>                                                                 \
-  struct is_vec_specialized_for<uint##bit##_t> : std::bool_constant<true> {}; \
-                                                                              \
-  template <>                                                                 \
-  class Vectorized<uint##bit##_t> {                                           \
-    using neon_type = uint##bit##x##vl##_t;                                   \
-                                                                              \
-   private:                                                                   \
-    neon_type values;                                                         \
-                                                                              \
-   public:                                                                    \
-    using value_type = uint##bit##_t;                                         \
-    using size_type = int;                                                    \
-    static constexpr size_type size() {                                       \
-      return vl;                                                              \
-    }                                                                         \
-    Vectorized() {                                                            \
-      values = vdupq_n_u##bit(0);                                             \
-    }                                                                         \
-    Vectorized(neon_type v) : values(v) {}                                    \
-    Vectorized(uint##bit##_t val);                                            \
-    template <                                                                \
-        typename... Args,                                                     \
-        typename = std::enable_if_t<(sizeof...(Args) == size())>>             \
-    Vectorized(Args... vals) {                                                \
-      __at_align__ uint##bit##_t buffer[size()] = {vals...};                  \
-      values = vld1q_u##bit(buffer);                                          \
-    }                                                                         \
-    operator neon_type() const {                                              \
-      return values;                                                          \
-    }                                                                         \
-    static Vectorized<uint##bit##_t> loadu(                                   \
-        const void* ptr,                                                      \
-        uint64_t count = size());                                             \
-    void store(void* ptr, uint64_t count = size()) const;                     \
-    template <uint64_t mask>                                                  \
-    static Vectorized<uint##bit##_t> blend(                                   \
-        const Vectorized<uint##bit##_t>& a,                                   \
-        const Vectorized<uint##bit##_t>& b);                                  \
-    static Vectorized<uint##bit##_t> blendv(                                  \
-        const Vectorized<uint##bit##_t>& a,                                   \
-        const Vectorized<uint##bit##_t>& b,                                   \
-        const Vectorized<uint##bit##_t>& mask_) {                             \
-      return vbslq_u##bit(mask_.values, b, a);                                \
-    }                                                                         \
-    template <typename step_t>                                                \
-    static Vectorized<uint##bit##_t> arange(                                  \
-        value_type base = 0,                                                  \
-        step_t step = static_cast<step_t>(1));                                \
-    static Vectorized<uint##bit##_t> set(                                     \
-        const Vectorized<uint##bit##_t>& a,                                   \
-        const Vectorized<uint##bit##_t>& b,                                   \
-        uint64_t count = size());                                             \
-    const uint##bit##_t& operator[](uint idx) const = delete;                 \
-    uint##bit##_t& operator[](uint idx) = delete;                             \
-    Vectorized<uint##bit##_t> abs() const {                                   \
-      return values;                                                          \
-    }                                                                         \
-    Vectorized<uint##bit##_t> real() const {                                  \
-      return values;                                                          \
-    }                                                                         \
-    Vectorized<uint##bit##_t> imag() const {                                  \
-      return vdupq_n_u##bit(0);                                               \
-    }                                                                         \
-    Vectorized<uint##bit##_t> conj() const {                                  \
-      return values;                                                          \
-    }                                                                         \
-    Vectorized<uint##bit##_t> neg() const {                                   \
-      return vreinterpretq_u##bit##_s##bit(                                   \
-          vnegq_s##bit(vreinterpretq_s##bit##_u##bit(values)));               \
-    }                                                                         \
-    uint##bit##_t reduce_add() const {                                        \
-      return vaddvq_u##bit(values);                                           \
-    }                                                                         \
-    uint##bit##_t reduce_max() const;                                         \
-    Vectorized<uint##bit##_t> operator==(                                     \
-        const Vectorized<uint##bit##_t>& other) const {                       \
-      return Vectorized<value_type>(vceqq_u##bit(values, other.values));      \
-    }                                                                         \
-    Vectorized<uint##bit##_t> operator!=(                                     \
-        const Vectorized<uint##bit##_t>& other) const;                        \
-    Vectorized<uint##bit##_t> operator<(                                      \
-        const Vectorized<uint##bit##_t>& other) const {                       \
-      return Vectorized<value_type>(vcltq_u##bit(values, other.values));      \
-    }                                                                         \
-    Vectorized<uint##bit##_t> operator<=(                                     \
-        const Vectorized<uint##bit##_t>& other) const {                       \
-      return Vectorized<value_type>(vcleq_u##bit(values, other.values));      \
-    }                                                                         \
-    Vectorized<uint##bit##_t> operator>(                                      \
-        const Vectorized<uint##bit##_t>& other) const {                       \
-      return Vectorized<value_type>(vcgtq_u##bit(values, other.values));      \
-    }                                                                         \
-    Vectorized<uint##bit##_t> operator>=(                                     \
-        const Vectorized<uint##bit##_t>& other) const {                       \
-      return Vectorized<value_type>(vcgeq_u##bit(values, other.values));      \
-    }                                                                         \
-    Vectorized<uint##bit##_t> eq(                                             \
-        const Vectorized<uint##bit##_t>& other) const;                        \
-    Vectorized<uint##bit##_t> ne(                                             \
-        const Vectorized<uint##bit##_t>& other) const;                        \
-    Vectorized<uint##bit##_t> gt(                                             \
-        const Vectorized<uint##bit##_t>& other) const;                        \
-    Vectorized<uint##bit##_t> ge(                                             \
-        const Vectorized<uint##bit##_t>& other) const;                        \
-    Vectorized<uint##bit##_t> lt(                                             \
-        const Vectorized<uint##bit##_t>& other) const;                        \
-    Vectorized<uint##bit##_t> le(                                             \
-        const Vectorized<uint##bit##_t>& other) const;                        \
-  };                                                                          \
-  template <>                                                                 \
-  Vectorized<uint##bit##_t> inline operator+(                                 \
-      const Vectorized<uint##bit##_t>& a,                                     \
-      const Vectorized<uint##bit##_t>& b) {                                   \
-    return vaddq_u##bit(a, b);                                                \
-  }                                                                           \
-  template <>                                                                 \
-  Vectorized<uint##bit##_t> inline operator-(                                 \
-      const Vectorized<uint##bit##_t>& a,                                     \
-      const Vectorized<uint##bit##_t>& b) {                                   \
-    return vsubq_u##bit(a, b);                                                \
-  }                                                                           \
-  template <>                                                                 \
-  Vectorized<uint##bit##_t> inline operator&(                                 \
-      const Vectorized<uint##bit##_t>& a,                                     \
-      const Vectorized<uint##bit##_t>& b) {                                   \
-    return vandq_u##bit(a, b);                                                \
-  }                                                                           \
-  template <>                                                                 \
-  Vectorized<uint##bit##_t> inline operator|(                                 \
-      const Vectorized<uint##bit##_t>& a,                                     \
-      const Vectorized<uint##bit##_t>& b) {                                   \
-    return vorrq_u##bit(a, b);                                                \
-  }                                                                           \
-  template <>                                                                 \
-  Vectorized<uint##bit##_t> inline operator^(                                 \
-      const Vectorized<uint##bit##_t>& a,                                     \
-      const Vectorized<uint##bit##_t>& b) {                                   \
-    return veorq_u##bit(a, b);                                                \
-  }                                                                           \
-  Vectorized<uint##bit##_t> inline Vectorized<uint##bit##_t>::eq(             \
-      const Vectorized<uint##bit##_t>& other) const {                         \
-    return (*this == other) & Vectorized<uint##bit##_t>(1);                   \
-  }                                                                           \
-  Vectorized<uint##bit##_t> inline Vectorized<uint##bit##_t>::ne(             \
-      const Vectorized<uint##bit##_t>& other) const {                         \
-    return (*this != other) & Vectorized<uint##bit##_t>(1);                   \
-  }                                                                           \
-  Vectorized<uint##bit##_t> inline Vectorized<uint##bit##_t>::gt(             \
-      const Vectorized<uint##bit##_t>& other) const {                         \
-    return (*this > other) & Vectorized<uint##bit##_t>(1);                    \
-  }                                                                           \
-  Vectorized<uint##bit##_t> inline Vectorized<uint##bit##_t>::ge(             \
-      const Vectorized<uint##bit##_t>& other) const {                         \
-    return (*this >= other) & Vectorized<uint##bit##_t>(1);                   \
-  }                                                                           \
-  Vectorized<uint##bit##_t> inline Vectorized<uint##bit##_t>::lt(             \
-      const Vectorized<uint##bit##_t>& other) const {                         \
-    return (*this < other) & Vectorized<uint##bit##_t>(1);                    \
-  }                                                                           \
-  Vectorized<uint##bit##_t> inline Vectorized<uint##bit##_t>::le(             \
-      const Vectorized<uint##bit##_t>& other) const {                         \
-    return (*this <= other) & Vectorized<uint##bit##_t>(1);                   \
-  }
-
-VEC_UINT_NEON_TEMPLATE(16, 8)
-
-inline uint8_t Vectorized<uint8_t>::reduce_max() const {
-  return vmaxvq_u8(values);
-}
-
-template <>
-Vectorized<uint8_t> inline operator*(
-    const Vectorized<uint8_t>& a,
-    const Vectorized<uint8_t>& b) {
-  return vmulq_u8(a, b);
-}
-
-template <>
-inline Vectorized<uint8_t> operator~(const Vectorized<uint8_t>& a) {
-  return vmvnq_u8(a);
-}
-
-inline Vectorized<uint8_t> Vectorized<uint8_t>::operator!=(
-    const Vectorized<uint8_t>& other) const {
-  return ~(*this == other);
-}
-
-template <>
-Vectorized<uint8_t> inline minimum(
-    const Vectorized<uint8_t>& a,
-    const Vectorized<uint8_t>& b) {
-  return vminq_u8(a, b);
-}
-
-template <>
-Vectorized<uint8_t> inline maximum(
-    const Vectorized<uint8_t>& a,
-    const Vectorized<uint8_t>& b) {
-  return vmaxq_u8(a, b);
-}
-
-template <uint64_t mask>
-Vectorized<uint8_t> Vectorized<uint8_t>::blend(
-    const Vectorized<uint8_t>& a,
-    const Vectorized<uint8_t>& b) {
-  // Build an array of flags: each bit of element is 1 if the corresponding bit
-  // in 'mask' is set, 0 otherwise.
-  uint8x16_t maskArray = {
-      (mask & 1LL) ? 0xFF : 0,
-      (mask & 2LL) ? 0xFF : 0,
-      (mask & 4LL) ? 0xFF : 0,
-      (mask & 8LL) ? 0xFF : 0,
-      (mask & 16LL) ? 0xFF : 0,
-      (mask & 32LL) ? 0xFF : 0,
-      (mask & 64LL) ? 0xFF : 0,
-      (mask & 128LL) ? 0xFF : 0,
-      (mask & 256LL) ? 0xFF : 0,
-      (mask & 512LL) ? 0xFF : 0,
-      (mask & 1024LL) ? 0xFF : 0,
-      (mask & 2048LL) ? 0xFF : 0,
-      (mask & 4096LL) ? 0xFF : 0,
-      (mask & 8192LL) ? 0xFF : 0,
-      (mask & 16384LL) ? 0xFF : 0,
-      (mask & 32768LL) ? 0xFF : 0};
-  // Use BSL to select elements from b where the mask is 1, else from a
-  return vbslq_u8(maskArray, b.values, a.values);
-}
-
-#define VEC_UINT_NEON_OPS(vl, bit)                                             \
-  inline Vectorized<uint##bit##_t>::Vectorized(uint##bit##_t val) {            \
-    values = vdupq_n_u##bit(val);                                              \
-  }                                                                            \
-  inline Vectorized<uint##bit##_t> Vectorized<uint##bit##_t>::loadu(           \
-      const void* ptr, uint64_t count) {                                       \
-    if (count == size()) {                                                     \
-      return vld1q_u##bit(reinterpret_cast<const uint##bit##_t*>(ptr));        \
-    } else {                                                                   \
-      __at_align__ uint##bit##_t tmp_values[size()];                           \
-      for (const auto i : c10::irange(size())) {                               \
-        tmp_values[i] = 0;                                                     \
-      }                                                                        \
-      std::memcpy(                                                             \
-          tmp_values,                                                          \
-          reinterpret_cast<const uint##bit##_t*>(ptr),                         \
-          count * sizeof(uint##bit##_t));                                      \
-      return vld1q_u##bit(reinterpret_cast<const uint##bit##_t*>(tmp_values)); \
-    }                                                                          \
-  }                                                                            \
-  inline void Vectorized<uint##bit##_t>::store(void* ptr, uint64_t count)      \
-      const {                                                                  \
-    if (count == size()) {                                                     \
-      vst1q_u##bit(reinterpret_cast<uint##bit##_t*>(ptr), values);             \
-    } else {                                                                   \
-      uint##bit##_t tmp_values[size()];                                        \
-      vst1q_u##bit(reinterpret_cast<uint##bit##_t*>(tmp_values), values);      \
-      std::memcpy(ptr, tmp_values, count * sizeof(uint##bit##_t));             \
-    }                                                                          \
-  }
-
-VEC_UINT_NEON_OPS(16, 8)
-
-template <typename step_t>
-inline Vectorized<uint8_t> Vectorized<uint8_t>::arange(
-    uint8_t base,
-    step_t step) {
-  const Vectorized<uint8_t> base_vec(base);
-  const Vectorized<uint8_t> step_vec(step);
-  const uint8x16_t step_sizes = {
-      0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
-  return vmlaq_u8(base_vec, step_sizes, step_vec);
-}
-
-template <>
-Vectorized<uint8_t> inline operator>>(
-    const Vectorized<uint8_t>& a,
-    const Vectorized<uint8_t>& b) {
-  uint8x16_t x = a;
-  uint8x16_t bound = vdupq_n_u8(8);
-  uint8x16_t z = vminq_u8(b, bound);
-  return x >> z;
-}
-
-template <>
-Vectorized<uint8_t> inline operator<<(
-    const Vectorized<uint8_t>& a,
-    const Vectorized<uint8_t>& b) {
-  uint8x16_t bound = vdupq_n_u8(8);
-  uint8x16_t z = vminq_u8(b, bound);
-  return vshlq_u8(a, vreinterpretq_s8_u8(z));
-}
-
-inline Vectorized<uint8_t> Vectorized<uint8_t>::set(
-    const Vectorized<uint8_t>& a,
-    const Vectorized<uint8_t>& b,
-    uint64_t count) {
-  if (count == 0) {
-    return a;
-  } else if (count >= 16) {
-    return b;
-  } else {
-    // Build an array of flags: each bit of element is 1 if the corresponding
-    // bit in 'mask' is set, 0 otherwise.
-    uint8x16_t maskArray = {
-        static_cast<uint8_t>((count >= 1LL) ? 0xFF : 0),
-        static_cast<uint8_t>((count >= 2LL) ? 0xFF : 0),
-        static_cast<uint8_t>((count >= 3LL) ? 0xFF : 0),
-        static_cast<uint8_t>((count >= 4LL) ? 0xFF : 0),
-        static_cast<uint8_t>((count >= 5LL) ? 0xFF : 0),
-        static_cast<uint8_t>((count >= 6LL) ? 0xFF : 0),
-        static_cast<uint8_t>((count >= 7LL) ? 0xFF : 0),
-        static_cast<uint8_t>((count >= 8LL) ? 0xFF : 0),
-        static_cast<uint8_t>((count >= 9LL) ? 0xFF : 0),
-        static_cast<uint8_t>((count >= 10LL) ? 0xFF : 0),
-        static_cast<uint8_t>((count >= 11LL) ? 0xFF : 0),
-        static_cast<uint8_t>((count >= 12LL) ? 0xFF : 0),
-        static_cast<uint8_t>((count >= 13LL) ? 0xFF : 0),
-        static_cast<uint8_t>((count >= 14LL) ? 0xFF : 0),
-        static_cast<uint8_t>((count >= 15LL) ? 0xFF : 0),
-        0};
-
-    // Use BSL to select elements from b where the mask is 1, else from a
-    return vbslq_u8(maskArray, b.values, a.values);
-  }
-}
-
-template <>
-Vectorized<uint8_t> inline operator/(
-    const Vectorized<uint8_t>& a,
-    const Vectorized<uint8_t>& b) {
-  uint8x16_t x = a;
-  uint8x16_t y = b;
-  return x / y;
-}
-
-template <>
-Vectorized<uint8_t> inline clamp(
-    const Vectorized<uint8_t>& a,
-    const Vectorized<uint8_t>& min,
-    const Vectorized<uint8_t>& max) {
-  return minimum(max, maximum(min, a));
-}
-
-template <>
-Vectorized<uint8_t> inline clamp_max(
-    const Vectorized<uint8_t>& a,
-    const Vectorized<uint8_t>& max) {
-  return minimum(max, a);
-}
-
-template <>
-Vectorized<uint8_t> inline clamp_min(
-    const Vectorized<uint8_t>& a,
-    const Vectorized<uint8_t>& min) {
-  return maximum(min, a);
-}
-
-} // namespace CPU_CAPABILITY
-} // namespace at::vec
--- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
@ -1390,7 +1390,7 @@ std::pair<Vectorized<float>, Vectorized<float>> inline convert_int8_to_float(

 std::pair<Vectorized<float>, Vectorized<float>> inline convert_int8_to_float(
    at::vec::Vectorized<uint8_t> src) {
-  auto u8x8 = vget_low_u8(src);
+  auto u8x8 = vld1_u8(src.operator const uint8_t*());
  auto u16x8 = vmovl_u8(u8x8);
  auto u32x4_hi = vmovl_u16(vget_high_u16(u16x8));
  auto u32x4_lo = vmovl_u16(vget_low_u16(u16x8));
@ -1412,7 +1412,7 @@ Vectorized<float> inline convert_int8_half_register_to_float(

 Vectorized<float> inline convert_int8_half_register_to_float(
    at::vec::Vectorized<uint8_t> src) {
-  auto u8x8 = vget_low_u8(src);
+  auto u8x8 = vld1_u8(src.operator const uint8_t*());
  auto u16x8 = vmovl_u8(u8x8);
  auto u32x4_lo = vmovl_u16(vget_low_u16(u16x8));

--- a/aten/src/ATen/cuda/CUDAGreenContext.cpp
+++ b/aten/src/ATen/cuda/CUDAGreenContext.cpp
@ -1,192 +0,0 @@
-#include <ATen/cuda/CUDAGreenContext.h>
-
-namespace at::cuda {
-  GreenContext::GreenContext(uint32_t device_id, uint32_t num_sms) {
-#if CUDA_HAS_GREEN_CONTEXT
-    int driver_version;
-    C10_CUDA_CHECK(cudaDriverGetVersion(&driver_version));
-    TORCH_CHECK(
-        driver_version >= 12080, "cuda driver too old to use green context!");
-    CUcontext pctx = nullptr;
-    C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuCtxGetCurrent_(&pctx));
-    if (C10_UNLIKELY(!pctx)) {
-      TORCH_WARN(
-          "Attempted to create a green context but"
-          " there was no primary context! Creating a primary context...");
-
-      cudaFree(0);
-    }
-
-    CUdevice device;
-    device_id_ = device_id;
-    C10_CUDA_DRIVER_CHECK(
-        c10::cuda::DriverAPI::get()->cuDeviceGet_(&device, device_id));
-
-    // Get device resources
-    CUdevResource device_resource;
-    C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuDeviceGetDevResource_(
-        device, &device_resource, CU_DEV_RESOURCE_TYPE_SM));
-
-    // Split resources
-    std::vector<CUdevResource> result(1);
-    auto result_data = result.data();
-    unsigned int nb_groups = 1;
-    CUdevResource remaining;
-
-    C10_CUDA_DRIVER_CHECK(
-        c10::cuda::DriverAPI::get()->cuDevSmResourceSplitByCount_(
-            result_data,
-            &nb_groups,
-            &device_resource,
-            &remaining,
-            0, // default flags
-            num_sms));
-
-    TORCH_CHECK(nb_groups == 1, "Failed to create single resource group");
-
-    // Generate resource descriptor
-    CUdevResourceDesc desc;
-    C10_CUDA_DRIVER_CHECK(
-        c10::cuda::DriverAPI::get()->cuDevResourceGenerateDesc_(
-            &desc, result_data, 1));
-
-    // Create green context
-    // CU_GREEN_CTX_DEFAULT_STREAM is required per docs:
-    // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GREEN__CONTEXTS.html
-    C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuGreenCtxCreate_(
-        &green_ctx_, desc, device, CU_GREEN_CTX_DEFAULT_STREAM));
-
-    // Convert to regular context
-    C10_CUDA_DRIVER_CHECK(
-        c10::cuda::DriverAPI::get()->cuCtxFromGreenCtx_(&context_, green_ctx_));
-    TORCH_CHECK(context_, "Green ctx conversion to regular ctx failed!");
-#else
-    TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
-#endif
-  }
-
-  std::unique_ptr<GreenContext> GreenContext::create(
-      uint32_t num_sms,
-      std::optional<uint32_t> device_id) {
-#if CUDA_HAS_GREEN_CONTEXT
-    if (!device_id.has_value()) {
-      device_id = at::cuda::current_device();
-    }
-    return std::make_unique<GreenContext>(device_id.value(), num_sms);
-#else
-    TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
-#endif
-  }
-
-  // Implement move operations
-  GreenContext::GreenContext(GreenContext&& other) noexcept{
-#if CUDA_HAS_GREEN_CONTEXT
-    device_id_ = std::exchange(other.device_id_, -1);
-    green_ctx_ = std::exchange(other.green_ctx_, nullptr);
-    context_ = std::exchange(other.context_, nullptr);
-    parent_stream_ = std::exchange(other.parent_stream_, nullptr);
-#else
-    TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
-#endif
-  }
-
-  GreenContext& GreenContext::operator=(GreenContext&& other) noexcept{
-#if CUDA_HAS_GREEN_CONTEXT
-    if (this != &other) {
-      // Clean up current resources
-      if (green_ctx_) {
-        CUcontext current = nullptr;
-        C10_CUDA_DRIVER_CHECK(
-            c10::cuda::DriverAPI::get()->cuCtxGetCurrent_(&current));
-        if (current == context_) {
-          TORCH_CHECK(
-              false,
-              "attempting to overwrite current green ctx "
-              "when it is active!");
-        }
-        C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuGreenCtxDestroy_(green_ctx_));
-      }
-
-      // Take ownership of other's resources
-      device_id_ = std::exchange(other.device_id_, -1);
-      green_ctx_ = std::exchange(other.green_ctx_, nullptr);
-      context_ = std::exchange(other.context_, nullptr);
-      parent_stream_ = std::exchange(other.parent_stream_, nullptr);
-    }
-    return *this;
-#else
-    TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
-#endif
-  }
-
-  GreenContext::~GreenContext() noexcept{
-#if CUDA_HAS_GREEN_CONTEXT
-    C10_CUDA_DRIVER_CHECK(
-        c10::cuda::DriverAPI::get()->cuGreenCtxDestroy_(green_ctx_));
-#else
-    TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
-#endif
-  }
-
-  // Get the underlying CUDA context
-  CUcontext GreenContext::getContext() const {
-#if CUDA_HAS_GREEN_CONTEXT
-    return context_;
-#else
-    TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
-#endif
-  }
-
-  // Get the underlying green context
-#if CUDA_HAS_GREEN_CONTEXT
-  CUgreenCtx GreenContext::getGreenContext() const {
-    return green_ctx_;
-  }
-#endif
-
-  // Make this context current
-  void GreenContext::setContext() {
-#if CUDA_HAS_GREEN_CONTEXT
-    auto current_stream = c10::cuda::getCurrentCUDAStream();
-    parent_stream_ = current_stream.stream();
-
-    at::cuda::CUDAEvent ev;
-    ev.record(current_stream);
-
-    CUcontext current = nullptr;
-    C10_CUDA_DRIVER_CHECK(
-        c10::cuda::DriverAPI::get()->cuCtxGetCurrent_(&current));
-    if (!current) {
-      C10_CUDA_DRIVER_CHECK(
-          c10::cuda::DriverAPI::get()->cuCtxSetCurrent_(context_));
-    } else {
-      C10_CUDA_DRIVER_CHECK(
-          c10::cuda::DriverAPI::get()->cuCtxPushCurrent_(context_));
-    }
-    // currently hardcodes the new green context to use the default stream
-    // TODO(eqy): consider creating a new stream if e.g., it allows interop
-    // with CUDA Graph captures etc.
-    auto default_stream = c10::cuda::getDefaultCUDAStream();
-    ev.block(default_stream);
-    c10::cuda::setCurrentCUDAStream(default_stream);
-#else
-    TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
-#endif
-  }
-
-  void GreenContext::popContext() {
-#if CUDA_HAS_GREEN_CONTEXT
-    // see above note about stream being hardcoded to the default stream
-    at::cuda::CUDAEvent ev;
-    ev.record(c10::cuda::getCurrentCUDAStream());
-    CUcontext popped;
-    C10_CUDA_DRIVER_CHECK(
-        c10::cuda::DriverAPI::get()->cuCtxPopCurrent_(&popped));
-    TORCH_INTERNAL_ASSERT(
-        popped == context_, "expected popped context to be the current ctx");
-    ev.block(c10::cuda::getStreamFromExternal(parent_stream_, device_id_));
-#else
-    TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
-#endif
-  }
-} // namespace at::cuda
--- a/aten/src/ATen/cuda/CUDAGreenContext.h
+++ b/aten/src/ATen/cuda/CUDAGreenContext.h
@ -1,53 +0,0 @@
-#pragma once
-#include <ATen/cuda/CUDAEvent.h>
-
-#if defined(CUDA_VERSION) && !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
-#include <c10/cuda/driver_api.h>
-#include <cuda.h>
-#include <memory>
-#include <stdexcept>
-#include <vector>
-#define CUDA_HAS_GREEN_CONTEXT 1
-#else
-#define CUDA_HAS_GREEN_CONTEXT 0
-#endif
-
-namespace at::cuda {
-
-class TORCH_CUDA_CPP_API GreenContext {
- public:
-  GreenContext(uint32_t device_id, uint32_t num_sms);
-
-  static std::unique_ptr<GreenContext> create(uint32_t num_sms, std::optional<uint32_t> device_id);
-
-  // Delete copy constructor and assignment
-  GreenContext(const GreenContext&) = delete;
-  GreenContext& operator=(const GreenContext&) = delete;
-
-  // Implement move operations
-  GreenContext(GreenContext&& other) noexcept;
-  GreenContext& operator=(GreenContext&& other) noexcept;
-  ~GreenContext() noexcept;
-
-  // Get the underlying CUDA context
-  CUcontext getContext() const;
-
-  // Get the underlying green context
-#if CUDA_HAS_GREEN_CONTEXT
-  CUgreenCtx getGreenContext() const;
-#endif
-
-  // Make this context current
-  void setContext();
-
-  void popContext();
-
- private:
-#if CUDA_HAS_GREEN_CONTEXT
-  int32_t device_id_ = -1;
-  CUgreenCtx green_ctx_ = nullptr;
-  CUcontext context_ = nullptr;
-  cudaStream_t parent_stream_ = nullptr;
-#endif
-};
-} // namespace at::cuda
--- a/aten/src/ATen/cuda/cub.cuh
+++ b/aten/src/ATen/cuda/cub.cuh
@ -70,7 +70,11 @@
 #define ATEN_CUB_MAXIMUM() NO_ROCM(at_cuda_detail)ROCM_HIPCUB(::cub)::Max()
 #endif

-#if defined(USE_ROCM)
+#if (!defined(USE_ROCM) && !CUB_SUPPORTS_NV_BFLOAT16()) || defined(USE_ROCM)
+
+#if !defined(USE_ROCM)
+namespace at_cuda_detail {
+#endif

 // backport https://github.com/NVIDIA/cub/pull/306 for c10::BFloat16

@ -92,6 +96,10 @@ template <>
 struct ROCM_HIPCUB(cub)::NumericTraits<c10::BFloat16>:
       ROCM_HIPCUB(cub)::BaseTraits<ROCM_HIPCUB(cub)::FLOATING_POINT, true, false, unsigned short, c10::BFloat16> {};

+#if !defined(USE_ROCM)
+} // namespace at_cuda_detail
+#endif
+
 #endif

 #if !defined(USE_ROCM)
@ -113,7 +121,7 @@ struct cuda_type<c10::Half> {
  using type = __half;
 };

-#if !defined(USE_ROCM)
+#if !defined(USE_ROCM) && CUB_SUPPORTS_NV_BFLOAT16()

 template<>
 struct cuda_type<c10::BFloat16> {
@ -195,6 +203,36 @@ __global__ void transform_vals(InputIteratorT1 a, InputIteratorT2 b, OutputItera
  *out = scan_op(static_cast<acc_t>(*a), static_cast<acc_t>(*b));
 }

+#if !CUB_SUPPORTS_FUTURE_VALUE()
+template<typename ValueT, typename InputIteratorT>
+struct chained_iterator {
+  using iterator_category = std::random_access_iterator_tag;
+  using difference_type   = std::ptrdiff_t;
+  using value_type        = ValueT;
+  using pointer           = ValueT*;
+  using reference         = ValueT&;
+
+  InputIteratorT iter;
+  ValueT *first;
+  difference_type offset = 0;
+
+  __device__ ValueT operator[](difference_type i) {
+    i +=  offset;
+    if (i == 0) {
+      return *first;
+    } else {
+      return ValueT(iter[i - 1]);
+    }
+  }
+  __device__ chained_iterator operator+(difference_type i) {
+    return chained_iterator{iter, first, i};
+  }
+  __device__ ValueT operator*() {
+    return (*this)[0];
+  }
+};
+#endif
+
 // even though cub is supposed to support tensors with int_max elements, in reality it doesn't,
 // so split at int_max/2
 constexpr int max_cub_size = std::numeric_limits<int>::max() / 2 + 1; // 2**30
@ -239,6 +277,25 @@ inline void inclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT
        first_elem_ptr,
        scan_op);
    C10_CUDA_KERNEL_LAUNCH_CHECK();
+#if !CUB_SUPPORTS_FUTURE_VALUE()
+    using ArgIndexInputIterator = NO_ROCM(at_cuda_detail)::cub::ArgIndexInputIterator<InputIteratorT>;
+    using tuple = typename ArgIndexInputIterator::value_type;
+    auto input_iter_transform = [=] __device__ (const tuple &x)->input_t  {
+      if (x.key == 0) {
+        return *first_elem_ptr;
+      } else {
+        return x.value;
+      }
+    };
+    auto input_ = ATEN_CUB_TRANSFORM_ITERATOR(input_t, decltype(input_iter_transform), ArgIndexInputIterator)(
+      ArgIndexInputIterator(input + i), input_iter_transform);
+    CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceScan::InclusiveScan,
+        input_,
+        output + i,
+        scan_op,
+        size_cub,
+        at::cuda::getCurrentCUDAStream());
+#else
    CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceScan::ExclusiveScan,
        input + i + 1,
        output + i,
@ -246,6 +303,7 @@ inline void inclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT
        ::at_cuda_detail::cub::FutureValue<input_t>(first_elem_ptr),
        size_cub,
        at::cuda::getCurrentCUDAStream());
+#endif
  }
 #endif
 }
@ -497,6 +555,16 @@ inline void exclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT
        first_elem_ptr,
        scan_op);
    C10_CUDA_KERNEL_LAUNCH_CHECK();
+#if !CUB_SUPPORTS_FUTURE_VALUE()
+    auto input_ = impl::chained_iterator<InitValueT, InputIteratorT>{
+      input + i, first_elem_ptr};
+    CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceScan::InclusiveScan,
+        input_,
+        output + i,
+        scan_op,
+        size_cub,
+        at::cuda::getCurrentCUDAStream());
+#else
    CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceScan::ExclusiveScan,
        input + i,
        output + i,
@ -504,6 +572,7 @@ inline void exclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT
        ::at_cuda_detail::cub::FutureValue<InitValueT>(first_elem_ptr),
        size_cub,
        at::cuda::getCurrentCUDAStream());
+#endif
  }
 #endif
 }
--- a/aten/src/ATen/cuda/cub_definitions.cuh
+++ b/aten/src/ATen/cuda/cub_definitions.cuh
@ -10,6 +10,14 @@
 #define CUB_VERSION 200001
 #endif

+// cub sort support for __nv_bfloat16 is added to cub 1.13 in:
+// https://github.com/NVIDIA/cub/pull/306
+#if CUB_VERSION >= 101300
+#define CUB_SUPPORTS_NV_BFLOAT16() true
+#else
+#define CUB_SUPPORTS_NV_BFLOAT16() false
+#endif
+
 // cub support for CUB_WRAPPED_NAMESPACE is added to cub 1.13.1 in:
 // https://github.com/NVIDIA/cub/pull/326
 // CUB_WRAPPED_NAMESPACE is defined globally in cmake/Dependencies.cmake
@ -20,6 +28,14 @@
 #define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() false
 #endif

+// cub support for cub::FutureValue is added to cub 1.15 in:
+// https://github.com/NVIDIA/cub/pull/305
+#if CUB_VERSION >= 101500
+#define CUB_SUPPORTS_FUTURE_VALUE() true
+#else
+#define CUB_SUPPORTS_FUTURE_VALUE() false
+#endif
+
 // There were many bc-breaking changes in major version release of CCCL v3.0.0
 // Please see https://nvidia.github.io/cccl/cccl/3.0_migration_guide.html
 #if CUB_VERSION >= 200800
--- a/aten/src/ATen/detail/XLAHooksInterface.cpp
+++ b/aten/src/ATen/detail/XLAHooksInterface.cpp
@ -1,23 +0,0 @@
-#include <ATen/detail/XLAHooksInterface.h>
-
-namespace at {
-namespace detail {
-
-const XLAHooksInterface& getXLAHooks() {
-  auto create_impl = [] {
-    // Create XLA hooks using the registry
-    auto hooks = XLAHooksRegistry()->Create("torch_xla::detail::XLAHooks", XLAHooksArgs{});
-    if (hooks) {
-      return hooks;
-    }
-    // If hooks creation fails, fall back to default implementation
-    return std::make_unique<XLAHooksInterface>();
-  };
-  static auto hooks = create_impl();
-  return *hooks;
-}
-} // namespace detail
-
-C10_DEFINE_REGISTRY(XLAHooksRegistry, XLAHooksInterface, XLAHooksArgs)
-
-} // namespace at
--- a/aten/src/ATen/detail/XLAHooksInterface.h
+++ b/aten/src/ATen/detail/XLAHooksInterface.h
@ -1,79 +0,0 @@
-#pragma once
-
-#include <c10/core/Device.h>
-#include <c10/util/Exception.h>
-#include <c10/util/Registry.h>
-
-#include <ATen/detail/AcceleratorHooksInterface.h>
-
-C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-parameter")
-
-namespace at {
-
-constexpr const char* XLA_HELP =
-  "This error has occurred because you are trying "
-  "to use some XLA functionality, but the XLA library has not been "
-  "loaded by the dynamic linker. You must load xla libraries by `import torch_xla`";
-
-struct TORCH_API XLAHooksInterface : AcceleratorHooksInterface {
-  ~XLAHooksInterface() override = default;
-
-  void init() const override {
-    TORCH_CHECK(false, "Cannot initialize XLA without torch_xla library. ", XLA_HELP);
-  }
-
-  virtual bool hasXLA() const {
-    return false;
-  }
-
-  virtual std::string showConfig() const {
-    TORCH_CHECK(
-        false,
-        "Cannot query detailed XLA version without torch_xla library. ",
-        XLA_HELP);
-  }
-
-  const Generator& getDefaultGenerator(
-      [[maybe_unused]] DeviceIndex device_index = -1) const override {
-    TORCH_CHECK(
-        false, "Cannot get default XLA generator without torch_xla library. ", XLA_HELP);
-  }
-
-  Generator getNewGenerator(
-      [[maybe_unused]] DeviceIndex device_index = -1) const override {
-    TORCH_CHECK(false, "Cannot get XLA generator without torch_xla library. ", XLA_HELP);
-  }
-
-  virtual DeviceIndex getCurrentDevice() const override {
-    TORCH_CHECK(false, "Cannot get current XLA device without torch_xla library. ", XLA_HELP);
-  }
-
-  Device getDeviceFromPtr(void* /*data*/) const override {
-    TORCH_CHECK(false, "Cannot get device of pointer on XLA without torch_xla library. ", XLA_HELP);
-  }
-
-  Allocator* getPinnedMemoryAllocator() const override {
-    TORCH_CHECK(false, "Cannot get XLA pinned memory allocator without torch_xla library. ", XLA_HELP);
-  }
-
-  bool isPinnedPtr(const void* data) const override {
-    return false;
-  }
-
-  bool hasPrimaryContext(DeviceIndex device_index) const override {
-    TORCH_CHECK(false, "Cannot query primary context without torch_xla library. ", XLA_HELP);
-  }
-
-};
-
-struct TORCH_API XLAHooksArgs {};
-
-TORCH_DECLARE_REGISTRY(XLAHooksRegistry, XLAHooksInterface, XLAHooksArgs);
-#define REGISTER_XLA_HOOKS(clsname) \
-  C10_REGISTER_CLASS(XLAHooksRegistry, clsname, clsname)
-
-namespace detail {
-TORCH_API const XLAHooksInterface& getXLAHooks();
-} // namespace detail
-} // namespace at
-C10_DIAGNOSTIC_POP()
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@ -3620,7 +3620,7 @@ Tensor& _int_mm_out_cpu(const Tensor& self, const Tensor& mat2, Tensor& result)
    try {
      mkldnn_matmul_i8i8i32(self, mat2, result);
      dispatched = true;
-    } catch ([[maybe_unused]] const std::exception& e) {
+    } catch (const std::exception& e) {
      TORCH_WARN(func_name, " failed, switching to BLAS gemm: ", e.what());
    }
  }
--- a/aten/src/ATen/native/PixelShuffle.h
+++ b/aten/src/ATen/native/PixelShuffle.h
@ -11,8 +11,6 @@ inline void check_pixel_shuffle_shapes(const Tensor& self, int64_t upscale_facto
              "pixel_shuffle expects a positive upscale_factor, but got ",
              upscale_factor);
  int64_t c = self.size(-3);
-  TORCH_CHECK_VALUE(upscale_factor <= std::numeric_limits<decltype(upscale_factor)>::max() / upscale_factor,
-        "upscale factor is too large, (upscale_factor)^2 overflowed: upscale_factor=", upscale_factor);
  int64_t upscale_factor_squared = upscale_factor * upscale_factor;
  TORCH_CHECK(c % upscale_factor_squared == 0,
              "pixel_shuffle expects its input's 'channel' dimension to be divisible by the square of "
--- a/aten/src/ATen/native/TriangularOps.cpp
+++ b/aten/src/ATen/native/TriangularOps.cpp
@ -141,6 +141,8 @@ void compute_triu_tril(const Tensor& self, int64_t k, const Tensor &result) {
    return;
  }

+  checkTrilTriuMemoryOverlap(result, self);
+
  bool inplace_op = self.is_same(result);

  bool inplace_update = false;
--- a/aten/src/ATen/native/TriangularOpsUtils.h
+++ b/aten/src/ATen/native/TriangularOpsUtils.h
@ -1,3 +1,4 @@
+#include <ATen/MemoryOverlap.h>
 #include <ATen/core/Tensor.h>
 #include <ATen/native/LinearAlgebraUtils.h>

@ -54,4 +55,13 @@ static inline std::tuple<bool, Tensor> checkTrilTriuBatchContiguous(const Tensor
  return std::make_tuple(true, tensor);
 }

+static inline void checkTrilTriuMemoryOverlap(const Tensor& result, const Tensor& self) {
+  if (result.is_same(self)) {
+    at::assert_no_internal_overlap(result);
+  } else {
+    at::assert_no_internal_overlap(result);
+    at::assert_no_overlap(result, self);
+  }
+}
+
 }  // namespace at::native
--- a/aten/src/ATen/native/cpu/DepthwiseConvKernel.cpp
+++ b/aten/src/ATen/native/cpu/DepthwiseConvKernel.cpp
@ -259,20 +259,11 @@ inline void winograd_f2k3_input_transform_inplace__rvv(
  const vfloat32m1_t wd1 = __riscv_vfadd_vv_f32m1(d1, d2, 4);
  const vfloat32m1_t wd2 = __riscv_vfsub_vv_f32m1(d2, d1, 4);
  const vfloat32m1_t wd3 = __riscv_vfsub_vv_f32m1(d1, d3, 4);
-  /* GCC 14.2 (RISC-V RVV) ICE workaround:
-   * Avoid single-statement read-modify-write on MEM_REF like:
-   *   *input_tile_val =
-   *     __riscv_vset_v_f32m1_f32m1x4(*input_tile_val, idx, val);
-   * This triggers an ICE during GIMPLE lower (gsi_replace / riscv_gimple_fold_builtin)
-   * with -march=rv64gcv. Use a temporary then write back.
-   * Do NOT refactor into the single-statement form. Clang is unaffected.
-   */
-  vfloat32m1x4_t tmp_input_tile_val = *input_tile_val;
-  tmp_input_tile_val = __riscv_vset_v_f32m1_f32m1x4(tmp_input_tile_val, 0, wd0);
-  tmp_input_tile_val = __riscv_vset_v_f32m1_f32m1x4(tmp_input_tile_val, 1, wd1);
-  tmp_input_tile_val = __riscv_vset_v_f32m1_f32m1x4(tmp_input_tile_val, 2, wd2);
-  tmp_input_tile_val = __riscv_vset_v_f32m1_f32m1x4(tmp_input_tile_val, 3, wd3);
-  *input_tile_val = tmp_input_tile_val;
+
+  *input_tile_val = __riscv_vset_v_f32m1_f32m1x4(*input_tile_val, 0, wd0);
+  *input_tile_val = __riscv_vset_v_f32m1_f32m1x4(*input_tile_val, 1, wd1);
+  *input_tile_val = __riscv_vset_v_f32m1_f32m1x4(*input_tile_val, 2, wd2);
+  *input_tile_val = __riscv_vset_v_f32m1_f32m1x4(*input_tile_val, 3, wd3);
 }

 inline void winograd_f2k3_output_transform_inplace__rvv(
@ -286,15 +277,9 @@ inline void winograd_f2k3_output_transform_inplace__rvv(
  const vfloat32m1_t wm0 = __riscv_vfadd_vv_f32m1(m0_plus_m1, m2, 4);
  const vfloat32m1_t m1_sub_m2 = __riscv_vfsub_vv_f32m1(m1, m2, 4);
  const vfloat32m1_t wm1 = __riscv_vfsub_vv_f32m1(m1_sub_m2, m3, 4);
-  /* GCC 14.2 (RISC-V RVV) ICE workaround — see note above.
-   * Keep the temporary + write-back pattern to avoid ICE.
-   * Do NOT rewrite into:
-   *   *input_tile_val = __riscv_vset_v_f32m1_f32m1x4(*input_tile_val, idx, val);
-   */
-  vfloat32m1x4_t tmp_output_tile_val = *input_tile_val;
-  tmp_output_tile_val = __riscv_vset_v_f32m1_f32m1x4(tmp_output_tile_val, 0, wm0);
-  tmp_output_tile_val = __riscv_vset_v_f32m1_f32m1x4(tmp_output_tile_val, 1, wm1);
-  *input_tile_val = tmp_output_tile_val;
+
+  *input_tile_val = __riscv_vset_v_f32m1_f32m1x4(*input_tile_val, 0, wm0);
+  *input_tile_val = __riscv_vset_v_f32m1_f32m1x4(*input_tile_val, 1, wm1);
 }

 inline vfloat32m1_t
@ -315,17 +300,11 @@ inline void winograd_f2k3_kernel_transform__rvv(
  const vfloat32m1_t const_half = __riscv_vfmv_v_f_f32m1(0.5f, 4);
  const vfloat32m1_t g0_plus_g2 = __riscv_vfadd_vv_f32m1(g0, g2, 4);
  vfloat32m1_t half_g0_plus_g2 =  __riscv_vfmul_vv_f32m1(const_half, g0_plus_g2, 4);
-  /* GCC 14.2 (RISC-V RVV) ICE workaround — see note above.
-   * Keep the temporary + write-back pattern to avoid ICE.
-   * Do NOT rewrite into:
-   *   *transform = __riscv_vset_v_f32m1_f32m1x4(*transform, idx, val);
-   */
-  vfloat32m1x4_t tmp_transform = *transform;
-  tmp_transform = __riscv_vset_v_f32m1_f32m1x4(tmp_transform, 0, g0);
-  tmp_transform = __riscv_vset_v_f32m1_f32m1x4(tmp_transform, 1, vmuladdq_f32(half_g0_plus_g2, const_half, g1));
-  tmp_transform = __riscv_vset_v_f32m1_f32m1x4(tmp_transform, 2, vmulsubq_f32(half_g0_plus_g2, const_half, g1));
-  tmp_transform = __riscv_vset_v_f32m1_f32m1x4(tmp_transform, 3, g2);
-  *transform = tmp_transform;
+
+  *transform = __riscv_vset_v_f32m1_f32m1x4(*transform, 0, g0);
+  *transform = __riscv_vset_v_f32m1_f32m1x4(*transform, 1, vmuladdq_f32(half_g0_plus_g2, const_half, g1));
+  *transform = __riscv_vset_v_f32m1_f32m1x4(*transform, 2, vmulsubq_f32(half_g0_plus_g2, const_half, g1));
+  *transform = __riscv_vset_v_f32m1_f32m1x4(*transform, 3, g2);
 }

 inline vfloat32m1x4_t v4f_transpose4x4__rvv(const vfloat32m1x4_t m) {
--- a/aten/src/ATen/native/cpu/PowKernel.cpp
+++ b/aten/src/ATen/native/cpu/PowKernel.cpp
@ -120,7 +120,7 @@ static void pow_tensor_scalar_kernel(
  } else if (dtype == ScalarType::Half) {
    [&]() {
      using scalar_t =
-          c10::impl::ScalarTypeToCPPTypeT<ScalarType::Half>;
+          decltype(c10::impl::ScalarTypeToCPPType<ScalarType::Half>::t);
      const auto exp = exp_scalar.to<scalar_t>();
      using Vec = Vectorized<scalar_t>;
      cpu_kernel_vec(iter,
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@ -272,110 +272,28 @@ cuda::blas::GEMMAndBiasActivationEpilogue activation_to_gemm_and_blas_arg(Activa
  }
 }

-/*
- * Checks whether DISABLE_ADDMM_CUDA_LT is set.
- * Additionally, for ROCM we test whether the architecture supports the Lt.
- */
-static bool isGloballyDisabledAddmmCudaLt(const at::Device& device) {
-  // When hipBLASLt is not supported on the architecture, return true
-  #ifdef USE_ROCM
-  static const std::vector<std::string> archs = {
+static bool getDisableAddmmCudaLt() {
+    static const auto env_value = c10::utils::get_env("DISABLE_ADDMM_CUDA_LT");
+    if (env_value == "1") {
+      return true;
+    }
+    return false;
+}
+
+#ifdef USE_ROCM
+static bool isSupportedHipLtROCmArch(int index) {
+    static const std::vector<std::string> archs = {
        "gfx90a", "gfx942",
-    #if ROCM_VERSION >= 60300
+#if ROCM_VERSION >= 60300
        "gfx1100", "gfx1101", "gfx1200", "gfx1201", "gfx908",
-    #endif
-    #if ROCM_VERSION >= 70000
+#endif
+#if ROCM_VERSION >= 70000
        "gfx950", "gfx1150", "gfx1151"
-    #endif
-  };
-  const auto is_hipblas_lt_arch_supported = at::detail::getCUDAHooks().isGPUArch(archs, device.index());
-  if (!is_hipblas_lt_arch_supported) {
-    return true;
-  }
-  #endif
-
-  // Check whether it is disabled in the env
-  static const auto is_addmm_cuda_lt_disabled = c10::utils::get_env("DISABLE_ADDMM_CUDA_LT");
-  if (is_addmm_cuda_lt_disabled == "1") {
-    return true;
-  }
-
-  return false;
-}
-
-/*
- * Check whether for the given input we want to enable the Lt interface
- */
-static bool isInputCompliesAddmmCudaLt(Tensor& result, const Tensor& self, const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha) {
-  // Implies 2D bias which we currently not send through Lt.
-  // TODO: this check is done pre col-major input preparation,
-  // so, this condition can be ralexed in cases when a col-major
-  // copy of result is needed.
-  if (result.is_same(self)) {
-    return false;
-  }
-
-  #if defined(USE_ROCM) && ROCM_VERSION == 60400
-  // hipblaslt TT fp32 regression on ROCm 6.4, cannot use
-  const auto args = cublasCommonArgs(mat1, mat2, result);
-  if (args.transa == 't' && args.transb == 't') {
-    return false;
-  }
-  #endif
-
-  const auto mat1_sizes = mat1.sizes();
-  const auto mat2_sizes = mat2.sizes();
-  #if defined(CUDA_VERSION) || defined(USE_ROCM)
-  const auto scalar_type = mat1.scalar_type();
-  return (beta.toComplexDouble() == 1.0
-    // self.dim() == 1 && result.dim() == 2 && self.sizes()[0] == mat2_sizes[1]
-    // is to use lt interface only when self is bias.
-    && self.dim() == 1 && self.sizes()[0] == mat2_sizes[1] && self.is_contiguous()
-    && result.dim() == 2 && result.is_contiguous()
-    && ( // some dtype restrictions
-      #ifndef USE_ROCM
-      scalar_type == at::ScalarType::Double ||
-      #endif
-      scalar_type == at::ScalarType::Float ||
-      scalar_type == at::ScalarType::Half ||
-      scalar_type == at::ScalarType::BFloat16
-    )
-    && ( // some shape/stride restrictions
-      // Strangely, if mat2 has only 1 row or column, we get
-      // CUBLAS_STATUS_INVALID_VALUE error from cublasLtMatmulAlgoGetHeuristic.
-      // NOTE: extension to mat1 because mat1/mat2 can be swapped based off
-      // their row-/col-majorness.
-      mat1_sizes[0] > 1 && mat1_sizes[1] > 1 &&
-      mat2_sizes[0] > 1 && mat2_sizes[1] > 1
-      // The last conditions is to skip 16b transA and non-trans-B having
-      // leading dim >> rows when they are sliced from a large tensor
-      // see fbcode/caffe2/test/test_linalg.py:test_corner_cases_of_cublasltmatmul
-      #if !(defined(CUDA_VERSION) && CUDA_VERSION >= 12010 || defined(USE_ROCM))
-      // Related to avoiding the leading stride >> leading dim problematic case
-      // with 16b dtypes described above. For such dtypes we only allow inputs
-      // which are either row- or col-major (i.e. non-overlapping, compact memory layout).
-      // In that case the leading stride will be equal to the outer dim len.
-      // Why do we catch this case here? The following `prepare_matrix_for_cublas` method
-      // does not modify inputs as long as there is a stride of length 1
-      // and the leading stride is at least max(1, other dim length), so we might
-      // end up with contiguous cols but not rows (i.e. holes between different rows)
-      // and vice versa.
-      && mat2_sizes[0] < 65535 * 32 && mat2_sizes[1] < 65535 * 32 &&
-      mat1_sizes[0] < 65535 * 32 && mat1_sizes[1] < 65535 * 32 &&
-      && (
-        // filter by dtype
-        (scalar_type != at::ScalarType::Half && scalar_type != at::ScalarType::BFloat16) ||
-        // check mat1/mat2 is row-/col-major
-        (mat1.is_non_overlapping_and_dense() && mat2.is_non_overlapping_and_dense())
-      )
-      #endif
-    )
-  );
-  #endif
-
-  // no compliance by default
-  return false;
+#endif
+    };
+    return at::detail::getCUDAHooks().isGPUArch(archs, index);
 }
+#endif

 template <typename scalar_t>
 void launchTunableGemmAndBias(cublasCommonArgs &args, const Scalar& alpha, const scalar_t* bias, cuda::blas::GEMMAndBiasActivationEpilogue activation) {
@ -417,70 +335,7 @@ void launchTunableGemmAndBias(cublasCommonArgs &args, const Scalar& alpha, const
  }
 }

-template <typename scalar_t, typename res_scalar_t = scalar_t>
-bool launchGemmAndBiasCublasLt(
-    // args contains result which is modified
-    cublasCommonArgs& args,
-    const Tensor& self,
-    const Scalar& alpha,
-    Activation activation = Activation::None
-) {
-  const auto* self_ptr = self.const_data_ptr<scalar_t>();
-
-  const auto tuning_ctx = at::cuda::tunable::getTuningContext();
-  if (tuning_ctx->IsTunableOpEnabled()) {
-    // TODO: maybe also return some success state?
-    launchTunableGemmAndBias<scalar_t>(
-      args, alpha, self_ptr, activation_to_gemm_and_blas_arg(activation)
-    );
-    return true;
-  }
-
-  return at::cuda::blas::gemm_and_bias<scalar_t, res_scalar_t>(
-    args.transa == 't',
-    args.transb == 't',
-    args.m,
-    args.n,
-    args.k,
-    alpha.to<at::opmath_type<scalar_t>>(),
-    args.mata->const_data_ptr<scalar_t>(),
-    args.lda,
-    args.matb->const_data_ptr<scalar_t>(),
-    args.ldb,
-    self_ptr,
-    args.result->data_ptr<res_scalar_t>(),
-    args.result_ld,
-    activation_to_gemm_and_blas_arg(activation)
-  );
-}
-
-template <typename scalar_t, typename res_scalar_t = scalar_t>
-bool launchGemmCublas(
-    // args contains result which is modified
-    cublasCommonArgs& args,
-    const Scalar& alpha,
-    const Scalar& beta
-) {
-  at::cuda::blas::gemm<scalar_t, res_scalar_t>(
-    args.transa,
-    args.transb,
-    args.m,
-    args.n,
-    args.k,
-    alpha.to<at::opmath_type<scalar_t>>(),
-    args.mata->const_data_ptr<scalar_t>(),
-    args.lda,
-    args.matb->const_data_ptr<scalar_t>(),
-    args.ldb,
-    beta.to<at::opmath_type<scalar_t>>(),
-    args.result->data_ptr<res_scalar_t>(),
-    args.result_ld
-  );
-  return true; // success!
-}
-
 Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha, Activation activation=Activation::None, bool disable_addmm_cuda_lt_override=false) {
-  // Shape checks {
  // Make sure to keep addmm_cuda below in sync with this code; it
  // preflights a check to try to avoid actually needing to call
  // expand().
@ -490,62 +345,105 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
    "expected mat1 and mat2 to have the same dtype, but got: ", mat1.dtype(), " != ", mat2.dtype()
  )

-  if (result.is_same(self)) {
-    TORCH_CHECK(result.dim() == 2, "tensors must be 2-D");
-    TORCH_CHECK(self.sizes()[0] == mat1.sizes()[0], "self dim 0 must match mat1 dim 0");
-    TORCH_CHECK(self.sizes()[1] == mat2.sizes()[1], "self dim 1 must match mat2 dim 1");
-  }
-  // } Shape checks
-
  // NOLINTNEXTLINE(*c-array*)
  TensorArg targs[]{{result, "out", 0}, {self, "self", 1}, {mat1, "mat1", 2}, {mat2, "mat2", 3}};
  checkAllSameGPU(__func__, targs);

-  // Handle whether to use the Lt interface {
-  static bool persistent_disable_addmm_cuda_lt = isGloballyDisabledAddmmCudaLt(self.device());
+  IntArrayRef mat1_sizes = mat1.sizes();
+  IntArrayRef mat2_sizes = mat2.sizes();
+  IntArrayRef self__sizes;
+  bool useLtInterface = false;
+#if defined(USE_ROCM)
+  // When hipBLASLt is not supported on the architecture,
+  // disable_addmm_cuda_lt will always be to set to true
+  static bool disable_addmm_cuda_lt =
+    !isSupportedHipLtROCmArch(self.device().index()) || getDisableAddmmCudaLt();
+#else
+  static bool disable_addmm_cuda_lt = getDisableAddmmCudaLt();
+#endif
  // if lt path fails, we recurse back into this function here and force the lt path to off
  // we cannot update varible disable_addmm_cuda_lt from above since it is static and would be permanent
-  bool disable_addmm_cuda_lt = persistent_disable_addmm_cuda_lt || disable_addmm_cuda_lt_override;
-  #ifdef USE_ROCM
-  // Conditioned on the device index, which is not persistent
-  disable_addmm_cuda_lt = isGloballyDisabledAddmmCudaLt(self.device()) || disable_addmm_cuda_lt;
-  #endif
-  // Condition on the input
-  disable_addmm_cuda_lt = !isInputCompliesAddmmCudaLt(result, self, mat1, mat2, beta, alpha) || disable_addmm_cuda_lt;
-  // }
-
+  bool disable_addmm_cuda_lt_final = disable_addmm_cuda_lt || disable_addmm_cuda_lt_override;
+#if defined(USE_ROCM) && ROCM_VERSION == 60400
+  // hipblaslt TT fp32 regression on ROCm 6.4, cannot use
+  cublasCommonArgs _args(mat1, mat2, result);
+  if (_args.transa == 't' && _args.transb == 't') {
+    disable_addmm_cuda_lt_final = true;
+  }
+#endif
  at::ScalarType scalar_type = mat1.scalar_type();
  bool is_float_output_with_half_input = (scalar_type == at::ScalarType::Half || scalar_type == at::ScalarType::BFloat16) && result.scalar_type() == at::ScalarType::Float;
+  c10::MaybeOwned<Tensor> self_;
+  if (&result != &self) {
+#if defined(CUDA_VERSION) || defined(USE_ROCM)
+    // Strangely, if mat2 has only 1 row or column, we get
+    // CUBLAS_STATUS_INVALID_VALUE error from cublasLtMatmulAlgoGetHeuristic.
+    // self.dim() == 1 && result.dim() == 2 && self.sizes()[0] == mat2_sizes[1]
+    // is to use lt interface only when self is bias.
+    // for cuda 11.4, cublasLtMatmul is activated
+    // the last two conditions is to skip 16b transA and non-trans-B having
+    // leading dim >> rows when they are sliced from a large tensor
+    // see fbcode/caffe2/test/test_linalg.py:test_corner_cases_of_cublasltmatmul
+    if (!disable_addmm_cuda_lt_final) {
+      useLtInterface = beta.toComplexDouble() == 1.0 && self.dim() == 1 &&
+          result.dim() == 2 && self.sizes()[0] == mat2_sizes[1] &&
+          self.is_contiguous() && result.is_contiguous() &&
+#ifdef USE_ROCM
+          (scalar_type == at::ScalarType::Float ||
+           scalar_type == at::ScalarType::Half ||
+           scalar_type == at::ScalarType::BFloat16) &&
+#else
+          (scalar_type == at::ScalarType::Double ||
+           scalar_type == at::ScalarType::Float ||
+           scalar_type == at::ScalarType::Half ||
+           scalar_type == at::ScalarType::BFloat16) &&
+#endif
+#if (defined(CUDA_VERSION) && CUDA_VERSION >= 12010 || defined(USE_ROCM))
+          mat2_sizes[0] > 1 && mat2_sizes[1] > 1;
+#else
+          mat2_sizes[0] > 1 && mat2_sizes[1] > 1 &&
+          mat2_sizes[0] < 65535 * 32 && mat2_sizes[1] < 65535 * 32 &&
+          mat1_sizes[0] < 65535 * 32 && mat1_sizes[1] < 65535 * 32 &&
+          // avoid leading dim >> rows bugs
+          ((mat1.strides()[0] == 1 && mat1.strides()[1] == mat1_sizes[0]) ||
+           (mat1.strides()[1] == 1 && mat1.strides()[0] == mat1_sizes[1]) ||
+           (scalar_type != at::ScalarType::Half &&
+            scalar_type != at::ScalarType::BFloat16)) &&
+          ((mat2.strides()[0] == 1 && mat2.strides()[1] == mat2_sizes[0]) ||
+           (mat2.strides()[1] == 1 && mat2.strides()[0] == mat2_sizes[1]) ||
+           (scalar_type != at::ScalarType::Half &&
+            scalar_type != at::ScalarType::BFloat16));
+#endif
+    }
+#endif
+    if (!useLtInterface) {
+      self_ = expand_size(self, {mat1_sizes[0], mat2_sizes[1]}, "addmm");
+    }
+    self__sizes = self_->sizes();
+  } else {
+    self_ = c10::MaybeOwned<Tensor>::borrowed(self);
+    self__sizes = self_->sizes();
+    TORCH_CHECK(result.dim() == 2, "tensors must be 2-D");
+    TORCH_CHECK(self__sizes[0] == mat1_sizes[0], "self_ dim 0 must match mat1 dim 0");
+    TORCH_CHECK(self__sizes[1] == mat2_sizes[1], "self_ dim 1 must match mat2 dim 1");
+  }

-  // Handle result/self shapes
-  if (!result.is_same(self)) {
-    at::native::resize_output(result, {mat1.sizes()[0], mat2.sizes()[1]});
-
-    const auto self_maybe_expanded = [&]() -> c10::MaybeOwned<Tensor> {
-      if (disable_addmm_cuda_lt) {
-        // When in non-Lt path we do expand self even before
-        // check for beta != 0.0 to make sure that
-        // test_sparse_csr.py::TestSparseCSRCUDA::test_addmm_errors_*
-        // runs green.
-        return expand_size(self, result.sizes(), "addmm");
-      }
-      // copy next, should broadcast
-      return c10::MaybeOwned<Tensor>::borrowed(self);
-    }();
-    // We copy bias when in the non-Lt path
-    if (beta.toComplexDouble() != 0.0 && disable_addmm_cuda_lt) {
-      // NOTE: self should broadcast over result
-      at::native::copy_(result, *self_maybe_expanded);
+  if (&result != &self) {
+    at::native::resize_output(result, {mat1_sizes[0], mat2_sizes[1]});
+    if (beta.toComplexDouble() != 0.0 && !useLtInterface) {
+      at::native::copy_(result, *self_);
    }
  }

-  // Short circuit on empty result
-  if (result.numel() == 0) {
+
+  IntArrayRef result_sizes = result.sizes();
+  if ((result_sizes[0] == 0) || (result_sizes[1] == 0)) {
    return result;
  }

-  // Short circuit if the reduction dim is empty
-  if (mat1.sizes()[1] == 0) {
+  cublasCommonArgs args(mat1, mat2, result);
+
+  if (mat1.numel() == 0) {
    // By definition, when beta==0, values in self should be ignored. nans and infs
    // should not propagate
    if (beta.toComplexDouble() == 0.) {
@ -557,64 +455,158 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
        result,
        self.expand(result.sizes()),
        at::native::scalar_tensor(
-          beta,
-          self.scalar_type(),
-          std::nullopt /* layout */,
-          at::kCPU,
-          std::nullopt /* pin_memory */
-        )
-    );
+            beta,
+            self.scalar_type(),
+            std::nullopt /* layout */,
+            at::kCPU,
+            std::nullopt /* pin_memory */));
  }

-  cublasCommonArgs args(mat1, mat2, result);
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!args.result->is_conj());

-  // The Lt path
-  if (!disable_addmm_cuda_lt) {
-    bool lt_success = false;
+  if (useLtInterface) {
+#if defined(USE_ROCM)
+    bool okay = true;
    if (is_float_output_with_half_input) {
-      #ifdef USE_ROCM
      TORCH_CHECK(false, "float output with half input is not enabled for ROCm");
-      #else
-      if (at::cuda::tunable::getTuningContext()->IsTunableOpEnabled()) {
-       TORCH_CHECK(false, "Tunable GEMM is not supported for float output with reduced float input");
-      }
-      AT_DISPATCH_REDUCED_FLOATING_TYPES(
-        scalar_type,
-        "addmm_cuda_lt",
-        [&] {
-          lt_success = launchGemmAndBiasCublasLt<scalar_t, float>(args, self, alpha, activation);
-        }
-      );
-      #endif
    } else {
-      // !is_float_output_with_half_input
      AT_DISPATCH_FLOATING_TYPES_AND2(
        at::ScalarType::Half,
        at::ScalarType::BFloat16,
        scalar_type,
        "addmm_cuda_lt",
        [&] {
-          lt_success = launchGemmAndBiasCublasLt<scalar_t>(args, self, alpha, activation);
+        auto tuning_ctx = at::cuda::tunable::getTuningContext();
+        if (tuning_ctx->IsTunableOpEnabled()) {
+          launchTunableGemmAndBias<scalar_t>(
+              args,
+              alpha,
+              (&result != &self) ? self.const_data_ptr<scalar_t>() : nullptr,
+              activation_to_gemm_and_blas_arg(activation));
+        } else {
+          okay = at::cuda::blas::gemm_and_bias<scalar_t>(
+            args.transa == 't',
+            args.transb == 't',
+            args.m,
+            args.n,
+            args.k,
+            alpha.to<at::opmath_type<scalar_t>>(),
+            args.mata->const_data_ptr<scalar_t>(),
+            args.lda,
+            args.matb->const_data_ptr<scalar_t>(),
+            args.ldb,
+            // This condition is needed for mm case on ROCm for hipblasLt path.
+            // Passing the bias ptr as null to avoid accuracy issues for mm case.
+            (&result != &self) ? self.const_data_ptr<scalar_t>() : nullptr,
+            args.result->data_ptr<scalar_t>(),
+            args.result_ld,
+            activation_to_gemm_and_blas_arg(activation)
+          );
        }
-      );
-    } // end is_float_output_with_half_input
-
-    if (!lt_success) {
-    // lt path failed; recurse but disable lt path
+      });
+    }
+    if (!okay) {
+      // lt path failed; recurse but disable lt path
      return addmm_out_cuda_impl(result, self, mat1, mat2, beta, alpha, activation, true);
    }
-    // end Lt path
-  } else {
-    // No Lt, we use a GEMM instead
+#else
+    auto activation_epilogue = activation_to_gemm_and_blas_arg(activation);
+    bool okay = true;
+    if (is_float_output_with_half_input) {
+      AT_DISPATCH_REDUCED_FLOATING_TYPES(
+        scalar_type,
+        "addmm_cuda_lt",
+        [&] {
+        auto tuning_ctx = at::cuda::tunable::getTuningContext();
+        if (tuning_ctx->IsTunableOpEnabled()) {
+          TORCH_CHECK(false, "Tunable GEMM is not supported for float output with reduced float input");
+        }
+        else {
+          okay = at::cuda::blas::gemm_and_bias<scalar_t, float>(
+              args.transa == 't',
+              args.transb == 't',
+              args.m,
+              args.n,
+              args.k,
+              alpha.to<at::opmath_type<scalar_t>>(),
+              args.mata->const_data_ptr<scalar_t>(),
+              args.lda,
+              args.matb->const_data_ptr<scalar_t>(),
+              args.ldb,
+              self.const_data_ptr<scalar_t>(),
+              args.result->data_ptr<float>(),
+              args.result_ld,
+              activation_epilogue
+          );
+        }});
+    } else {
+      AT_DISPATCH_FLOATING_TYPES_AND2(
+        at::ScalarType::Half,
+        at::ScalarType::BFloat16,
+        scalar_type,
+        "addmm_cuda_lt",
+        [&] {
+        auto tuning_ctx = at::cuda::tunable::getTuningContext();
+        if (tuning_ctx->IsTunableOpEnabled()) {
+          launchTunableGemmAndBias<scalar_t>(
+              args,
+              alpha,
+              self.const_data_ptr<scalar_t>(),
+              activation_epilogue);
+        }
+        else {
+          okay = at::cuda::blas::gemm_and_bias<scalar_t>(
+              args.transa == 't',
+              args.transb == 't',
+              args.m,
+              args.n,
+              args.k,
+              alpha.to<at::opmath_type<scalar_t>>(),
+              args.mata->const_data_ptr<scalar_t>(),
+              args.lda,
+              args.matb->const_data_ptr<scalar_t>(),
+              args.ldb,
+              self.const_data_ptr<scalar_t>(),
+              args.result->data_ptr<scalar_t>(),
+              args.result_ld,
+              activation_epilogue
+          );
+      }});
+    }
+    if (!okay) {
+      // lt path failed; recurse but disable lt path
+      return addmm_out_cuda_impl(result, self, mat1, mat2, beta, alpha, activation, true);
+    }
+#endif
+  } else
+  {
    if (is_float_output_with_half_input) {
      AT_DISPATCH_REDUCED_FLOATING_TYPES(
        scalar_type,
        "addmm_cuda",
        [&] {
-          launchGemmCublas<scalar_t, float>(args, alpha, beta);
-        }
-      );
+          using opmath_t = at::opmath_type<scalar_t>;
+          opmath_t alpha_val = alpha.to<opmath_t>();
+          opmath_t beta_val = beta.to<opmath_t>();
+          const scalar_t* mat1_ptr = args.mata->const_data_ptr<scalar_t>();
+          const scalar_t* mat2_ptr = args.matb->const_data_ptr<scalar_t>();
+
+          float* result_ptr = args.result->mutable_data_ptr<float>();
+          at::cuda::blas::gemm<scalar_t, float>(
+              args.transa,
+              args.transb,
+              args.m,
+              args.n,
+              args.k,
+              alpha_val,
+              mat1_ptr,
+              args.lda,
+              mat2_ptr,
+              args.ldb,
+              beta_val,
+              result_ptr,
+              args.result_ld);
+        });
    } else {
      AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
        at::ScalarType::Half,
@ -622,12 +614,28 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
        scalar_type,
        "addmm_cuda",
        [&] {
-          launchGemmCublas<scalar_t>(args, alpha, beta);
-        }
-      );
+          using opmath_t = at::opmath_type<scalar_t>;
+          opmath_t alpha_val = alpha.to<opmath_t>();
+          opmath_t beta_val = beta.to<opmath_t>();
+          const scalar_t* mat1_ptr = args.mata->const_data_ptr<scalar_t>();
+          const scalar_t* mat2_ptr = args.matb->const_data_ptr<scalar_t>();
+          scalar_t* result_ptr = args.result->mutable_data_ptr<scalar_t>();
+          at::cuda::blas::gemm<scalar_t>(
+              args.transa,
+              args.transb,
+              args.m,
+              args.n,
+              args.k,
+              alpha_val,
+              mat1_ptr,
+              args.lda,
+              mat2_ptr,
+              args.ldb,
+              beta_val,
+              result_ptr,
+              args.result_ld);
+        });
    }
-
-    // Apply epilogue
    switch (activation) {
      case Activation::RELU:
        // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
@ -639,14 +647,14 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
        break;
      default: break;
    }
-  } // end GEMM path
+  }

 // Preprocessor gate here needs to match the inverse of the check
 // gating activation_to_gemm_and_blas_arg above; here we are manually
 // performing a post-GELU because we weren't able to use the GELU
 // epilogue above.
 #if !defined(CUDA_VERSION) && !defined(USE_ROCM)
-  if (!disable_addmm_cuda_lt && activation == Activation::GELU) {
+  if (useLtInterface && activation == Activation::GELU) {
    at::gelu_(const_cast<Tensor&>(*args.result), "tanh");
  }
 #endif
--- a/aten/src/ATen/native/cuda/CUDALoops.cuh
+++ b/aten/src/ATen/native/cuda/CUDALoops.cuh
@ -856,13 +856,9 @@ struct type_specialized_kernel_launcher {
      out_calc_t output_offset_calculator,
      loader_t loader,
      storer_t storer) {
-    constexpr ScalarType sret_t = rt_binary_specializations[arg_index][0];
-    constexpr ScalarType sarg0_t = rt_binary_specializations[arg_index][1];
-    constexpr ScalarType sarg1_t = rt_binary_specializations[arg_index][2];
-    if (ret_t == sret_t && arg0_t == sarg0_t && arg1_t == sarg1_t) {
-      using cret_t = c10::impl::ScalarTypeToCPPTypeT<sret_t>;
-      using carg0_t = c10::impl::ScalarTypeToCPPTypeT<sarg0_t>;
-      using carg1_t = c10::impl::ScalarTypeToCPPTypeT<sarg1_t>;
+    if (ret_t == rt_binary_specializations[arg_index][0] &&
+        arg0_t == rt_binary_specializations[arg_index][1] &&
+        arg1_t == rt_binary_specializations[arg_index][2])
      launch_vectorized_templated_kernel<
          func_t,
          array_t,
@ -870,9 +866,12 @@ struct type_specialized_kernel_launcher {
          out_calc_t,
          loader_t,
          storer_t,
-          cret_t,
-          carg0_t,
-          carg1_t>(
+          decltype(c10::impl::ScalarTypeToCPPType<
+                   rt_binary_specializations[arg_index][0]>::t),
+          decltype(c10::impl::ScalarTypeToCPPType<
+                   rt_binary_specializations[arg_index][1]>::t),
+          decltype(c10::impl::ScalarTypeToCPPType<
+                   rt_binary_specializations[arg_index][2]>::t)>(
          numel,
          f,
          data,
@ -880,7 +879,6 @@ struct type_specialized_kernel_launcher {
          output_offset_calculator,
          loader,
          storer);
-    }
  }
 };

--- a/aten/src/ATen/native/cuda/Loops.cuh
+++ b/aten/src/ATen/native/cuda/Loops.cuh
@ -1,17 +1,18 @@
 #pragma once

-#include <ATen/OpMathType.h>
-#include <ATen/cuda/detail/OffsetCalculator.cuh>
 #include <ATen/detail/FunctionTraits.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/TensorIteratorDynamicCasting.h>
+#include <ATen/cuda/detail/OffsetCalculator.cuh>
+#include <ATen/OpMathType.h>
 #include <ATen/native/cuda/thread_constants.h>
+
+#include <thrust/tuple.h>
+
 #include <ATen/native/cuda/MemoryAccess.cuh>

 #include <tuple>

-
-
 namespace at::native {

 template<int N>
@ -61,11 +62,7 @@ __device__ inline void elementwise_kernel_helper(func_t f, policy_t policy) {
  #pragma unroll
  for (int i = 0; i < elems_per_thread; i++) {
    if (policy.check_inbounds(i)) {
-#if defined(__HIP__)
      results[i] = c10::guts::apply(f, args[i]);
-#else
-      results[i] = std::apply(f, args[i]);
-#endif
    }
  }

--- a/aten/src/ATen/native/cuda/Normalization.cuh
+++ b/aten/src/ATen/native/cuda/Normalization.cuh
@ -23,7 +23,7 @@ namespace at::native {

 // The maximum number of threads in a block
 #if defined(USE_ROCM)
-constexpr int MAX_BLOCK_SIZE = 1024;
+constexpr int MAX_BLOCK_SIZE = 256;
 #else
 constexpr int MAX_BLOCK_SIZE = 512;
 #endif
@ -33,7 +33,7 @@ constexpr unsigned MAX_GRID_SIZE = 65535u;
 // Number of threads in a block given an input size up to MAX_BLOCK_SIZE
 static int getNumThreads(int nElem) {
 #if defined(USE_ROCM)
-  int threadSizes[5] = { 64, 128, 256, 512, MAX_BLOCK_SIZE };
+  int threadSizes[5] = { 16, 32, 64, 128, MAX_BLOCK_SIZE };
 #else
  int threadSizes[5] = { 32, 64, 128, 256, MAX_BLOCK_SIZE };
 #endif
@ -115,23 +115,9 @@ __device__ scalar_t reduce(Op op, PTA tensor, int plane) {
  // first the reductions each thread does separately
  scalar_t sum = static_cast<scalar_t>(0);
  for (int batch = threadIdx.y; batch < tensor.size(0); batch += blockDim.y) {
-#if defined(USE_ROCM)
-    constexpr int UNRL = 4; // load deserilize factor
-    scalar_t tmp[UNRL];
-    for (int x = threadIdx.x; x < tensor.size(2); x += blockDim.x*UNRL) {
-#pragma unroll
-      for (int u = 0; u < UNRL; u++)
-        tmp[u] = op(batch, plane, std::min((int)tensor.size(2)-1, (int)(x+u*blockDim.x)));
-#pragma unroll
-      for (int u = 0; u < UNRL; u++)
-        if (x+u*blockDim.x < tensor.size(2))
-          sum += tmp[u];
-    }
-#else
    for (int x = threadIdx.x; x < tensor.size(2); x += blockDim.x) {
      sum += op(batch, plane, x);
    }
-#endif
  }
  __shared__ scalar_t shared[C10_WARP_SIZE];
  SumReduceOp<scalar_t> reduce_op;
@ -306,22 +292,6 @@ __global__ void batch_norm_collect_statistics_kernel(
  stat_accscalar_t var_n = 0;
  int n = 0;
  for (int batch = threadIdx.y; batch < input.size(0); batch += blockDim.y) {
-#if defined(USE_ROCM)
-    constexpr int UNRL = 4;
-    stat_accscalar_t v_[UNRL];
-    for (int x = threadIdx.x; x < input.size(2); x += blockDim.x*UNRL) {
-      for (int u = 0; u < UNRL; u++)
-        v_[u] = input[batch][plane][min(x+u*blockDim.x, input.size(2)-1)];
-      for (int u = 0; u < UNRL; u++) {
-        if (x+u*blockDim.x < input.size(2)) {
-          stat_accscalar_t d1 = v_[u] - avg;
-          n++;
-          avg += d1 / n;
-          var_n += d1 * (v_[u] - avg);
-        }
-      }
-    }
-#else
    for (int x = threadIdx.x; x < input.size(2); x += blockDim.x) {
      stat_accscalar_t v = input[batch][plane][x];
      stat_accscalar_t d1 = v - avg;
@ -329,7 +299,6 @@ __global__ void batch_norm_collect_statistics_kernel(
      avg += d1 / n;
      var_n += d1 * (v - avg);
    }
-#endif
  }

  // first warpSum to get one value per thread to
--- a/aten/src/ATen/native/cuda/ReflectionPad.cu
+++ b/aten/src/ATen/native/cuda/ReflectionPad.cu
@ -92,16 +92,6 @@ inline thrust::pair<int64_t, int64_t>  get_index_mapping2d(
    output_offset + output_y * output_dim_x + output_x);
 }

-__device__ __forceinline__ int64_t reflect_index(int64_t x, int64_t len) {
-  const int64_t two = (len - 1) * 2;
-  if (two <= 0) {
-    return 0;
-  }
-  int64_t m = x % two;
-  if (m < 0) m += two;
-  return (m < len) ? m : (two - m);
-}
-
 template<typename scalar_t>
 __global__ void reflection_pad1d_out_kernel(
    const scalar_t * input, scalar_t * output,
@ -116,28 +106,6 @@ __global__ void reflection_pad1d_out_kernel(
  }
 }

-template <typename scalar_t>
-__global__ void reflection_pad1d_flat(
-    const scalar_t* __restrict__ input,
-    scalar_t* __restrict__ output,
-    int64_t input_w, int64_t pad_l, int64_t pad_r,
-    int64_t out_w, int64_t plane_count) {
-
-  const int64_t bx = blockDim.x;
-  const int64_t tx = threadIdx.x;
-
-  const int64_t total = plane_count * out_w;
-  const int64_t grid_stride = static_cast<int64_t>(bx) * gridDim.x;
-  int64_t linear = static_cast<int64_t>(blockIdx.x) * bx + tx;
-
-  for (; linear < total; linear += grid_stride) {
-    const int64_t plane = linear / out_w;
-    const int64_t x = linear - plane * out_w;
-    const int64_t j = reflect_index(x - pad_l, input_w);
-    output[plane * out_w + x] = input[plane * input_w + j];
-  }
-}
-
 template <typename scalar_t>
 __global__ void reflection_pad1d_backward_out_kernel(
    scalar_t * grad_input, const scalar_t * grad_output,
@ -742,44 +710,25 @@ TORCH_IMPL_FUNC(reflection_pad1d_out_cuda)
  int64_t input_w = input_.size(dim_w);
  int64_t output_w = input_w + pad_l + pad_r;

+  dim3 block_size(output_w > 256 ? 256 : output_w);
+  dim3 grid_size((int)::ceil(output_w / 256.0), nplane, nbatch);

  Tensor input = input_.contiguous();

-  const int block_x = static_cast<int>(std::min<int64_t>(256, std::max<int64_t>(1, output_w)));
-  const cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
-  const int max_x = prop->maxGridSize[0];
-  const int max_y = prop->maxGridSize[1];
-  const int max_z = prop->maxGridSize[2];
-
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16, input.scalar_type(), "reflection_pad1d_out", [&] {
-    auto stream = at::cuda::getCurrentCUDAStream();
-
-    const int64_t gx = at::ceil_div(output_w, static_cast<int64_t>(block_x));
-
-    const bool fits3d = (nplane <= max_y) && (nbatch <= max_z) && (gx <= max_x);
-
-    if (fits3d) {
-      dim3 block(block_x, 1, 1);
-      dim3 grid(gx, static_cast<unsigned>(nplane), static_cast<unsigned>(nbatch));
-      reflection_pad1d_out_kernel<scalar_t><<<grid, block, 0, stream>>>(
-          input.const_data_ptr<scalar_t>(),
-          output.mutable_data_ptr<scalar_t>(),
-          input_w, pad_l, pad_r);
-    } else {
-      dim3 block(block_x, 1, 1);
-      const int64_t plane_count = nplane * nbatch;
-      const int64_t total_blocks = at::ceil_div(plane_count * output_w, static_cast<int64_t>(block_x));
-      const int grid_x = static_cast<int>(std::min<int64_t>(max_x, std::max<int64_t>(1, total_blocks)));
-      dim3 grid(grid_x, 1, 1);
-
-      reflection_pad1d_flat<scalar_t><<<grid, block, 0, stream>>>(
-          input.const_data_ptr<scalar_t>(),
-          output.mutable_data_ptr<scalar_t>(),
-          input_w, pad_l, pad_r, output_w, plane_count);
-    }
-
-    C10_CUDA_KERNEL_LAUNCH_CHECK();
-  });
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
+      kHalf, kBFloat16, input.scalar_type(), "reflection_pad1d_out_template", [&] {
+        reflection_pad1d_out_kernel<<<
+            grid_size,
+            block_size,
+            0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            input.const_data_ptr<scalar_t>(),
+            output.mutable_data_ptr<scalar_t>(),
+            input_w,
+            pad_l,
+            pad_r);
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+      });
 }

 TORCH_IMPL_FUNC(reflection_pad1d_backward_out_cuda)(const Tensor& grad_output_,
--- a/aten/src/ATen/native/cuda/Sorting.cpp
+++ b/aten/src/ATen/native/cuda/Sorting.cpp
@ -43,12 +43,6 @@ std::tuple<Tensor&, Tensor&> kthvalue_out_impl_cuda(
  TORCH_CHECK(k >= 1 && k <= slicesize,
              "kthvalue(): selected number k out of range for dimension ", dim);

-  TORCH_CHECK(
-      slicesize <= std::numeric_limits<int32_t>::max(),
-      "kthvalue(): dimension ", dim, " is too large (", slicesize,
-      "). The current CUDA implementation supports dimension sizes up to ",
-      std::numeric_limits<int32_t>::max());
-
  at::assert_no_overlap(self, values);

  _reduction_with_indices_allocate_or_resize_output(
@ -169,6 +163,10 @@ std::tuple<Tensor&, Tensor&> kthvalue_out_cuda(
    bool keepdim,
    Tensor& values,
    Tensor& indices) {
+  // See note [Writing Nondeterministic Operations]
+  // If there are duplicate elements of the kth value, the procedure for choosing which
+  // of the duplicates to use for the indices output is nondeterministic.
+  at::globalContext().alertNotDeterministic("kthvalue CUDA");
  auto result = [&]() {
    NoNamesGuard guard;
    // `kthvalue_out_impl_cuda` expects contiguous in input `self`.
--- a/aten/src/ATen/native/cuda/Sorting.cu
+++ b/aten/src/ATen/native/cuda/Sorting.cu
@ -65,34 +65,25 @@ __global__ void gatherKthValue(
      &kValue);

  // Find the index of the k-th highest element
-  __shared__ int32_t minIndexFound;
-
-  if (threadIdx.x == 0) {
-      minIndexFound = static_cast<int32_t>(inputSliceSize);
-  }
-  __syncthreads();
+  index_t kValueIndex = 0;
+  bool foundKValue = false;

  for (index_t i = threadIdx.x; i < inputSliceSize; i += blockDim.x) {
-      // Early exit based on best-so-far
-      if (i >= minIndexFound) {
-          break;
-      }
-
-      scalar_t v = doLdg(&inputSliceStart[i * inputWithinSliceStride]);
-      bool isKValue =
-          ((v == kValue) || (at::_isnan(v) && at::_isnan(kValue)));
-
-      if (isKValue) {
-          atomicMin(&minIndexFound, static_cast<int32_t>(i));
-          break;
-      }
+    bool inRange = (i < inputSliceSize);
+    scalar_t v = inRange ? doLdg(&inputSliceStart[i * inputWithinSliceStride])
+                         : static_cast<scalar_t>(0);
+    bool isKValue = inRange &&
+        ((v == kValue) || (at::_isnan(v) && at::_isnan(kValue)));
+    if (isKValue) {
+      kValueIndex = i;
+      foundKValue = true;
+      break;
+    }
  }

-  __syncthreads();
-
-  if (threadIdx.x == 0) {
-      indicesSliceStart[0] = static_cast<index_t>(minIndexFound);
-      kthValueSliceStart[0] = kValue;
+  if (foundKValue) {
+    kthValueSliceStart[0] = kValue;
+    indicesSliceStart[0] = kValueIndex;
  }
 }

--- a/aten/src/ATen/native/cuda/TriangularOps.cu
+++ b/aten/src/ATen/native/cuda/TriangularOps.cu
@ -5,6 +5,7 @@
 #include <ATen/Dispatch.h>
 #include <ATen/MemoryOverlap.h>
 #include <ATen/native/Resize.h>
+#include <ATen/native/TriangularOpsUtils.h>

 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@ -110,6 +111,8 @@ __global__ void triu_tril_kernel(

 template <bool upper>
 void triu_tril_cuda_template(const Tensor& result, const Tensor& self, int64_t k, const char* name) {
+  checkTrilTriuMemoryOverlap(result, self);
+
  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(
      at::ScalarType::ComplexHalf,
      at::ScalarType::Half,
--- a/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu
@ -127,29 +127,6 @@ __global__ void upsample_bilinear2d_nhwc_out_frame(
  }
 }

-#ifdef USE_ROCM
-// Helper function to compute output pixel range that can contribute to input pixel
-template <typename accscalar_t>
-__device__ __forceinline__ void compute_output_range(
-    int input_pos,
-    accscalar_t scale,
-    int output_size,
-    bool align_corners,
-    int& min_output,
-    int& max_output) {
-  accscalar_t lo, hi;
-  if (align_corners) {
-      lo = static_cast<accscalar_t>(input_pos - 1) / scale;
-      hi = static_cast<accscalar_t>(input_pos + 1) / scale;
-  } else {
-      lo = (input_pos - static_cast<accscalar_t>(0.5)) / scale - static_cast<accscalar_t>(0.5);
-      hi = (input_pos + static_cast<accscalar_t>(1.5)) / scale - static_cast<accscalar_t>(0.5);
-  }
-  min_output = max(0, static_cast<int>(std::ceil(lo)));
-  max_output = min(output_size - 1, static_cast<int>(std::floor(hi)));
-}
-#endif
-
 // Backward (adjoint) operation 1 <- 2 (accumulates)
 template <typename scalar_t, typename accscalar_t>
 C10_LAUNCH_BOUNDS_1(1024)
@ -164,74 +141,8 @@ __global__ void upsample_bilinear2d_backward_out_frame(
    const bool align_corners,
    scalar_t* __restrict__ idata,
    const scalar_t* __restrict__ odata) {
-  // In C++, integer multiplication, like in standard arithmetic, is generally commutative.
-  const size_t i_numel = nc * width1 * height1;
-#ifdef USE_ROCM
-  for (size_t index = blockDim.x * blockIdx.x + threadIdx.x; index < i_numel;
-       index += blockDim.x * gridDim.x) {
-    // Decode input pixel coordinates
-    size_t index_temp = index;
-    const int w1 = index_temp % width1;
-    index_temp /= width1;
-    const int h1 = index_temp % height1;
-    const size_t nc_idx = index_temp / height1;
-
-    accscalar_t grad_sum = 0;
-
-    // Find range of output pixels that could interpolate from this input pixel
-    int h2_min, h2_max, w2_min, w2_max;
-    compute_output_range<accscalar_t>(h1, rheight, height2, align_corners, h2_min, h2_max);
-    compute_output_range<accscalar_t>(w1, rwidth, width2, align_corners, w2_min, w2_max);
-
-    // Iterate over potential output pixels
-    for (int h2 = h2_min; h2 <= h2_max; h2++) {
-      for (int w2 = w2_min; w2 <= w2_max; w2++) {
-        // Compute source coordinates for this output pixel
-        const accscalar_t h1r = area_pixel_compute_source_index<accscalar_t>(
-            rheight, h2, align_corners, /*cubic=*/false);
-        const int h1_base = (int)h1r;
-        const int h1p = (h1_base < height1 - 1) ? 1 : 0;
-        const accscalar_t h1lambda = h1r - h1_base;
-        const accscalar_t h0lambda = static_cast<accscalar_t>(1) - h1lambda;
-
-        const accscalar_t w1r = area_pixel_compute_source_index<accscalar_t>(
-            rwidth, w2, align_corners, /*cubic=*/false);
-        const int w1_base = (int)w1r;
-        const int w1p = (w1_base < width1 - 1) ? 1 : 0;
-        const accscalar_t w1lambda = w1r - w1_base;
-        const accscalar_t w0lambda = static_cast<accscalar_t>(1) - w1lambda;
-
-        // Check if our input pixel participates in this interpolation and accumulate all weights
-        // At boundaries, h1p=0 or w1p=0 causes some sampling positions to collapse
-        // to the same pixel, so we need to accumulate weights from all matching positions
-        accscalar_t weight = 0;
-
-        // Check all four interpolation positions and accumulate weights
-        if (h1 == h1_base && w1 == w1_base) {
-          weight += h0lambda * w0lambda;  // top-left
-        }
-        if (h1 == h1_base && w1 == w1_base + w1p) {
-          weight += h0lambda * w1lambda;  // top-right (may be same as top-left if w1p=0)
-        }
-        if (h1 == h1_base + h1p && w1 == w1_base) {
-          weight += h1lambda * w0lambda;  // bottom-left (may be same as top-left if h1p=0)
-        }
-        if (h1 == h1_base + h1p && w1 == w1_base + w1p) {
-          weight += h1lambda * w1lambda;  // bottom-right (may collapse to other positions)
-        }
-
-        if (weight > 0) {
-          const size_t output_idx = nc_idx * height2 * width2 + h2 * width2 + w2;
-          grad_sum += weight * static_cast<accscalar_t>(odata[output_idx]);
-        }
-      }
-    }
-
-    // Write accumulated gradient (no atomics needed)
-    idata[index] = static_cast<scalar_t>(grad_sum);
-  }
-#else
  const size_t o_numel = nc * width2 * height2;
+  const size_t i_numel = nc * width1 * height1;
  for (size_t index = blockDim.x * blockIdx.x + threadIdx.x; index < o_numel;
       index += blockDim.x * gridDim.x) {
    size_t index_temp = index;
@ -280,7 +191,6 @@ __global__ void upsample_bilinear2d_backward_out_frame(
        static_cast<scalar_t>(h1lambda * w1lambda * d2val),
        true);
  }
-#endif
 }

 template <typename scalar_t, typename accscalar_t>
@ -477,6 +387,7 @@ static void upsample_bilinear2d_backward_out_cuda_template(
  // threads are not covering the whole input tensor.
  grad_input.zero_();

+  const size_t num_kernels = nbatch * channels * output_height * output_width;
  const int num_threads = std::min(
      at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
@ -486,12 +397,6 @@ static void upsample_bilinear2d_backward_out_cuda_template(
    return;
  }

-#ifdef USE_ROCM
-  constexpr bool use_input = true;
-#else
-  constexpr bool use_input = false;
-#endif
-
  AT_DISPATCH_FLOATING_TYPES_AND2(
      at::ScalarType::Half, at::ScalarType::BFloat16,
      grad_output_.scalar_type(), "upsample_bilinear2d_backward_out_frame", [&] {
@ -509,8 +414,6 @@ static void upsample_bilinear2d_backward_out_cuda_template(
      const accscalar_t rwidth = area_pixel_compute_scale<accscalar_t>(
          input_width, output_width, align_corners, scales_w);

-      const size_t num_kernels = nbatch * channels * output_height * output_width;
-
      upsample_bilinear2d_backward_nhwc_out_frame<scalar_t, accscalar_t>
          <<<ceil_div(num_kernels, static_cast<size_t>(num_threads)), num_threads, 0, stream>>>(
              input_height,
@ -541,8 +444,6 @@ static void upsample_bilinear2d_backward_out_cuda_template(
      const accscalar_t rwidth = area_pixel_compute_scale<accscalar_t>(
          input_width, output_width, align_corners, scales_w);

-      const size_t num_kernels = nbatch * channels * (use_input ? input_height * input_width : output_height * output_width);
-
      upsample_bilinear2d_backward_out_frame<scalar_t, accscalar_t>
          <<<ceil_div(num_kernels, static_cast<size_t>(num_threads)),
             num_threads,
--- a/aten/src/ATen/native/cuda/fused_adagrad_utils.cuh
+++ b/aten/src/ATen/native/cuda/fused_adagrad_utils.cuh
@ -52,7 +52,7 @@ struct FusedAdagradMathFunctor {
  using opmath_t = at::opmath_type<scalar_t>;

  C10_DEVICE __forceinline__ void operator()(
-      int64_t chunk_size,
+      int chunk_size,
      FusedOptimizerTensorListMetadata<3>& tl,
      const float* lr_ptr,
      const double& lr,
@ -133,4 +133,4 @@ struct FusedAdagradMathFunctor {

 } // namespace

-} // namespace at::native
+} // namespace at::native
--- a/aten/src/ATen/native/mps/kernels/LinearAlgebra.h
+++ b/aten/src/ATen/native/mps/kernels/LinearAlgebra.h
@ -1,16 +0,0 @@
-#pragma onces
-#include <c10/metal/common.h>
-
-template <unsigned N = c10::metal::max_ndim>
-struct OrgqrParams {
-  int32_t num_batch_dims;
-
-  uint32_t m;
-  uint32_t n;
-  uint32_t k;
-
-  ::c10::metal::array<uint32_t, N> A_strides;
-  ::c10::metal::array<uint32_t, N> tau_strides;
-  ::c10::metal::array<uint32_t, N> H_strides;
-  ::c10::metal::array<uint32_t, N> H_sizes;
-};
--- a/aten/src/ATen/native/mps/kernels/LinearAlgebra.metal
+++ b/aten/src/ATen/native/mps/kernels/LinearAlgebra.metal
@ -1,4 +1,3 @@
-#include <ATen/native/mps/kernels/LinearAlgebra.h>
 #include <c10/metal/utils.h>
 #include <metal_array>
 #include <metal_simdgroup>
@ -641,164 +640,6 @@ kernel void applyPivots(
  }
 }

-template <typename T>
-static T bool_to_float(bool b) {
-  return static_cast<T>(b);
-}
-
-template <>
-half2 bool_to_float(bool b) {
-  return half2(b ? 1 : 0, 0);
-}
-
-template <>
-float2 bool_to_float(bool b) {
-  return float2(b ? 1 : 0, 0);
-}
-
-template <typename T>
-static T calc_H_irc(
-    device T* A,
-    uint32_t A_stride_r,
-    uint32_t A_stride_c,
-    constant T* tau,
-    uint32_t tau_stride,
-    uint32_t r,
-    uint32_t c,
-    uint32_t i) {
-  T I_val = bool_to_float<T>(r == c);
-  T tau_val = tau[i * tau_stride];
-
-  T A_ci = c10::metal::conj(A[c * A_stride_r + i * A_stride_c]);
-  T A_ri = A[r * A_stride_r + i * A_stride_c];
-
-  T c_eq_i = bool_to_float<T>(c == i);
-  T r_eq_i = bool_to_float<T>(r == i);
-
-  T A_ci_ = (c > i) ? A_ci : c_eq_i;
-  T A_ri_ = (r > i) ? A_ri : r_eq_i;
-
-  return I_val - c10::metal::mul(tau_val, c10::metal::mul(A_ci_, A_ri_));
-}
-
-// Calculate (A @ B)[r, c], the element in the r-th row and c-th column of the
-// result of matrix multiplying A and B together. A and B must be size m-by-m
-// and have the same strides. The formula for this operation, written in Python
-// syntax, is:
-//   (A @ B)[r, c] = A[r, :].dot(B[:, c])
-template <typename T>
-static T calc_matmul_rc(
-    device T* A,
-    device T* B,
-    uint32_t stride_r,
-    uint32_t stride_c,
-    uint32_t m,
-    uint32_t r,
-    uint32_t c) {
-  T AB_rc = 0;
-  auto A_row_offset = r * stride_r;
-  auto B_col_offset = c * stride_c;
-
-  uint32_t A_col_offset = 0;
-  uint32_t B_row_offset = 0;
-
-  for (uint32_t j = 0; j < m;
-       j++, A_col_offset += stride_c, B_row_offset += stride_r) {
-    AB_rc += c10::metal::mul(
-        A[A_row_offset + A_col_offset], B[B_row_offset + B_col_offset]);
-  }
-  return AB_rc;
-}
-
-template <typename T>
-kernel void orgqr(
-    device T* A [[buffer(0)]],
-    constant T* tau [[buffer(1)]],
-    device T* H [[buffer(2)]],
-    device T* H_prod [[buffer(3)]],
-    constant OrgqrParams<>& params [[buffer(4)]],
-    uint tid [[thread_position_in_grid]]) {
-  constant auto& A_strides = params.A_strides;
-  constant auto& tau_strides = params.tau_strides;
-  constant auto& H_strides = params.H_strides;
-  constant auto& H_sizes = params.H_sizes;
-
-  auto num_batch_dims = params.num_batch_dims;
-  auto m = params.m;
-  auto n = params.n;
-  auto k = params.k;
-
-  auto m2 = m * m;
-  auto batch_idx = tid / m2;
-
-  // Find the matrices for this thread's batch index
-  uint32_t A_offset = 0;
-  uint32_t tau_offset = 0;
-  uint32_t H_offset = 0;
-
-  for (auto dim = num_batch_dims - 1; dim >= 0; dim--) {
-    auto dim_size = H_sizes[dim];
-    auto dim_idx = batch_idx % dim_size;
-
-    A_offset += dim_idx * A_strides[dim];
-    tau_offset += dim_idx * tau_strides[dim];
-    H_offset += dim_idx * H_strides[dim];
-
-    batch_idx /= dim_size;
-  }
-
-  A += A_offset;
-  tau += tau_offset;
-  H += H_offset;
-  H_prod += H_offset;
-
-  auto matrix_idx = tid % m2;
-  auto r = matrix_idx / m;
-  auto c = matrix_idx % m;
-  auto A_stride_r = A_strides[num_batch_dims];
-  auto A_stride_c = A_strides[num_batch_dims + 1];
-  auto tau_stride = tau_strides[num_batch_dims];
-  auto H_stride_r = H_strides[num_batch_dims];
-  auto H_stride_c = H_strides[num_batch_dims + 1];
-
-  // Find the element of H and H_prod that this thread will calculate
-  device T* H_elem_ptr = H + (r * H_stride_r + c * H_stride_c);
-  device T* H_prod_elem_ptr = H_prod + (r * H_stride_r + c * H_stride_c);
-
-  for (uint32_t i = 0; i < k; i++) {
-    // Calculate and write H_i
-
-    T H_irc = calc_H_irc(A, A_stride_r, A_stride_c, tau, tau_stride, r, c, i);
-
-    // Calculate element [r, c] of prod(H_0, ..., H_i)
-    if (i == 0) {
-      *H_prod_elem_ptr = H_irc;
-    } else {
-      *H_elem_ptr = H_irc;
-
-      // Need this sync because the below matmul requires all threads to finish
-      // writing their entries to `H_prod` and `H`.
-      threadgroup_barrier(mem_flags::mem_threadgroup);
-
-      T H_prod_0_to_i_rc =
-          calc_matmul_rc(H_prod, H, H_stride_r, H_stride_c, m, r, c);
-
-      // Need this sync because the above matmul uses the current values in
-      // `H_prod`, and we don't want to overwrite those until all threads are
-      // finished using them.
-      threadgroup_barrier(mem_flags::mem_threadgroup);
-
-      *H_prod_elem_ptr = H_prod_0_to_i_rc;
-    }
-  }
-
-  device T* A_elem_ptr = A + (r * A_stride_r + c * A_stride_c);
-
-  if (c < n) {
-    *A_elem_ptr = *H_prod_elem_ptr;
-  }
-}
-
 #define INSTANTIATE_MM_OPS(DTYPE)                                           \
  template [[host_name("matmul_" #DTYPE)]] kernel void matmul<DTYPE>(       \
      constant DTYPE * mat1Data [[buffer(0)]],                              \
@ -838,19 +679,3 @@ INSTANTIATE_MM_OPS(int);
 INSTANTIATE_MM_OPS(short);
 INSTANTIATE_MM_OPS(char);
 INSTANTIATE_MM_OPS(uchar);
-
-#define REGISTER_ORGQR(T)                            \
-  template [[host_name("orgqr_" #T)]]                \
-  kernel void orgqr<T>(                              \
-      device T * A [[buffer(0)]],                    \
-      constant T * tau [[buffer(1)]],                \
-      device T * H [[buffer(2)]],                    \
-      device T * H_prod [[buffer(3)]],               \
-      constant OrgqrParams<> & params [[buffer(4)]], \
-      uint tid [[thread_position_in_grid]]);
-
-REGISTER_ORGQR(float);
-REGISTER_ORGQR(half);
-REGISTER_ORGQR(bfloat);
-REGISTER_ORGQR(float2);
-REGISTER_ORGQR(half2);
--- a/aten/src/ATen/native/mps/kernels/UnaryKernel.metal
+++ b/aten/src/ATen/native/mps/kernels/UnaryKernel.metal
@ -5,21 +5,6 @@
 using namespace metal;
 using namespace c10::metal;

-struct angle_functor {
-  template <typename T, enable_if_t<is_complex_v<T>, bool> = true>
-  inline T operator()(const T x) {
-    return T(atan2(x.y, x.x), 0);
-  }
-  template <typename T, enable_if_t<is_scalar_floating_point_v<T>, bool> = true>
-  inline T operator()(const T x) {
-    return T(isnan(x) ? x : x < 0 ? M_PI_F : 0.0);
-  }
-  template <typename T, enable_if_t<is_scalar_integral_v<T>, bool> = true>
-  inline float operator()(const T x) {
-    return x < 0 ? M_PI_F : 0.0;
-  }
-};
-
 // Implement exp wrapper for both real and complex types
 template <typename T, enable_if_t<is_scalar_floating_point_v<T>, bool> = true>
 inline T exp_(const T x) {
@ -560,7 +545,6 @@ REGISTER_UNARY_OP(abs, float, float);
 REGISTER_UNARY_OP(abs, half, half);

 #define INSTANTIATE_UNARY_KERNELS2(DTYPE0, DTYPE1) \
-  REGISTER_UNARY_OP(angle, DTYPE1, DTYPE0);        \
  REGISTER_UNARY_OP(erf, DTYPE1, DTYPE0);          \
  REGISTER_UNARY_OP(erfc, DTYPE1, DTYPE0);         \
  REGISTER_UNARY_OP(erfinv, DTYPE1, DTYPE0);       \
@ -599,7 +583,6 @@ INSTANTIATE_UNARY_KERNELS2(float, int);
 INSTANTIATE_UNARY_KERNELS2(float, long);

 #define INSTANTIATE_UNARY_KERNELS_VEC2(DTYPE)     \
-  REGISTER_UNARY_OP(angle, DTYPE##2, DTYPE##2);   \
  REGISTER_UNARY_OP(neg, DTYPE##2, DTYPE##2);     \
  REGISTER_UNARY_OP(exp, DTYPE##2, DTYPE##2);     \
  REGISTER_UNARY_OP(expm1, DTYPE##2, DTYPE##2);   \
--- a/aten/src/ATen/native/mps/operations/Attention.mm
+++ b/aten/src/ATen/native/mps/operations/Attention.mm
@ -92,8 +92,13 @@ static std::tuple<Tensor, Tensor> sdpa_general_mps(const Tensor& query,
          }

          // upcasting to float32 if needed to improve precision when multiplying by the scale factor
-          maskedMM = castMPSTensor(mpsGraph, maskedMM, MPSDataTypeFloat32);
+          if ([maskedMM dataType] != MPSDataTypeFloat32) {
+            maskedMM = [mpsGraph castTensor:maskedMM toType:MPSDataTypeFloat32 name:nil];
+          }
          maskedMM = [mpsGraph multiplicationWithPrimaryTensor:maskedMM secondaryTensor:scaleTensor name:nil];
+          if ([maskedMM dataType] != qTensor.dataType) {
+            maskedMM = [mpsGraph castTensor:maskedMM toType:qTensor.dataType name:nil];
+          }

          if (is_causal) {
            auto causalMask = [mpsGraph constantWithScalar:1.0f
@ -107,9 +112,7 @@ static std::tuple<Tensor, Tensor> sdpa_general_mps(const Tensor& query,
                                                      name:nil];
          } else if (attn_mask) {
            graph->maskTensor = mpsGraphRankedPlaceHolder(mpsGraph, *attn_mask);
-            maskedMM = [mpsGraph additionWithPrimaryTensor:maskedMM
-                                           secondaryTensor:castMPSTensor(mpsGraph, graph->maskTensor, maskedMM.dataType)
-                                                      name:nil];
+            maskedMM = [mpsGraph additionWithPrimaryTensor:maskedMM secondaryTensor:graph->maskTensor name:nil];
          }

          // Account for case where all values were masked causing division by 0 in softmax (issue:#156707)
@ -130,8 +133,8 @@ static std::tuple<Tensor, Tensor> sdpa_general_mps(const Tensor& query,
          graph->qTensor = qTensor;
          graph->kTensor = kTensor;
          graph->vTensor = vTensor;
-          graph->outputTensor = castMPSTensor(mpsGraph, output, qTensor.dataType);
-          graph->attnTensor = castMPSTensor(mpsGraph, sm, qTensor.dataType);
+          graph->outputTensor = output;
+          graph->attnTensor = sm;
        });
    auto qPlaceholder = Placeholder(cachedGraph->qTensor, query);
    auto kPlaceholder = Placeholder(cachedGraph->kTensor, key);
--- a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
+++ b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
@ -8,9 +8,6 @@
 #include <ATen/native/Resize.h>
 #include <ATen/native/mps/MPSGraphSequoiaOps.h>
 #include <ATen/native/mps/OperationUtils.h>
-#include <ATen/native/mps/kernels/LinearAlgebra.h>
-
-#include <fmt/format.h>

 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@ -31,7 +28,6 @@
 #include <ATen/ops/linalg_solve_triangular_native.h>
 #include <ATen/ops/lu_unpack_native.h>
 #include <ATen/ops/mm_native.h>
-#include <ATen/ops/orgqr_native.h>
 #include <ATen/ops/slice.h>
 #include <ATen/ops/stack.h>
 #include <ATen/ops/triangular_solve_native.h>
@ -342,8 +338,6 @@ static void linalg_lu_factor_ex_out_mps_impl(const Tensor& A,
          ". See https://developer.apple.com/documentation/metalperformanceshaders/mpsmatrixdecompositionstatus for details.");
    }
  }
-
-  map_mps_decomposition_error_code_to_blas(info);
 }

 static void linalg_solve_out_mps_impl(const Tensor& A,
@ -1239,69 +1233,6 @@ static void cholesky_stub_impl(const Tensor& out, const Tensor& info, bool upper
  }
 }

-static Tensor& orgqr_stub_impl(Tensor& self, const Tensor& tau) {
-  if (self.numel() == 0) {
-    return self;
-  }
-
-  auto m = self.size(-2);
-  auto n = self.size(-1);
-  auto k = tau.size(-1);
-
-  if (tau.numel() == 0) {
-    auto I = eye(m, self.scalar_type(), std::nullopt, self.device());
-    return self.copy_(I.slice(-1, 0, n));
-  }
-
-  auto num_batch_dims = self.dim() - 2;
-  auto batch_sizes = self.sizes().slice(0, num_batch_dims);
-
-  std::vector<int64_t> H_sizes(num_batch_dims + 2);
-  for (auto dim : c10::irange(num_batch_dims)) {
-    H_sizes[dim] = self.size(dim);
-  }
-  H_sizes[num_batch_dims] = m;
-  H_sizes[num_batch_dims + 1] = m;
-
-  auto H = at::empty(H_sizes, self.options().memory_format(MemoryFormat::Contiguous));
-  auto H_prod = at::empty_like(H);
-
-  OrgqrParams params;
-
-  params.num_batch_dims = num_batch_dims;
-  params.m = m;
-  params.n = n;
-  params.k = k;
-
-  for (const auto dim : c10::irange(self.dim())) {
-    params.A_strides[dim] = self.stride(dim);
-
-    if (dim < tau.dim()) {
-      params.tau_strides[dim] = tau.stride(dim);
-    }
-
-    params.H_strides[dim] = H.stride(dim);
-    params.H_sizes[dim] = H.size(dim);
-  }
-
-  auto num_threads = H.numel();
-  MPSStream* stream = getCurrentMPSStream();
-
-  dispatch_sync_with_rethrow(stream->queue(), ^() {
-    @autoreleasepool {
-      id<MTLComputeCommandEncoder> compute_encoder = stream->commandEncoder();
-      auto pipeline_state = lib.getPipelineStateForFunc(fmt::format("orgqr_{}", scalarToMetalTypeString(self)));
-      getMPSProfiler().beginProfileKernel(pipeline_state, "orgqr", {self, tau});
-      [compute_encoder setComputePipelineState:pipeline_state];
-      mtl_setArgs(compute_encoder, self, tau, H, H_prod, params);
-      mtl_dispatch1DJob(compute_encoder, pipeline_state, num_threads);
-      getMPSProfiler().endProfileKernel(pipeline_state);
-    }
-  });
-
-  return self;
-}
-
 } // namespace mps

 Tensor addr_mps(const Tensor& self, const Tensor& vec1, const Tensor& vec2, const Scalar& beta, const Scalar& alpha) {
@ -1517,6 +1448,20 @@ TORCH_IMPL_FUNC(_linalg_solve_ex_out_mps)
  mps::linalg_solve_out_mps_impl(A, B, left, check_errors, result, LU, pivots, info);
 }

+std::tuple<Tensor&, Tensor&> linalg_lu_factor_out_mps(const Tensor& A, bool pivot, Tensor& LU, Tensor& pivots) {
+  Tensor info = at::empty({}, A.options().dtype(kInt));
+  mps::linalg_lu_factor_ex_out_mps_impl(A, pivot, LU, pivots, info, false);
+  return std::tie(LU, pivots);
+}
+
+std::tuple<Tensor, Tensor> linalg_lu_factor_mps(const Tensor& A, bool pivot) {
+  Tensor LU = at::empty({0}, A.options());
+  Tensor pivots = at::empty({0}, A.options().dtype(kInt));
+  Tensor info = at::empty({}, A.options().dtype(kInt));
+  mps::linalg_lu_factor_ex_out_mps_impl(A, pivot, LU, pivots, info, false);
+  return std::make_tuple(std::move(LU), std::move(pivots));
+}
+
 TORCH_IMPL_FUNC(lu_unpack_out_mps)
 (const Tensor& LU_data,
 const Tensor& LU_pivots,
@ -1538,6 +1483,4 @@ TORCH_IMPL_FUNC(linalg_inv_ex_out_mps)(const Tensor& A, bool check_errors, const
 }

 REGISTER_DISPATCH(cholesky_stub, mps::cholesky_stub_impl)
-REGISTER_DISPATCH(orgqr_stub, mps::orgqr_stub_impl);
-
 } // namespace at::native
--- a/aten/src/ATen/native/mps/operations/UnaryKernel.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryKernel.mm
@ -34,7 +34,6 @@ REGISTER_UNARY_TI_DISPATCH(sinc);
 REGISTER_UNARY_TI_DISPATCH(sinh);
 REGISTER_UNARY_TI_DISPATCH(cosh);
 REGISTER_UNARY_TI_DISPATCH(tanh);
-REGISTER_UNARY_TI_DISPATCH(angle);
 REGISTER_UNARY_TI_DISPATCH(abs);
 REGISTER_UNARY_TI_DISPATCH(sin);
 REGISTER_UNARY_TI_DISPATCH(cos);
--- a/aten/src/ATen/native/mps/operations/UnaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryOps.mm
@ -12,6 +12,7 @@
 #include <ATen/ops/_copy_from_and_resize.h>
 #include <ATen/ops/acos_native.h>
 #include <ATen/ops/acosh_native.h>
+#include <ATen/ops/angle_native.h>
 #include <ATen/ops/asin_native.h>
 #include <ATen/ops/asinh_native.h>
 #include <ATen/ops/atan_native.h>
@ -203,6 +204,23 @@ Tensor& logical_not_out_mps(const Tensor& self, Tensor& output) {
  return output;
 }

+Tensor& angle_out_mps(const Tensor& self, Tensor& output) {
+  mps::unary_op(self, output, "angle_out_mps", ^MPSGraphTensor*(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
+    auto realPart = [mpsGraph realPartOfTensor:inputTensor name:nil];
+    auto imagPart = [mpsGraph imaginaryPartOfTensor:inputTensor name:nil];
+    return [mpsGraph atan2WithPrimaryTensor:imagPart secondaryTensor:realPart name:nil];
+  });
+  return output;
+}
+
+Tensor angle_mps(const Tensor& self) {
+  const auto float_type = c10::isIntegralType(self.scalar_type(), /*includeBool=*/true)
+      ? c10::typeMetaToScalarType(c10::get_default_dtype())
+      : c10::toRealValueType(self.scalar_type());
+  Tensor result = at::empty({0}, self.options().dtype(float_type));
+  return angle_out_mps(self, result);
+}
+
 TORCH_IMPL_FUNC(frac_out_mps)(const Tensor& self, const Tensor& output) {
  TORCH_CHECK(isFloatingType(self.scalar_type()), "frac_out_mps is only implemented for floating types");
  mps::unary_op(self, output, "frac_out_mps", ^MPSGraphTensor*(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -403,14 +403,16 @@
  device_check: NoCheck   # TensorIterator
  variants: function, method
  dispatch:
-    CPU, CUDA, MPS: angle
+    CPU, CUDA: angle
+    MPS: angle_mps
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: angle_sparse_csr
  tags: pointwise

 - func: angle.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
  device_check: NoCheck   # TensorIterator
  dispatch:
-    CPU, CUDA, MPS: angle_out
+    CPU, CUDA: angle_out
+    MPS: angle_out_mps
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: angle_sparse_csr_out
  tags: pointwise

@ -14155,10 +14157,16 @@
 - func: linalg_lu_factor(Tensor A, *, bool pivot=True) -> (Tensor LU, Tensor pivots)
  python_module: linalg
  variants: function
+  dispatch:
+    CompositeImplicitAutograd: linalg_lu_factor
+    MPS: linalg_lu_factor_mps

 - func: linalg_lu_factor.out(Tensor A, *, bool pivot=True, Tensor(a!) LU, Tensor(b!) pivots) -> (Tensor(a!) LU, Tensor(b!) pivots)
  python_module: linalg
  variants: function
+  dispatch:
+    CompositeImplicitAutograd: linalg_lu_factor_out
+    MPS: linalg_lu_factor_out_mps

 - func: linalg_lu_factor_ex(Tensor A, *, bool pivot=True, bool check_errors=False) -> (Tensor LU, Tensor pivots, Tensor info)
  python_module: linalg
@ -14360,12 +14368,12 @@
  python_module: linalg
  variants: function
  dispatch:
-    CPU, CUDA, MPS: linalg_householder_product
+    CPU, CUDA: linalg_householder_product

 - func: linalg_householder_product.out(Tensor input, Tensor tau, *, Tensor(a!) out) -> Tensor(a!)
  python_module: linalg
  dispatch:
-    CPU, CUDA, MPS: linalg_householder_product_out
+    CPU, CUDA: linalg_householder_product_out

 - func: linalg_inv_ex(Tensor A, *, bool check_errors=False) -> (Tensor inverse, Tensor info)
  python_module: linalg
--- a/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu
@ -40,7 +40,15 @@
 #include <thrust/iterator/discard_iterator.h>


+#if defined(__CUDACC__) && (defined(CUSPARSE_VERSION) || (defined(USE_ROCM) && ROCM_VERSION >= 60300))
+#define IS_CUSPARSE11_AVAILABLE() 1
+#else
+#define IS_CUSPARSE11_AVAILABLE() 0
+#endif
+
+#if IS_CUSPARSE11_AVAILABLE()
 #include <library_types.h>
+#endif

 namespace at::native {

@ -95,9 +103,17 @@ struct csrMatrixRef {
  int nnz_{0};
  std::vector<int> size_{};

-  cusparseSpMatDescr_t description_{0};
+  #if IS_CUSPARSE11_AVAILABLE()
+    cusparseSpMatDescr_t description_{0};
+  #else
+    cusparseMatDescr_t description_{0};
+  #endif

-  csrMatrixRef() = default;
+  csrMatrixRef() {
+    #if !IS_CUSPARSE11_AVAILABLE()
+      create_general_description_(description_);
+    #endif
+  }

  csrMatrixRef(
      int* csr_indices,
@ -110,6 +126,7 @@ struct csrMatrixRef {
        csr_values_{csr_values},
        nnz_{nnz},
        size_{size} {
+    #if IS_CUSPARSE11_AVAILABLE()
      cudaDataType cuda_data_type = at::cuda::getCudaDataType<scalar_t>();
      TORCH_CUDASPARSE_CHECK(cusparseCreateCsr(
        &description_,
@ -123,10 +140,17 @@ struct csrMatrixRef {
        CUSPARSE_INDEX_32I,
        CUSPARSE_INDEX_BASE_ZERO,
        cuda_data_type));
+    #else
+      create_general_description_(description_);
+    #endif
  }

  ~csrMatrixRef() {
-    cusparseDestroySpMat(description_);
+    #if IS_CUSPARSE11_AVAILABLE()
+      cusparseDestroySpMat(description_);
+    #else
+      cusparseDestroyMatDescr(description_);
+    #endif
  }

  int size(int index) const {
@ -172,6 +196,8 @@ struct csrOutput {
  }
 };

+#if IS_CUSPARSE11_AVAILABLE()
+
 // RAII guard helps to support cuSparse 11 API for `A @ B` operation
 // This generic template exists because with cuSparse the `scalar_t` type could be a double or float
 template <class scalar_t>
@ -370,6 +396,284 @@ template struct CusparseMatrixMultiplyOp<float>;

 template struct CusparseMatrixMultiplyOp<double>;

+#else // if not IS_CUSPARSE11_AVAILABLE()
+
+using DcsrMatrixRef = csrMatrixRef<double>;
+using ScsrMatrixRef = csrMatrixRef<float>;
+
+// RAII guard helps to support cuSparse 10 API for `A @ B` operation
+// This generic template exists because with cuSparse the `scalar_t` type could be a double or float
+template <class scalar_t>
+struct CusparseMatrixMultiplyOp {
+  csrOutput operator()(
+      const csrMatrixRef<scalar_t>& lhs,
+      const csrMatrixRef<scalar_t>& rhs,
+      Tensor &output_values,
+      Tensor &output_indices)
+  {
+    static_assert(false&&sizeof(scalar_t), "cusparse csr sparse-sparse MM only supports data type of float and double.");
+  }
+};
+
+// Specializacion for `A @ B` operation for double values with cuSparse
+template<> struct CusparseMatrixMultiplyOp<double> {
+  csrgemm2Info_t gemm2Info_;
+
+  CusparseMatrixMultiplyOp() {
+    TORCH_CUDASPARSE_CHECK(cusparseCreateCsrgemm2Info(&gemm2Info_));
+  }
+  ~CusparseMatrixMultiplyOp() {
+    cusparseDestroyCsrgemm2Info(gemm2Info_);
+  }
+
+  csrOutput operator ()(
+      const DcsrMatrixRef& lhs,
+      const DcsrMatrixRef& rhs,
+      Tensor &output_values,
+      Tensor &output_indices) {
+    double alpha = 1.0;
+    DcsrMatrixRef empty;
+    return Dgemm2(lhs, rhs, empty, &alpha, nullptr, output_values, output_indices);
+  }
+
+  csrOutput Dgemm2(
+      const DcsrMatrixRef& A,
+      const DcsrMatrixRef& B,
+      const DcsrMatrixRef& C,
+      const double* alpha,
+      const double* beta,
+      Tensor &output_values,
+      Tensor &output_indices) {
+    void* buffer_{nullptr};
+    cusparseHandle_t cusparseHandle_ = at::cuda::getCurrentCUDASparseHandle();
+    TORCH_CUDASPARSE_CHECK(cusparseSetPointerMode(cusparseHandle_, CUSPARSE_POINTER_MODE_HOST));
+
+    csrOutput out({A.size(0), B.size(1)});
+    int innerSize = confirm_mult_size(A.size_, B.size_);
+    out.csr_pointers_ = at::empty({out.size(0) + 1}, output_indices.options().dtype(kInt));
+
+    // Compute needed buffer size
+    size_t new_bubber_sz;
+    TORCH_CUDASPARSE_CHECK(cusparseDcsrgemm2_bufferSizeExt(
+        cusparseHandle_,
+        out.size(0),
+        out.size(1),
+        innerSize,
+        alpha,
+        A.description_,
+        A.nnz_,
+        A.csr_pointers_,
+        A.csr_indices_,
+        B.description_,
+        B.nnz_,
+        B.csr_pointers_,
+        B.csr_indices_,
+        beta,
+        C.description_,
+        C.nnz_,
+        C.csr_pointers_,
+        C.csr_indices_,
+        gemm2Info_,
+        &new_bubber_sz));
+
+    // (Re)allocate buffer if needed
+    auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
+    at::DataPtr data_ptr = allocator.allocate(new_bubber_sz);
+    buffer_ = data_ptr.get();
+
+    // Find the resulting non-zero pattern.
+    TORCH_CUDASPARSE_CHECK(cusparseXcsrgemm2Nnz(
+        cusparseHandle_,
+        out.size(0),
+        out.size(1),
+        innerSize,
+        A.description_,
+        A.nnz_,
+        A.csr_pointers_,
+        A.csr_indices_,
+        B.description_,
+        B.nnz_,
+        B.csr_pointers_,
+        B.csr_indices_,
+        C.description_,
+        C.nnz_,
+        C.csr_pointers_,
+        C.csr_indices_,
+        out.description_,
+        out.csr_pointers_.data_ptr<int>(),
+        &out.nnz_,
+        gemm2Info_,
+        buffer_));
+
+    out.csr_indices_ = at::empty({out.nnz_}, output_indices.options().dtype(kInt));
+    out.csr_values_ = at::empty({out.nnz_}, output_values.options());
+
+    // Perform the gemm2 operation for doubles
+    // out = alpha ∗ A ∗ B + beta ∗ C
+    TORCH_CUDASPARSE_CHECK(cusparseDcsrgemm2(
+        cusparseHandle_,
+        out.size(0),
+        out.size(1),
+        innerSize,
+        alpha,
+        A.description_,
+        A.nnz_,
+        A.csr_values_,
+        A.csr_pointers_,
+        A.csr_indices_,
+        B.description_,
+        B.nnz_,
+        B.csr_values_,
+        B.csr_pointers_,
+        B.csr_indices_,
+        beta,
+        C.description_,
+        C.nnz_,
+        C.csr_values_,
+        C.csr_pointers_,
+        C.csr_indices_,
+        out.description_,
+        out.csr_values_.data_ptr<double>(),
+        out.csr_pointers_.data_ptr<int>(),
+        out.csr_indices_.data_ptr<int>(),
+        gemm2Info_,
+        buffer_));
+    return out;
+  }
+};
+
+// Specializacion for `A @ B` operation for float values with cuSparse
+template<> struct CusparseMatrixMultiplyOp<float> {
+  csrgemm2Info_t gemm2Info_;
+
+  CusparseMatrixMultiplyOp() {
+    TORCH_CUDASPARSE_CHECK(cusparseCreateCsrgemm2Info(&gemm2Info_));
+
+  }
+  ~CusparseMatrixMultiplyOp() {
+    cusparseDestroyCsrgemm2Info(gemm2Info_);
+  }
+  csrOutput operator()(
+      const ScsrMatrixRef& lhs,
+      const ScsrMatrixRef& rhs,
+      Tensor &output_values,
+      Tensor &output_indices) {
+    float alpha = 1.0;
+    ScsrMatrixRef empty;
+    return Sgemm2(lhs, rhs, empty, &alpha, nullptr, output_values, output_indices);
+  }
+
+  csrOutput Sgemm2(
+      const ScsrMatrixRef& A,
+      const ScsrMatrixRef& B,
+      const ScsrMatrixRef& C,
+      const float* alpha,
+      const float* beta,
+      Tensor &output_values,
+      Tensor &output_indices) {
+    void* buffer_{nullptr};
+    cusparseHandle_t cusparseHandle_ = at::cuda::getCurrentCUDASparseHandle();
+    TORCH_CUDASPARSE_CHECK(cusparseSetPointerMode(cusparseHandle_, CUSPARSE_POINTER_MODE_HOST));
+
+    csrOutput out({A.size(0), B.size(1)});
+
+    int innerSize = confirm_mult_size(A.size_, B.size_);
+
+    out.csr_pointers_ = at::empty({out.size(0) + 1}, output_indices.options().dtype(kInt));
+
+    // Compute needed buffer size
+    size_t new_bubber_sz;
+    TORCH_CUDASPARSE_CHECK(cusparseScsrgemm2_bufferSizeExt(
+        cusparseHandle_,
+        out.size(0),
+        out.size(1),
+        innerSize,
+        alpha,
+        A.description_,
+        A.nnz_,
+        A.csr_pointers_,
+        A.csr_indices_,
+        B.description_,
+        B.nnz_,
+        B.csr_pointers_,
+        B.csr_indices_,
+        beta,
+        C.description_,
+        C.nnz_,
+        C.csr_pointers_,
+        C.csr_indices_,
+        gemm2Info_,
+        &new_bubber_sz));
+
+    auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
+    at::DataPtr data_ptr = allocator.allocate(new_bubber_sz);
+    buffer_ = data_ptr.get();
+
+    // Find the resulting non-zero pattern.
+    TORCH_CUDASPARSE_CHECK(cusparseXcsrgemm2Nnz(
+        cusparseHandle_,
+        out.size(0),
+        out.size(1),
+        innerSize,
+        A.description_,
+        A.nnz_,
+        A.csr_pointers_,
+        A.csr_indices_,
+        B.description_,
+        B.nnz_,
+        B.csr_pointers_,
+        B.csr_indices_,
+        C.description_,
+        C.nnz_,
+        C.csr_pointers_,
+        C.csr_indices_,
+        out.description_,
+        out.csr_pointers_.data_ptr<int>(),
+        &out.nnz_,
+        gemm2Info_,
+        buffer_));
+
+    out.csr_indices_ = at::empty({out.nnz_}, output_indices.options().dtype(kInt));
+    out.csr_values_ = at::empty({out.nnz_}, output_values.options());
+
+    // Perform the gemm2 operation for doubles
+    // out = alpha ∗ A ∗ B + beta ∗ C
+    TORCH_CUDASPARSE_CHECK(cusparseScsrgemm2(
+        cusparseHandle_,
+        out.size(0),
+        out.size(1),
+        innerSize,
+        alpha,
+        A.description_,
+        A.nnz_,
+        A.csr_values_,
+        A.csr_pointers_,
+        A.csr_indices_,
+        B.description_,
+        B.nnz_,
+        B.csr_values_,
+        B.csr_pointers_,
+        B.csr_indices_,
+        beta,
+        C.description_,
+        C.nnz_,
+        C.csr_values_,
+        C.csr_pointers_,
+        C.csr_indices_,
+        out.description_,
+        out.csr_values_.data_ptr<float>(),
+        out.csr_pointers_.data_ptr<int>(),
+        out.csr_indices_.data_ptr<int>(),
+        gemm2Info_,
+        buffer_));
+    return out;
+  }
+};
+
+
+
+#endif // IS_CUSPARSE11_AVAILABLE()
+
 template <typename scalar_t>
 void sparse_sparse_matmul_cuda_kernel(
    Tensor& result,
@ -511,15 +815,19 @@ Tensor sparse_sparse_matmul_cuda(const Tensor& mat1_, const Tensor& mat2_) {
  auto output = at::native::empty_like(mat1_);
  output.sparse_resize_and_clear_({mat1_.size(0), mat2_.size(1)}, mat1_.sparse_dim(), 0);

-#if !defined(USE_ROCM)
+#if IS_CUSPARSE11_AVAILABLE() && !defined(USE_ROCM)
  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, mat1_.scalar_type(), "sparse_matmul", [&] {
      sparse_sparse_matmul_cuda_kernel<scalar_t>(output, mat1_.coalesce(), mat2_.coalesce());
  });
-#else
+#elif IS_CUSPARSE11_AVAILABLE() && defined(USE_ROCM)
  // ROCm does not support half and bfloat16 types for sparse_matmul
  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(mat1_.scalar_type(), "sparse_matmul", [&] {
      sparse_sparse_matmul_cuda_kernel<scalar_t>(output, mat1_.coalesce(), mat2_.coalesce());
  });
+#else
+  AT_DISPATCH_FLOATING_TYPES(mat1_.scalar_type(), "sparse_matmul", [&] {
+    sparse_sparse_matmul_cuda_kernel<scalar_t>(output, mat1_.coalesce(), mat2_.coalesce());
+  });
 #endif
  return output;
 }
--- a/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm
+++ b/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm
@ -33,7 +33,7 @@ using namespace mps;
 #ifndef PYTORCH_JIT_COMPILE_SHADERS
 static auto& lib = MetalShaderLibrary::getBundledLibrary();
 #else
-#include <ATen/native/mps/SparseTensorMath_metallib.h>
+#include <ATen/native/mps/Mul_metallib.h>
 #endif

 static Tensor& s_addmm_out_sparse_dense_mps(
@ -369,7 +369,12 @@ static SparseTensor& mul_out_dense_sparse_mps(
  }

  if (scalar_like) {
-    auto out_vals = values.mul(dense.to(values.options()));
+    auto scalar = dense;
+    if (dense.numel() == 1 && dense.dim() > 0) {
+      scalar = dense.view({});
+    }
+    scalar = scalar.to(values.options());
+    auto out_vals = values.mul(scalar);
    if (out.scalar_type() != commonDtype) {
      out_vals = out_vals.to(out.scalar_type());
    }
@ -503,14 +508,14 @@ SparseTensor& mul_out_sparse_mps(const Tensor& t_, const Tensor& src_, SparseTen
  const auto device = r_.device();
  auto stream = getCurrentMPSStream();

-  auto lhs_indices = lhs._indices().contiguous();
-  auto rhs_indices = rhs._indices().contiguous();
-  auto lhs_values  = lhs._values().to(commonDtype).contiguous();
-  auto rhs_values  = rhs._values().to(commonDtype).contiguous();
+  auto lhs_indices = lhs._indices();
+  auto rhs_indices = rhs._indices();
+  auto lhs_values  = lhs._values().to(commonDtype);
+  auto rhs_values  = rhs._values().to(commonDtype);

  // Flatten sparse indices to keys
-  auto lhs_keys = flatten_indices(lhs_indices, lhs.sizes().slice(0, ndim_i));
-  auto rhs_keys = flatten_indices(rhs_indices, rhs.sizes().slice(0, ndim_i));
+  auto lhs_keys = flatten_indices(lhs_indices, lhs.sizes());
+  auto rhs_keys = flatten_indices(rhs_indices, rhs.sizes());

  // Intersect sorted keys (search the shorter in the longer)
  const bool A_is_lhs = (lhs_nnz <= rhs_nnz);
@ -541,54 +546,35 @@ SparseTensor& mul_out_sparse_mps(const Tensor& t_, const Tensor& src_, SparseTen
  auto out_indices = at::empty({ndim_i, static_cast<int64_t>(M)}, at::device(device).dtype(at::kLong));
  auto lhs_match = outA_idx.narrow(0, 0, M);
  auto rhs_match = outB_idx.narrow(0, 0, M);
-  auto dense_sizes_vec = lhs.sizes().slice(ndim_i).vec();
-  int64_t cols64 = 1;
-  for (auto s : dense_sizes_vec) cols64 *= s;
-  const uint32_t cols = static_cast<uint32_t>(std::max<int64_t>(cols64, 1));
-
-  auto to2d = [&](Tensor t, int64_t nnz) -> Tensor {
-    const int64_t t_cols = t.numel() / nnz;
-    if (t_cols == cols64) {
-      return t.view({nnz, cols64});
-    }
-    return t.view({nnz, 1}).expand({nnz, cols64}).contiguous();
-  };
-
-  // make both sides 2d [nnz, cols] buffers so the kernel can index it
-  auto lhs_vals2d = to2d(lhs_values, lhs_nnz);
-  auto rhs_vals2d = to2d(rhs_values, rhs_nnz);
-
-  std::vector<int64_t> out_val_sizes;
-  out_val_sizes.reserve(1 + dense_sizes_vec.size());
-  out_val_sizes.push_back(static_cast<int64_t>(M));
-  out_val_sizes.insert(out_val_sizes.end(), dense_sizes_vec.begin(), dense_sizes_vec.end());
+  auto out_val_sizes = lhs_values.sizes().vec();
+  out_val_sizes[0] = static_cast<int64_t>(M);
  auto out_values = at::empty(out_val_sizes, lhs_values.options());

-  if (M > 0) {
-    dispatch_sync_with_rethrow(stream->queue(), ^() {
-      @autoreleasepool {
-        auto pso = lib.getPipelineStateForFunc(
-            "fused_gather_mul_kernel_" + mps::scalarToMetalTypeString(lhs_values));
-        auto enc = stream->commandEncoder();
-        [enc setComputePipelineState:pso];
+  const uint32_t cols = static_cast<uint32_t>(
+      lhs_values.numel() / std::max<int64_t>(1, lhs_nnz));

-        const uint32_t tew = pso.threadExecutionWidth;
-        const uint32_t gridW = std::max<uint32_t>(cols, 1u);
-        const uint32_t tgW = std::min(gridW, tew);
-        MTLSize grid = MTLSizeMake(gridW, 1, M);
-        MTLSize tgs  = MTLSizeMake(tgW, 1, 1);
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      auto pso = lib.getPipelineStateForFunc(
+          "fused_gather_mul_kernel_" + mps::scalarToMetalTypeString(lhs_values));
+      auto enc = stream->commandEncoder();
+      [enc setComputePipelineState:pso];

-        mtl_setArgs(enc,
-                    lhs_vals2d, rhs_vals2d,
-                    lhs_match, rhs_match,
-                    lhs_indices, out_indices,
-                    out_values,
-                    std::array<uint32_t, 2>{static_cast<uint32_t>(ndim_i), static_cast<uint32_t>(lhs_nnz)},
-                    std::array<uint32_t, 2>{M, cols});
-        [enc dispatchThreads:grid threadsPerThreadgroup:tgs];
-      }
-    });
-  }
+      const uint32_t tew  = pso.threadExecutionWidth;
+      uint32_t tgW = std::min(cols, tew);
+      MTLSize grid = MTLSizeMake(cols, 1, M);
+      MTLSize tgs  = MTLSizeMake(tgW, 1, 1);
+
+      mtl_setArgs(enc,
+                  lhs_values, rhs_values,
+                  lhs_match, rhs_match,
+                  lhs_indices, out_indices,
+                  out_values,
+                  std::array<uint32_t, 2>{static_cast<uint32_t>(ndim_i), static_cast<uint32_t>(lhs_nnz)},
+                  std::array<uint32_t, 2>{M, cols});
+      [enc dispatchThreads:grid threadsPerThreadgroup:tgs];
+    }
+  });

  if (r_.scalar_type() != commonDtype) {
    out_values = out_values.to(r_.scalar_type());
--- a/aten/src/ATen/native/sparse/mps/kernels/SparseTensorMath.metal
+++ b/aten/src/ATen/native/sparse/mps/kernels/SparseTensorMath.metal
@ -62,6 +62,7 @@ kernel void build_row_ptr_from_sorted_rows_by_batch(

 template <typename T>
 kernel void spmm_bmm_coo_rows_grouped(
+    device const long*   rows      [[buffer(0)]],
    device const long*   cols      [[buffer(1)]],
    device const T*      vals      [[buffer(2)]],
    device const T*      dense     [[buffer(3)]],
@ -72,6 +73,7 @@ kernel void spmm_bmm_coo_rows_grouped(
    uint3                ltid      [[thread_position_in_threadgroup]],
    uint3                tptg      [[threads_per_threadgroup]])
 {
+  const uint B = dims.x;
  const uint I = dims.y;
  const uint J = dims.z;
  const uint K = dims.w;
@ -195,9 +197,9 @@ kernel void fused_gather_mul_kernel(
    const ulong offR = (ulong)iR * (ulong)view_cols + (ulong)col;
    const ulong offO = (ulong)k  * (ulong)view_cols + (ulong)col;

-    const auto a = static_cast<accum_t<T>>(lhs_vals[offL]);
-    const auto b = static_cast<accum_t<T>>(rhs_vals[offR]);
-    out_vals[offO] = static_cast<T>(mul(a, b));
+    const float a = (float)lhs_vals[offL];
+    const float b = (float)rhs_vals[offR];
+    out_vals[offO] = (T)(a * b);
  }

  // One thread per match copies the indices column
@ -319,6 +321,7 @@ INSTANTIATE_FOR_FLOAT_TYPES(INSTANTIATE_FUSED_GATHER_MUL);
 #define INSTANTIATE_SPMM_BMM_COO_ROWS_GROUPED(DTYPE)                         \
  template [[host_name("spmm_bmm_coo_rows_grouped_" #DTYPE)]] kernel void    \
  spmm_bmm_coo_rows_grouped<DTYPE>(                                          \
+      device const long*   rows      [[buffer(0)]],                          \
      device const long*   cols      [[buffer(1)]],                          \
      device const DTYPE*  vals      [[buffer(2)]],                          \
      device const DTYPE*  dense     [[buffer(3)]],                          \
--- a/aten/src/ATen/native/ts_native_functions.yaml
+++ b/aten/src/ATen/native/ts_native_functions.yaml
@ -202,6 +202,7 @@ supported:
  - select_backward
  - _trilinear
  - linalg_pinv.atol_rtol_tensor
+  - svd
  - logsumexp.out
 symint:
  - empty.memory_format
--- a/benchmarks/dynamo/genai_layers/benchmark.py
+++ b/benchmarks/dynamo/genai_layers/benchmark.py
@ -58,7 +58,8 @@ def list_benchmarks():

 def run_benchmark(
    benchmark_name: str,
-    script_args,
+    should_visualize: bool = False,
+    compile_mode: str = "max-autotune-no-cudagraphs",
 ):
    """Run a specific benchmark."""
    if benchmark_name not in BENCHMARK_REGISTRY:
@ -67,29 +68,29 @@ def run_benchmark(
        return False

    print(f"Running benchmark: {benchmark_name}")
-    print(f"Torch compile mode: {script_args.compile_mode}")
+    print(f"Torch compile mode: {compile_mode}")
    print("=" * 60)

    benchmark_class = BENCHMARK_REGISTRY[benchmark_name]
-    benchmark = benchmark_class(script_args)
+    benchmark = benchmark_class(compile_mode)
    benchmark.benchmark()
-    if script_args.visualize:
+    if should_visualize:
        benchmark.visualize()

    return True


-def run_all_benchmarks(script_args):
+def run_all_benchmarks(should_visualize: bool = False, compile_mode: str = "default"):
    """Run all available benchmarks."""
    print("Running all benchmarks...")
-    print(f"Torch compile mode: {script_args.compile_mode}")
+    print(f"Torch compile mode: {compile_mode}")
    print("=" * 60)

    for name, cls in BENCHMARK_REGISTRY.items():
        print(f"\n{'=' * 20} {name.upper()} {'=' * 20}")
-        benchmark = cls(script_args)
+        benchmark = cls(compile_mode)
        benchmark.benchmark()
-        if script_args.visualize:
+        if should_visualize:
            benchmark.visualize()
        print()

@ -136,19 +137,6 @@ Examples:
        help="Torch compile mode to use (default: default)",
    )

-    parser.add_argument(
-        "--tolerance",
-        type=float,
-        default=None,
-        help="Tolerance for the accuracy check",
-    )
-
-    parser.add_argument(
-        "--exit-on-accuracy-failure",
-        action="store_true",
-        help="Whether to exit with an error message for accuracy failure",
-    )
-
    args = parser.parse_args()

    # Handle list option
@ -158,7 +146,7 @@ Examples:

    # Handle all option
    if args.all:
-        run_all_benchmarks(args)
+        run_all_benchmarks(args.visualize, args.compile_mode)
        return

    # Handle specific benchmarks
@ -169,7 +157,7 @@ Examples:
        sys.exit(1)

    for benchmark_name in args.benchmarks:
-        run_benchmark(benchmark_name, args)
+        run_benchmark(benchmark_name, args.visualize, args.compile_mode)
        print()  # Add spacing between benchmarks


--- a/benchmarks/dynamo/genai_layers/kernels.py
+++ b/benchmarks/dynamo/genai_layers/kernels.py
@ -9,8 +9,8 @@ import torch.nn.functional as F


 class CrossEntropyForward(BenchmarkKernel):
-    def __init__(self, script_args):
-        super().__init__(script_args)
+    def __init__(self, compile_mode: str = "max-autotune-no-cudagraphs"):
+        super().__init__(compile_mode)
        self.available_backends = ["eager", "compiled", "quack", "liger"]

    def get_shapes(self) -> tuple[tuple[int, ...], ...]:
@ -106,8 +106,8 @@ class CrossEntropyForward(BenchmarkKernel):


 class CrossEntropyBackward(BenchmarkKernel):
-    def __init__(self, script_args):
-        super().__init__(script_args)
+    def __init__(self, compile_mode: str = "max-autotune-no-cudagraphs"):
+        super().__init__(compile_mode)
        self.available_backends = ["eager", "compiled", "quack", "liger"]

    def get_shapes(self) -> tuple[tuple[int, ...], ...]:
@ -194,8 +194,8 @@ class CrossEntropyBackward(BenchmarkKernel):


 class SoftmaxForward(BenchmarkKernel):
-    def __init__(self, script_args):
-        super().__init__(script_args)
+    def __init__(self, compile_mode: str = "max-autotune-no-cudagraphs"):
+        super().__init__(compile_mode)
        self.available_backends = ["eager", "compiled", "quack", "liger"]

    def get_shapes(self) -> tuple[tuple[int, ...], ...]:
@ -259,8 +259,8 @@ class SoftmaxForward(BenchmarkKernel):


 class SoftmaxBackward(BenchmarkKernel):
-    def __init__(self, script_args):
-        super().__init__(script_args)
+    def __init__(self, compile_mode: str = "max-autotune-no-cudagraphs"):
+        super().__init__(compile_mode)
        self.available_backends = ["eager", "compiled", "quack", "liger"]

    def get_shapes(self) -> tuple[tuple[int, ...], ...]:
@ -329,8 +329,8 @@ class SoftmaxBackward(BenchmarkKernel):


 class RMSNormForward(BenchmarkKernel):
-    def __init__(self, script_args):
-        super().__init__(script_args)
+    def __init__(self, compile_mode: str = "max-autotune-no-cudagraphs"):
+        super().__init__(compile_mode)
        self.available_backends = ["eager", "compiled", "quack", "liger"]

    def get_shapes(self) -> tuple[tuple[int, ...], ...]:
@ -383,22 +383,7 @@ class RMSNormForward(BenchmarkKernel):
        from quack.rmsnorm import _rmsnorm_fwd

        x, w = args
-        y = torch.empty_like(x)
-
-        def quack_fwd():
-            _rmsnorm_fwd(
-                x,
-                w,
-                out=y,
-                bias=None,
-                rstd=None,
-                residual=None,
-                residual_out=None,
-                eps=1e-6,
-            )
-            return y
-
-        return quack_fwd
+        return lambda: _rmsnorm_fwd(x, w, eps=1e-6)

    def liger(self, args, kwargs) -> Any:
        from liger_kernel.transformers.rms_norm import LigerRMSNorm
@ -419,14 +404,9 @@ class RMSNormForward(BenchmarkKernel):


 class RMSNormBackward(BenchmarkKernel):
-    def __init__(self, script_args):
-        super().__init__(script_args)
-        self.available_backends = [
-            "eager",
-            "compiled",
-            "quack",
-            "liger",
-        ]
+    def __init__(self, compile_mode: str = "max-autotune-no-cudagraphs"):
+        super().__init__(compile_mode)
+        self.available_backends = ["eager", "compiled", "quack", "liger"]

    def get_shapes(self) -> tuple[tuple[int, ...], ...]:
        # TODO: OOM for (32768, 65536) on h100
@ -474,11 +454,8 @@ class RMSNormBackward(BenchmarkKernel):
            y, [x, w], grad_outputs=dy, retain_graph=True
        )

-    def compute_rstd(self, x, eps):
-        return torch.rsqrt(torch.mean(x.float().square(), dim=-1, keepdim=True) + eps)
-
    def quack(self, args, kwargs=None) -> Any:
-        from quack.rmsnorm import _get_sm_count, _rmsnorm_bwd
+        from quack.rmsnorm import _rmsnorm_backward

        (
            x,
@ -486,40 +463,15 @@ class RMSNormBackward(BenchmarkKernel):
            dy,
        ) = args
        M, N = x.shape
-
-        rstd = self.compute_rstd(x, eps=1e-6)
-        dx = torch.empty_like(x)
-        sm_count = _get_sm_count(x.size(1), x.device)
-        dw_partial = torch.empty(
-            sm_count, x.size(1), device=x.device, dtype=torch.float32
-        )
-
-        def quack_bwd():
-            _rmsnorm_bwd(
-                x,
-                w,
-                dy,
-                rstd,
-                dx,
-                dw_partial,
-                db_partial=None,
-                dresidual_out=None,
-                dresidual=None,
-                sm_count=sm_count,
-            )
-            dw = dw_partial.sum(dim=0).to(w.dtype)
-            return dx, dw
-
-        return quack_bwd
+        rstd = torch.randn(M, device="cuda", dtype=torch.float32)
+        return lambda: _rmsnorm_backward(x, w, dy, rstd)

    def liger(self, args, kwargs=None) -> Any:
        from liger_kernel.transformers.rms_norm import LigerRMSNorm

        x, w, dy = args
        M, N = x.shape
-        liger_rmsnorm = LigerRMSNorm(
-            hidden_size=N, eps=1e-6, casting_mode="gemma"
-        ).cuda()
+        liger_rmsnorm = LigerRMSNorm(hidden_size=N, eps=1e-6).cuda()
        liger_rmsnorm.weight.data.copy_(w)
        y = liger_rmsnorm(x)
        return lambda: torch.autograd.grad(
@ -537,8 +489,8 @@ class RMSNormBackward(BenchmarkKernel):


 class LayerNormForward(BenchmarkKernel):
-    def __init__(self, script_args):
-        super().__init__(script_args)
+    def __init__(self, compile_mode: str = "max-autotune-no-cudagraphs"):
+        super().__init__(compile_mode)
        self.available_backends = ["eager", "compiled", "quack", "liger"]

    def get_shapes(self) -> tuple[tuple[int, ...], ...]:
@ -611,8 +563,8 @@ class LayerNormForward(BenchmarkKernel):


 class LayerNormBackward(BenchmarkKernel):
-    def __init__(self, script_args):
-        super().__init__(script_args)
+    def __init__(self, compile_mode: str = "max-autotune-no-cudagraphs"):
+        super().__init__(compile_mode)
        self.available_backends = ["eager", "compiled", "liger"]

    def get_shapes(self) -> tuple[tuple[int, ...], ...]:
@ -662,31 +614,20 @@ class LayerNormBackward(BenchmarkKernel):
            y, [x, w], grad_outputs=dy, retain_graph=True
        )

-    def compute_mean_rstd(self, x, eps):
-        x = x.float()
-
-        var, mean = torch.var_mean(x, dim=-1, keepdim=True, correction=0)
-        rstd = torch.rsqrt(var + eps)
-        return mean, rstd
-
    def liger(self, args, kwargs) -> Any:
-        """
-        Call layer_norm_backward directly rather than calling
-        liger_kernel.transformers.layer_norm.LigerLayerNorm and
-        torch.autograd.grad.
-
-        The latter fashion saves mean/rstd in x.dtype which can fail
-        accuracy test. We call layer_norm_backward with fp32 mean and
-        rstd.
-        """
-        from liger_kernel.ops.layer_norm import layer_norm_backward
+        from liger_kernel.transformers.layer_norm import LigerLayerNorm

        x, w, dy = args
-        eps = 1e-6
-        mean, rstd = self.compute_mean_rstd(x, eps)
        M, N = x.shape
-
-        return lambda: layer_norm_backward(dy, x, w, None, mean, rstd)[0:2]
+        liger_layernorm = LigerLayerNorm(hidden_size=N, eps=1e-6).cuda()
+        liger_layernorm.weight.data.copy_(w)
+        liger_layernorm.bias.data.copy_(
+            torch.zeros(N, device="cuda", dtype=torch.float32)
+        )
+        y = liger_layernorm(x)
+        return lambda: torch.autograd.grad(
+            y, [x, liger_layernorm.weight], grad_outputs=dy, retain_graph=True
+        )

    def benchmark(self):
        for M, N in self.get_shapes():
--- a/benchmarks/dynamo/genai_layers/utils.py
+++ b/benchmarks/dynamo/genai_layers/utils.py
@ -1,5 +1,4 @@
 import os
-import sys
 from collections import defaultdict
 from collections.abc import Callable
 from dataclasses import dataclass
@ -44,11 +43,10 @@ class Performance:


 class BenchmarkKernel:
-    def __init__(self, script_args):
-        self.script_args = script_args
+    def __init__(self, compile_mode: str = "max-autotune-no-cudagraphs"):
        self.name = self.__class__.__name__
        self.available_backends: list[str] = []
-        self.compile_mode: str = script_args.compile_mode
+        self.compile_mode: str = compile_mode

        # mapping from backend to list of performance results
        self.profiling_results: defaultdict[str, list[Performance]] = defaultdict(list)
@ -108,21 +106,14 @@ class BenchmarkKernel:
            args_ref, kwargs_ref = self.clone_inputs(args, kwargs)
            res[backend] = getattr(self, backend)(args_ref, kwargs_ref)()
        gold = res["eager"]
-
-        tol = {}
-        if self.script_args.tolerance:
-            tol = {
-                "atol": self.script_args.tolerance,
-                "rtol": self.script_args.tolerance,
-            }
        for backend in self.available_backends:
            if backend == "eager":
                continue
            try:
-                torch.testing.assert_close(res[backend], gold, **tol)
+                torch.testing.assert_close(res[backend], gold)
                for t, gold_t in zip(res[backend], gold):
                    if t.requires_grad:
-                        torch.testing.assert_close(t.grad, gold_t.grad, **tol)
+                        torch.testing.assert_close(t.grad, gold_t.grad)
                print(
                    f"Accuracy check \033[92m✓ succeed\033[0m for {backend} backend on {self.name} kernel"
                )
@ -130,9 +121,6 @@ class BenchmarkKernel:
                print(
                    f"Accuracy check \033[91m✗ failed\033[0m for {backend} backend on {self.name} kernel. Error {e}"
                )
-                if self.script_args.exit_on_accuracy_failure:
-                    print("Exit right away since --exit-on-accuracy-failure is set")
-                    sys.exit(1)

    def benchmark_single_shape(
        self, args, kwargs=None, should_check_accuracy=True, setting: str = ""
--- a/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv
+++ b/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv
@ -1,8 +1,8 @@
-add_loop_eager,compile_time_instruction_count,3184000000,0.1
+add_loop_eager,compile_time_instruction_count,3070000000,0.1



-add_loop_eager_dynamic,compile_time_instruction_count,4595000000,0.1
+add_loop_eager_dynamic,compile_time_instruction_count,4432000000,0.1



@ -18,7 +18,7 @@ add_loop_inductor_gpu,compile_time_instruction_count,26800000000,0.1



-basic_modules_ListOfLinears_eager,compile_time_instruction_count,1096000000,0.1
+basic_modules_ListOfLinears_eager,compile_time_instruction_count,1048000000,0.1



@ -26,7 +26,7 @@ basic_modules_ListOfLinears_inductor,compile_time_instruction_count,15240000000,



-basic_modules_ListOfLinears_inductor_gpu_force_shape_pad,compile_time_instruction_count,17720000000,0.1
+basic_modules_ListOfLinears_inductor_gpu_force_shape_pad,compile_time_instruction_count,17020000000,0.1



@ -34,11 +34,11 @@ basic_modules_ListOfLinears_inductor_gpu,compile_time_instruction_count,11090000



-update_hint_regression,compile_time_instruction_count,1645000000,0.1
+update_hint_regression,compile_time_instruction_count,1719000000,0.1



-sum_floordiv_regression,compile_time_instruction_count,3813000000,0.1
+sum_floordiv_regression,compile_time_instruction_count,3686995725,0.1



@ -50,31 +50,31 @@ symint_sum_loop,compile_time_instruction_count,4299000000,0.1



-aotdispatcher_inference_nosubclass_cpu,compile_time_instruction_count,1793000000,0.1
+aotdispatcher_inference_nosubclass_cpu,compile_time_instruction_count,1869000000,0.1



-aotdispatcher_inference_subclass_cpu,compile_time_instruction_count,5120000000,0.1
+aotdispatcher_inference_subclass_cpu,compile_time_instruction_count,5281000000,0.1



-aotdispatcher_partitioner_cpu,compile_time_instruction_count,7936000000,0.1
+aotdispatcher_partitioner_cpu,compile_time_instruction_count,8333000000,0.1



-aotdispatcher_partitioner_cpu2,compile_time_instruction_count,1848000000,0.1
+aotdispatcher_partitioner_cpu2,compile_time_instruction_count,1909000000,0.1



-aotdispatcher_training_nosubclass_cpu,compile_time_instruction_count,3152000000,0.1
+aotdispatcher_training_nosubclass_cpu,compile_time_instruction_count,3442000000,0.1



-aotdispatcher_training_subclass_cpu,compile_time_instruction_count,8301000000,0.1
+aotdispatcher_training_subclass_cpu,compile_time_instruction_count,9239000000,0.1



-mm_loop_inductor_gpu,compile_time_instruction_count,4958000000,0.1
+mm_loop_inductor_gpu,compile_time_instruction_count,4820968837,0.1



@ -82,8 +82,8 @@ mm_loop_inductor_dynamic_gpu,compile_time_instruction_count,9051000000,0.1



-basic_NestedModule_eager,compile_time_instruction_count,9990000000,0.1
+basic_NestedModule_eager,compile_time_instruction_count,9554000000,0.1



-basic_InlineMod_eager,compile_time_instruction_count,8126000000,0.1
+basic_InlineMod_eager,compile_time_instruction_count,7618000000,0.1
--- a/benchmarks/dynamo/torchbench.yaml
+++ b/benchmarks/dynamo/torchbench.yaml
@ -43,7 +43,6 @@ tolerance:
    - doctr_reco_predictor
    - drq
    - phlippe_resnet
-    - pytorch_CycleGAN_and_pix2pix

  higher_bf16:
    - doctr_reco_predictor
--- a/benchmarks/operator_benchmark/aarch64_expected_ci_operator_benchmark_eager_float32_cpu.csv
+++ b/benchmarks/operator_benchmark/aarch64_expected_ci_operator_benchmark_eager_float32_cpu.csv
@ -44,101 +44,21 @@ PyTorch,div_,div__M1_N1_K1_cpu_dtype_onetorch.float32_dtype_twotorch.float32,sho
 PyTorch,div_,div__M64_N64_K64_cpu_dtype_onetorch.float32_dtype_twotorch.float32,short,False,59.241161,0.000000
 PyTorch,div_,div__M64_N64_K128_cpu_dtype_onetorch.float32_dtype_twotorch.float32,short,False,59.852816,0.000000
 PyTorch,add,"add_in_one[64,1,64]_in_two[1,64,1]_cpu_dtypetorch.float32",short,False,57.006677,0.000000
-PyTorch,add,"add_in_one[64,1,64]_in_two[1,64,1]_cpu_dtypetorch.bfloat16",short,False,88.167000,0.000000
-PyTorch,add,"add_in_one[64,1,64]_in_two[1,64,1]_cpu_dtypetorch.float64",short,False,57.519000,0.000000
 PyTorch,sub,"sub_in_one[64,1,64]_in_two[1,64,1]_cpu_dtypetorch.float32",short,False,55.606088,0.000000
-PyTorch,sub,"sub_in_one[64,1,64]_in_two[1,64,1]_cpu_dtypetorch.bfloat16",short,False,86.551000,0.000000
-PyTorch,sub,"sub_in_one[64,1,64]_in_two[1,64,1]_cpu_dtypetorch.float64",short,False,57.864088,0.000000
 PyTorch,div,"div_in_one[64,1,64]_in_two[1,64,1]_cpu_dtypetorch.float32",short,False,58.529255,0.000000
-PyTorch,div,"div_in_one[64,1,64]_in_two[1,64,1]_cpu_dtypetorch.bfloat16",short,False,71.641000,0.000000
-PyTorch,div,"div_in_one[64,1,64]_in_two[1,64,1]_cpu_dtypetorch.float64",short,False,83.073000,0.000000
 PyTorch,mul,"mul_in_one[64,1,64]_in_two[1,64,1]_cpu_dtypetorch.float32",short,False,54.645077,0.000000
-PyTorch,mul,"mul_in_one[64,1,64]_in_two[1,64,1]_cpu_dtypetorch.bfloat16",short,False,67.570000,0.000000
-PyTorch,mul,"mul_in_one[64,1,64]_in_two[1,64,1]_cpu_dtypetorch.float64",short,False,57.895000,0.000000
 PyTorch,add,add_M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,4.397014,0.000000
-PyTorch,add,add_M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,7.739000,0.000000
-PyTorch,add,add_M1_N1_K1_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,7.786000,0.000000
-PyTorch,add,add_M1_N1_K1_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,1.911000,0.000000
 PyTorch,add,add_M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,59.243500,0.000000
-PyTorch,add,add_M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,105.066000,0.000000
-PyTorch,add,add_M64_N64_K64_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,106.076000,0.000000
-PyTorch,add,add_M64_N64_K64_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,47.225000,0.000000
 PyTorch,add,add_M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,57.947691,0.000000
-PyTorch,add,add_M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,107.291000,0.000000
-PyTorch,add,add_M64_N64_K128_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,107.224000,0.000000
-PyTorch,add,add_M64_N64_K128_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,47.912000,0.000000
 PyTorch,sub,sub_M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,1.925851,0.000000
-PyTorch,sub,sub_M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,8.0240000,0.000000
-PyTorch,sub,sub_M1_N1_K1_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,8.069000,0.000000
-PyTorch,sub,sub_M1_N1_K1_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,1.938000,0.000000
 PyTorch,sub,sub_M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,57.308320,0.000000
-PyTorch,sub,sub_M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,107.091000,0.000000
-PyTorch,sub,sub_M64_N64_K64_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,108.710000,0.000000
-PyTorch,sub,sub_M64_N64_K64_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,47.502000,0.000000
 PyTorch,sub,sub_M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,57.787743,0.000000
-PyTorch,sub,sub_M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,108.863000,0.000000
-PyTorch,sub,sub_M64_N64_K128_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,108.939000,0.000000
-PyTorch,sub,sub_M64_N64_K128_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,47.603000,0.000000
 PyTorch,div,div_M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,7.978539,0.000000
-PyTorch,div,div_M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,8.741000,0.000000
-PyTorch,div,div_M1_N1_K1_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,8.757000,0.000000
-PyTorch,div,div_M1_N1_K1_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,8.774000,0.000000
 PyTorch,div,div_M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,159.754860,0.000000
-PyTorch,div,div_M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,165.552000,0.000000
-PyTorch,div,div_M64_N64_K64_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,165.755000,0.000000
-PyTorch,div,div_M64_N64_K64_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,165.714000,0.000000
 PyTorch,div,div_M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,165.360235,0.000000
-PyTorch,div,div_M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,168.376000,0.000000
-PyTorch,div,div_M64_N64_K128_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,169.604000,0.000000
-PyTorch,div,div_M64_N64_K128_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,168.428000,0.000000
 PyTorch,mul,mul_M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,3.928136,0.000000
-PyTorch,mul,mul_M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,7.402000,0.000000
-PyTorch,mul,mul_M1_N1_K1_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,7.567000,0.000000
-PyTorch,mul,mul_M1_N1_K1_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,4.020000,0.000000
 PyTorch,mul,mul_M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,56.413499,0.000000
-PyTorch,mul,mul_M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,104.638000,0.000000
-PyTorch,mul,mul_M64_N64_K64_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,104.335000,0.000000
-PyTorch,mul,mul_M64_N64_K64_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,48.612000,0.000000
 PyTorch,mul,mul_M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,55.925090,0.000000
-PyTorch,mul,mul_M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,106.110000,0.000000
-PyTorch,mul,mul_M64_N64_K128_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,106.389000,0.000000
-PyTorch,mul,mul_M64_N64_K128_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,48.195000,0.000000
-PyTorch,asr,asr_M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,1.989000,0.000000
-PyTorch,asr,asr_M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,7.999000,0.000000
-PyTorch,asr,asr_M1_N1_K1_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,7.939000,0.000000
-PyTorch,asr,asr_M1_N1_K1_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,1.980000,0.000000
-PyTorch,asr,asr_M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,54.408000,0.000000
-PyTorch,asr,asr_M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,105.647000,0.000000
-PyTorch,asr,asr_M64_N64_K64_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,106.476000,0.000000
-PyTorch,asr,asr_M64_N64_K64_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,48.784000,0.000000
-PyTorch,asr,asr_M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,55.583000,0.000000
-PyTorch,asr,asr_M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,108.083000,0.000000
-PyTorch,asr,asr_M64_N64_K128_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,107.663000,0.000000
-PyTorch,asr,asr_M64_N64_K128_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,48.283000,0.000000
-PyTorch,lsl,lsl_M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,1.986000,0.000000
-PyTorch,lsl,lsl_M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,7.676000,0.000000
-PyTorch,lsl,lsl_M1_N1_K1_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,7.618000,0.000000
-PyTorch,lsl,lsl_M1_N1_K1_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,1.982000,0.000000
-PyTorch,lsl,lsl_M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,54.698000,0.000000
-PyTorch,lsl,lsl_M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,105.899000,0.000000
-PyTorch,lsl,lsl_M64_N64_K64_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,106.741000,0.000000
-PyTorch,lsl,lsl_M64_N64_K64_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,51.182000,0.000000
-PyTorch,lsl,lsl_M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,55.290000,0.000000
-PyTorch,lsl,lsl_M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,107.744000,0.000000
-PyTorch,lsl,lsl_M64_N64_K128_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,107.820000,0.000000
-PyTorch,lsl,lsl_M64_N64_K128_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,51.298000,0.000000
-PyTorch,xor,xor_M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,1.988000,0.000000
-PyTorch,xor,xor_M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,7.689000,0.000000
-PyTorch,xor,xor_M1_N1_K1_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,7.695000,0.000000
-PyTorch,xor,xor_M1_N1_K1_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,1.978000,0.000000
-PyTorch,xor,xor_M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,54.934000,0.000000
-PyTorch,xor,xor_M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,105.217000,0.000000
-PyTorch,xor,xor_M64_N64_K64_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,104.215000,0.000000
-PyTorch,xor,xor_M64_N64_K64_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,47.115000,0.000000
-PyTorch,xor,xor_M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,55.974000,0.000000
-PyTorch,xor,xor_M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,106.828000,0.000000
-PyTorch,xor,xor_M64_N64_K128_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,106.879000,0.000000
-PyTorch,xor,xor_M64_N64_K128_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,48.197000,0.000000
 PyTorch,logical_and,"logical_and_in_one[64,1,64]_in_two[1,64,1]_cpu_dtypetorch.bool",short,False,78.404254,0.000000
 PyTorch,logical_and,logical_and_M1_N1_K1_cpu_dtype_onetorch.bool_dtype_twotorch.bool,short,False,5.354032,0.000000
 PyTorch,logical_and,logical_and_M64_N64_K64_cpu_dtype_onetorch.bool_dtype_twotorch.bool,short,False,54.072783,0.000000
@ -151,9 +71,6 @@ PyTorch,baddbmm,baddbmm_B2_M1_N8_K2_cpu_dtypetorch.float32,short,False,6.631313,
 PyTorch,baddbmm,baddbmm_B2_M1_N8_K2_cpu_dtypetorch.bfloat16,short,False,6.476986,0.000000
 PyTorch,baddbmm,baddbmm_B128_M64_N32_K64_cpu_dtypetorch.float32,short,False,266.065131,0.000000
 PyTorch,baddbmm,baddbmm_B128_M64_N32_K64_cpu_dtypetorch.bfloat16,short,False,295.503063,0.000000
-PyTorch,all,all_M1_N1_K1_cpu,short,False,5.773000,0.000000
-PyTorch,all,all_M64_N64_K64_cpu,short,False,89.427000,0.000000
-PyTorch,all,all_M64_N64_K128_cpu,short,False,120.119000,0.000000
 PyTorch,cat,"cat_sizes(1,1,1)_N2_dim0_cpu",short,False,4.301950,0.000000
 PyTorch,cat,"cat_sizes(512,512,2)_N2_dim1_cpu",short,False,99.093415,0.000000
 PyTorch,cat,"cat_sizes(128,1024,2)_N2_dim1_cpu",short,False,96.771578,0.000000
--- a/benchmarks/operator_benchmark/benchmark_core.py
+++ b/benchmarks/operator_benchmark/benchmark_core.py
@ -580,9 +580,6 @@ class BenchmarkRunner:
                else "unknown"
            )

-            # Extract operator name from test_name
-            operator_name = test_name.split("_")[0]
-
            # Create the record
            @dataclass
            class BenchmarkInfo:
@ -596,7 +593,6 @@ class BenchmarkRunner:
                name: str
                type: str
                origins: list[str]
-                extra_info: dict[str, Any]

            @dataclass
            class MetricInfo:
@ -622,14 +618,10 @@ class BenchmarkRunner:
                        "device": device,
                        "arch": device_arch,
                        "use_compile": use_compile,
-                        "operator_name": operator_name,
                    },
                ),
                model=ModelInfo(
-                    name=test_name,
-                    type="micro-benchmark",
-                    origins=["pytorch"],
-                    extra_info={"operator_name": operator_name},
+                    name=test_name, type="micro-benchmark", origins=["pytorch"]
                ),
                metric=MetricInfo(
                    name="latency",
--- a/benchmarks/operator_benchmark/pt/binary_test.py
+++ b/benchmarks/operator_benchmark/pt/binary_test.py
@ -25,7 +25,7 @@ binary_configs_broadcast = op_bench.config_list(
    ],
    cross_product_configs={
        "device": ["cpu"],
-        "dtype": [torch.float, torch.bfloat16, torch.float64],
+        "dtype": [torch.float],
    },
    tags=["short"],
 )
@ -71,8 +71,8 @@ binary_short_configs = op_bench.config_list(
    ],
    cross_product_configs={
        "device": ["cpu", "cuda"],
-        "dtype_one": [torch.int32, torch.uint8],
-        "dtype_two": [torch.int32, torch.uint8],
+        "dtype_one": [torch.int32],
+        "dtype_two": [torch.int32],
    },
    tags=["short"],
 )
@ -82,8 +82,8 @@ binary_long_configs = op_bench.cross_product_configs(
    N=[32, 64],
    K=[256, 512],
    device=["cpu", "cuda"],
-    dtype_one=[torch.int8, torch.int32, torch.uint8],
-    dtype_two=[torch.int8, torch.int32, torch.uint8],
+    dtype_one=[torch.int8, torch.int32],
+    dtype_two=[torch.int8, torch.int32],
    tags=["long"],
 )

--- a/benchmarks/operator_benchmark/x86_64_expected_ci_operator_benchmark_eager_float32_cpu.csv
+++ b/benchmarks/operator_benchmark/x86_64_expected_ci_operator_benchmark_eager_float32_cpu.csv
--- a/buckbuild.bzl
+++ b/buckbuild.bzl
@ -176,8 +176,8 @@ THIRD_PARTY_LIBS = {
    "omp": ["//xplat/third-party/linker_lib:omp", "//third_party:no-op"],
    "pocketfft": ["//third-party/pocket_fft:pocketfft", "//third_party:pocketfft_header"],
    "psimd": ["//xplat/third-party/psimd:psimd", "//third_party:psimd"],
-    "pthreadpool": ["fbsource//xplat/third-party/pthreadpool:pthreadpool", "//third_party:pthreadpool"],
-    "pthreadpool_header": ["fbsource//xplat/third-party/pthreadpool:pthreadpool_header", "//third_party:pthreadpool_header"],
+    "pthreadpool": ["//xplat/third-party/pthreadpool:pthreadpool", "//third_party:pthreadpool"],
+    "pthreadpool_header": ["//xplat/third-party/pthreadpool:pthreadpool_header", "//third_party:pthreadpool_header"],
    "moodycamel": ["//third-party/moodycamel:moodycamel", "//third_party:moodycamel"],
    "pyyaml": ["//third-party/pypi/pyyaml:pyyaml", "//third_party:pyyaml"],
    "rt": ["//xplat/third-party/linker_lib:rt", "//third_party:rt"],
@ -1729,10 +1729,8 @@ def define_buck_targets(
            "torch/csrc/jit/backends/backend_debug_info.cpp",
            "torch/csrc/jit/backends/backend_interface.cpp",
        ],
-        compiler_flags = get_pt_compiler_flags() + select({
-            "DEFAULT": [],
-            "ovr_config//os:android": c2_fbandroid_xplat_compiler_flags
-        }),
+        compiler_flags = get_pt_compiler_flags(),
+        fbandroid_compiler_flags = c2_fbandroid_xplat_compiler_flags,
        # @lint-ignore BUCKLINT link_whole
        link_whole = True,
        linker_flags = get_no_as_needed_linker_flag(),
@ -2025,9 +2023,6 @@ def define_buck_targets(
                "ovr_config//os:android-x86_64": [
                    "-mssse3",
                ],
-            }) + select({
-                "DEFAULT": [],
-                "ovr_config//os:android": c2_fbandroid_xplat_compiler_flags,
            }),
            exported_preprocessor_flags = get_aten_preprocessor_flags(),
            exported_deps = [
--- a/build_variables.bzl
+++ b/build_variables.bzl
@ -855,7 +855,6 @@ libtorch_python_cuda_core_sources = [
    "torch/csrc/cuda/Stream.cpp",
    "torch/csrc/cuda/Graph.cpp",
    "torch/csrc/cuda/MemPool.cpp",
-    "torch/csrc/cuda/GreenContext.cpp",
    "torch/csrc/cuda/shared/cudart.cpp",
    "torch/csrc/cuda/shared/nvtx.cpp",
    "torch/csrc/cuda/utils.cpp",
--- a/c10/core/Allocator.h
+++ b/c10/core/Allocator.h
@ -9,7 +9,6 @@

 #include <c10/core/Device.h>
 #include <c10/core/DeviceType.h>
-#include <c10/core/alignment.h>
 #include <c10/macros/Export.h>
 #include <c10/macros/Macros.h>
 #include <c10/util/Exception.h>
--- a/c10/core/AllocatorConfig.h
+++ b/c10/core/AllocatorConfig.h
@ -13,17 +13,7 @@
 namespace c10::CachingAllocator {

 // "large" allocations may be packed in 20 MiB blocks
-constexpr size_t kLargeBuffer = 20971520;
-// "small" allocations are packed in 2 MiB blocks
-constexpr size_t kSmallBuffer = 2097152;
-// all sizes are rounded to at least 512 bytes
-constexpr size_t kMinBlockSize = 512;
-// largest "small" allocation is 1 MiB
-constexpr size_t kSmallSize = 1048576;
-// allocations between 1 and 10 MiB may use kLargeBuffer
-constexpr size_t kMinLargeAlloc = 10485760;
-// round up large allocations to 2 MiB
-constexpr size_t kRoundLarge = 2097152;
+const size_t kLargeBuffer = 20971520;

 // A utility class for tokenizing allocator configuration strings into discrete
 // parts. For example, the config string:
--- a/c10/core/Backend.h
+++ b/c10/core/Backend.h
@ -223,7 +223,7 @@ inline DispatchKey backendToDispatchKey(Backend b) {
    case Backend::PrivateUse1:
      return DispatchKey::PrivateUse1;
    default:
-      TORCH_CHECK(false, "Unknown backend");
+      throw std::runtime_error("Unknown backend");
  }
 }

--- a/c10/core/DispatchKeySet.cpp
+++ b/c10/core/DispatchKeySet.cpp
@ -52,9 +52,7 @@ constexpr DispatchKeySet math_dispatch_keyset = backend_dispatch_keyset |
    // where we would like to support composite implicit kernels but not
    // explicit kernels therefore we manually add the key to the
    // math_dispatch_keyset
-    DispatchKeySet{DispatchKey::NestedTensor} |
-    // Functionalize should always reuse CompositeImplicit decomps.
-    DispatchKeySet{DispatchKey::Functionalize};
+    DispatchKeySet{DispatchKey::NestedTensor};

 constexpr DispatchKeySet nested_dispatch_keyset =
    DispatchKeySet(
--- a/c10/core/Scalar.h
+++ b/c10/core/Scalar.h
@ -336,7 +336,7 @@ class C10_API Scalar {
    } else if (isBoolean()) {
      return ScalarType::Bool;
    } else {
-      TORCH_CHECK(false, "Unknown scalar type.");
+      throw std::runtime_error("Unknown scalar type.");
    }
  }

--- a/c10/core/ScalarType.cpp
+++ b/c10/core/ScalarType.cpp
@ -228,7 +228,7 @@ std::pair<std::string, std::string> getDtypeNames(c10::ScalarType scalarType) {
    case c10::ScalarType::Float4_e2m1fn_x2:
      return std::make_pair("float4_e2m1fn_x2", "");
    default:
-      TORCH_CHECK(false, "Unimplemented scalar type");
+      throw std::runtime_error("Unimplemented scalar type");
  }
 }

--- a/c10/core/ScalarType.h
+++ b/c10/core/ScalarType.h
@ -52,6 +52,19 @@ AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(SPECIALIZE_CppTypeToScalarType)
 AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_CONSTANT)
 #undef DEFINE_CONSTANT

+inline const char* toString(ScalarType t) {
+#define DEFINE_CASE(_, name) \
+  case ScalarType::name:     \
+    return #name;
+
+  switch (t) {
+    AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_CASE)
+    default:
+      return "UNKNOWN_SCALAR";
+  }
+#undef DEFINE_CASE
+}
+
 inline size_t elementSize(ScalarType t) {
 #define CASE_ELEMENTSIZE_CASE(ctype, name) \
  case ScalarType::name:                   \
@ -137,6 +150,22 @@ inline ScalarType toQIntType(ScalarType t) {
  }
 }

+inline ScalarType toUnderlying(ScalarType t) {
+  switch (t) {
+    case ScalarType::QUInt8:
+    case ScalarType::QUInt4x2:
+      [[fallthrough]];
+    case ScalarType::QUInt2x4:
+      return ScalarType::Byte;
+    case ScalarType::QInt8:
+      return ScalarType::Char;
+    case ScalarType::QInt32:
+      return ScalarType::Int;
+    default:
+      return t;
+  }
+}
+
 inline bool isSignedType(ScalarType t) {
 #define CASE_ISSIGNED(name)     \
  case ScalarType::name:        \
@ -279,6 +308,12 @@ inline bool canCast(const ScalarType from, const ScalarType to) {

 C10_API ScalarType promoteTypes(ScalarType a, ScalarType b);

+inline std::ostream& operator<<(
+    std::ostream& stream,
+    at::ScalarType scalar_type) {
+  return stream << toString(scalar_type);
+}
+
 // Returns a pair of strings representing the names for each dtype.
 // The returned pair is (name, legacy_name_if_applicable)
 C10_API std::pair<std::string, std::string> getDtypeNames(
--- a/c10/core/alignment.h
+++ b/c10/core/alignment.h
@ -1,7 +1,6 @@
 #pragma once

 #include <cstddef>
-#include <new>

 namespace c10 {

@ -19,12 +18,4 @@ constexpr size_t gPagesize = 4096;
 // since the default thp pagesize is 2MB, enable thp only
 // for buffers of size 2MB or larger to avoid memory bloating
 constexpr size_t gAlloc_threshold_thp = static_cast<size_t>(2) * 1024 * 1024;
-
-// Cache line size used to avoid false sharing between threads. Falls back to 64
-// bytes if C++17 feature is unavailable.
-#ifdef __cpp_lib_hardware_interference_size
-using std::hardware_destructive_interference_size;
-#else
-constexpr std::size_t hardware_destructive_interference_size = 64;
-#endif
 } // namespace c10
--- a/c10/core/thread_pool.cpp
+++ b/c10/core/thread_pool.cpp
@ -87,7 +87,9 @@ bool ThreadPool::inThreadPool() const {
 }

 void ThreadPool::run(std::function<void()> func) {
-  TORCH_CHECK(threads_.size() > 0, "No threads to run a task");
+  if (threads_.empty()) {
+    throw std::runtime_error("No threads to run a task");
+  }
  std::unique_lock<std::mutex> lock(mutex_);

  // Set task and signal condition variable so that a worker thread will
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@ -131,6 +131,15 @@ namespace Native {
 *                  notifyCaptureDestroy.
 */

+constexpr size_t kMinBlockSize =
+    512; // all sizes are rounded to at least 512 bytes
+constexpr size_t kSmallSize = 1048576; // largest "small" allocation is 1 MiB
+constexpr size_t kSmallBuffer =
+    2097152; // "small" allocations are packed in 2 MiB blocks
+constexpr size_t kMinLargeAlloc =
+    10485760; // allocations between 1 and 10 MiB may use kLargeBuffer
+constexpr size_t kRoundLarge = 2097152; // round up large allocations to 2 MiB
+
 static char SHAREABLE_HANDLE_VERSION = 2;
 enum ShareableHandleType : char {
  SHAREABLE_CUDA_MALLOC = 'c',
@ -932,7 +941,7 @@ class EventPool {

 private:
  struct PerDevicePool {
-    alignas(hardware_destructive_interference_size) std::mutex mutex_;
+    alignas(64) std::mutex mutex_;
    std::vector<std::unique_ptr<cudaEvent_t>> event_pool_;
  };
  std::vector<PerDevicePool> pools_;
@ -3749,6 +3758,11 @@ static void uncached_delete(void* ptr) {
 static void local_raw_delete(void* ptr);
 thread_local std::stack<std::string> DeviceCachingAllocator::compile_context;
 thread_local std::string DeviceCachingAllocator::user_metadata;
+#ifdef __cpp_lib_hardware_interference_size
+using std::hardware_destructive_interference_size;
+#else
+static constexpr std::size_t hardware_destructive_interference_size = 64;
+#endif

 class NativeCachingAllocator : public CUDAAllocator {
 private:
@ -4469,10 +4483,7 @@ struct BackendStaticInitializer {
        if (key == "backend") {
          tokenizer.checkToken(++i, ":");
          i++; // Move to the value after the colon
-          // break up token to trick hipify
-          if (tokenizer[i] ==
-                  "c"
-                  "udaMallocAsync"
+          if (tokenizer[i] == "cudaMallocAsync"
 #ifdef USE_ROCM
              // convenience for ROCm users to allow either CUDA or HIP env var
              || tokenizer[i] == "hipMallocAsync"
--- a/c10/cuda/CUDAMallocAsyncAllocator.cpp
+++ b/c10/cuda/CUDAMallocAsyncAllocator.cpp
@ -913,9 +913,7 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
    }
  }
  std::string name() override {
-    // break up token to trick hipify
-    return "c"
-           "udaMallocAsync";
+    return "cudaMallocAsync";
  }
  void copy_data(void* dest, const void* src, std::size_t count) const final {
    C10_CUDA_CHECK(
--- a/c10/cuda/driver_api.h
+++ b/c10/cuda/driver_api.h
@ -51,17 +51,6 @@

 #if defined(CUDA_VERSION) && (CUDA_VERSION >= 12030)
 #define C10_LIBCUDA_DRIVER_API_OPTIONAL(_) \
-  _(cuCtxFromGreenCtx, 12080)              \
-  _(cuCtxGetCurrent, 12080)                \
-  _(cuCtxPopCurrent, 12080)                \
-  _(cuCtxPushCurrent, 12080)               \
-  _(cuCtxSetCurrent, 12080)                \
-  _(cuGreenCtxCreate, 12080)               \
-  _(cuGreenCtxDestroy, 12080)              \
-  _(cuDevSmResourceSplitByCount, 12080)    \
-  _(cuDeviceGet, 12080)                    \
-  _(cuDeviceGetDevResource, 12080)         \
-  _(cuDevResourceGenerateDesc, 12080)      \
  _(cuMulticastAddDevice, 12030)           \
  _(cuMulticastBindMem, 12030)             \
  _(cuMulticastCreate, 12030)              \
--- a/c10/metal/utils.h
+++ b/c10/metal/utils.h
@ -328,21 +328,6 @@ struct pair {
  T2 second;
 };

-template <typename T>
-static T conj(T a) {
-  return a;
-}
-
-template <>
-half2 conj(half2 a) {
-  return half2(a.x, -a.y);
-}
-
-template <>
-float2 conj(float2 a) {
-  return float2(a.x, -a.y);
-}
-
 #define INSTANTIATE_FOR_ALL_TYPES(MACRO) \
  MACRO(float);                          \
  MACRO(half);                           \
--- a/c10/util/C++17.h
+++ b/c10/util/C++17.h
@ -45,7 +45,14 @@ constexpr bool is_pod_v = is_pod<T>::value;

 namespace guts {

-#if defined(__HIP__)
+#if defined(__cpp_lib_apply) && !defined(__CUDA_ARCH__) && !defined(__HIP__)
+
+template <class F, class Tuple>
+C10_HOST_DEVICE inline constexpr decltype(auto) apply(F&& f, Tuple&& t) {
+  return std::apply(std::forward<F>(f), std::forward<Tuple>(t));
+}
+
+#else

 // Implementation from http://en.cppreference.com/w/cpp/utility/apply (but
 // modified)
--- a/Show More
+++ b/Show More