vllm setup

Signed-off-by: Yang Wang <elainewy@meta.com>
Fix the typos in the right nav by pulling the latest theme (#158746 )
2025-11-03 15:35:04 +08:00 · 2025-07-21 17:40:32 -07:00 · 2025-07-21 22:51:07 +00:00 · 2025-07-21 22:49:26 +00:00 · 2025-07-21 22:46:53 +00:00 · 2025-07-21 22:41:07 +00:00
705 changed files with 21402 additions and 21555 deletions
--- a/.bazelrc
+++ b/.bazelrc
@ -2,7 +2,7 @@ build --cxxopt=--std=c++17
 build --copt=-I.
 # Bazel does not support including its cc_library targets as system
 # headers. We work around this for generated code
-# (e.g. c10/macros/cmake_macros.h) by making the generated directory a
+# (e.g. torch/headeronly/macros/cmake_macros.h) by making the generated directory a
 # system include path.
 build --copt=-isystem --copt bazel-out/k8-fastbuild/bin
 build --copt=-isystem --copt bazel-out/darwin-fastbuild/bin
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -78,319 +78,45 @@ elif [[ "$image" == *linter* ]]; then
  DOCKERFILE="linter/Dockerfile"
 fi

-_UCX_COMMIT=7bb2722ff2187a0cad557ae4a6afa090569f83fb
-_UCC_COMMIT=20eae37090a4ce1b32bcce6144ccad0b49943e0b
-if [[ "$image" == *rocm* ]]; then
-  _UCX_COMMIT=cc312eaa4655c0cc5c2bcd796db938f90563bcf6
-  _UCC_COMMIT=0c0fc21559835044ab107199e334f7157d6a0d3d
-fi
+PY_HARDCODED_CONFIG_SCRIPT=$(python3 get_config.py --image "$image")

-tag=$(echo $image | awk -F':' '{print $2}')
-
-# It's annoying to rename jobs every time you want to rewrite a
-# configuration, so we hardcode everything here rather than do it
-# from scratch
-case "$tag" in
-  pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11)
-    CUDA_VERSION=12.4
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=11
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    TRITON=yes
-    ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11)
-    CUDA_VERSION=12.8.1
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=11
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    TRITON=yes
-    ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.8.1
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=9
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    TRITON=yes
-    INDUCTOR_BENCHMARKS=yes
-    ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.8.1
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.12
-    GCC_VERSION=9
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    TRITON=yes
-    INDUCTOR_BENCHMARKS=yes
-    ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.8.1
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.13
-    GCC_VERSION=9
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    TRITON=yes
-    INDUCTOR_BENCHMARKS=yes
-    ;;
-  pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9)
-    CUDA_VERSION=12.6.3
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=9
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    TRITON=yes
-    ;;
-  pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.6
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=9
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    TRITON=yes
-    INDUCTOR_BENCHMARKS=yes
-    ;;
-  pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.6
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.12
-    GCC_VERSION=9
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    TRITON=yes
-    INDUCTOR_BENCHMARKS=yes
-    ;;
-  pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.6
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.13
-    GCC_VERSION=9
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    TRITON=yes
-    INDUCTOR_BENCHMARKS=yes
-    ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9)
-    CUDA_VERSION=12.8.1
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=9
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    TRITON=yes
-    ;;
-  pytorch-linux-jammy-py3-clang12-onnx)
-    ANACONDA_PYTHON_VERSION=3.9
-    CLANG_VERSION=12
-    VISION=yes
-    ONNX=yes
-    ;;
-  pytorch-linux-jammy-py3.9-clang12)
-    ANACONDA_PYTHON_VERSION=3.9
-    CLANG_VERSION=12
-    VISION=yes
-    TRITON=yes
-    ;;
-  pytorch-linux-jammy-py3.11-clang12)
-    ANACONDA_PYTHON_VERSION=3.11
-    CLANG_VERSION=12
-    VISION=yes
-    TRITON=yes
-    ;;
-  pytorch-linux-jammy-py3.9-gcc9)
-    ANACONDA_PYTHON_VERSION=3.9
-    GCC_VERSION=9
-    VISION=yes
-    TRITON=yes
-    ;;
-  pytorch-linux-jammy-rocm-n-1-py3)
-    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=11
-    VISION=yes
-    ROCM_VERSION=6.3
+if [[ $? -eq 0 ]]; then
+  eval "$PY_HARDCODED_CONFIG_SCRIPT"
+else
+  echo "[Fallback] Python script failed or no match — fallback to hardcoded shell case"
+  # Catch-all for builds that are not hardcoded.
+  VISION=yes
+  echo "image '$image' did not match an existing build configuration"
+  if [[ "$image" == *py* ]]; then
+    extract_version_from_image_name py ANACONDA_PYTHON_VERSION
+  fi
+  if [[ "$image" == *cuda* ]]; then
+    extract_version_from_image_name cuda CUDA_VERSION
+    extract_version_from_image_name cudnn CUDNN_VERSION
+  fi
+  if [[ "$image" == *rocm* ]]; then
+    extract_version_from_image_name rocm ROCM_VERSION
    NINJA_VERSION=1.9.0
    TRITON=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    INDUCTOR_BENCHMARKS=yes
-    ;;
-  pytorch-linux-jammy-rocm-n-py3 | pytorch-linux-noble-rocm-n-py3)
-    if [[ $tag =~ "jammy" ]]; then
-      ANACONDA_PYTHON_VERSION=3.10
-    else
-      ANACONDA_PYTHON_VERSION=3.12
+    # To ensure that any ROCm config will build using conda cmake
+    # and thus have LAPACK/MKL enabled
    fi
-    GCC_VERSION=11
-    VISION=yes
-    ROCM_VERSION=6.4
-    NINJA_VERSION=1.9.0
-    TRITON=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    INDUCTOR_BENCHMARKS=yes
-    ;;
-  pytorch-linux-jammy-xpu-2025.0-py3)
-    ANACONDA_PYTHON_VERSION=3.9
-    GCC_VERSION=11
-    VISION=yes
-    XPU_VERSION=2025.0
-    NINJA_VERSION=1.9.0
-    TRITON=yes
-    ;;
-  pytorch-linux-jammy-xpu-2025.1-py3)
-    ANACONDA_PYTHON_VERSION=3.9
-    GCC_VERSION=11
-    VISION=yes
-    XPU_VERSION=2025.1
-    NINJA_VERSION=1.9.0
-    TRITON=yes
-    ;;
-    pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks)
-    ANACONDA_PYTHON_VERSION=3.9
-    GCC_VERSION=11
-    VISION=yes
-    KATEX=yes
-    TRITON=yes
-    DOCS=yes
-    INDUCTOR_BENCHMARKS=yes
-    ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12)
-    ANACONDA_PYTHON_VERSION=3.9
-    CUDA_VERSION=12.8.1
-    CUDNN_VERSION=9
-    CLANG_VERSION=12
-    VISION=yes
-    TRITON=yes
-    ;;
-  pytorch-linux-jammy-py3-clang18-asan)
-    ANACONDA_PYTHON_VERSION=3.10
-    CLANG_VERSION=18
-    VISION=yes
-    ;;
-  pytorch-linux-jammy-py3.9-gcc11)
-    ANACONDA_PYTHON_VERSION=3.9
-    GCC_VERSION=11
-    VISION=yes
-    KATEX=yes
-    TRITON=yes
-    DOCS=yes
-    UNINSTALL_DILL=yes
-    ;;
-  pytorch-linux-jammy-py3-clang12-executorch)
-    ANACONDA_PYTHON_VERSION=3.10
-    CLANG_VERSION=12
-    EXECUTORCH=yes
-    ;;
-  pytorch-linux-jammy-py3.12-halide)
-    CUDA_VERSION=12.6
-    ANACONDA_PYTHON_VERSION=3.12
-    GCC_VERSION=11
-    HALIDE=yes
-    TRITON=yes
-    ;;
-  pytorch-linux-jammy-py3.12-triton-cpu)
-    CUDA_VERSION=12.6
-    ANACONDA_PYTHON_VERSION=3.12
-    GCC_VERSION=11
-    TRITON_CPU=yes
-    ;;
-  pytorch-linux-jammy-linter)
-    # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
-    # We will need to update mypy version eventually, but that's for another day. The task
-    # would be to upgrade mypy to 1.0.0 with Python 3.11
-    PYTHON_VERSION=3.9
-    ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter)
-    PYTHON_VERSION=3.9
-    CUDA_VERSION=12.8.1
-    ;;
-  pytorch-linux-jammy-aarch64-py3.10-gcc11)
-    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=11
-    ACL=yes
-    VISION=yes
-    CONDA_CMAKE=yes
-    OPENBLAS=yes
-    # snadampal: skipping llvm src build install because the current version
-    # from pytorch/llvm:9.0.1 is x86 specific
-    SKIP_LLVM_SRC_BUILD_INSTALL=yes
-    ;;
-  pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks)
-    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=11
-    ACL=yes
-    VISION=yes
-    CONDA_CMAKE=yes
-    OPENBLAS=yes
-    # snadampal: skipping llvm src build install because the current version
-    # from pytorch/llvm:9.0.1 is x86 specific
-    SKIP_LLVM_SRC_BUILD_INSTALL=yes
-    INDUCTOR_BENCHMARKS=yes
-    ;;
-  *)
-    # Catch-all for builds that are not hardcoded.
-    VISION=yes
-    echo "image '$image' did not match an existing build configuration"
-    if [[ "$image" == *py* ]]; then
-      extract_version_from_image_name py ANACONDA_PYTHON_VERSION
-    fi
-    if [[ "$image" == *cuda* ]]; then
-      extract_version_from_image_name cuda CUDA_VERSION
-      extract_version_from_image_name cudnn CUDNN_VERSION
-    fi
-    if [[ "$image" == *rocm* ]]; then
-      extract_version_from_image_name rocm ROCM_VERSION
-      NINJA_VERSION=1.9.0
-      TRITON=yes
-      # To ensure that any ROCm config will build using conda cmake
-      # and thus have LAPACK/MKL enabled
-      fi
-    if [[ "$image" == *centos7* ]]; then
-      NINJA_VERSION=1.10.2
-    fi
-    if [[ "$image" == *gcc* ]]; then
-      extract_version_from_image_name gcc GCC_VERSION
-    fi
-    if [[ "$image" == *clang* ]]; then
-      extract_version_from_image_name clang CLANG_VERSION
-    fi
-    if [[ "$image" == *devtoolset* ]]; then
-      extract_version_from_image_name devtoolset DEVTOOLSET_VERSION
-    fi
-    if [[ "$image" == *glibc* ]]; then
-      extract_version_from_image_name glibc GLIBC_VERSION
-    fi
-  ;;
+  if [[ "$image" == *centos7* ]]; then
+    NINJA_VERSION=1.10.2
+  fi
+  if [[ "$image" == *gcc* ]]; then
+    extract_version_from_image_name gcc GCC_VERSION
+  fi
+  if [[ "$image" == *clang* ]]; then
+    extract_version_from_image_name clang CLANG_VERSION
+  fi
+  if [[ "$image" == *devtoolset* ]]; then
+    extract_version_from_image_name devtoolset DEVTOOLSET_VERSION
+  fi
+  if [[ "$image" == *glibc* ]]; then
+    extract_version_from_image_name glibc GLIBC_VERSION
+  fi
+;;
 esac

 tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')
--- a/.ci/docker/ci_commit_pins/torchbench.txt
+++ b/.ci/docker/ci_commit_pins/torchbench.txt
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-ae848267bebc65c6181e8cc5e64a6357d2679260
+11ec6354315768a85da41032535e3b7b99c5f706
--- a/.ci/docker/common/install_conda.sh
+++ b/.ci/docker/common/install_conda.sh
@ -4,12 +4,8 @@ set -ex

 # Optionally install conda
 if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
-  BASE_URL="https://repo.anaconda.com/miniconda"
-  CONDA_FILE="Miniconda3-latest-Linux-x86_64.sh"
-  if [[ $(uname -m) == "aarch64" ]] || [[ "$BUILD_ENVIRONMENT" == *xpu* ]] || [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
-    BASE_URL="https://github.com/conda-forge/miniforge/releases/latest/download"  # @lint-ignore
-    CONDA_FILE="Miniforge3-Linux-$(uname -m).sh"
-  fi
+  BASE_URL="https://github.com/conda-forge/miniforge/releases/latest/download"  # @lint-ignore
+  CONDA_FILE="Miniforge3-Linux-$(uname -m).sh"

  MAJOR_PYTHON_VERSION=$(echo "$ANACONDA_PYTHON_VERSION" | cut -d . -f 1)
  MINOR_PYTHON_VERSION=$(echo "$ANACONDA_PYTHON_VERSION" | cut -d . -f 2)
@ -21,7 +17,6 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
      exit 1
      ;;
  esac
-
  mkdir -p /opt/conda
  chown jenkins:jenkins /opt/conda

--- a/.ci/docker/common/install_inductor_benchmark_deps.sh
+++ b/.ci/docker/common/install_inductor_benchmark_deps.sh
@ -15,11 +15,35 @@ function install_timm() {
  commit=$(get_pinned_commit timm)

  pip_install "git+https://github.com/huggingface/pytorch-image-models@${commit}"
-  # Clean up
-  conda_run pip uninstall -y torch torchvision triton
+}
+
+function install_torchbench() {
+  local commit
+  commit=$(get_pinned_commit torchbench)
+  git clone https://github.com/pytorch/benchmark torchbench
+  pushd torchbench
+  git checkout "$commit"
+
+  python install.py --continue_on_fail
+
+  # TODO (huydhn): transformers-4.44.2 added by https://github.com/pytorch/benchmark/pull/2488
+  # is regressing speedup metric. This needs to be investigated further
+  pip install transformers==4.38.1
+
+  echo "Print all dependencies after TorchBench is installed"
+  python -mpip freeze
+  popd
 }

 # Pango is needed for weasyprint which is needed for doctr
 conda_install pango
+
+# Stable packages are ok here, just to satisfy TorchBench check
+pip_install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
+
+install_torchbench
 install_huggingface
 install_timm
+
+# Clean up
+conda_run pip uninstall -y torch torchvision torchaudio triton
--- a/.ci/docker/common/install_rocm.sh
+++ b/.ci/docker/common/install_rocm.sh
@ -33,13 +33,22 @@ EOF
        ROCM_VERSION="${ROCM_VERSION}.1"
    fi

+    # Default url values
+    rocm_baseurl="http://repo.radeon.com/rocm/apt/${ROCM_VERSION}"
+    amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu"
+
+    # Special case for ROCM_VERSION == 7.0
+    if [[ $(ver "$ROCM_VERSION") -eq $(ver 7.0) ]]; then
+        rocm_baseurl="https://repo.radeon.com/rocm/apt/7.0_alpha2"
+        amdgpu_baseurl="https://repo.radeon.com/amdgpu/30.10_alpha2/ubuntu"
+    fi
+
    # Add amdgpu repository
    UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'`
-    echo "deb [arch=amd64] https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list
+    echo "deb [arch=amd64] ${amdgpu_baseurl} ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list

    # Add rocm repository
    wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
-    local rocm_baseurl="http://repo.radeon.com/rocm/apt/${ROCM_VERSION}"
    echo "deb [arch=amd64] ${rocm_baseurl} ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/rocm.list
    apt-get update --allow-insecure-repositories

@ -73,30 +82,30 @@ EOF
    done

    # ROCm 6.3 had a regression where initializing static code objects had significant overhead
+    # CI no longer builds for ROCm 6.3, but
    # ROCm 6.4 did not yet fix the regression, also HIP branch names are different
-    if [[ $(ver $ROCM_VERSION) -ge $(ver 6.3) ]] && [[ $(ver $ROCM_VERSION) -lt $(ver 7.0) ]]; then
+    if [[ $(ver $ROCM_VERSION) -ge $(ver 6.4) ]] && [[ $(ver $ROCM_VERSION) -lt $(ver 7.0) ]]; then
        if [[ $(ver $ROCM_VERSION) -eq $(ver 6.4.1) ]]; then
            HIP_BRANCH=release/rocm-rel-6.4
-            VER_STR=6.4
-            VER_PATCH=.1
+            CLR_HASH=606bc820b4b1f315d135da02a1f0b176ca50a92c  # branch release/rocm-rel-6.4.1-statco-hotfix
        elif [[ $(ver $ROCM_VERSION) -eq $(ver 6.4) ]]; then
            HIP_BRANCH=release/rocm-rel-6.4
-            VER_STR=6.4
-        elif [[ $(ver $ROCM_VERSION) -eq $(ver 6.3) ]]; then
-            HIP_BRANCH=rocm-6.3.x
-            VER_STR=6.3
+            CLR_HASH=600f5b0d2baed94d5121e2174a9de0851b040b0c  # branch release/rocm-rel-6.4-statco-hotfix
        fi
        # clr build needs CppHeaderParser but can only find it using conda's python
        python -m pip install CppHeaderParser
        git clone https://github.com/ROCm/HIP -b $HIP_BRANCH
        HIP_COMMON_DIR=$(readlink -f HIP)
-        git clone https://github.com/jeffdaily/clr -b release/rocm-rel-${VER_STR}${VER_PATCH}-statco-hotfix
+        git clone https://github.com/jeffdaily/clr
+        pushd clr
+        git checkout $CLR_HASH
+        popd
        mkdir -p clr/build
        pushd clr/build
        # Need to point CMake to the correct python installation to find CppHeaderParser
        cmake .. -DPython3_EXECUTABLE=/opt/conda/envs/py_${ANACONDA_PYTHON_VERSION}/bin/python3 -DCLR_BUILD_HIP=ON -DHIP_COMMON_DIR=$HIP_COMMON_DIR
        make -j
-        cp hipamd/lib/libamdhip64.so.${VER_STR}.* /opt/rocm/lib/libamdhip64.so.${VER_STR}.*
+        cp hipamd/lib/libamdhip64.so.6.4.* /opt/rocm/lib/libamdhip64.so.6.4.*
        popd
        rm -rf HIP clr
    fi
--- a/.ci/docker/get_config.py
+++ b/.ci/docker/get_config.py
@ -0,0 +1,350 @@
+import argparse
+import sys
+from enum import Enum
+import shlex
+
+class HardwareType(Enum):
+    DEFAULT = "default"
+    ROCM = "rocm"
+
+    @staticmethod
+    def from_image_name(image_name: str) -> "HardwareType":
+        if "rocm" in image_name:
+            return HardwareType.ROCM
+        return HardwareType.DEFAULT
+
+class HardcodedBaseConfig:
+    _UCX_UCC_CONFIGS: dict[HardwareType, dict[str, str]] = {
+        HardwareType.DEFAULT: {
+            "UCX_COMMIT": "7bb2722ff2187a0cad557ae4a6afa090569f83fb",
+            "UCC_COMMIT": "20eae37090a4ce1b32bcce6144ccad0b49943e0b",
+        },
+        HardwareType.ROCM: {
+            "UCX_COMMIT": "cc312eaa4655c0cc5c2bcd796db938f90563bcf6",
+            "UCC_COMMIT": "0c0fc21559835044ab107199e334f7157d6a0d3d",
+        },
+    }
+
+    def __init__(self, hardwareType: HardwareType) -> None:
+        commits = self.get_ucx_ucc_commits(hardwareType)
+        self.ucx_commit = commits["UCX_COMMIT"]
+        self.ucc_commit = commits["UCC_COMMIT"]
+
+    def _get_tag(self, image: str):
+        if ":" not in image:
+            print(f"echo 'Invalid image format (missing :): {image}'", file=sys.stderr)
+            return
+        tag = image.split(":")[1]
+        return tag
+
+    def get_all_configs(self):
+        _TAG_CONFIGS = {
+            "pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11": {
+                "CUDA_VERSION": "12.4",
+                "CUDNN_VERSION": "9",
+                "ANACONDA_PYTHON_VERSION": "3.10",
+                "GCC_VERSION": "11",
+                "VISION": "yes",
+                "KATEX": "yes",
+                "UCX_COMMIT": self.ucx_commit,
+                "UCC_COMMIT": self.ucc_commit,
+                "TRITON": "yes",
+            },
+            "pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11": {
+                "CUDA_VERSION": "12.8.1",
+                "CUDNN_VERSION": "9",
+                "ANACONDA_PYTHON_VERSION": "3.10",
+                "GCC_VERSION": "11",
+                "VISION": "yes",
+                "KATEX": "yes",
+                "UCX_COMMIT": self.ucx_commit,
+                "UCC_COMMIT": self.ucc_commit,
+                "TRITON": "yes",
+            },
+            "pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks": {
+                "CUDA_VERSION": "12.8.1",
+                "CUDNN_VERSION": "9",
+                "ANACONDA_PYTHON_VERSION": "3.10",
+                "GCC_VERSION": "9",
+                "VISION": "yes",
+                "KATEX": "yes",
+                "UCX_COMMIT": self.ucx_commit,
+                "UCC_COMMIT": self.ucc_commit,
+                "TRITON": "yes",
+                "INDUCTOR_BENCHMARKS": "yes",
+            },
+            "pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks": {
+                "CUDA_VERSION": "12.8.1",
+                "CUDNN_VERSION": "9",
+                "ANACONDA_PYTHON_VERSION": "3.12",
+                "GCC_VERSION": "9",
+                "VISION": "yes",
+                "KATEX": "yes",
+                "UCX_COMMIT": self.ucx_commit,
+                "UCC_COMMIT": self.ucc_commit,
+                "TRITON": "yes",
+                "INDUCTOR_BENCHMARKS": "yes",
+            },
+            "pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks": {
+                "CUDA_VERSION": "12.8.1",
+                "CUDNN_VERSION": "9",
+                "ANACONDA_PYTHON_VERSION": "3.13",
+                "GCC_VERSION": "9",
+                "VISION": "yes",
+                "KATEX": "yes",
+                "UCX_COMMIT": self.ucx_commit,
+                "UCC_COMMIT": self.ucc_commit,
+                "TRITON": "yes",
+                "INDUCTOR_BENCHMARKS": "yes",
+            },
+            "pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9": {
+                "CUDA_VERSION": "12.6.3",
+                "CUDNN_VERSION": "9",
+                "ANACONDA_PYTHON_VERSION": "3.10",
+                "GCC_VERSION": "9",
+                "VISION": "yes",
+                "KATEX": "yes",
+                "UCX_COMMIT": self.ucx_commit,
+                "UCC_COMMIT": self.ucc_commit,
+                "TRITON": "yes",
+            },
+            "pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks": {
+                "CUDA_VERSION": "12.6",
+                "CUDNN_VERSION": "9",
+                "ANACONDA_PYTHON_VERSION": "3.10",
+                "GCC_VERSION": "9",
+                "VISION": "yes",
+                "KATEX": "yes",
+                "UCX_COMMIT": self.ucx_commit,
+                "UCC_COMMIT": self.ucc_commit,
+                "TRITON": "yes",
+                "INDUCTOR_BENCHMARKS": "yes",
+            },
+            "pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks": {
+                "CUDA_VERSION": "12.6",
+                "CUDNN_VERSION": "9",
+                "ANACONDA_PYTHON_VERSION": "3.12",
+                "GCC_VERSION": "9",
+                "VISION": "yes",
+                "KATEX": "yes",
+                "UCX_COMMIT": self.ucx_commit,
+                "UCC_COMMIT": self.ucc_commit,
+                "TRITON": "yes",
+                "INDUCTOR_BENCHMARKS": "yes",
+            },
+            "pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks": {
+                "CUDA_VERSION": "12.6",
+                "CUDNN_VERSION": "9",
+                "ANACONDA_PYTHON_VERSION": "3.13",
+                "GCC_VERSION": "9",
+                "VISION": "yes",
+                "KATEX": "yes",
+                "UCX_COMMIT": self.ucx_commit,
+                "UCC_COMMIT": self.ucc_commit,
+                "TRITON": "yes",
+                "INDUCTOR_BENCHMARKS": "yes",
+            },
+            "pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9": {
+                "CUDA_VERSION": "12.8.1",
+                "CUDNN_VERSION": "9",
+                "ANACONDA_PYTHON_VERSION": "3.10",
+                "GCC_VERSION": "9",
+                "VISION": "yes",
+                "KATEX": "yes",
+                "UCX_COMMIT": self.ucx_commit,
+                "UCC_COMMIT": self.ucc_commit,
+                "TRITON": "yes",
+            },
+            "pytorch-linux-jammy-py3-clang12-onnx": {
+                "ANACONDA_PYTHON_VERSION": "3.9",
+                "CLANG_VERSION": "12",
+                "VISION": "yes",
+                "ONNX": "yes",
+            },
+            "pytorch-linux-jammy-py3.9-clang12": {
+                "ANACONDA_PYTHON_VERSION": "3.9",
+                "CLANG_VERSION": "12",
+                "VISION": "yes",
+                "TRITON": "yes",
+            },
+            "pytorch-linux-jammy-py3.11-clang12": {
+                "ANACONDA_PYTHON_VERSION": "3.11",
+                "CLANG_VERSION": "12",
+                "VISION": "yes",
+                "TRITON": "yes",
+            },
+            "pytorch-linux-jammy-py3.9-gcc9": {
+                "ANACONDA_PYTHON_VERSION": "3.9",
+                "GCC_VERSION": "9",
+                "VISION": "yes",
+                "TRITON": "yes",
+            },
+            "pytorch-linux-jammy-rocm-n-py3": {
+                "ANACONDA_PYTHON_VERSION": "3.10",
+                "GCC_VERSION": "11",
+                "VISION": "yes",
+                "ROCM_VERSION": "6.4",
+                "NINJA_VERSION": "1.9.0",
+                "TRITON": "yes",
+                "KATEX": "yes",
+                "UCX_COMMIT": self.ucx_commit,
+                "UCC_COMMIT": self.ucc_commit,
+                "INDUCTOR_BENCHMARKS": "yes",
+            },
+            "pytorch-linux-noble-rocm-n-py3": {
+                "ANACONDA_PYTHON_VERSION": "3.12",
+                "GCC_VERSION": "11",
+                "VISION": "yes",
+                "ROCM_VERSION": "6.4",
+                "NINJA_VERSION": "1.9.0",
+                "TRITON": "yes",
+                "KATEX": "yes",
+                "UCX_COMMIT": self.ucx_commit,
+                "UCC_COMMIT": self.ucc_commit,
+                "INDUCTOR_BENCHMARKS": "yes",
+            },
+            "pytorch-linux-noble-rocm-alpha-py3": {
+                "ANACONDA_PYTHON_VERSION": "3.12",
+                "GCC_VERSION": "11",
+                "VISION": "yes",
+                "ROCM_VERSION": "7.0",
+                "NINJA_VERSION": "1.9.0",
+                "TRITON": "yes",
+                "KATEX": "yes",
+                "UCX_COMMIT": self.ucx_commit,
+                "UCC_COMMIT": self.ucc_commit,
+                "INDUCTOR_BENCHMARKS": "yes",
+                "PYTORCH_ROCM_ARCH": "gfx90a;gfx942;gfx950",
+            },
+            "pytorch-linux-jammy-xpu-2025.0-py3": {
+                "ANACONDA_PYTHON_VERSION": "3.9",
+                "GCC_VERSION": "11",
+                "VISION": "yes",
+                "XPU_VERSION": "2025.0",
+                "NINJA_VERSION": "1.9.0",
+                "TRITON": "yes",
+            },
+            "pytorch-linux-jammy-xpu-2025.1-py3": {
+                "ANACONDA_PYTHON_VERSION": "3.9",
+                "GCC_VERSION": "11",
+                "VISION": "yes",
+                "XPU_VERSION": "2025.1",
+                "NINJA_VERSION": "1.9.0",
+                "TRITON": "yes",
+            },
+            "pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks": {
+                "ANACONDA_PYTHON_VERSION": "3.9",
+                "GCC_VERSION": "11",
+                "VISION": "yes",
+                "KATEX": "yes",
+                "TRITON": "yes",
+                "DOCS": "yes",
+                "INDUCTOR_BENCHMARKS": "yes",
+            },
+            "pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12": {
+                "ANACONDA_PYTHON_VERSION": "3.9",
+                "CUDA_VERSION": "12.8.1",
+                "CUDNN_VERSION": "9",
+                "CLANG_VERSION": "12",
+                "VISION": "yes",
+                "TRITON": "yes",
+            },
+            "pytorch-linux-jammy-py3-clang18-asan": {
+                "ANACONDA_PYTHON_VERSION": "3.10",
+                "CLANG_VERSION": "18",
+                "VISION": "yes",
+            },
+            "pytorch-linux-jammy-py3.9-gcc11": {
+                "ANACONDA_PYTHON_VERSION": "3.9",
+                "GCC_VERSION": "11",
+                "VISION": "yes",
+                "KATEX": "yes",
+                "TRITON": "yes",
+                "DOCS": "yes",
+                "UNINSTALL_DILL": "yes",
+            },
+            "pytorch-linux-jammy-py3-clang12-executorch": {
+                "ANACONDA_PYTHON_VERSION": "3.10",
+                "CLANG_VERSION": "12",
+                "EXECUTORCH": "yes",
+            },
+            "pytorch-linux-jammy-py3.12-halide": {
+                "CUDA_VERSION": "12.6",
+                "ANACONDA_PYTHON_VERSION": "3.12",
+                "GCC_VERSION": "11",
+                "HALIDE": "yes",
+                "TRITON": "yes",
+            },
+            "pytorch-linux-jammy-py3.12-triton-cpu": {
+                "CUDA_VERSION": "12.6",
+                "ANACONDA_PYTHON_VERSION": "3.12",
+                "GCC_VERSION": "11",
+                "TRITON_CPU": "yes",
+            },
+            "pytorch-linux-jammy-linter": {
+                "PYTHON_VERSION": "3.9",
+            },
+            "pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter": {
+                "PYTHON_VERSION": "3.9",
+                "CUDA_VERSION": "12.8.1",
+            },
+            "pytorch-linux-jammy-aarch64-py3.10-gcc11": {
+                "ANACONDA_PYTHON_VERSION": "3.10",
+                "GCC_VERSION": "11",
+                "ACL": "yes",
+                "VISION": "yes",
+                "CONDA_CMAKE": "yes",
+                "OPENBLAS": "yes",
+                "SKIP_LLVM_SRC_BUILD_INSTALL": "yes",
+            },
+            "pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks": {
+                "ANACONDA_PYTHON_VERSION": "3.10",
+                "GCC_VERSION": "11",
+                "ACL": "yes",
+                "VISION": "yes",
+                "CONDA_CMAKE": "yes",
+                "OPENBLAS": "yes",
+                "SKIP_LLVM_SRC_BUILD_INSTALL": "yes",
+                "INDUCTOR_BENCHMARKS": "yes",
+            },
+        }
+        return _TAG_CONFIGS
+    def get_config(self, image_name:str) -> dict:
+        tag = self._get_tag(image_name)
+
+        config_dict = self.get_all_configs()
+        if tag not in config_dict:
+            raise ValueError(f"Unknown tag: {tag}")
+        return config_dict[tag]
+
+    def get_ucx_ucc_commits(self, hw_type: HardwareType) -> dict[str, str]:
+        if hw_type not in self._UCX_UCC_CONFIGS:
+            raise ValueError(f"Unsupported hardware type: {hw_type}")
+        return self._UCX_UCC_CONFIGS[hw_type]
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Return  for a given image tag."
+    )
+    parser.add_argument(
+        "--image", required=True, help="Full image string (e.g., repo/name:tag)"
+    )
+    args = parser.parse_args()
+
+    try:
+
+        image_name = args.image
+        hw_type = HardwareType.from_image_name(image_name)
+
+        config_runner = HardcodedBaseConfig(hw_type)
+        config = config_runner.get_config(args.image)
+        for key, val in config.items():
+            print(f'export {key}={shlex.quote(val)}')
+    except Exception as e:
+        # Any error will signal fallback
+        print(f"# Fallback due to error: {e}", file=sys.stderr)
+        sys.exit(42)
+
+
+if __name__ == "__main__":
+    main()
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -361,7 +361,6 @@ pwlf==2.2.1
 #Pinned versions: 2.2.1
 #test that import: test_sac_estimator.py

-
 # To build PyTorch itself
 pyyaml
 pyzstd
@ -389,3 +388,9 @@ tlparse==0.3.30
 cuda-bindings>=12.0,<13.0 ; platform_machine != "s390x"
 #Description: required for testing CUDAGraph::raw_cuda_graph(). See https://nvidia.github.io/cuda-python/cuda-bindings/latest/support.html for how this version was chosen. Note "Any fix in the latest bindings would be backported to the prior major version" means that only the newest version of cuda-bindings will get fixes. Depending on the latest version of 12.x is okay because all 12.y versions will be supported via "CUDA minor version compatibility". Pytorch builds against 13.z versions of cuda toolkit work with 12.x versions of cuda-bindings as well because newer drivers work with old toolkits.
 #test that import: test_cuda.py
+
+setuptools-git-versioning==2.1.0
+scikit-build==0.18.1
+pyre-extensions==0.0.32
+tabulate==0.9.0
+#Description: These package are needed to build FBGEMM and torchrec on PyTorch CI
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@ -4,7 +4,7 @@ sphinx==5.3.0
 -e git+https://github.com/pytorch/pytorch_sphinx_theme.git@pytorch_sphinx_theme2#egg=pytorch_sphinx_theme2

 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
-# but it doesn't seem to work and hangs around idly. The initial thought is probably
+# but it doesn't seem to work and hangs around idly. The initial thought that it is probably
 # something related to Docker setup. We can investigate this later.

 sphinxcontrib.katex==0.8.6
@ -59,3 +59,4 @@ sphinx-copybutton==0.5.0
 sphinx-design==0.4.0
 sphinxcontrib-mermaid==1.0.0
 myst-parser==0.18.1
+myst-nb
--- a/.ci/docker/ubuntu-rocm/Dockerfile
+++ b/.ci/docker/ubuntu-rocm/Dockerfile
@ -98,8 +98,9 @@ COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps
 COPY ./common/common_utils.sh common_utils.sh
 COPY ci_commit_pins/huggingface.txt huggingface.txt
 COPY ci_commit_pins/timm.txt timm.txt
+COPY ci_commit_pins/torchbench.txt torchbench.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
-RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt
+RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt torchbench.txt

 # (optional) Install non-default Ninja version
 ARG NINJA_VERSION
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -98,8 +98,9 @@ COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps
 COPY ./common/common_utils.sh common_utils.sh
 COPY ci_commit_pins/huggingface.txt huggingface.txt
 COPY ci_commit_pins/timm.txt timm.txt
+COPY ci_commit_pins/torchbench.txt torchbench.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
-RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt
+RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt torchbench.txt

 ARG TRITON
 ARG TRITON_CPU
--- a/.ci/manywheel/build_common.sh
+++ b/.ci/manywheel/build_common.sh
@ -97,8 +97,7 @@ if [[ -z "$PYTORCH_ROOT" ]]; then
    exit 1
 fi
 pushd "$PYTORCH_ROOT"
-retry pip install -q "setuptools>=70.1.0" packaging
-retry pip install -qU cmake ninja
+retry pip install -qUr requirements-build.txt
 python setup.py clean
 retry pip install -qr requirements.txt
 case ${DESIRED_PYTHON} in
--- a/.ci/manywheel/build_libtorch.sh
+++ b/.ci/manywheel/build_libtorch.sh
@ -92,8 +92,7 @@ if [[ -z "$PYTORCH_ROOT" ]]; then
    exit 1
 fi
 pushd "$PYTORCH_ROOT"
-retry pip install -q "setuptools>=70.1.0" packaging
-retry pip install -qU cmake ninja
+retry pip install -qUr requirements-build.txt
 python setup.py clean
 retry pip install -qr requirements.txt
 retry pip install -q numpy==2.0.1
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -306,6 +306,22 @@ else
    fi
    pip_install_whl "$(echo dist/*.whl)"

+    if [[ "${BUILD_ADDITIONAL_PACKAGES:-}" == *vision* ]]; then
+      install_torchvision
+    fi
+
+    if [[ "${BUILD_ADDITIONAL_PACKAGES:-}" == *audio* ]]; then
+      install_torchaudio
+    fi
+
+    if [[ "${BUILD_ADDITIONAL_PACKAGES:-}" == *torchrec* || "${BUILD_ADDITIONAL_PACKAGES:-}" == *fbgemm* ]]; then
+      install_torchrec_and_fbgemm
+    fi
+
+    if [[ "${BUILD_ADDITIONAL_PACKAGES:-}" == *torchao* ]]; then
+      install_torchao
+    fi
+
    if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
      echo "Checking that xpu is compiled"
      pushd dist/
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@ -78,6 +78,34 @@ function pip_install_whl() {
  fi
 }

+function pip_build_and_install() {
+  local build_target=$1
+  local wheel_dir=$2
+
+  local found_whl=0
+  for file in "${wheel_dir}"/*.whl
+  do
+    if [[ -f "${file}" ]]; then
+      found_whl=1
+      break
+    fi
+  done
+
+  # Build the wheel if it doesn't exist
+  if [ "${found_whl}" == "0" ]; then
+    python3 -m pip wheel \
+      --no-build-isolation \
+      --no-deps \
+      --no-use-pep517 \
+      -w "${wheel_dir}" \
+      "${build_target}"
+  fi
+
+  for file in "${wheel_dir}"/*.whl
+  do
+    pip_install_whl "${file}"
+  done
+}

 function pip_install() {
  # retry 3 times
@ -124,14 +152,7 @@ function get_pinned_commit() {
 function install_torchaudio() {
  local commit
  commit=$(get_pinned_commit audio)
-  if [[ "$1" == "cuda" ]]; then
-    # TODO: This is better to be passed as a parameter from _linux-test workflow
-    # so that it can be consistent with what is set in build
-    TORCH_CUDA_ARCH_LIST="8.0;8.6" pip_install --no-use-pep517 "git+https://github.com/pytorch/audio.git@${commit}"
-  else
-    pip_install --no-use-pep517 "git+https://github.com/pytorch/audio.git@${commit}"
-  fi
-
+  pip_build_and_install "git+https://github.com/pytorch/audio.git@${commit}" dist/audio
 }

 function install_torchtext() {
@ -139,8 +160,8 @@ function install_torchtext() {
  local text_commit
  data_commit=$(get_pinned_commit data)
  text_commit=$(get_pinned_commit text)
-  pip_install --no-use-pep517 "git+https://github.com/pytorch/data.git@${data_commit}"
-  pip_install --no-use-pep517 "git+https://github.com/pytorch/text.git@${text_commit}"
+  pip_build_and_install "git+https://github.com/pytorch/data.git@${data_commit}" dist/data
+  pip_build_and_install "git+https://github.com/pytorch/text.git@${text_commit}" dist/text
 }

 function install_torchvision() {
@ -153,7 +174,14 @@ function install_torchvision() {
    echo 'char* dlerror(void) { return "";}'|gcc -fpic -shared -o "${HOME}/dlerror.so" -x c -
    LD_PRELOAD=${orig_preload}:${HOME}/dlerror.so
  fi
-  pip_install --no-use-pep517 "git+https://github.com/pytorch/vision.git@${commit}"
+
+  if [[ "${BUILD_ENVIRONMENT}" == *cuda* ]]; then
+    # Not sure if both are needed, but why not
+    export FORCE_CUDA=1
+    export WITH_CUDA=1
+  fi
+  pip_build_and_install "git+https://github.com/pytorch/vision.git@${commit}" dist/vision
+
  if [ -n "${LD_PRELOAD}" ]; then
    LD_PRELOAD=${orig_preload}
  fi
@ -173,25 +201,48 @@ function install_torchrec_and_fbgemm() {

  if [[ "$BUILD_ENVIRONMENT" == *rocm* ]] ; then
    # install torchrec first because it installs fbgemm nightly on top of rocm fbgemm
-    pip_install --no-use-pep517 "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}"
+    pip_build_and_install "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}" dist/torchrec
    pip_uninstall fbgemm-gpu-nightly

    pip_install tabulate  # needed for newer fbgemm
    pip_install patchelf  # needed for rocm fbgemm
-    git clone --recursive https://github.com/pytorch/fbgemm
-    pushd fbgemm/fbgemm_gpu
-    git checkout "${fbgemm_commit}"
-    python setup.py install \
-      --package_variant=rocm \
-      -DHIP_ROOT_DIR="${ROCM_PATH}" \
-      -DCMAKE_C_FLAGS="-DTORCH_USE_HIP_DSA" \
-      -DCMAKE_CXX_FLAGS="-DTORCH_USE_HIP_DSA"
-    popd
+
+    local wheel_dir=dist/fbgemm_gpu
+    local found_whl=0
+    for file in "${wheel_dir}"/*.whl
+    do
+      if [[ -f "${file}" ]]; then
+        found_whl=1
+        break
+      fi
+    done
+
+    # Build the wheel if it doesn't exist
+    if [ "${found_whl}" == "0" ]; then
+      git clone --recursive https://github.com/pytorch/fbgemm
+      pushd fbgemm/fbgemm_gpu
+      git checkout "${fbgemm_commit}"
+      python setup.py bdist_wheel \
+        --package_variant=rocm \
+        -DHIP_ROOT_DIR="${ROCM_PATH}" \
+        -DCMAKE_C_FLAGS="-DTORCH_USE_HIP_DSA" \
+        -DCMAKE_CXX_FLAGS="-DTORCH_USE_HIP_DSA"
+      popd
+
+      # Save the wheel before cleaning up
+      mkdir -p dist/fbgemm_gpu
+      cp fbgemm/fbgemm_gpu/dist/*.whl dist/fbgemm_gpu
+    fi
+
+    for file in "${wheel_dir}"/*.whl
+    do
+      pip_install_whl "${file}"
+    done
+
    rm -rf fbgemm
  else
-    # See https://github.com/pytorch/pytorch/issues/106971
-    CUDA_PATH=/usr/local/cuda-12.1 pip_install --no-use-pep517 "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#egg=fbgemm-gpu&subdirectory=fbgemm_gpu"
-    pip_install --no-use-pep517 "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}"
+    pip_build_and_install "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}" dist/torchrec
+    pip_build_and_install "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#subdirectory=fbgemm_gpu" dist/fbgemm_gpu
  fi
 }

@ -207,34 +258,10 @@ function clone_pytorch_xla() {
  fi
 }

-function checkout_install_torchbench() {
-  local commit
-  commit=$(get_pinned_commit torchbench)
-  git clone https://github.com/pytorch/benchmark torchbench
-  pushd torchbench
-  git checkout "$commit"
-
-  if [ "$1" ]; then
-    python install.py --continue_on_fail models "$@"
-  else
-    # Occasionally the installation may fail on one model but it is ok to continue
-    # to install and test other models
-    python install.py --continue_on_fail
-  fi
-
-  # TODO (huydhn): transformers-4.44.2 added by https://github.com/pytorch/benchmark/pull/2488
-  # is regressing speedup metric. This needs to be investigated further
-  pip install transformers==4.38.1
-
-  echo "Print all dependencies after TorchBench is installed"
-  python -mpip freeze
-  popd
-}
-
 function install_torchao() {
  local commit
  commit=$(get_pinned_commit torchao)
-  pip_install --no-use-pep517 "git+https://github.com/pytorch/ao.git@${commit}"
+  pip_build_and_install "git+https://github.com/pytorch/ao.git@${commit}" dist/ao
 }

 function print_sccache_stats() {
--- a/.ci/pytorch/run_tests.sh
+++ b/.ci/pytorch/run_tests.sh
@ -74,12 +74,13 @@ else
 fi

 # Environment initialization
+retry pip install -qUr requirements-build.txt
 if [[ "$(uname)" == Darwin ]]; then
    # Install the testing dependencies
-    retry pip install -q future hypothesis ${NUMPY_PACKAGE} ${PROTOBUF_PACKAGE} pytest setuptools six typing_extensions pyyaml
+    retry pip install -q future hypothesis ${NUMPY_PACKAGE} ${PROTOBUF_PACKAGE} pytest
 else
    retry pip install -qr requirements.txt || true
-    retry pip install -q hypothesis protobuf pytest setuptools || true
+    retry pip install -q hypothesis protobuf pytest || true
    numpy_ver=1.15
    case "$(python --version 2>&1)" in
      *2* | *3.5* | *3.6*)
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -289,6 +289,12 @@ elif [[ $TEST_CONFIG == 'nogpu_AVX512' ]]; then
  export ATEN_CPU_CAPABILITY=avx2
 fi

+if [[ "${TEST_CONFIG}" == "legacy_nvidia_driver" ]]; then
+  # Make sure that CUDA can be initialized
+  (cd test && python -c "import torch; torch.rand(2, 2, device='cuda')")
+  export USE_LEGACY_DRIVER=1
+fi
+
 test_python_legacy_jit() {
  time python test/run_test.py --include test_jit_legacy test_jit_fuser_legacy --verbose
  assert_git_not_dirty
@ -1600,7 +1606,13 @@ if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-baze
 fi
 if [[ "${TEST_CONFIG}" == *numpy_2* ]]; then
  # Install numpy-2.0.2 and compatible scipy & numba versions
-  python -mpip install --pre numpy==2.0.2 scipy==1.13.1 numba==0.60.0
+  # Force re-install of pandas to avoid error where pandas checks numpy version from initial install and fails upon import
+  TMP_PANDAS_VERSION=$(python -c "import pandas; print(pandas.__version__)" 2>/dev/null)
+  if [ -n "$TMP_PANDAS_VERSION" ]; then
+    python -m pip install --pre numpy==2.0.2 scipy==1.13.1 numba==0.60.0 pandas=="$TMP_PANDAS_VERSION" --force-reinstall
+  else
+    python -m pip install --pre numpy==2.0.2 scipy==1.13.1 numba==0.60.0
+  fi
  python test/run_test.py --include dynamo/test_functions.py dynamo/test_unspec.py test_binary_ufuncs.py test_fake_tensor.py test_linalg.py test_numpy_interop.py test_tensor_creation_ops.py test_torch.py torch_np/test_basic.py
 elif [[ "${BUILD_ENVIRONMENT}" == *aarch64* && "${TEST_CONFIG}" != *perf_cpu_aarch64* ]]; then
  test_linux_aarch64
@ -1654,49 +1666,37 @@ elif [[ "${TEST_CONFIG}" == *timm* ]]; then
  id=$((SHARD_NUMBER-1))
  test_dynamo_benchmark timm_models "$id"
 elif [[ "${TEST_CONFIG}" == cachebench ]]; then
-  install_torchaudio cuda
+  install_torchaudio
  install_torchvision
-  checkout_install_torchbench nanogpt BERT_pytorch resnet50 hf_T5 llama moco
-  PYTHONPATH=$(pwd)/torchbench test_cachebench
+  PYTHONPATH=/torchbench test_cachebench
 elif [[ "${TEST_CONFIG}" == verify_cachebench ]]; then
-  install_torchaudio cpu
+  install_torchaudio
  install_torchvision
-  checkout_install_torchbench nanogpt
-  PYTHONPATH=$(pwd)/torchbench test_verify_cachebench
+  PYTHONPATH=/torchbench test_verify_cachebench
 elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
-  if [[ "${TEST_CONFIG}" == *cpu* ]]; then
-    install_torchaudio cpu
-  else
-    install_torchaudio cuda
-  fi
+  install_torchaudio
  install_torchvision
-  TORCH_CUDA_ARCH_LIST="8.0;8.6" install_torchao
+  install_torchao
  id=$((SHARD_NUMBER-1))
  # https://github.com/opencv/opencv-python/issues/885
  pip_install opencv-python==4.8.0.74
  if [[ "${TEST_CONFIG}" == *inductor_torchbench_smoketest_perf* ]]; then
-    checkout_install_torchbench hf_Bert hf_Albert timm_vision_transformer
-    PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_smoketest_perf
+    PYTHONPATH=/torchbench test_inductor_torchbench_smoketest_perf
  elif [[ "${TEST_CONFIG}" == *inductor_torchbench_cpu_smoketest_perf* ]]; then
-    checkout_install_torchbench timm_vision_transformer phlippe_densenet basic_gnn_edgecnn \
-      llama_v2_7b_16h resnet50 timm_efficientnet mobilenet_v3_large timm_resnest \
-      functorch_maml_omniglot yolov3 mobilenet_v2 resnext50_32x4d densenet121 mnasnet1_0
-    PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_cpu_smoketest_perf
+    PYTHONPATH=/torchbench test_inductor_torchbench_cpu_smoketest_perf
  elif [[ "${TEST_CONFIG}" == *torchbench_gcp_smoketest* ]]; then
-    checkout_install_torchbench
-    TORCHBENCHPATH=$(pwd)/torchbench test_torchbench_gcp_smoketest
+    TORCHBENCHPATH=/torchbench test_torchbench_gcp_smoketest
  else
-    checkout_install_torchbench
    # Do this after checkout_install_torchbench to ensure we clobber any
    # nightlies that torchbench may pull in
    if [[ "${TEST_CONFIG}" != *cpu* ]]; then
      install_torchrec_and_fbgemm
    fi
-    PYTHONPATH=$(pwd)/torchbench test_dynamo_benchmark torchbench "$id"
+    PYTHONPATH=/torchbench test_dynamo_benchmark torchbench "$id"
  fi
 elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
  install_torchvision
-  PYTHONPATH=$(pwd)/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
+  PYTHONPATH=/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
  if [[ "$SHARD_NUMBER" -eq "1" ]]; then
    test_inductor_aoti
  fi
--- a/.ci/pytorch/windows/internal/smoke_test.bat
+++ b/.ci/pytorch/windows/internal/smoke_test.bat
@ -148,14 +148,7 @@ if "%NVIDIA_GPU_EXISTS%" == "0" (
    goto end
 )

-set BUILD_SPLIT_CUDA=
-if exist "%install_root%\lib\torch_cuda_cu.lib" if exist "%install_root%\lib\torch_cuda_cpp.lib" set BUILD_SPLIT_CUDA=ON
-
-if "%BUILD_SPLIT_CUDA%" == "ON" (
-    cl %PYTORCH_ROOT%\.ci\pytorch\test_example_code\check-torch-cuda.cpp torch_cpu.lib c10.lib torch_cuda_cu.lib torch_cuda_cpp.lib /EHsc /std:c++17 /link /INCLUDE:?warp_size@cuda@at@@YAHXZ /INCLUDE:?_torch_cuda_cu_linker_symbol_op_cuda@native@at@@YA?AVTensor@2@AEBV32@@Z
-) else (
-    cl %PYTORCH_ROOT%\.ci\pytorch\test_example_code\check-torch-cuda.cpp torch_cpu.lib c10.lib torch_cuda.lib /EHsc /std:c++17 /link /INCLUDE:?warp_size@cuda@at@@YAHXZ
-)
+cl %PYTORCH_ROOT%\.ci\pytorch\test_example_code\check-torch-cuda.cpp torch_cpu.lib c10.lib torch_cuda.lib /EHsc /std:c++17 /link /INCLUDE:?warp_size@cuda@at@@YAHXZ
 .\check-torch-cuda.exe
 if ERRORLEVEL 1 exit /b 1

--- a/.ci/wheel/build_wheel.sh
+++ b/.ci/wheel/build_wheel.sh
@ -184,7 +184,8 @@ tmp_env_name="wheel_py$python_nodot"
 conda create ${EXTRA_CONDA_INSTALL_FLAGS} -yn "$tmp_env_name" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS}
 source activate "$tmp_env_name"

-pip install "numpy=${NUMPY_PINNED_VERSION}"  "pyyaml${PYYAML_PINNED_VERSION}" requests ninja "setuptools${SETUPTOOLS_PINNED_VERSION}" typing_extensions
+retry pip install -r "${pytorch_rootdir}/requirements-build.txt"
+pip install "numpy=${NUMPY_PINNED_VERSION}"  "pyyaml${PYYAML_PINNED_VERSION}" requests ninja "setuptools${SETUPTOOLS_PINNED_VERSION}" typing-extensions
 retry pip install -r "${pytorch_rootdir}/requirements.txt" || true
 retry brew install libomp

--- a/.github/actions/linux-test/action.yml
+++ b/.github/actions/linux-test/action.yml
@ -126,7 +126,7 @@ runs:
      shell: bash
      continue-on-error: true
      run: |
-        python3 -m pip install psutil==5.9.1 nvidia-ml-py==11.525.84
+        python3 -m pip install psutil==5.9.8 nvidia-ml-py==11.525.84
        python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
        echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"

--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-6c57850358f34c47802db216b0746e4e9d08a95a
+00b0c91db92c51a11356249262577b9fa26c18c5
--- a/.github/ci_commit_pins/vllm.txt
+++ b/.github/ci_commit_pins/vllm.txt
@ -0,0 +1 @@
+29d1ffc5b4c763ef76aff9e3f617fa60dd292418
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@ -76,6 +76,7 @@
  - .github/ci_commit_pins/audio.txt
  - .github/ci_commit_pins/vision.txt
  - .github/ci_commit_pins/torchdynamo.txt
+  - .github/ci_commit_pins/vllm.txt
  - .ci/docker/ci_commit_pins/triton.txt
  approved_by:
  - pytorchbot
--- a/.github/requirements-gha-cache.txt
+++ b/.github/requirements-gha-cache.txt
@ -1,5 +1,6 @@
 # This file is to cache other dependencies not specified elsewhere in:
-#   requirement.txt
+#   requirements.txt
+#   requirements-build.txt
 #   docs/requirements.txt
 #   docs/cpp/requirements.txt
 #   functorch/docs/requirements.txt
--- a/.github/requirements/pip-requirements-macOS.txt
+++ b/.github/requirements/pip-requirements-macOS.txt
@ -16,7 +16,7 @@ packaging==23.1
 parameterized==0.8.1
 pillow==10.3.0
 protobuf==5.29.4
-psutil==5.9.1
+psutil==5.9.8
 pygments==2.15.0
 pytest-cpp==2.3.0
 pytest-flakefinder==1.1.0
--- a/.github/workflows/_get-changed-files.yml
+++ b/.github/workflows/_get-changed-files.yml
@ -0,0 +1,43 @@
+name: Get Changed Files
+
+on:
+  workflow_call:
+    outputs:
+      changed-files:
+        description: "List of changed files (space-separated) or '*' if not in a PR"
+        value: ${{ jobs.get-changed-files.outputs.changed-files }}
+
+jobs:
+  get-changed-files:
+    runs-on: ubuntu-latest
+    outputs:
+      changed-files: ${{ steps.get-files.outputs.changed-files }}
+
+    steps:
+      - name: Get changed files
+        id: get-files
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          # Check if we're in a pull request context
+          if [ "${{ github.event_name }}" = "pull_request" ] || [ "${{ github.event_name }}" = "pull_request_target" ]; then
+            echo "Running in PR context"
+
+            # Get the PR number from the github context
+            PR_NUMBER="${{ github.event.number }}"
+
+            # Use gh CLI to get changed files in the PR with explicit repo
+            CHANGED_FILES=$(gh pr view "$PR_NUMBER" --repo "${{ github.repository }}" --json files --jq '.files[].path' | tr '\n' ' ' | sed 's/ $//')
+
+            if [ -z "$CHANGED_FILES" ]; then
+              echo "No changed files found, setting to '*'"
+              CHANGED_FILES="*"
+            fi
+
+            echo "Changed files: $CHANGED_FILES"
+            echo "changed-files=$CHANGED_FILES" >> "$GITHUB_OUTPUT"
+
+          else
+            echo "Not in PR context, setting changed files to '*'"
+            echo "changed-files=*" >> "$GITHUB_OUTPUT"
+          fi
--- a/.github/workflows/_linux-build.yml
+++ b/.github/workflows/_linux-build.yml
@ -16,11 +16,6 @@ on:
        type: boolean
        default: true
        description: If set, upload generated build artifacts.
-      build-with-debug:
-        required: false
-        type: boolean
-        default: false
-        description: If set, build in debug mode.
      sync-tag:
        required: false
        type: string
@ -87,7 +82,6 @@ on:
        required: false
        type: number
        default: 1
-
      allow-reuse-old-whl:
        description: |
          If set, the build try to pull an old wheel from s3 that was built on a
@ -95,6 +89,13 @@ on:
        required: false
        type: boolean
        default: true
+      build-additional-packages:
+        description: |
+          If set, the build job will also builds these packages and saves their
+          wheels as artifacts
+        required: false
+        type: string
+        default: ""

    secrets:
      HUGGING_FACE_HUB_TOKEN:
@ -106,7 +107,6 @@ on:
        description: |
          FB app token to write to scribe endpoint

-
    outputs:
      docker-image:
        value: ${{ jobs.build.outputs.docker-image }}
@ -225,7 +225,7 @@ jobs:
          MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
        run: |
          mkdir -p ../../usage_logs
-          python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7
+          python3 -m pip install psutil==5.9.8 dataclasses_json==0.6.7
          python3 -m tools.stats.monitor \
          --log-interval "$MONITOR_LOG_INTERVAL" \
          --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" \
@ -247,8 +247,6 @@ jobs:
        env:
          BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          # TODO duplicated
-          AWS_DEFAULT_REGION: us-east-1
          PR_NUMBER: ${{ github.event.pull_request.number }}
          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
          # Do not set SCCACHE_S3_KEY_PREFIX to share the cache between all build jobs
@ -260,10 +258,10 @@ jobs:
          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
          DOCKER_IMAGE_S390X: ${{ inputs.docker-image-name }}
          XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
-          DEBUG: ${{ inputs.build-with-debug && '1' || '0' }}
          OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
+          BUILD_ADDITIONAL_PACKAGES: ${{ inputs.build-additional-packages }}
        run: |
          START_TIME=$(date +%s)
          if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then
@ -295,7 +293,6 @@ jobs:
          container_name=$(docker run \
            -e BUILD_ENVIRONMENT \
            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e AWS_DEFAULT_REGION \
            -e PR_NUMBER \
            -e SHA1 \
            -e BRANCH \
@ -310,6 +307,7 @@ jobs:
            -e HUGGING_FACE_HUB_TOKEN \
            -e SCRIBE_GRAPHQL_ACCESS_TOKEN \
            -e USE_SPLIT_BUILD \
+            -e BUILD_ADDITIONAL_PACKAGES \
            --memory="${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}g" \
            --memory-swap="${TOTAL_MEMORY_WITH_SWAP}g" \
            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
@ -323,6 +321,11 @@ jobs:
            "${USED_IMAGE}" \
            ${DOCKER_SHELL_CMD}
          )
+
+          if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then
+            docker exec -t "${container_name}" sh -c "python3 -m pip install -r requirements.txt"
+          fi
+
          docker exec -t "${container_name}" sh -c '.ci/pytorch/build.sh'

          END_TIME=$(date +%s)
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@ -164,6 +164,8 @@ jobs:
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        id: install-nvidia-driver
        uses: pytorch/test-infra/.github/actions/setup-nvidia@main
+        with:
+          driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '570.133.07' }}
        if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && matrix.runner != 'B200' }}

      - name: Setup GPU_FLAG for docker run
@ -203,7 +205,7 @@ jobs:
          MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
          MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
        run: |
-          python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84
+          python3 -m pip install psutil==5.9.8 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84
          python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
          echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"

--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@ -136,7 +136,7 @@ jobs:
          MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
          MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
        run: |
-          "$VENV_PATH/bin/python3" -m pip install psutil==5.9.1 dataclasses_json==0.6.7
+          "$VENV_PATH/bin/python3" -m pip install psutil==5.9.8 dataclasses_sajson==0.6.7
          "$VENV_PATH/bin/python3" -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
          echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"

@ -281,7 +281,7 @@ jobs:
        continue-on-error: true
        run: |
          if [[ -n "$REINSTALL_BREW_MINICONDA" ]]; then
-              brew install miniconda
+              brew install --cask miniconda
          fi

      - name: Clean up disk space
--- a/.github/workflows/_rocm-test.yml
+++ b/.github/workflows/_rocm-test.yml
@ -132,7 +132,7 @@ jobs:
        shell: bash
        continue-on-error: true
        run: |
-          python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7
+          python3 -m pip install psutil==5.9.8 dataclasses_json==0.6.7
          python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
          echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"

--- a/.github/workflows/_win-test.yml
+++ b/.github/workflows/_win-test.yml
@ -138,7 +138,7 @@ jobs:
        continue-on-error: true
        run: |
          # Windows conda doesn't have python3 binary, only python, but it's python3
-          ${CONDA_RUN} python -m pip install psutil==5.9.1 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84
+          ${CONDA_RUN} python -m pip install psutil==5.9.8 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84
          ${CONDA_RUN} python -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
          echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"

--- a/.github/workflows/_xpu-test.yml
+++ b/.github/workflows/_xpu-test.yml
@ -133,7 +133,7 @@ jobs:
          MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
          MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
        run: |
-          python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84
+          python3 -m pip install psutil==5.9.8 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84
          python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
          echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"

--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -62,9 +62,9 @@ jobs:
          pytorch-linux-jammy-py3.11-clang12,
          pytorch-linux-jammy-py3.12-clang12,
          pytorch-linux-jammy-py3.13-clang12,
-          pytorch-linux-jammy-rocm-n-1-py3,
          pytorch-linux-jammy-rocm-n-py3,
          pytorch-linux-noble-rocm-n-py3,
+          pytorch-linux-noble-rocm-alpha-py3,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12,
          pytorch-linux-jammy-py3.9-gcc11,
          pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks,
--- a/.github/workflows/inductor-nightly.yml
+++ b/.github/workflows/inductor-nightly.yml
@ -48,6 +48,7 @@ jobs:
          { config: "dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench", shard: 1, num_shards: 2, runner: "linux.8xlarge.amx" },
          { config: "dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench", shard: 2, num_shards: 2, runner: "linux.8xlarge.amx" },
        ]}
+      build-additional-packages: "vision audio torchao"
    secrets: inherit

  linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-test:
--- a/.github/workflows/inductor-perf-compare.yml
+++ b/.github/workflows/inductor-perf-compare.yml
@ -43,6 +43,7 @@ jobs:
          { config: "inductor_timm_perf_compare", shard: 2, num_shards: 2, runner: "linux.aws.a100" },
          { config: "inductor_torchbench_perf_compare", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
        ]}
+      build-additional-packages: "vision audio fbgemm torchao"
    secrets: inherit

  test:
--- a/.github/workflows/inductor-perf-test-nightly-aarch64.yml
+++ b/.github/workflows/inductor-perf-test-nightly-aarch64.yml
@ -116,6 +116,7 @@ jobs:
          { config: "inductor_torchbench_perf_cpu_aarch64", shard: 15, num_shards: 15, runner: "linux.arm64.m7g.metal" },
        ]}
      selected-test-configs: ${{ inputs.benchmark_configs }}
+      build-additional-packages: "vision audio torchao"
    secrets: inherit


--- a/.github/workflows/inductor-perf-test-nightly-h100.yml
+++ b/.github/workflows/inductor-perf-test-nightly-h100.yml
@ -86,6 +86,11 @@ jobs:
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      # Use a bigger runner here because CUDA_ARCH 9.0 is only built for H100
+      # or newer GPUs, so it doesn't benefit much from existing compiler cache
+      # from trunk. Also use a memory-intensive runner here because memory is
+      # usually the bottleneck
+      runner: linux.12xlarge.memory
      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '9.0'
@ -114,6 +119,7 @@ jobs:
          { config: "inductor_torchbench_perf_cuda_h100", shard: 9, num_shards: 9, runner: "linux.aws.h100" },
        ]}
      selected-test-configs: ${{ inputs.benchmark_configs }}
+      build-additional-packages: "vision audio fbgemm torchao"
    secrets: inherit

  test-periodically:
--- a/.github/workflows/inductor-perf-test-nightly-x86.yml
+++ b/.github/workflows/inductor-perf-test-nightly-x86.yml
@ -98,6 +98,7 @@ jobs:
          { config: "inductor_torchbench_perf_cpu_x86", shard: 4, num_shards: 4, runner: "linux.24xl.spr-metal" },
        ]}
      selected-test-configs: ${{ inputs.benchmark_configs }}
+      build-additional-packages: "vision audio torchao"
    secrets: inherit

  linux-jammy-cpu-py3_9-gcc11-inductor-test-nightly-freezing:
--- a/.github/workflows/inductor-perf-test-nightly.yml
+++ b/.github/workflows/inductor-perf-test-nightly.yml
@ -86,6 +86,8 @@ jobs:
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      # Every bit to make perf run faster helps
+      runner: linux.12xlarge.memory
      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.0'
@ -112,6 +114,7 @@ jobs:
          { config: "cachebench", shard: 2, num_shards: 2, runner: "linux.aws.a100" },
        ]}
      selected-test-configs: ${{ inputs.benchmark_configs }}
+      build-additional-packages: "vision audio fbgemm torchao"
    secrets: inherit

  test-nightly:
--- a/.github/workflows/inductor-periodic.yml
+++ b/.github/workflows/inductor-periodic.yml
@ -58,6 +58,7 @@ jobs:
          { config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
          { config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
        ]}
+      build-additional-packages: "vision audio fbgemm torchao"
    secrets: inherit

  linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-test:
@ -125,6 +126,7 @@ jobs:
        { include: [
          { config: "inductor_torchbench_smoketest_perf", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
        ]}
+      build-additional-packages: "vision audio fbgemm torchao"
    secrets: inherit

  linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-test:
@ -159,6 +161,7 @@ jobs:
          { config: "cpu_inductor_freezing_avx2_timm", shard: 1, num_shards: 2, runner: "linux.10xlarge.avx2" },
          { config: "cpu_inductor_freezing_avx2_timm", shard: 2, num_shards: 2, runner: "linux.10xlarge.avx2" },
        ]}
+      build-additional-packages: "vision audio torchao"
    secrets: inherit

  linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-test:
@ -195,6 +198,7 @@ jobs:
          { config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
          { config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
        ]}
+      build-additional-packages: "vision audio fbgemm torchao"
    secrets: inherit

  linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
@ -240,6 +244,7 @@ jobs:
          { config: "dynamic_cpu_aot_inductor_amp_freezing_torchbench", shard: 1, num_shards: 2, runner: "linux.8xlarge.amx" },
          { config: "dynamic_cpu_aot_inductor_amp_freezing_torchbench", shard: 2, num_shards: 2, runner: "linux.8xlarge.amx" },
        ]}
+      build-additional-packages: "vision audio torchao"
    secrets: inherit

  linux-jammy-cpu-py3_9-gcc11-inductor-test:
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@ -62,6 +62,7 @@ jobs:
          { config: "inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
          { config: "inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
        ]}
+      build-additional-packages: "vision audio fbgemm torchao"
    secrets: inherit

  linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
@ -94,6 +95,7 @@ jobs:
          { config: "dynamic_cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
          { config: "inductor_torchbench_cpu_smoketest_perf", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.24xl.spr-metal" },
        ]}
+      build-additional-packages: "vision audio torchao"
    secrets: inherit

  linux-jammy-cpu-py3_9-gcc11-inductor-test:
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -26,9 +26,30 @@ jobs:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
      curr_branch: ${{ github.head_ref || github.ref_name }}
+
+  get-changed-files:
+    if: github.repository_owner == 'pytorch'
+    name: Get changed files
+    uses: ./.github/workflows/_get-changed-files.yml
+
  lintrunner-clang:
    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
-    needs: get-label-type
+    needs: [get-label-type, get-changed-files]
+    # Only run if there are changed files relevant to clangtidy / clangformat
+    if: |
+      github.repository_owner == 'pytorch' && (
+        needs.get-changed-files.outputs.changed-files == '*' ||
+        contains(needs.get-changed-files.outputs.changed-files, '.h') ||
+        contains(needs.get-changed-files.outputs.changed-files, '.cpp') ||
+        contains(needs.get-changed-files.outputs.changed-files, '.cc') ||
+        contains(needs.get-changed-files.outputs.changed-files, '.cxx') ||
+        contains(needs.get-changed-files.outputs.changed-files, '.hpp') ||
+        contains(needs.get-changed-files.outputs.changed-files, '.hxx') ||
+        contains(needs.get-changed-files.outputs.changed-files, '.cu') ||
+        contains(needs.get-changed-files.outputs.changed-files, '.cuh') ||
+        contains(needs.get-changed-files.outputs.changed-files, '.mm') ||
+        contains(needs.get-changed-files.outputs.changed-files, '.metal')
+      )
    with:
      timeout: 120
      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
@ -39,13 +60,27 @@ jobs:
      submodules: true
      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
      script: |
-        export ADDITIONAL_LINTRUNNER_ARGS="--take CLANGTIDY,CLANGFORMAT --all-files"
+        CHANGED_FILES="${{ needs.get-changed-files.outputs.changed-files }}"
+        if [ "$CHANGED_FILES" = "*" ]; then
+          export ADDITIONAL_LINTRUNNER_ARGS="--take CLANGTIDY,CLANGFORMAT --all-files"
+        else
+          export ADDITIONAL_LINTRUNNER_ARGS="--take CLANGTIDY,CLANGFORMAT $CHANGED_FILES"
+        fi
        export CLANG=1
        .github/scripts/lintrunner.sh

-  lintrunner-noclang:
+  # NOTE: mypy needs its own job because it depends on --all-files, without assessing all files it sometimes
+  #       fails to find types when it should
+  lintrunner-mypy:
    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
-    needs: get-label-type
+    needs: [get-label-type, get-changed-files]
+    # Only run if there are changed files relevant to mypy
+    if: |
+      github.repository_owner == 'pytorch' && (
+        needs.get-changed-files.outputs.changed-files == '*' ||
+        contains(needs.get-changed-files.outputs.changed-files, '.py') ||
+        contains(needs.get-changed-files.outputs.changed-files, '.pyi')
+      )
    with:
      timeout: 120
      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
@ -56,8 +91,30 @@ jobs:
      submodules: true
      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
      script: |
-        export ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT --all-files"
-        .github/scripts/lintrunner.sh
+        CHANGED_FILES="${{ needs.get-changed-files.outputs.changed-files }}"
+        echo "Running mypy"
+        ADDITIONAL_LINTRUNNER_ARGS="--take MYPY --all-files" .github/scripts/lintrunner.sh
+
+  lintrunner-noclang:
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    needs: [get-label-type, get-changed-files]
+    with:
+      timeout: 120
+      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
+      docker-image: ci-image:pytorch-linux-jammy-linter
+      # NB: A shallow checkout won't work here because calculate-docker-image requires a full checkout
+      # to run git rev-parse HEAD~:.ci/docker when a new image is needed
+      fetch-depth: 0
+      submodules: true
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        CHANGED_FILES="${{ needs.get-changed-files.outputs.changed-files }}"
+        echo "Running all other linters"
+        if [ "$CHANGED_FILES" = '*' ]; then
+          ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY --all-files" .github/scripts/lintrunner.sh
+        else
+          ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY ${CHANGED_FILES}" .github/scripts/lintrunner.sh
+        fi

  quick-checks:
    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
@ -260,6 +317,7 @@ jobs:
          check-latest: false
          cache: pip
          cache-dependency-path: |
+            **/requirements-build.txt
            **/requirements.txt
      - name: Setup Min Python version
        if: matrix.test_type != 'older_python_version'
@ -270,6 +328,7 @@ jobs:
          check-latest: false
          cache: pip
          cache-dependency-path: |
+            **/requirements-build.txt
            **/requirements.txt
      - name: Install torch
        if: matrix.test_type == 'with_torch'
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@ -83,6 +83,10 @@ jobs:
            repo-owner: triton-lang
            branch: main
            pin-folder: .ci/docker/ci_commit_pins
+          - repo-name: vllm
+            repo-owner: vllm-project
+            branch: main
+            pin-folder: .github/ci_commit_pins
    # Allow this to be triggered on either a schedule or on workflow_dispatch to allow for easier testing
    if: github.repository_owner == 'pytorch' && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
    steps:
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@ -82,6 +82,36 @@ jobs:
      test-matrix: ${{ needs.linux-jammy-cuda12_4-py3_10-gcc11-sm89-build.outputs.test-matrix }}
    secrets: inherit

+  linux-jammy-cuda12_4-py3_10-gcc11-build:
+    name: linux-jammy-cuda12.4-py3.10-gcc11
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-cuda12.4-py3.10-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11
+      test-matrix: |
+        { include: [
+          { config: "legacy_nvidia_driver", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
+          { config: "legacy_nvidia_driver", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
+          { config: "legacy_nvidia_driver", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
+          { config: "legacy_nvidia_driver", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
+          { config: "legacy_nvidia_driver", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cuda12_4-py3_10-gcc11-test:
+    name: linux-jammy-cuda12.4-py3.10-gcc11
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-cuda12_4-py3_10-gcc11-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-cuda12.4-py3.10-gcc11
+      docker-image: ${{ needs.linux-jammy-cuda12_4-py3_10-gcc11-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_4-py3_10-gcc11-build.outputs.test-matrix }}
+    secrets: inherit
+
  linux-jammy-cuda12_8-py3_10-gcc11-build:
    name: linux-jammy-cuda12.8-py3.10-gcc11
    uses: ./.github/workflows/_linux-build.yml
@ -127,7 +157,6 @@ jobs:
          { config: "multigpu", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.12xlarge.nvidia.gpu", owners: ["oncall:distributed"] },
          { config: "multigpu", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.12xlarge.nvidia.gpu", owners: ["oncall:distributed"] },
        ]}
-      build-with-debug: false
    secrets: inherit

  linux-jammy-cuda12_8-py3_9-gcc9-test:
@ -148,7 +177,6 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-debug
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9
-      build-with-debug: true
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu", owners: ["oncall:debug-build"] },
--- a/.github/workflows/test-h100.yml
+++ b/.github/workflows/test-h100.yml
@ -37,7 +37,7 @@ jobs:
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runner: "linux.12xlarge"
+      runner: linux.12xlarge.memory
      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
      cuda-arch-list: '9.0'
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -500,7 +500,7 @@ include_patterns = [
    '**/*.h',
 ]
 exclude_patterns = [
-    'c10/macros/Macros.h',
+    'torch/headeronly/macros/Macros.h',
 ]
 command = [
    'python3',
@ -523,7 +523,7 @@ include_patterns = [
    '**/*.h',
 ]
 exclude_patterns = [
-    'c10/macros/Macros.h',
+    'torch/headeronly/macros/Macros.h',
 ]
 command = [
    'python3',
@ -1162,14 +1162,9 @@ exclude_patterns = [
    # These files are all grandfathered in, feel free to remove from this list
    # as necessary
    # NOTE: remove the patterns in the order they are listed
-    'aten/**',
-    'aten/src/ATen/native/**',
-    'aten/src/ATen/native/q*/**',
    'aten/src/ATen/native/[a-pA-P]*/**',
    'aten/src/ATen/[a-mA-M]*/**',
    'test/**',
-    'test/[a-hA-h]*/**',
-    'torch/distributed/tensor/**',
 ]
 init_command = [
    'python3',
@ -1605,7 +1600,10 @@ is_formatter = true
 # the same line, merge conflicts should not arise in git or hg
 [[linter]]
 code = 'MERGE_CONFLICTLESS_CSV'
-include_patterns = ['benchmarks/dynamo/ci_expected_accuracy/*.csv']
+include_patterns = [
+    'benchmarks/dynamo/ci_expected_accuracy/*.csv',
+    'benchmarks/dynamo/pr_time_benchmarks/expected_results.csv',
+]
 command = [
    'python3',
    'tools/linter/adapters/no_merge_conflict_csv_linter.py',
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1190,10 +1190,6 @@ if(APPLE)
  append_cxx_flag_if_supported("-Wno-missing-braces" CMAKE_CXX_FLAGS)
 endif()

-if(USE_XPU)
-  string(APPEND CMAKE_CXX_FLAGS " -DUSE_XPU")
-endif()
-
 if(EMSCRIPTEN)
  string(
    APPEND
@ -1245,6 +1241,7 @@ if(USE_MIMALLOC AND USE_MIMALLOC_ON_MKL)
 endif()

 # ---[ Main build
+add_subdirectory(torch/headeronly)  # headeronly headers
 add_subdirectory(c10)
 add_subdirectory(caffe2)

--- a/2
+++ b/2
@ -136,7 +136,7 @@ torch/profiler/ @sraikund16
 test/functorch/test_aotdispatch.py @ezyang @Chillee

 # Dataloader
-torch/utils/data/ @divyanshk @ramanishsingh
+torch/utils/data/ @divyanshk @ramanishsingh @scotts

 # hipify
 torch/utils/hipify/ @jeffdaily @jithunnair-amd
--- a/2
+++ b/2
@ -33,7 +33,7 @@ RUN case ${TARGETPLATFORM} in \
         *)              MINICONDA_ARCH=x86_64   ;; \
    esac && \
    curl -fsSL -v -o ~/miniconda.sh -O  "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-${MINICONDA_ARCH}.sh"
-COPY requirements.txt .
+COPY requirements.txt requirements-build.txt .
 # Manually invoke bash on miniconda script per https://github.com/conda/conda/issues/10431
 RUN chmod +x ~/miniconda.sh && \
    bash ~/miniconda.sh -b -p /opt/conda && \
--- a/README.md
+++ b/README.md
@ -294,14 +294,14 @@ Install PyTorch

 ```bash
 export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}"
-python -m pip install -r requirements.txt
+python -m pip install -r requirements-build.txt
 python -m pip install --no-build-isolation -v -e .
 ```

 **On macOS**

 ```bash
-python -m pip install -r requirements.txt
+python -m pip install -r requirements-build.txt
 python -m pip install --no-build-isolation -v -e .
 ```

@ -520,7 +520,7 @@ on [our website](https://pytorch.org/get-started/previous-versions).

 ## Getting Started

-Three-pointers to get you started:
+Three pointers to get you started:
 - [Tutorials: get you started with understanding and using PyTorch](https://pytorch.org/tutorials/)
 - [Examples: easy to understand PyTorch code across all domains](https://github.com/pytorch/examples)
 - [The API Reference](https://pytorch.org/docs/)
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -458,7 +458,7 @@ if(LAPACK_FOUND)
    # would not need this at all), some of our libraries (magma in particular)
    # backend to CPU BLAS/LAPACK implementations, and so it is very important
    # we get the *right* implementation, because even if the symbols are the
-    # same, LAPACK implementions may have different calling conventions.
+    # same, LAPACK implementations may have different calling conventions.
    # This caused https://github.com/pytorch/pytorch/issues/7353
    #
    # We do NOT do this on Linux, since we just rely on torch_cpu to
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@ -27,7 +27,7 @@ namespace {
  These const variables defined the fp32 precisions for different backend
  We have "generic", "cuda", "mkldnn" backend now and we can choose fp32
  prevision from "ieee", "tf32", "bf16" and "none". The "ieee" precision means
-  IEEE standard floating point format "tf32" and "bf16" means we are allowed to
+  IEEE standard floating point format, "tf32" and "bf16" means we are allowed to
  use "tf32" or "bf16" as internal computation data types for fp32 computations.
  And "none" means it is override-able by parent's node

@ -40,7 +40,7 @@ namespace {
 */
 const std::map<std::string, std::vector<std::string>> _fp32_precisions = {
    {"generic", {{"ieee", "tf32", "bf16", "none"}}},
-    {"mkldnn", {{"ieee", "bf16", "none"}}},
+    {"mkldnn", {{"ieee", "tf32", "bf16", "none"}}},
    {"cuda", {{"ieee", "tf32", "none"}}}};

 // Check whether the backend and op are legal
@ -76,7 +76,9 @@ void check_fp32_prec_backend_and_op(

  C10_ALWAYS_INLINE void warn_deprecated_fp32_precision_api(){
    TORCH_WARN_ONCE(
-      "This API is going to be deprecated, please see "
+      "Please use the new API settings to control TF32 behavior, such as torch.backends.cudnn.conv.fp32_precision = 'tf32' "
+      "or torch.backends.cuda.matmul.fp32_precision = 'ieee'. Old settings, e.g, torch.backends.cuda.matmul.allow_tf32 = True, "
+      "torch.backends.cudnn.allow_tf32 = True, allowTF32CuDNN() and allowTF32CuBLAS() will be deprecated after Pytorch 2.9. Please see "
      "https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices"
    );
  }
@ -368,6 +370,9 @@ Float32MatmulPrecision Context::float32MatmulPrecision() const {
  invalid = invalid ||
      (float32Precision("mkldnn", "matmul") == "bf16" &&
       float32_matmul_precision != at::Float32MatmulPrecision::MEDIUM);
+  invalid = invalid ||
+      (float32Precision("mkldnn", "matmul") == "tf32" &&
+       float32_matmul_precision != at::Float32MatmulPrecision::HIGH);
  TORCH_CHECK(
      !invalid,
      "PyTorch is checking the matmul precision without a specific backend name,",
@ -401,7 +406,7 @@ void Context::setFloat32MatmulPrecision(const std::string &s) {
    } else if (s_ == "high") {
      float32_matmul_precision = at::Float32MatmulPrecision::HIGH;
      setFloat32Precision("cuda", "matmul", "tf32");
-      setFloat32Precision("mkldnn", "matmul", "ieee");
+      setFloat32Precision("mkldnn", "matmul", "tf32");
      return true;
    } else if (s_ == "medium") {
      float32_matmul_precision = at::Float32MatmulPrecision::MEDIUM;
--- a/aten/src/ATen/DLConvertor.cpp
+++ b/aten/src/ATen/DLConvertor.cpp
@ -69,37 +69,41 @@ DLDataType getDLDataType(const Tensor& t) {
    case ScalarType::Float8_e4m3fn:
    case ScalarType::Float8_e4m3fnuz:
    case ScalarType::Float8_e8m0fnu:
-      TORCH_CHECK(false, "float8 types are not supported by dlpack");
+      TORCH_CHECK_BUFFER(false, "float8 types are not supported by dlpack");
      break;
    case ScalarType::Float4_e2m1fn_x2:
-      TORCH_CHECK(false, "float4 types are not supported by dlpack");
+      TORCH_CHECK_BUFFER(false, "float4 types are not supported by dlpack");
      break;
    case ScalarType::QInt8:
    case ScalarType::QUInt8:
    case ScalarType::QInt32:
    case ScalarType::QUInt4x2:
    case ScalarType::QUInt2x4:
-      TORCH_CHECK(false, "QUInt/QInt types are not supported by dlpack");
+      TORCH_CHECK_BUFFER(false, "QUInt/QInt types are not supported by dlpack");
      break;
    case ScalarType::Bits1x8:
    case ScalarType::Bits2x4:
    case ScalarType::Bits4x2:
    case ScalarType::Bits8:
    case ScalarType::Bits16:
-      TORCH_CHECK(false, "Bit types are not supported by dlpack");
+      TORCH_CHECK_BUFFER(false, "Bit types are not supported by dlpack");
      break;
    case ScalarType::Undefined:
-      TORCH_CHECK(false, "Undefined is not a valid ScalarType");
+      TORCH_CHECK_BUFFER(false, "Undefined is not a valid ScalarType");
    case ScalarType::NumOptions:
-      TORCH_CHECK(false, "NumOptions is not a valid ScalarType");
+      TORCH_CHECK_BUFFER(false, "NumOptions is not a valid ScalarType");
  }
  return dtype;
 }

-static DLDevice getDLDevice(const Tensor& tensor, c10::DeviceIndex device_id) {
+DLDevice torchDeviceToDLDevice(at::Device device) {
  DLDevice ctx;
-  ctx.device_id = static_cast<int32_t>(static_cast<unsigned char>(device_id));
-  switch (tensor.device().type()) {
+
+  ctx.device_id = (device.is_cuda() || device.is_privateuseone())
+      ? static_cast<int32_t>(static_cast<unsigned char>(device.index()))
+      : 0;
+
+  switch (device.type()) {
    case DeviceType::CPU:
      ctx.device_type = DLDeviceType::kDLCPU;
      break;
@ -120,8 +124,7 @@ static DLDevice getDLDevice(const Tensor& tensor, c10::DeviceIndex device_id) {
      break;
    case DeviceType::XPU:
      ctx.device_type = DLDeviceType::kDLOneAPI;
-      ctx.device_id =
-          at::detail::getXPUHooks().getGlobalIdxFromDevice(tensor.device());
+      ctx.device_id = at::detail::getXPUHooks().getGlobalIdxFromDevice(device);
      break;
    case DeviceType::MAIA:
      ctx.device_type = DLDeviceType::kDLMAIA;
@ -130,44 +133,46 @@ static DLDevice getDLDevice(const Tensor& tensor, c10::DeviceIndex device_id) {
      ctx.device_type = DLDeviceType::kDLExtDev;
      break;
    default:
-      TORCH_CHECK(false, "Cannot pack tensors on " + tensor.device().str());
+      TORCH_CHECK_BUFFER(false, "Cannot pack tensors on " + device.str());
  }
+
  return ctx;
 }

-static Device getATenDevice(const DLDevice& ctx, void* data) {
-  switch (ctx.device_type) {
+static Device getATenDevice(DLDeviceType type, c10::DeviceIndex index, void* data = nullptr) {
+  switch (type) {
    case DLDeviceType::kDLCPU:
      return at::Device(DeviceType::CPU);
 #ifndef USE_ROCM
    // if we are compiled under HIP, we cannot do cuda
    case DLDeviceType::kDLCUDA:
-      return at::Device(DeviceType::CUDA, static_cast<c10::DeviceIndex>(ctx.device_id));
+      return at::Device(DeviceType::CUDA, index);
 #endif
    case DLDeviceType::kDLOpenCL:
-      return at::Device(DeviceType::OPENCL, static_cast<c10::DeviceIndex>(ctx.device_id));
+      return at::Device(DeviceType::OPENCL, index);
    case DLDeviceType::kDLROCM:
 #ifdef USE_ROCM
      // this looks funny, we need to return CUDA here to masquerade
-      return at::Device(DeviceType::CUDA, static_cast<c10::DeviceIndex>(ctx.device_id));
+      return at::Device(DeviceType::CUDA, index);
 #else
-      return at::Device(DeviceType::HIP, static_cast<c10::DeviceIndex>(ctx.device_id));
+      return at::Device(DeviceType::HIP, index);
 #endif
    case DLDeviceType::kDLOneAPI:
+      TORCH_CHECK(data != nullptr, "Can't get ATen device for XPU without XPU data.");
      return at::detail::getXPUHooks().getDeviceFromPtr(data);
    case DLDeviceType::kDLMAIA:
-      return at::Device(DeviceType::MAIA, static_cast<c10::DeviceIndex>(ctx.device_id));
+      return at::Device(DeviceType::MAIA, index);
    case DLDeviceType::kDLExtDev:
-      return at::Device(DeviceType::PrivateUse1, static_cast<c10::DeviceIndex>(ctx.device_id));
+      return at::Device(DeviceType::PrivateUse1, index);
    default:
-      TORCH_CHECK(
-          false, "Unsupported device_type: ", std::to_string(ctx.device_type));
+      TORCH_CHECK_BUFFER(
+          false, "Unsupported device_type: ", std::to_string(type));
  }
 }

 ScalarType toScalarType(const DLDataType& dtype) {
  ScalarType stype = ScalarType::Undefined;
-  TORCH_CHECK(dtype.lanes == 1, "ATen does not support lanes != 1");
+  TORCH_CHECK_BUFFER(dtype.lanes == 1, "ATen does not support lanes != 1");
  switch (dtype.code) {
    case DLDataTypeCode::kDLUInt:
      switch (dtype.bits) {
@ -184,7 +189,7 @@ ScalarType toScalarType(const DLDataType& dtype) {
          stype = ScalarType::UInt64;
          break;
        default:
-          TORCH_CHECK(
+          TORCH_CHECK_BUFFER(
              false, "Unsupported kUInt bits ", std::to_string(dtype.bits));
      }
      break;
@ -203,7 +208,7 @@ ScalarType toScalarType(const DLDataType& dtype) {
          stype = ScalarType::Long;
          break;
        default:
-          TORCH_CHECK(
+          TORCH_CHECK_BUFFER(
              false, "Unsupported kInt bits ", std::to_string(dtype.bits));
      }
      break;
@ -219,7 +224,7 @@ ScalarType toScalarType(const DLDataType& dtype) {
          stype = ScalarType::Double;
          break;
        default:
-          TORCH_CHECK(
+          TORCH_CHECK_BUFFER(
              false, "Unsupported kFloat bits ", std::to_string(dtype.bits));
      }
      break;
@ -229,7 +234,7 @@ ScalarType toScalarType(const DLDataType& dtype) {
          stype = ScalarType::BFloat16;
          break;
        default:
-          TORCH_CHECK(
+          TORCH_CHECK_BUFFER(
              false, "Unsupported kFloat bits ", std::to_string(dtype.bits));
      }
      break;
@ -245,7 +250,7 @@ ScalarType toScalarType(const DLDataType& dtype) {
          stype = ScalarType::ComplexDouble;
          break;
        default:
-          TORCH_CHECK(
+          TORCH_CHECK_BUFFER(
              false, "Unsupported kFloat bits ", std::to_string(dtype.bits));
      }
      break;
@ -255,12 +260,12 @@ ScalarType toScalarType(const DLDataType& dtype) {
          stype = ScalarType::Bool;
          break;
        default:
-          TORCH_CHECK(
+          TORCH_CHECK_BUFFER(
              false, "Unsupported kDLBool bits ", std::to_string(dtype.bits));
      }
      break;
    default:
-      TORCH_CHECK(false, "Unsupported code ", std::to_string(dtype.code));
+      TORCH_CHECK_BUFFER(false, "Unsupported code ", std::to_string(dtype.code));
  }
  return stype;
 }
@ -314,11 +319,7 @@ T* toDLPackImpl(const Tensor& src) {
  atDLMTensor->tensor.manager_ctx = atDLMTensor;
  atDLMTensor->tensor.deleter = &deleter<T>;
  atDLMTensor->tensor.dl_tensor.data = view.data_ptr();
-  c10::DeviceIndex device_id = 0;
-  if (src.is_cuda() || src.is_privateuseone()) {
-    device_id = src.get_device();
-  }
-  atDLMTensor->tensor.dl_tensor.device = getDLDevice(src, device_id);
+  atDLMTensor->tensor.dl_tensor.device = torchDeviceToDLDevice(src.device());
  atDLMTensor->tensor.dl_tensor.ndim = static_cast<int32_t>(src.dim());
  atDLMTensor->tensor.dl_tensor.dtype = getDLDataType(src);
  atDLMTensor->tensor.dl_tensor.shape = view.sizes().data();
@ -346,7 +347,7 @@ at::Tensor fromDLPackImpl(T* src, std::function<void(void*)> deleter) {
  }

  DLTensor& dl_tensor = src->dl_tensor;
-  Device device = getATenDevice(dl_tensor.device, dl_tensor.data);
+  Device device = getATenDevice(dl_tensor.device.device_type, dl_tensor.device.device_id, dl_tensor.data);
  ScalarType stype = toScalarType(dl_tensor.dtype);

  if (!dl_tensor.strides) {
@ -388,4 +389,35 @@ Tensor fromDLPackVersioned(DLManagedTensorVersioned* src, std::function<void(voi
  return fromDLPackImpl<DLManagedTensorVersioned>(src, std::move(deleter));
 }

+Tensor maybeCopyTensor(
+    const Tensor& data,
+    std::optional<DLDevice> optional_dl_device,
+    std::optional<bool> copy) {
+  bool force_copy = copy.has_value() && *copy;
+  bool force_move = copy.has_value() && !*copy;
+
+  if (optional_dl_device.has_value()) {
+    auto device = at::getATenDevice(
+        optional_dl_device->device_type,
+        static_cast<c10::DeviceIndex>(optional_dl_device->device_id));
+
+    if (device != data.device()) {
+      TORCH_CHECK_VALUE(
+          !force_move,
+          "cannot move (i.e. copy=False) tensor from ",
+          data.device(),
+          " to ",
+          device,
+          " without copying.");
+      return data.to(device);
+    }
+  }
+
+  if (force_copy) {
+    return data.clone();
+  }
+
+  return data;
+}
+
 } // namespace at
--- a/aten/src/ATen/DLConvertor.h
+++ b/aten/src/ATen/DLConvertor.h
@ -4,7 +4,7 @@
 #include <ATen/Tensor.h>
 #include <ATen/dlpack.h>

-// this convertor will:
+// this converter will:
 // 1) take a Tensor object and wrap it in the DLPack tensor
 // 2) take a dlpack tensor and convert it to the ATen Tensor

@ -21,6 +21,16 @@ TORCH_API Tensor fromDLPackVersioned(
 TORCH_API DLDataType getDLDataType(const Tensor& t);
 TORCH_API DLDevice getDLContext(const Tensor& tensor, const int64_t& device_id);

+// Copies the Tensor if there's a device mismatch or copy is forced.
+// This should be used before actually creating the DLPack capsule.
+TORCH_API Tensor maybeCopyTensor(
+    const Tensor& data,
+    std::optional<DLDevice> optional_dl_device,
+    std::optional<bool> copy);
+
+// Converts the given at::Device into a DLDevice.
+TORCH_API DLDevice torchDeviceToDLDevice(at::Device device);
+
 // This trait class is used for retrieving different attributes, such as the
 // PyCapsule names and conversion functions for both DLPack tensor classes:
 // `DLManagedTensor` and `DLManagedTensorVersioned`.
--- a/aten/src/ATen/DeviceAccelerator.h
+++ b/aten/src/ATen/DeviceAccelerator.h
@ -1,5 +1,6 @@
 #pragma once

+#include <c10/core/CachingDeviceAllocator.h>
 #include <c10/core/DeviceType.h>
 #include <c10/macros/Macros.h>

@ -72,6 +73,27 @@ TORCH_API c10::DeviceIndex exchangeDevice(c10::DeviceIndex device_index);
 // original device index that was active before the change.
 TORCH_API c10::DeviceIndex maybeExchangeDevice(c10::DeviceIndex device_index);

+TORCH_API inline void emptyCache() {
+  const auto device_type = getAccelerator(true).value();
+  at::getDeviceAllocator(device_type)->emptyCache();
+}
+
+TORCH_API inline at::CachingDeviceAllocator::DeviceStats getDeviceStats(
+    c10::DeviceIndex device_index) {
+  const auto device_type = getAccelerator(true).value();
+  return at::getDeviceAllocator(device_type)->getDeviceStats(device_index);
+}
+
+TORCH_API inline void resetAccumulatedStats(c10::DeviceIndex device_index) {
+  const auto device_type = getAccelerator(true).value();
+  at::getDeviceAllocator(device_type)->resetAccumulatedStats(device_index);
+}
+
+TORCH_API inline void resetPeakStats(c10::DeviceIndex device_index) {
+  const auto device_type = getAccelerator(true).value();
+  at::getDeviceAllocator(device_type)->resetPeakStats(device_index);
+}
+
 } // namespace at::accelerator

 namespace at {
--- a/aten/src/ATen/FunctionalInverses.cpp
+++ b/aten/src/ATen/FunctionalInverses.cpp
@ -233,8 +233,8 @@ Tensor FunctionalInverses::slice_Tensor_inverse(const Tensor& base, const Tensor

 // NOLINTNEXTLINE(performance-unnecessary-value-param)
 Tensor FunctionalInverses::split_Tensor_inverse(const Tensor& base, const Tensor& mutated_view, InverseReturnMode inverse_return_mode, int64_t mutated_view_idx, c10::SymInt split_size, int64_t dim) {
-    // It would be nice if this logic could be re-used from autograd's split_backward(), but I don't think it can.
-    // For functionalization, we have only have one of the tensors from the TensorList outputed by split(), and we want to layer i
+    // It would be nice if this logic could be reused from autograd's split_backward(), but I don't think it can.
+    // For functionalization, we have only have one of the tensors from the TensorList outputted by split(), and we want to layer i
    // on top of the base tensor.
    // For autograd, we have all of the tensors outputted by split() and we just want to stack them.
    dim = at::maybe_wrap_dim(dim, base.dim());
--- a/aten/src/ATen/FunctionalTensorWrapper.cpp
+++ b/aten/src/ATen/FunctionalTensorWrapper.cpp
@ -286,11 +286,11 @@ void FunctionalTensorWrapper::storage_resize_(const c10::SymInt& new_size) {
  // storage resizing is severely limited: we only support resizing either to zero, or from zero bytes.
  TORCH_CHECK(new_size == 0 || curr_storage_size == 0, "new_size: ", new_size, ". curr_storage_size: ", curr_storage_size);
  // The "functionalization rule" for storage resizing is a giant no-op, mainly because we don't want
-  // resize_() calls to actualy emit any ops in the functional graph.
+  // resize_() calls to actually emit any ops in the functional graph.
  // How does it work?
  // Resizing up (old size == 0):
  //   We do nothing in this case.
-  //   The expection is that for the user code to be valid, the next op that should run against the current tensor "x"
+  //   The expectation is that for the user code to be valid, the next op that should run against the current tensor "x"
  //   will be a x.copy_(y) (or similar), that will fully overwrite the data of x.
  //   If there are any outstanding aliases of x, we expect them not to be used until after the copy_() call
  //   (otherwise the eager code would be invalid),
@ -327,7 +327,7 @@ void FunctionalTensorWrapper::maybe_replace_storage(const Tensor& other) {
  // We're also no longer re-generate "b" fully from "a" anymore, since "a" refers to a slice of "b"'s data.
  //
  // This is probably fixable in theory, but:
-  // - the fix would likey complicated the functionalization logic quite a bit.
+  // - the fix would likely complicated the functionalization logic quite a bit.
  // - the primary use case for resize_() today is resizing zero-sized tensors in out= variants of operators
  // - resize_() also can give you weird results today if you try to resize_() a weirdly strided tensor.
  //
@ -344,7 +344,7 @@ void FunctionalTensorWrapper::maybe_replace_storage(const Tensor& other) {
  set_sizes_and_strides(value_.sizes(), value_.strides());
  refresh_numel();
  // (Technically we should be guaranteed that the tensor was already contiguous,
-  // since it's guaranteed not to have been a view. Doesnt hurt to run though)
+  // since it's guaranteed not to have been a view. Doesn't hurt to run though)
  refresh_contiguous();
  // Swapping out the storage of a tensor (aka from a resize_() call) will update the sizes and strides of the tensor,
  // so we need to record the fact that metadata was mutated.
@ -819,7 +819,7 @@ void setFunctionalizationReapplyViewsTLS(bool reapply_views) {
 // This function will "functionalize" it.
 // That is, it will call the operator, but removing any intermediate views/mutations
 // that are performed inside of it.
-// This is useful for LTC/XLA, which would like to re-use some of our composite kernels
+// This is useful for LTC/XLA, which would like to reuse some of our composite kernels
 // from pytorch core but not have to worry about the view ops that they might call.
 // e.g. at::block_diag
 void functionalize_op_helper(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
--- a/aten/src/ATen/LegacyBatchedFallback.cpp
+++ b/aten/src/ATen/LegacyBatchedFallback.cpp
@ -218,7 +218,7 @@ static Tensor safeStack(TensorList tensors) {
  // is possible for the backward function to return an undefined grad for some
  // grad_input for each example. In that case, we return an undefined grad.
  //
-  // It is theoretically posssible for *some* of the examples to produce an
+  // It is theoretically possible for *some* of the examples to produce an
  // undefined grad (a kernel could peek at the gradient values and return an
  // undefined tensor if it determines the gradient is full of zeros). We
  // could handle this by treating the undefined grad as a zero-filled tensor
--- a/aten/src/ATen/LegacyVmapTransforms.h
+++ b/aten/src/ATen/LegacyVmapTransforms.h
@ -140,7 +140,7 @@ struct TORCH_API VmapPhysicalView {
  // mapping a physical tensor to a new logical tensor (BatchedTensor)
  VmapPhysicalToLogicalMap getPhysicalToLogicalMap() const;

-  // Maps a logical shape to a physical shape by pre-pending the batch
+  // Maps a logical shape to a physical shape by prepending the batch
  // sizes to the logical shape.
  VmapDimVector getPhysicalShape(IntArrayRef logical_shape) const;

--- a/aten/src/ATen/MapAllocator.cpp
+++ b/aten/src/ATen/MapAllocator.cpp
@ -299,7 +299,7 @@ MapAllocator::MapAllocator(WithFd, std::string_view filename, int fd, int flags,
            ::close(fd);
            TORCH_CHECK(false, "unable to stretch file <", filename_, "> to the right size: ", c10::utils::str_error(last_err), " (", last_err, ")");
          }
-/* on macOS write returns with errno 45 (Opperation not supported) when used
+/* on macOS write returns with errno 45 (Operation not supported) when used
 * with a file descriptor obtained via shm_open
 */
 #ifndef __APPLE__
--- a/aten/src/ATen/NestedTensorImpl.cpp
+++ b/aten/src/ATen/NestedTensorImpl.cpp
@ -211,7 +211,7 @@ NestedTensorImpl::NestedTensorImpl(
 }

 // assume contiguous, `nested_strides` and `offsets`
-// can be infered from `nested_sizes`
+// can be inferred from `nested_sizes`
 NestedTensorImpl::NestedTensorImpl(
    const at::Tensor& buffer,
    const at::Tensor& nested_sizes)
--- a/aten/src/ATen/NestedTensorImpl.h
+++ b/aten/src/ATen/NestedTensorImpl.h
@ -32,7 +32,7 @@ struct TORCH_API NestedTensorImpl : public c10::TensorImpl {
      at::Tensor nested_strides,
      at::Tensor storage_offsets);
  // assume contiguous, `nested_strides` and `offsets`
-  // can be infered from `nested_sizes`
+  // can be inferred from `nested_sizes`
  explicit NestedTensorImpl(
      const at::Tensor& buffer,
      const at::Tensor& nested_sizes);
--- a/aten/src/ATen/Parallel.h
+++ b/aten/src/ATen/Parallel.h
@ -93,12 +93,12 @@ ident: identity for binary combination function sf. sf(ident, x) needs to return
 x.

 f: function for reduction over a chunk. f needs to be of signature scalar_t
-f(int64_t partial_begin, int64_t partial_end, scalar_t identifiy)
+f(int64_t partial_begin, int64_t partial_end, scalar_t identify)

 sf: function to combine two partial results. sf needs to be of signature
 scalar_t sf(scalar_t x, scalar_t y)

-For example, you might have a tensor of 10000 entires and want to sum together
+For example, you might have a tensor of 10000 entries and want to sum together
 all the elements. Parallel_reduce with a grain_size of 2500 will then allocate
 an intermediate result tensor with 4 elements. Then it will execute the function
 "f" you provide and pass the beginning and end index of these chunks, so
--- a/aten/src/ATen/ScalarOps.cpp
+++ b/aten/src/ATen/ScalarOps.cpp
@ -8,7 +8,28 @@ namespace at {
 namespace {
 template <typename scalar_t>
 inline void fill_inplace(Tensor& self, const Scalar& value_scalar) {
-  auto value = value_scalar.to<scalar_t>();
+  scalar_t value{};
+
+  if constexpr (std::is_same_v<scalar_t, at::Half> ||
+                std::is_same_v<scalar_t, at::BFloat16> ||
+                std::is_same_v<scalar_t, at::Float8_e5m2> ||
+                std::is_same_v<scalar_t, at::Float8_e5m2fnuz> ||
+                std::is_same_v<scalar_t, at::Float8_e4m3fn> ||
+                std::is_same_v<scalar_t, at::Float8_e4m3fnuz> ||
+                std::is_same_v<scalar_t, at::Float8_e8m0fnu>) {
+    // relaxed float cast: allow inf similar to the torch.tensor constructor
+    //
+    // without this, we had the following divergence:
+    //   torch.tensor(1123581321.0, dtype=torch.float16)
+    //     => tensor(inf, dtype=torch.float16)
+    //   torch.ops.aten.scalar_tensor.default(1123581321, dtype=torch.float16)
+    //     => RuntimeError: value cannot be converted to type at::Half without overflow
+
+    value = static_cast<scalar_t>(value_scalar.to<double>());
+  } else {
+    value = value_scalar.to<scalar_t>();
+  }
+
  scalar_t* dptr = static_cast<scalar_t*>(self.data_ptr());
  *dptr = value;
 }
--- a/aten/src/ATen/TensorIndexing.h
+++ b/aten/src/ATen/TensorIndexing.h
@ -252,7 +252,7 @@ inline Tensor applySelect(
    // Note: `size >= -index` is not equivalent to `size > -1 - index` if index
    // is INT64_MIN For std::numeric_limits<int64_t>::min() result of unary
    // minus is undefined by the standard but in practice is equal to self. On
-    // the other hand, indexing wraping is valid for all negative int64_t
+    // the other hand, indexing wrapping is valid for all negative int64_t
    // values, as x[INT64_MIN] is the same as x[INT64_MAX]
    TORCH_CHECK_INDEX(
        size.sym_gt(-1 - index)
@ -315,10 +315,17 @@ inline void recordTensorIndex(
    const Tensor& tensor,
    std::vector<Tensor>& outIndices,
    int64_t* dim_ptr) {
-  // TODO: check scalarType
-  outIndices.resize(*dim_ptr + 1);
-  outIndices[*dim_ptr] = tensor;
-  (*dim_ptr)++;
+  if (outIndices.empty()) {
+    outIndices.resize(*dim_ptr + 1);
+    outIndices[*dim_ptr] = tensor;
+  } else {
+    outIndices.push_back(tensor);
+  }
+  if (tensor.scalar_type() == kByte || tensor.scalar_type() == kBool) {
+    *dim_ptr += tensor.dim();
+  } else {
+    *dim_ptr += 1;
+  }
 }

 inline c10::List<::std::optional<Tensor>> typeConvertIndices(
@ -458,13 +465,23 @@ inline Tensor handleDimInMultiDimIndexing(
        original_tensor_device,
        prev_dim_result_sizes);
    (*dim_ptr)++;
+    if (!outIndices.empty()) {
+      outIndices.resize(outIndices.size() + 1);
+    }
    return result;
  } else if (index.is_ellipsis()) {
-    (*dim_ptr) += original_tensor.dim() - (*specified_dims_ptr);
+    auto ellipsis_ndims = original_tensor.dim() - *specified_dims_ptr;
+    (*dim_ptr) += ellipsis_ndims;
+    if (!outIndices.empty()) {
+      outIndices.resize(outIndices.size() + ellipsis_ndims);
+    }
    return prev_dim_result;
  } else if (index.is_none()) {
    Tensor result = prev_dim_result.unsqueeze(*dim_ptr);
    (*dim_ptr)++;
+    if (!outIndices.empty()) {
+      outIndices.resize(outIndices.size() + 1);
+    }
    return result;
  } else if (index.is_boolean()) {
    Tensor result = prev_dim_result.unsqueeze(*dim_ptr);
@ -560,6 +577,10 @@ inline Tensor applySlicing(
 inline Tensor dispatch_index(
    const Tensor& self,
    std::vector<Tensor>&& indices) {
+  // Remove trailing null elements from indices
+  while (!indices.empty() && !indices.back().defined()) {
+    indices.pop_back();
+  }
  return self.index(impl::typeConvertIndices(self, std::move(indices)));
 }

@ -567,6 +588,10 @@ inline Tensor dispatch_index_put_(
    Tensor& self,
    std::vector<Tensor>&& indices,
    const Tensor& value) {
+  // Remove trailing null elements from indices
+  while (!indices.empty() && !indices.back().defined()) {
+    indices.pop_back();
+  }
  return self.index_put_(
      impl::typeConvertIndices(self, std::move(indices)), value);
 }
--- a/aten/src/ATen/TensorIterator.cpp
+++ b/aten/src/ATen/TensorIterator.cpp
@ -208,7 +208,7 @@ bool TensorIteratorConfig::is_tensor_const(size_t idx) {
 // same strides are increasing. If dimensions are non-increasing, we move on to the next input to break the tie.
 //
 // Instead of applying rule 4 for tie breaking, we could move on to the next tensor directly. This would result in possibly
-// losing the correct permuation of the first tensor if there are permuted trivial dimensions, but could potentially
+// losing the correct permutation of the first tensor if there are permuted trivial dimensions, but could potentially
 // improve traversal order of the second tensor. We chose the former option to better propagate channels last layout
 // for example for a tensor with the sizes N1H1
 // These rules result in the intuitive behavior that in most cases recovers permutation of either the first argument (if all
@ -244,7 +244,7 @@ void TensorIteratorBase::reorder_dimensions() {
  // initialize perm with n-1, n-2, ..., 1, 0
  std::iota(perm_.rbegin(), perm_.rend(), 0);

-  // Reordering dimensions changes iteraton order
+  // Reordering dimensions changes iteration order
  if (enforce_linear_iteration_) {
    permute_dimensions(perm_);
    return;
--- a/aten/src/ATen/TensorIterator.h
+++ b/aten/src/ATen/TensorIterator.h
@ -388,7 +388,7 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase {

  /// Return scalar value from original_tensor_base if it is defined. When
  /// common_dtype is Half, casting scalar input to common_dtype might overflow.
-  /// If the scalar is aleady given in the type of Half, then return scalar
+  /// If the scalar is already given in the type of Half, then return scalar
  /// value from tensor_base.
  template <typename T>
  T original_scalar_value(int64_t arg) {
@ -502,7 +502,7 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase {
  /// kernels
  bool can_use_32bit_indexing() const;

-  /// An "iteratable" object that recursively splits this iterator into
+  /// An "iterable" object that recursively splits this iterator into
  /// sub-iterators that can use 32-bit indexing.
  SplitUntil32Bit with_32bit_indexing() const;

@ -878,7 +878,7 @@ class TORCH_API TensorIteratorConfig final {

  // Sets the enforce_linear_iteration_ flag, which is false by default.
  // If true, iteration goes in the same order as a C-contiguous tensor
-  // is layed out in memory. i.e. last dimension iterates fastest.
+  // is laid out in memory. i.e. last dimension iterates fastest.
  //
  // This iteration order can be less efficient and may even prevent
  // vectorization. So only use if the correctness of your kernel depends on it.
--- a/aten/src/ATen/TensorSubclassLikeUtils.h
+++ b/aten/src/ATen/TensorSubclassLikeUtils.h
@ -78,7 +78,7 @@ inline bool areAnyOptionalTensorSubclassLike(
 // NOTE: This function expects a scalar tensor of boolean dtype.
 // Eg.
 // Non-Composite Compliant Pattern : (t == 0).all().item<bool>()
-// Composite Compliant Patter : is_salar_tensor_true((t == 0).all())
+// Composite Compliant Pattern : is_salar_tensor_true((t == 0).all())
 inline bool is_scalar_tensor_true(const Tensor& t) {
  TORCH_INTERNAL_ASSERT(t.dim() == 0)
  TORCH_INTERNAL_ASSERT(t.scalar_type() == kBool)
--- a/aten/src/ATen/TensorUtils.cpp
+++ b/aten/src/ATen/TensorUtils.cpp
@ -378,9 +378,9 @@ inline static std::optional<ResultVec> computeStride_impl(
        (TORCH_GUARD_OR_TRUE(sym_ne(oldshape[tensor_d - 1], 1)) &&
        TORCH_GUARD_OR_TRUE(sym_ne(oldstride[tensor_d - 1], tensor_numel * chunk_base_stride)))) {
     // We want to accumulate stuff in view_numel until view_numel == tensor_numel, if we do not
-     // know if that is satisfied we keep accumalating. For example if view_numel = 1 and tensor_numel = u1,
+     // know if that is satisfied we keep accumulating. For example if view_numel = 1 and tensor_numel = u1,
     // we want to take that path, view_numel will become u0. Next iteration if u0==u1 we want to stop.
-     // Thats why we use TORCH_GUARD_OR_TRUE below.
+     // That's why we use TORCH_GUARD_OR_TRUE below.

     // we use TORCH_GUARD_OR_FALSE and not TORCH_GUARD_OR_TRUE when comparing newshape[view_d] ==1 because
     // if we know view_numel < tensor_numel is false, we want to stop. Unless we know for sure newshape[view_d]==1
--- a/aten/src/ATen/TracerMode.h
+++ b/aten/src/ATen/TracerMode.h
@ -27,7 +27,7 @@
 //    ops (ops being called by other ops). After the intermediate op call
 //    finishes it's set back to the original `TracingState` object.
 //
-//    The `TracingState` obect in TLS can also be read/written via its Python
+//    The `TracingState` object in TLS can also be read/written via its Python
 //    binding in `python_tracer.cpp`, and `get/setTracingState()` C++ APIs,
 //    which are also exposed as `TORCH_API`.
 //
--- a/aten/src/ATen/ZeroTensorFallback.cpp
+++ b/aten/src/ATen/ZeroTensorFallback.cpp
@ -95,7 +95,7 @@ namespace at {
    m.impl("clone", torch::CppFunction::makeFallthrough());
    m.impl("dot", torch::CppFunction::makeFallthrough());
    m.impl("vdot", torch::CppFunction::makeFallthrough());
-    // The functions in the list below have a specific registeration in native_functions.yaml and
+    // The functions in the list below have a specific registration in native_functions.yaml and
    // do not use the fallback.
    // m.impl("mul.Tensor", torch::CppFunction::makeFallthrough());
    // m.impl("add.Tensor", torch::CppFunction::makeFallthrough());
--- a/aten/src/ATen/autocast_mode.h
+++ b/aten/src/ATen/autocast_mode.h
@ -377,7 +377,7 @@ Keep it simple for now by assuming only one such flag is
 present in the argument list.  If I ever need a function
 with more than flag I'll figure out something else.
 The policy is:
-If the user has explicity specified a dtype, respect it.
+If the user has explicitly specified a dtype, respect it.
 Otherwise, set it to the autocast type.
 ********************************************************/

--- a/aten/src/ATen/cuda/CUDAGraph.cpp
+++ b/aten/src/ATen/cuda/CUDAGraph.cpp
@ -2,7 +2,6 @@
 #include <ATen/cuda/CUDAGraph.h>
 #include <ATen/cuda/Exceptions.h>
 #include <ATen/Functions.h>
-#include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/cuda/CUDAFunctions.h>

 #include <cstddef>
--- a/aten/src/ATen/cuda/CUDAGraph.h
+++ b/aten/src/ATen/cuda/CUDAGraph.h
@ -2,6 +2,7 @@

 #include <ATen/Tensor.h>
 #include <c10/core/Device.h>
+#include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/cuda/CUDAGraphsC10Utils.h>
 #include <c10/cuda/CUDAStream.h>
 #include <c10/util/flat_hash_map.h>
--- a/aten/src/ATen/dlpack.h
+++ b/aten/src/ATen/dlpack.h
@ -199,7 +199,7 @@ typedef struct {
   * `byte_offset` field should be used to point to the beginning of the data.
   *
   * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow,
-   * TVM, perhaps others) do not adhere to this 256 byte aligment requirement
+   * TVM, perhaps others) do not adhere to this 256 byte alignment requirement
   * on CPU/CUDA/ROCm, and always use `byte_offset=0`.  This must be fixed
   * (after which this note will be updated); at the moment it is recommended
   * to not rely on the data pointer being correctly aligned.
--- a/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h
+++ b/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h
@ -1,6 +1,6 @@
 #pragma once

-#include <c10/core/Allocator.h>
+#include <c10/core/CachingDeviceAllocator.h>
 #include <c10/core/DeviceType.h>

 // Use of c10::hip namespace here makes hipification easier, because
@ -10,10 +10,10 @@ namespace c10::hip {
 // Takes a valid HIPAllocator (of any sort) and turns it into
 // an allocator pretending to be a CUDA allocator.  See
 // Note [Masquerading as CUDA]
-class HIPAllocatorMasqueradingAsCUDA final : public Allocator {
-  Allocator* allocator_;
+class HIPAllocatorMasqueradingAsCUDA final : public DeviceAllocator {
+  DeviceAllocator* allocator_;
 public:
-  explicit HIPAllocatorMasqueradingAsCUDA(Allocator* allocator)
+  explicit HIPAllocatorMasqueradingAsCUDA(DeviceAllocator* allocator)
    : allocator_(allocator) {}
  DataPtr allocate(size_t size) override {
    DataPtr r = allocator_->allocate(size);
@ -26,6 +26,24 @@ public:
  void copy_data(void* dest, const void* src, std::size_t count) const final {
    allocator_->copy_data(dest, src, count);
  }
+  bool initialized() override {
+    return allocator_->initialized();
+  }
+  void emptyCache(MempoolId_t mempool_id = {0, 0}) {
+    allocator_->emptyCache(mempool_id);
+  }
+  void recordStream(const DataPtr& ptr, c10::Stream stream) {
+    allocator_->recordStream(ptr, stream);
+  }
+  CachingDeviceAllocator::DeviceStats getDeviceStats(c10::DeviceIndex device) {
+    return allocator_->getDeviceStats(device);
+  }
+  void resetAccumulatedStats(c10::DeviceIndex device) {
+    allocator_->resetAccumulatedStats(device);
+  }
+  void resetPeakStats(c10::DeviceIndex device) {
+    allocator_->resetPeakStats(device);
+  }
 };

 } // namespace c10::hip
--- a/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.cpp
+++ b/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.cpp
@ -4,8 +4,9 @@
 namespace c10 { namespace hip {
 namespace HIPCachingAllocatorMasqueradingAsCUDA {

+static HIPAllocatorMasqueradingAsCUDA allocator(HIPCachingAllocator::get());
+
 Allocator* get() {
-  static HIPAllocatorMasqueradingAsCUDA allocator(HIPCachingAllocator::get());
  return &allocator;
 }

@ -13,5 +14,9 @@ void recordStreamMasqueradingAsCUDA(const DataPtr& ptr, HIPStreamMasqueradingAsC
  HIPCachingAllocator::recordStream(ptr, stream.hip_stream());
 }

+// Register this HIP allocator as CUDA allocator to enable access through both
+// c10::GetAllocator(kCUDA) and c10::getDeviceAllocator(kCUDA) APIs
+REGISTER_ALLOCATOR(kCUDA, &allocator)
+
 } // namespace HIPCachingAllocatorMasqueradingAsCUDA
 }} // namespace c10::hip
--- a/aten/src/ATen/native/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp
@ -2453,7 +2453,7 @@ TORCH_IMPL_FUNC(linalg_qr_out)(const Tensor& A,

  // geqrf requires m x n workspace input that is modified in-place
  // We try to use Q. If it doesn't fit, we try to use R
-  // If m > n and compute_q==false, it won't fit into Q or R, so we neet to create an auxiliary tensor
+  // If m > n and compute_q==false, it won't fit into Q or R, so we need to create an auxiliary tensor
  Tensor QR;
  if (compute_q && Q.size(-1) == n) {
    QR = Q;
@ -4095,7 +4095,7 @@ Tensor linalg_vander_symint(
  const auto n = N.value_or(shape.back());
  TORCH_CHECK(n > 1, "N must be greater than 1.");

-  // Append cumprod of the oher 0...n-1 powers
+  // Append cumprod of the other 0...n-1 powers
  shape.push_back(n - 1);
  auto result = at::cumprod(x_.unsqueeze(-1).expand_symint(shape), -1);
  // The row of ones
--- a/aten/src/ATen/native/CPUBlas.cpp
+++ b/aten/src/ATen/native/CPUBlas.cpp
@ -202,7 +202,7 @@ void gemm(
    float *c, int64_t ldc) {
  internal::normalize_last_dims(transa, transb, m, n, k, &lda, &ldb, &ldc);
 #if AT_MKLDNN_ENABLED()
-   if (mkldnn_bf32_gemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc)) {
+   if (mkldnn_reduced_f32_gemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc)) {
     return;
   }
 #endif
--- a/aten/src/ATen/native/DilatedMaxPool2d.cpp
+++ b/aten/src/ATen/native/DilatedMaxPool2d.cpp
@ -54,7 +54,7 @@ bool ceil_mode) {
    TORCH_CHECK((input.ndimension() == 3 || input.ndimension() == 4),
      "non-empty 3D or 4D (batch mode) tensor expected for input");
  } else {
-    TORCH_CHECK(false, "Unsupport memory format. Supports only ChannelsLast, Contiguous");
+    TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
  }

  /* sizes */
@ -130,7 +130,7 @@ const Tensor& indices) {
    TORCH_CHECK((input.ndimension() == 3 || input.ndimension() == 4),
      "non-empty 3D or 4D (batch mode) tensor expected for input");
  } else {
-    TORCH_CHECK(false, "Unsupport memory format. Supports only ChannelsLast, Contiguous");
+    TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
  }

  /* sizes */
--- a/aten/src/ATen/native/DilatedMaxPool3d.cpp
+++ b/aten/src/ATen/native/DilatedMaxPool3d.cpp
@ -63,7 +63,7 @@ void max_pool3d_with_indices_out_cpu_template(
    TORCH_CHECK((input.ndimension() == 4 || input.ndimension() == 5),
      "non-empty 4D or 5D (batch mode) tensor expected for input");
  } else {
-    TORCH_CHECK(false, "Unsupport memory format. Supports only ChannelsLast3d, Contiguous");
+    TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast3d, Contiguous");
  }

  const int64_t nslices = input.size(-4);
@ -158,7 +158,7 @@ Tensor& max_pool3d_with_indices_backward_out_cpu_template(
    TORCH_CHECK((input.ndimension() == 4 || input.ndimension() == 5),
      "non-empty 4D or 5D (batch mode) tensor expected for input");
  } else {
-    TORCH_CHECK(false, "Unsupport memory format. Supports only ChannelsLast3d, Contiguous");
+    TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast3d, Contiguous");
  }

  const int64_t nslices = input.size(-4);
--- a/aten/src/ATen/native/DistributionTemplates.h
+++ b/aten/src/ATen/native/DistributionTemplates.h
@ -28,13 +28,13 @@ namespace at::native::templates {
 // ==================================================== Random ========================================================

 // The purpose of `update_from` and `update_to` is to find the closest valid int64_t number that can be used as actual `from`.
-// The current implementation of `random_` uses uint64_t arithmetics and casts the result to the target dtype(scalar_t).
+// The current implementation of `random_` uses uint64_t arithmetic and casts the result to the target dtype(scalar_t).
 // This casting can result in generating numbers that happen to be greater or equal to `to` value. For instance:
 //
 //    auto actual = torch::empty({3, 3}, torch::half);
 //    actual.random_(0, 65504);
 //
-// If random's uint64_t arithmetics produces 65503 as a random value after casting to torch::half it becomes 65504
+// If random's uint64_t arithmetic produces 65503 as a random value after casting to torch::half it becomes 65504
 // and violates the requirement that random value must be less than `to`. To resolve this issue `update_from` and `update_to`
 // moves `from` to the right and `to` to the left to the next closest value that won't go outside [from, to) after casting to
 // the target dtype. For `to` = 65504 it moves left for (1 << (log2(to) - 11 + 1)) = 32 and becomes 65472, which is previous
--- a/aten/src/ATen/native/GridSampler.cpp
+++ b/aten/src/ATen/native/GridSampler.cpp
@ -86,7 +86,7 @@ namespace {
        for (const auto d : c10::irange(out_D)) {
          for (const auto h : c10::irange(out_H)) {
            for (const auto w : c10::irange(out_W)) {
-              // get the corresponding input x, y, z co-ordinates from grid
+              // get the corresponding input x, y, z coordinates from grid
              const scalar_t *grid_ptr_NDHW = grid_ptr_N + d * grid_sD + h * grid_sH + w * grid_sW;
              scalar_t ix = *grid_ptr_NDHW;
              scalar_t iy = grid_ptr_NDHW[grid_sCoor];
@ -285,7 +285,7 @@ namespace {
        for (const auto d : c10::irange(out_D)) {
          for (const auto h : c10::irange(out_H)) {
            for (int64_t w = 0; w < out_W; ++w, gGrid_ptr_NDHW += gGrid_sW /* grad_grid is contiguous */ ) {
-              // get the corresponding input x, y, z co-ordinates from grid
+              // get the corresponding input x, y, z coordinates from grid
              const scalar_t *grid_ptr_NDHW = grid_ptr_N + d * grid_sD + h * grid_sH + w * grid_sW;
              scalar_t ix = *grid_ptr_NDHW;
              scalar_t iy = grid_ptr_NDHW[grid_sCoor];
@ -496,7 +496,7 @@ static Tensor _grid_sampler_2d_cpu_quantized(
      uint8_t* inp_ptr_N = inp_ptr + n * inp_sN;
      for (const auto h : c10::irange(out_H)) {
        for (const auto w : c10::irange(out_W)) {
-          // get the corresponding input x, y, z co-ordinates from grid
+          // get the corresponding input x, y, z coordinates from grid
          float* grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW;
          float x = *grid_ptr_NHW;
          float y = grid_ptr_NHW[grid_sCoor];
@ -599,7 +599,7 @@ Tensor _grid_sampler_2d_cpu_fallback(const Tensor& input, const Tensor& grid,
      const scalar_t *inp_ptr_N = inp_ptr + n * inp_sN;
      for (const auto h : c10::irange(out_H)) {
        for (const auto w : c10::irange(out_W)) {
-          // get the corresponding input x, y, z co-ordinates from grid
+          // get the corresponding input x, y, z coordinates from grid
          const scalar_t *grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW;
          scalar_t x = *grid_ptr_NHW;
          scalar_t y = grid_ptr_NHW[grid_sCoor];
@ -771,7 +771,7 @@ _grid_sampler_2d_cpu_fallback_backward(const Tensor& grad_output,
      scalar_t *gGrid_ptr_NHW = gGrid_ptr + n * gGrid_sN;
      for (const auto h : c10::irange(out_H)) {
        for (int64_t w = 0; w < out_W; ++w, gGrid_ptr_NHW += gGrid_sW /* grad_grid is contiguous */ ) {
-          // get the corresponding input x, y co-ordinates from grid
+          // get the corresponding input x, y coordinates from grid
          const scalar_t *grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW;
          scalar_t x = *grid_ptr_NHW;
          scalar_t y = grid_ptr_NHW[grid_sCoor];
--- a/aten/src/ATen/native/Math.h
+++ b/aten/src/ATen/native/Math.h
@ -1068,7 +1068,7 @@ inline scalar_t calc_igammac(scalar_t a, scalar_t x) {
   *   result at the boundary
   * - if a is large and a ~ x, then using Uniform Asymptotic Expansions for
   *   Large Parameter (see DLMF 8.12.4 [igam1])
-   * - if x > 1.1 and x < a, using the substraction from the regularized lower
+   * - if x > 1.1 and x < a, using the subtraction from the regularized lower
   *   incomplete gamma
   * - otherwise, calculate the series from [igam2] eq (5)
   */
@ -1148,7 +1148,7 @@ scalar_t calc_igamma(scalar_t a, scalar_t x) {
   *   result at the boundary
   * - if a is large and a ~ x, then using Uniform Asymptotic Expansions for
   *   Large Parameter (see DLMF 8.12.3 [igam1])
-   * - if x > 1 and x > a, using the substraction from the regularized upper
+   * - if x > 1 and x > a, using the subtraction from the regularized upper
   *   incomplete gamma
   * - otherwise, calculate the series from [igam2] eq (4)
   */
@ -1730,7 +1730,7 @@ inline C10_HOST_DEVICE T calc_ndtri(T y0) {
   with the usual checks for overflow etcetera.

   Performance-wise, it seems to be substantially faster than either
-   the SLATEC DERFC function [or an erfcx function derived therefrom]
+   the SLATEC DERFC function [or an erfcx function derived there from]
   or Cody's CALERF function (from netlib.org/specfun), while
   retaining near machine precision in accuracy.  */

--- a/aten/src/ATen/native/Pool.h
+++ b/aten/src/ATen/native/Pool.h
@ -17,7 +17,7 @@ using max_pool2d_backward_fn = void(*)(const Tensor& grad_input, const Tensor& g
 DECLARE_DISPATCH(max_pool2d_fn, max_pool2d_kernel)
 DECLARE_DISPATCH(max_pool2d_backward_fn, max_pool2d_backward_kernel)

-// averge pooling has same signature for forward and backward
+// average pooling has same signature for forward and backward
 using avg_pool2d_fn = void(*)(const Tensor& output, const Tensor& input, int64_t kW, int64_t kH,
    int64_t dW, int64_t dH, int64_t padW, int64_t padH, bool count_include_pad, std::optional<int64_t> divisor_override);
 using avg_pool2d_backward_fn = void(*)(const Tensor& output, const Tensor& input, int kW, int kH,
@ -26,7 +26,7 @@ using avg_pool2d_backward_fn = void(*)(const Tensor& output, const Tensor& input
 DECLARE_DISPATCH(avg_pool2d_fn, avg_pool2d_kernel)
 DECLARE_DISPATCH(avg_pool2d_backward_fn, avg_pool2d_backward_kernel)

-// averge pooling has same signature for forward and backward
+// average pooling has same signature for forward and backward
 using avg_pool3d_fn = void(*)(const Tensor& output, const Tensor& input,
    int64_t kW, int64_t kH, int64_t kD, int64_t dW, int64_t dH, int64_t dD,
    int64_t padW, int64_t padH, int64_t padD, bool count_include_pad,
--- a/aten/src/ATen/native/SegmentReduce.cpp
+++ b/aten/src/ATen/native/SegmentReduce.cpp
@ -480,7 +480,7 @@ REGISTER_ZVECTOR_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets
 REGISTER_SVE256_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cpu_kernel)

 // Currently some computation is being duplicated across forward and backward.
-// TODO: Cache indices in forward pass to re-use in backward
+// TODO: Cache indices in forward pass to reuse in backward
 Tensor _segment_reduce_backward_kernel(
    const Tensor& grad,
    const Tensor& output,
--- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp
+++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
@ -475,7 +475,7 @@ static void build_index_op(
    TensorIteratorBase& iter,
    const at::native::AdvancedIndex& info,
    const Tensor& result) {
-  // 'TensorIterator' needs to own the things comming from 'info', since
+  // 'TensorIterator' needs to own the things coming from 'info', since
  // 'info' will be destroyed after the META function.
  TensorIteratorConfig config;
  // info.src is a restrided view of result
--- a/aten/src/ATen/native/TensorAdvancedIndexingUtils.h
+++ b/aten/src/ATen/native/TensorAdvancedIndexingUtils.h
@ -35,7 +35,9 @@ inline std::tuple<bool, Tensor> canDispatchToMaskedFill(
  auto self_device = self.device();
  for (const std::optional<Tensor>& i : indices) {
    if (!i.has_value() || !(*i).defined()) {
-      num_ind++;
+      if (!mask.defined()) {
+        num_ind++;
+      }
    } else {
      const Tensor& index = *i;
      if ((index.scalar_type() != kByte && index.scalar_type() != kBool) ||
--- a/aten/src/ATen/native/TensorConversions.cpp
+++ b/aten/src/ATen/native/TensorConversions.cpp
@ -67,7 +67,7 @@ namespace at::native {
 namespace {
 // dense_to_sparse_{csr,bsr,csc,bsc} common helpers

-// Preparation fo the N-D dense -> sparse compressed conversion.
+// Preparation for the N-D dense -> sparse compressed conversion.
 // The N-D input is converted to 3-D (single batch dim) where we check that the
 // product of batch dims is nonzero and for each batch the sparse matrix
 // contained within has the same number of non-zero elements.
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@ -1367,9 +1367,9 @@ void randperm_cpu(Tensor& result, int64_t n, CPUGeneratorImpl* generator) {
    for (int64_t i = 0; i < n - 1; i++) {
      // NOLINTNEXTLINE(clang-analyzer-security.insecureAPI.rand)
      int64_t z = generator->random() % (n - i);
-      scalar_t sav = r__data[i * r__stride_0];
+      scalar_t save = r__data[i * r__stride_0];
      r__data[i * r__stride_0] = r__data[(z + i) * r__stride_0];
-      r__data[(z + i) * r__stride_0] = sav;
+      r__data[(z + i) * r__stride_0] = save;
    }
    return;
  }
--- a/aten/src/ATen/native/TensorIteratorReduce.cpp
+++ b/aten/src/ATen/native/TensorIteratorReduce.cpp
@ -80,7 +80,7 @@ static void two_pass_reduction(TensorIteratorBase& iter, loop2d_t loop) {
 }

 /// Chooses a dimension over which to parallelize. Prefers the outer-most
-/// dimension thats larger than the number of available threads.
+/// dimension that's larger than the number of available threads.
 static int find_split_dim(TensorIteratorBase& iter) {
  int num_threads = at::get_num_threads();
  auto shape = iter.shape();
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@ -247,7 +247,7 @@ TORCH_PRECOMPUTE_META_FUNC(cat)(const ITensorListRef& tensors, int64_t dim) {
  // Checking names before the actual dimensions.
  auto maybe_outnames = namedinference::compute_cat_outnames(materialized);

-  TORCH_CHECK(
+  TORCH_CHECK_VALUE(
      !materialized.empty(),
      "torch.cat(): expected a non-empty list of Tensors");

@ -274,7 +274,7 @@ TORCH_PRECOMPUTE_META_FUNC(cat)(const ITensorListRef& tensors, int64_t dim) {
  // when computing the actual output dtype and the flags.
  if (is_out_defined) {
    // Check for type promotion, if the output tensor is defined.
-    TORCH_CHECK(
+    TORCH_CHECK_TYPE(
        canCast(out_dtype, result.scalar_type()),
        "torch.cat(): input types can't be cast to the desired output type ",
        result.scalar_type());
@ -293,7 +293,7 @@ TORCH_PRECOMPUTE_META_FUNC(cat)(const ITensorListRef& tensors, int64_t dim) {
  // are compatible, i.e. we can execute `cat` on them.
  bool found_valid_tensor = valid < materialized.size();
  if (found_valid_tensor) {
-    TORCH_CHECK(
+    TORCH_CHECK_INDEX(
        dim <= materialized[valid].get().dim(),
        "torch.cat(): dimension ",
        dim,
@ -384,7 +384,7 @@ Tensor& set_storage_cpu_(
  result.unsafeGetTensorImpl()->set_storage_offset(storage_offset);
  at::OptionalIntArrayRef stride_opt =
      stride.data() != nullptr ? at::OptionalIntArrayRef(stride) : std::nullopt;
-  // We can re-use this kernel for the meta device.
+  // We can reuse this kernel for the meta device.
  // We just need to make sure we don't actually try to resize the (null)
  // storage.
  at::native::resize_impl_cpu_(
@ -505,7 +505,7 @@ Tensor& set_cpu_(Tensor& result) {
  return result;
 }

-// We can't re-use the cpu kernel here because we don't want to use the cpu
+// We can't reuse the cpu kernel here because we don't want to use the cpu
 // allocator.
 Tensor& set_meta_(Tensor& result) {
  caffe2::TypeMeta dtype = result.dtype();
@ -1904,7 +1904,7 @@ Tensor repeat(const Tensor& self, IntArrayRef repeats) {
 }

 Tensor tile_symint(const Tensor& self, SymIntArrayRef reps) {
-  // If self.size() > len(reps), reps is promoted to self.size() by pre-pending
+  // If self.size() > len(reps), reps is promoted to self.size() by prepending
  // 1’s to it to keep the same behaviour as `numpy.tile`.
  // Thus for a tensor of shape (2, 3, 4, 5), a dims of (2, 2) is treated
  // as (1, 1, 2, 2).
@ -2428,7 +2428,7 @@ Tensor index_select_sparse_cpu(
    const auto dim_indices = indices[dim].contiguous();

    // If nnz is smaller than size, then either indices[dim] or index gets
-    // sorted, then this is followed by a binary search to find interesections.
+    // sorted, then this is followed by a binary search to find intersections.
    const auto get_selected_indices_small_nnz_large_size =
        [&]() -> std::tuple<Tensor, Tensor> {
      const auto grain_size = at::internal::GRAIN_SIZE;
@ -3934,7 +3934,7 @@ Tensor squeeze_qtensor(const Tensor& self, c10::OptionalIntArrayRef dims) {
        quantizer->scalar_type());
  }
  // TODO: quantized Tensor support for SymInt needs to be added but basic
-  // building blocs are missing for now.
+  // building blocks are missing for now.
  auto result = make_qtensor(
      self,
      C10_AS_INTARRAYREF_SLOW(sizes),
--- a/aten/src/ATen/native/cpu/GridSamplerKernel.cpp
+++ b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp
@ -14,6 +14,12 @@

 namespace at::native { namespace {

+// fixes segfaults for GCC >= 12 on some AArch64 cpus https://github.com/pytorch/pytorch/issues/157626
+#if defined(__GNUC__) && __GNUC__ >= 12 && defined(__aarch64__)
+#pragma GCC push_options
+#pragma GCC optimize ("no-strict-aliasing")
+#endif
+
 /**  NOTE [ Grid Sample CPU Kernels ]
 *
 *   Implementation of vectorized grid sample CPU kernels is divided into three
@ -1014,6 +1020,10 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Bicubic,
  }
 };

+#if defined(__GNUC__) && __GNUC__ >= 12 && defined(__aarch64__)
+#pragma GCC pop_options
+#endif
+
 // ~~~~~~~~~~~~~~~~~~ grid_sample_2d_grid_slice_iterator ~~~~~~~~~~~~~~~~~~~~~~
 // Function to apply a vectorized function on a grid slice tensor (without batch
 // dimension).
--- a/aten/src/ATen/native/cudnn/MHA.cpp
+++ b/aten/src/ATen/native/cudnn/MHA.cpp
--- a/aten/src/ATen/native/cudnn/MHA.h
+++ b/aten/src/ATen/native/cudnn/MHA.h
@ -70,31 +70,4 @@ void run_cudnn_SDP_bprop(
    const Tensor& dropoutseed,
    const Tensor& dropoutoffset);

-void run_cudnn_SDP_bprop_nestedtensor(
-    int64_t b,
-    int64_t h_q,
-    int64_t h_k,
-    int64_t h_v,
-    int64_t s_q,
-    int64_t s_kv,
-    int64_t d_qk,
-    int64_t d_v,
-    float scaling_factor,
-    bool is_causal,
-    float dropout_probability,
-    const Tensor& cum_seqlen_q,
-    const Tensor& cum_seqlen_kv,
-    const Tensor& q,
-    const Tensor& k,
-    const Tensor& v,
-    const std::optional<Tensor>& attn_bias,
-    const Tensor& o,
-    const Tensor& dO,
-    const Tensor& softmaxstats,
-    Tensor& dQ,
-    Tensor& dK,
-    Tensor& dV,
-    const Tensor& dropoutseed,
-    const Tensor& dropoutoffset);
-
 } // namespace at::native
--- a/aten/src/ATen/native/mkldnn/Conv.cpp
+++ b/aten/src/ATen/native/mkldnn/Conv.cpp
@ -160,6 +160,10 @@ static bool mkldnn_conv_enabled_fpmath_mode_bf16(){
      mkldnn_bf16_device_check();
 }

+static bool mkldnn_conv_enabled_fpmath_mode_tf32(){
+  return at::globalContext().float32Precision("mkldnn", "conv") == "tf32" &&
+      cpuinfo_has_x86_amx_fp16();
+}

 static inline at::MemoryFormat mkldnn_convolution_memory_format(int64_t dims, bool is_channels_last) {
   auto memory_format =  at::MemoryFormat::Contiguous;
@ -271,6 +275,10 @@ static Tensor _mkldnn_convolution(
      input_t.scalar_type() == at::kFloat) {
    op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
  }
+  if (mkldnn_conv_enabled_fpmath_mode_tf32() &&
+      input_t.scalar_type() == at::kFloat) {
+    op_attr.set_fpmath_mode(dnnl_fpmath_mode_tf32);
+  }
  _mkldnn_convolution_out(
      input_t,
      weight_t,
@ -455,6 +463,9 @@ Tensor mkldnn_convolution_pointwise_binary(
    if (mkldnn_conv_enabled_fpmath_mode_bf16() && input_t.scalar_type() ==at::kFloat){
      op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
    }
+    if (mkldnn_conv_enabled_fpmath_mode_tf32() && input_t.scalar_type() ==at::kFloat){
+      op_attr.set_fpmath_mode(dnnl_fpmath_mode_tf32);
+    }

    if (bias.defined()) {
      const ideep::tensor b = itensor_from_tensor(bias);
@ -597,6 +608,10 @@ Tensor& mkldnn_convolution_pointwise_binary_(
        input_t.scalar_type() == at::kFloat) {
      op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
    }
+    if (mkldnn_conv_enabled_fpmath_mode_tf32() &&
+        input_t.scalar_type() == at::kFloat) {
+      op_attr.set_fpmath_mode(dnnl_fpmath_mode_tf32);
+    }
    _mkldnn_convolution_out(
        input_t,
        weight_t,
@ -718,6 +733,9 @@ Tensor _mkldnn_convolution_transpose(
  if (mkldnn_conv_enabled_fpmath_mode_bf16() && input_t.scalar_type() ==at::kFloat){
    op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
  }
+  if (mkldnn_conv_enabled_fpmath_mode_tf32() && input_t.scalar_type() ==at::kFloat){
+    op_attr.set_fpmath_mode(dnnl_fpmath_mode_tf32);
+  }

  if (bias.defined()) {
    const ideep::tensor b = itensor_from_tensor(bias, /*from_const_data_ptr*/true);
@ -808,6 +826,10 @@ Tensor mkldnn_convolution_backward_input(
      weight.scalar_type() == at::kFloat) {
    op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
  }
+  if (mkldnn_conv_enabled_fpmath_mode_tf32() &&
+      weight.scalar_type() == at::kFloat) {
+    op_attr.set_fpmath_mode(dnnl_fpmath_mode_tf32);
+  }
  ideep::convolution_backward_data::compute_v2(
      grad_y,
      w,
@ -828,6 +850,11 @@ Tensor mkldnn_convolution_backward_input(
    TORCH_WARN_ONCE(
        "Unexpected ideep version to support fpmath_mode_bf16, please update ideep version to align with pytorch main branch");
      }
+  if (mkldnn_conv_enabled_fpmath_mode_tf32() &&
+      weight.scalar_type() == at::kFloat) {
+    TORCH_WARN_ONCE(
+        "Unexpected ideep version to support fpmath_mode_tf32, please update ideep version to align with pytorch main branch");
+      }
 #endif

  if (grad_output.is_mkldnn()) {
@ -858,6 +885,10 @@ std::tuple<Tensor, Tensor> mkldnn_convolution_backward_weights(
      input.scalar_type() == at::kFloat) {
    op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
  }
+  if (mkldnn_conv_enabled_fpmath_mode_tf32() &&
+      input.scalar_type() == at::kFloat) {
+    op_attr.set_fpmath_mode(dnnl_fpmath_mode_tf32);
+  }
  if (bias_defined) {
    ideep::convolution_backward_weights::compute_v2(
        x,
@ -1011,6 +1042,10 @@ Tensor mkldnn_convolution_transpose_backward_input(
      weight.scalar_type() == at::kFloat) {
    op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
  }
+  if (mkldnn_conv_enabled_fpmath_mode_tf32() &&
+      weight.scalar_type() == at::kFloat) {
+    op_attr.set_fpmath_mode(dnnl_fpmath_mode_tf32);
+  }
  ideep::convolution_transpose_backward_data::compute_v3(
      grad_y,
      w,
@ -1053,6 +1088,10 @@ std::tuple<Tensor,Tensor> mkldnn_convolution_transpose_backward_weights(
      input.scalar_type() == at::kFloat) {
    op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
  }
+  if (mkldnn_conv_enabled_fpmath_mode_tf32() &&
+      input.scalar_type() == at::kFloat) {
+    op_attr.set_fpmath_mode(dnnl_fpmath_mode_tf32);
+  }
  if (bias_defined) {
    ideep::convolution_transpose_backward_weights::compute_v3(
        x,
--- a/aten/src/ATen/native/mkldnn/Linear.cpp
+++ b/aten/src/ATen/native/mkldnn/Linear.cpp
@ -73,6 +73,11 @@ static bool use_mkldnn_bf32_linear() {
      mkldnn_bf16_device_check();
 }

+static bool use_mkldnn_tf32_linear() {
+  return at::globalContext().float32Precision("mkldnn", "matmul") == "tf32" &&
+      cpuinfo_has_x86_amx_fp16();
+}
+
 Tensor mkldnn_linear(
    const Tensor& self,
    const Tensor& weight_t, const std::optional<Tensor>& bias_opt) {
@ -259,6 +264,9 @@ Tensor mkldnn_linear_pointwise(
  if (use_mkldnn_bf32_linear() && input_t.scalar_type() == at::kFloat){
    op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
  }
+  if (use_mkldnn_tf32_linear() && input_t.scalar_type() == at::kFloat){
+    op_attr.set_fpmath_mode(dnnl_fpmath_mode_tf32);
+  }
  if (mkldnn_bias.has_value()) {
    ideep::inner_product_forward::compute</*reorder_src=*/false, /*reorder_weight=*/false>(
        mkldnn_input,
@ -352,6 +360,10 @@ Tensor mkldnn_linear_pointwise_binary(
    op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
  }

+  if (use_mkldnn_tf32_linear() && input_t.scalar_type() == at::kFloat){
+    op_attr.set_fpmath_mode(dnnl_fpmath_mode_tf32);
+  }
+
  if (mkldnn_bias.has_value()) {
    ideep::inner_product_forward::compute_binary</*reorder_src=*/false, /*reorder_weight=*/false>(
        mkldnn_input,
--- a/aten/src/ATen/native/mkldnn/Matmul.cpp
+++ b/aten/src/ATen/native/mkldnn/Matmul.cpp
@ -1,7 +1,8 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
-#include <ATen/core/Tensor.h>
 #include <ATen/Config.h>
 #include <ATen/Context.h>
+#include <ATen/Dispatch.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/native/mkldnn/Matmul.h>

 #if !AT_MKLDNN_ENABLED()
@ -53,7 +54,7 @@ bool mkldnn_fp16_gemm(
    c10::Half *c, int64_t ldc) {
  return false;
 }
-bool mkldnn_bf32_gemm(
+bool mkldnn_reduced_f32_gemm(
    TransposeType transa, TransposeType transb,
    int64_t m, int64_t n, int64_t k,
    float alpha,
@ -85,6 +86,13 @@ void mkldnn_matmul_i8i8i32(
  TORCH_INTERNAL_ASSERT(false, __func__, ": ATen not compiled with MKLDNN support");
 }

+bool use_mkldnn_tf32_matmul(
+    const Tensor& mat1,
+    const Tensor& mat2,
+    const Tensor& result) {
+    return false;
+}
+
 } // namespace at::native


@ -107,6 +115,10 @@ static bool use_mkldnn_bf32_matmul() {
  return use_mkldnn_bf16_matmul() && at::globalContext().float32Precision("mkldnn", "matmul") == "bf16";
 }

+static bool use_mkldnn_tf32_matmul() {
+  return cpuinfo_has_x86_amx_fp16() && at::globalContext().float32Precision("mkldnn", "matmul") == "tf32";
+}
+
 // returns an ideep::tensor
 // - dims: shape e.g: {M,N}
 // - idtype: ideep data type e.g: (f32, bf16, f16)
@ -144,7 +156,8 @@ mkldnn_gemm(
  bool bf16_usable = std::is_same_v<scalar_t, c10::BFloat16> && use_mkldnn_bf16_matmul();
  bool fp16_usable = std::is_same_v<scalar_t, c10::Half> && use_mkldnn_fp16_matmul();
  bool bf32_usable = std::is_same_v<scalar_t, float> && use_mkldnn_bf32_matmul();
-  if ( !(bf16_usable || fp16_usable || bf32_usable) ||
+  bool tf32_usable = std::is_same_v<scalar_t, float> && use_mkldnn_tf32_matmul();
+  if ( !(bf16_usable || fp16_usable || bf32_usable || tf32_usable) ||
      (m * n * k <= 16 * 16 * 16) || (alpha == 0.0f)) {
    return false;
  }
@ -155,6 +168,7 @@ mkldnn_gemm(
    op_attr = ideep::attr_t::fuse_sum();
  }
  if (bf32_usable) op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16); // bf32 path
+  if (tf32_usable) op_attr.set_fpmath_mode(dnnl_fpmath_mode_tf32); // tf32 path

  // NOTE: View as c-contiguous to avoid extra reordering in mkldnn
  // Use identity: C = AB <=> C^T = B^T A^T
@ -281,7 +295,7 @@ bool mkldnn_fp16_gemm(
  return mkldnn_gemm<c10::Half>(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }

-bool mkldnn_bf32_gemm(
+bool mkldnn_reduced_f32_gemm(
    TransposeType transa, TransposeType transb,
    int64_t m, int64_t n, int64_t k,
    float alpha,
@ -339,6 +353,7 @@ void mkldnn_matmul(
  auto mat2_unsqueezed = mat2.dim() == 1 ? mat2.unsqueeze(1) : mat2;
  auto result_unsqueezed = result.dim() == 1 ? result.unsqueeze(1) : result;
  bool bf32_usable = mat1.scalar_type() == at::kFloat && use_mkldnn_bf32_matmul();
+  bool tf32_usable = mat1.scalar_type() == at::kFloat && use_mkldnn_tf32_matmul();

  ideep::attr_t op_attr;
  // "addmm", "addbmm" "baddbmm" in pytorch allow bias to be 2-D or 3-D tensor
@ -346,6 +361,7 @@ void mkldnn_matmul(
  // to address their differences, we use mkldnn post ops to perform a fused "add" after matrix multiplication is over
  if (beta != 0.0f) op_attr = ideep::attr_t::fuse_sum();
  if (bf32_usable) op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16); // bf32 path
+  if (tf32_usable) op_attr.set_fpmath_mode(dnnl_fpmath_mode_tf32); // tf32 path
  // If alpha = 0, dose not need actually do gemm computation
  if (alpha == 0)
    return;
@ -412,70 +428,56 @@ static inline bool checksize(const Tensor& mat1, const Tensor& mat2){
  }
 }

-bool use_mkldnn_bf16_matmul(
+template <typename T>
+bool use_mkldnn_typed_matmul(
    const Tensor& mat1,
    const Tensor& mat2,
    const Tensor& result) {
+  bool dtype_check = false;
+  if constexpr (std::is_same_v<T, c10::BFloat16>) {
 #if defined(__aarch64__)
-  if (mkldnn_bf16_device_check_arm()) {
-     //onednn fastmath mode can leverage bf16 HW even for the fp32 input, e.g. Arm Neoverse V1
-     //so, don't restrict the mkldnn_matmul only for bf16 inputs, allow it for float as well
-     return (
-        use_mkldnn_bf16_matmul() &&
-        (mat1.scalar_type() == mat2.scalar_type()) && (!result.defined() || (mat1.scalar_type() == result.scalar_type())) &&
-        ((mat1.scalar_type() == kFloat) || (mat1.scalar_type() == kBFloat16)) &&
-        mat1.numel() != 0 &&
-        mat2.numel() != 0 &&
-        checksize(mat1, mat2));
-  } else
+    if (mkldnn_bf16_device_check_arm()) {
+      // onednn fastmath mode can leverage bf16 HW even for the fp32 input, e.g.
+      // Arm Neoverse V1 so, don't restrict the mkldnn_matmul only for bf16
+      // inputs, allow it for float as well
+      dtype_check = use_mkldnn_bf16_matmul() &&
+          ((mat1.scalar_type() == kFloat) || (mat1.scalar_type() == kBFloat16));
+    }
+#else
+    dtype_check = dtype_check && use_mkldnn_bf16_matmul() &&
+        (mat1.scalar_type() == kBFloat16);
 #endif
-  {
-     return (
-        use_mkldnn_bf16_matmul() &&
-        mat1.scalar_type() == kBFloat16 &&
-        mat2.scalar_type() == kBFloat16 &&
-        (!result.defined() || result.scalar_type() == kBFloat16) &&
-        mat1.numel() != 0 &&
-        mat2.numel() != 0 &&
-        checksize(mat1, mat2));
+  } else if constexpr (std::is_same_v<T, c10::Half>) {
+    dtype_check = dtype_check && use_mkldnn_fp16_matmul() &&
+        (mat1.scalar_type() == kHalf);
+  } else if constexpr (std::is_same_v<T, float>) {
+    dtype_check = dtype_check &&
+        (use_mkldnn_bf32_matmul() || use_mkldnn_tf32_matmul()) &&
+        (mat1.scalar_type() == kFloat);
  }
-}
-
-bool use_mkldnn_fp16_matmul(
-    const Tensor& mat1,
-    const Tensor& mat2,
-    const Tensor& result) {
-
-    return (
-      use_mkldnn_fp16_matmul() &&
-      mat1.scalar_type() == kHalf &&
-      mat2.scalar_type() == kHalf &&
-      (!result.defined() || result.scalar_type() == kHalf) &&
-      mat1.numel() != 0 &&
-      mat2.numel() != 0 &&
-      checksize(mat1, mat2));
-}
-
-bool use_mkldnn_bf32_matmul(
-    const Tensor& mat1,
-    const Tensor& mat2,
-    const Tensor& result) {
-
-    return (
-      use_mkldnn_bf32_matmul() &&
-      mat1.scalar_type() == kFloat &&
-      mat2.scalar_type() == kFloat &&
-      (!result.defined() || result.scalar_type() == kFloat) &&
-      mat1.numel() != 0 &&
-      mat2.numel() != 0 &&
-      checksize(mat1, mat2));
+  if (!dtype_check) {
+    return false;
+  }
+  bool size_check =
+      mat1.numel() != 0 && mat2.numel() != 0 && checksize(mat1, mat2);
+  dtype_check = (mat1.scalar_type() == mat2.scalar_type()) &&
+      (!result.defined() || result.scalar_type() == mat1.scalar_type());
+  return dtype_check && size_check;
 }

 bool use_mkldnn_matmul(
    const Tensor& mat1,
    const Tensor& mat2,
    const Tensor& result) {
-  return (use_mkldnn_bf16_matmul(mat1, mat2, result) || use_mkldnn_fp16_matmul(mat1, mat2, result) || use_mkldnn_bf32_matmul(mat1, mat2, result));
+  auto mat1_type = mat1.scalar_type();
+  if (mat1_type != kBFloat16 || mat1_type != kHalf || mat1_type != kFloat) {
+    return false;
+  }
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      kBFloat16, kHalf, mat1.scalar_type(), "use_mkldnn_matmul", [&] {
+        return use_mkldnn_typed_matmul<scalar_t>(mat1, mat2, result);
+      });
+  return false;
 }

 static void _mkldnn_matmul_i8i8i32_with_primitive(
--- a/aten/src/ATen/native/mkldnn/Matmul.h
+++ b/aten/src/ATen/native/mkldnn/Matmul.h
@ -29,6 +29,11 @@ bool use_mkldnn_bf32_matmul(
    const Tensor& mat2,
    const Tensor& result_opt);

+bool use_mkldnn_tf32_matmul(
+    const Tensor& mat1,
+    const Tensor& mat2,
+    const Tensor& result_opt);
+
 // Try running mkldnn optimized gemm, or returns false if naive gemm would be faster
 bool mkldnn_bf16_gemm(
    TransposeType transa, TransposeType transb,
@ -62,7 +67,7 @@ oneDNN implicit reduced precision arithmetic feature
 https://github.com/mgouicem/oneDNN/tree/mgouicem/rfcs/implicit_downconvert/rfcs/20210301-computation-datatype
 to allow implicitly cast data type from FP32 to BF16 in onednn compute primitives
 */
-bool mkldnn_bf32_gemm(
+bool mkldnn_reduced_f32_gemm(
    TransposeType transa, TransposeType transb,
    int64_t m, int64_t n, int64_t k,
    float alpha,
--- a/Show More
+++ b/Show More