Update

[ghstack-poisoned]
2025-11-11 22:34:53 +08:00 · 2025-07-11 15:05:13 +08:00
parent b3cf7bc86d 84bb803719
commit 8266849bda
229 changed files with 5089 additions and 1410 deletions
--- a/.ci/aarch64_linux/aarch64_ci_build.sh
+++ b/.ci/aarch64_linux/aarch64_ci_build.sh
@ -4,7 +4,7 @@ set -eux -o pipefail
 GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}

 if [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then
-    export TORCH_CUDA_ARCH_LIST="9.0;10.0;12.0"
+    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
 fi

 SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -52,6 +52,8 @@ fi

 if [[ "$image" == *-jammy* ]]; then
  UBUNTU_VERSION=22.04
+elif [[ "$image" == *-noble* ]]; then
+  UBUNTU_VERSION=24.04
 elif [[ "$image" == *ubuntu* ]]; then
  extract_version_from_image_name ubuntu UBUNTU_VERSION
 fi
@ -230,8 +232,12 @@ case "$tag" in
    UCC_COMMIT=${_UCC_COMMIT}
    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-jammy-rocm-n-py3)
-    ANACONDA_PYTHON_VERSION=3.10
+  pytorch-linux-jammy-rocm-n-py3 | pytorch-linux-noble-rocm-n-py3)
+    if [[ $tag =~ "jammy" ]]; then
+      ANACONDA_PYTHON_VERSION=3.10
+    else
+      ANACONDA_PYTHON_VERSION=3.12
+    fi
    GCC_VERSION=11
    VISION=yes
    ROCM_VERSION=6.4
@ -322,6 +328,8 @@ case "$tag" in
    GCC_VERSION=11
    ACL=yes
    VISION=yes
+    CONDA_CMAKE=yes
+    OPENBLAS=yes
    # snadampal: skipping llvm src build install because the current version
    # from pytorch/llvm:9.0.1 is x86 specific
    SKIP_LLVM_SRC_BUILD_INSTALL=yes
@ -331,6 +339,8 @@ case "$tag" in
    GCC_VERSION=11
    ACL=yes
    VISION=yes
+    CONDA_CMAKE=yes
+    OPENBLAS=yes
    # snadampal: skipping llvm src build install because the current version
    # from pytorch/llvm:9.0.1 is x86 specific
    SKIP_LLVM_SRC_BUILD_INSTALL=yes
@ -417,6 +427,7 @@ docker build \
       --build-arg "XPU_VERSION=${XPU_VERSION}" \
       --build-arg "UNINSTALL_DILL=${UNINSTALL_DILL}" \
       --build-arg "ACL=${ACL:-}" \
+       --build-arg "OPENBLAS=${OPENBLAS:-}" \
       --build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \
       --build-arg "SKIP_LLVM_SRC_BUILD_INSTALL=${SKIP_LLVM_SRC_BUILD_INSTALL:-}" \
       -f $(dirname ${DOCKERFILE})/Dockerfile \
--- a/.ci/docker/common/common_utils.sh
+++ b/.ci/docker/common/common_utils.sh
@ -23,6 +23,10 @@ conda_install() {
  as_jenkins conda install -q -n py_$ANACONDA_PYTHON_VERSION -y python="$ANACONDA_PYTHON_VERSION" $*
 }

+conda_install_through_forge() {
+  as_jenkins conda install -c conda-forge -q -n py_$ANACONDA_PYTHON_VERSION -y python="$ANACONDA_PYTHON_VERSION" $*
+}
+
 conda_run() {
  as_jenkins conda run -n py_$ANACONDA_PYTHON_VERSION --no-capture-output $*
 }
--- a/.ci/docker/common/install_base.sh
+++ b/.ci/docker/common/install_base.sh
@ -15,6 +15,9 @@ install_ubuntu() {
  elif [[ "$UBUNTU_VERSION" == "22.04"* ]]; then
    cmake3="cmake=3.22*"
    maybe_libiomp_dev=""
+  elif [[ "$UBUNTU_VERSION" == "24.04"* ]]; then
+    cmake3="cmake=3.28*"
+    maybe_libiomp_dev=""
  else
    cmake3="cmake=3.5*"
    maybe_libiomp_dev="libiomp-dev"
--- a/.ci/docker/common/install_conda.sh
+++ b/.ci/docker/common/install_conda.sh
@ -70,10 +70,10 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
  fi

  # Install PyTorch conda deps, as per https://github.com/pytorch/pytorch README
-  if [[ $(uname -m) == "aarch64" ]]; then
-    conda_install "openblas==0.3.29=*openmp*"
-  else
-    conda_install "mkl=2021.4.0 mkl-include=2021.4.0"
+  if [[ $(uname -m) != "aarch64" ]]; then
+    pip_install mkl==2024.2.0
+    pip_install mkl-static==2024.2.0
+    pip_install mkl-include==2024.2.0
  fi

  # Install llvm-8 as it is required to compile llvmlite-0.30.0 from source
@ -87,6 +87,10 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
    conda_run ${SCRIPT_FOLDER}/install_magma_conda.sh $(cut -f1-2 -d'.' <<< ${CUDA_VERSION})
  fi

+  if [[ "$UBUNTU_VERSION" == "24.04"* ]] ; then
+    conda_install_through_forge libstdcxx-ng=14
+  fi
+
  # Install some other packages, including those needed for Python test reporting
  pip_install -r /opt/conda/requirements-ci.txt

--- a/.ci/docker/common/install_openblas.sh
+++ b/.ci/docker/common/install_openblas.sh
@ -4,8 +4,9 @@
 set -ex

 cd /
-git clone https://github.com/OpenMathLib/OpenBLAS.git -b "${OPENBLAS_VERSION:-v0.3.29}" --depth 1 --shallow-submodules
+git clone https://github.com/OpenMathLib/OpenBLAS.git -b "${OPENBLAS_VERSION:-v0.3.30}" --depth 1 --shallow-submodules

+OPENBLAS_CHECKOUT_DIR="OpenBLAS"
 OPENBLAS_BUILD_FLAGS="
 NUM_THREADS=128
 USE_OPENMP=1
@ -13,9 +14,8 @@ NO_SHARED=0
 DYNAMIC_ARCH=1
 TARGET=ARMV8
 CFLAGS=-O3
+BUILD_BFLOAT16=1
 "

-OPENBLAS_CHECKOUT_DIR="OpenBLAS"
-
 make -j8 ${OPENBLAS_BUILD_FLAGS} -C ${OPENBLAS_CHECKOUT_DIR}
 make -j8 ${OPENBLAS_BUILD_FLAGS} install -C ${OPENBLAS_CHECKOUT_DIR}
--- a/.ci/docker/common/install_rocm.sh
+++ b/.ci/docker/common/install_rocm.sh
@ -8,9 +8,11 @@ ver() {

 install_ubuntu() {
    apt-get update
-    if [[ $UBUNTU_VERSION == 20.04 ]]; then
-      # gpg-agent is not available by default on 20.04
-      apt-get install -y --no-install-recommends gpg-agent
+    # gpg-agent is not available by default
+    apt-get install -y --no-install-recommends gpg-agent
+    if [[ $(ver $UBUNTU_VERSION) -ge $(ver 22.04) ]]; then
+        echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' \
+            | sudo tee /etc/apt/preferences.d/rocm-pin-600
    fi
    apt-get install -y kmod
    apt-get install -y wget
@ -85,13 +87,14 @@ EOF
            VER_STR=6.3
        fi
        # clr build needs CppHeaderParser but can only find it using conda's python
-        /opt/conda/bin/python -m pip install CppHeaderParser
+        python -m pip install CppHeaderParser
        git clone https://github.com/ROCm/HIP -b $HIP_BRANCH
        HIP_COMMON_DIR=$(readlink -f HIP)
        git clone https://github.com/jeffdaily/clr -b release/rocm-rel-${VER_STR}${VER_PATCH}-statco-hotfix
        mkdir -p clr/build
        pushd clr/build
-        cmake .. -DCLR_BUILD_HIP=ON -DHIP_COMMON_DIR=$HIP_COMMON_DIR
+        # Need to point CMake to the correct python installation to find CppHeaderParser
+        cmake .. -DPython3_EXECUTABLE=/opt/conda/envs/py_${ANACONDA_PYTHON_VERSION}/bin/python3 -DCLR_BUILD_HIP=ON -DHIP_COMMON_DIR=$HIP_COMMON_DIR
        make -j
        cp hipamd/lib/libamdhip64.so.${VER_STR}.* /opt/rocm/lib/libamdhip64.so.${VER_STR}.*
        popd
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -41,7 +41,7 @@ case ${image} in
        GPU_IMAGE=arm64v8/almalinux:8
        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=13 --build-arg NINJA_VERSION=1.12.1"
        MANY_LINUX_VERSION="2_28_aarch64"
-        OPENBLAS_VERSION="v0.3.29"
+        OPENBLAS_VERSION="v0.3.30"
        ;;
    manylinuxcxx11-abi-builder:cpu-cxx11-abi)
        TARGET=final
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -16,6 +16,7 @@ click
 #test that import:

 coremltools==5.0b5 ; python_version < "3.12"
+coremltools==8.3 ; python_version == "3.12"
 #Description: Apple framework for ML integration
 #Pinned versions: 5.0b5
 #test that import:
@ -63,6 +64,7 @@ lark==0.12.0
 #test that import:

 librosa>=0.6.2 ; python_version < "3.11"
+librosa==0.10.2 ; python_version == "3.12"
 #Description: A python package for music and audio analysis
 #Pinned versions: >=0.6.2
 #test that import: test_spectral_ops.py
@ -111,6 +113,7 @@ ninja==1.11.1.3
 numba==0.49.0 ; python_version < "3.9"
 numba==0.55.2 ; python_version == "3.9"
 numba==0.55.2 ; python_version == "3.10"
+numba==0.60.0 ; python_version == "3.12"
 #Description: Just-In-Time Compiler for Numerical Functions
 #Pinned versions: 0.54.1, 0.49.0, <=0.49.1
 #test that import: test_numba_integration.py
@ -360,10 +363,11 @@ pwlf==2.2.1


 # To build PyTorch itself
-astunparse
-PyYAML
+pyyaml
 pyzstd
 setuptools
+six
+wheel

 scons==4.5.2 ; platform_machine == "aarch64"

--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@ -5,7 +5,7 @@ sphinx==5.3.0

 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought is probably
-# something related to Docker setup. We can investigate this later
+# something related to Docker setup. We can investigate this later.

 sphinxcontrib.katex==0.8.6
 #Description: This is used to generate PyTorch docs
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -147,6 +147,12 @@ RUN if [ -n "${ACL}" ]; then bash ./install_acl.sh; fi
 RUN rm install_acl.sh
 ENV INSTALLED_ACL ${ACL}

+ARG OPENBLAS
+COPY ./common/install_openblas.sh install_openblas.sh
+RUN if [ -n "${OPENBLAS}" ]; then bash ./install_openblas.sh; fi
+RUN rm install_openblas.sh
+ENV INSTALLED_OPENBLAS ${OPENBLAS}
+
 # Install ccache/sccache (do this last, so we get priority in PATH)
 ARG SKIP_SCCACHE_INSTALL
 COPY ./common/install_cache.sh install_cache.sh
--- a/.ci/manywheel/build_libtorch.sh
+++ b/.ci/manywheel/build_libtorch.sh
@ -104,7 +104,7 @@ if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
    export ROCclr_DIR=/opt/rocm/rocclr/lib/cmake/rocclr
 fi

-echo "Calling setup.py install at $(date)"
+echo "Calling 'python -m pip install .' at $(date)"

 if [[ $LIBTORCH_VARIANT = *"static"* ]]; then
    STATIC_CMAKE_FLAG="-DTORCH_STATIC=1"
@ -120,7 +120,7 @@ fi
        # TODO: Remove this flag once https://github.com/pytorch/pytorch/issues/55952 is closed
        CFLAGS='-Wno-deprecated-declarations' \
        BUILD_LIBTORCH_CPU_WITH_DEBUG=1 \
-        python setup.py install
+        python -m pip install --no-build-isolation -v .

    mkdir -p libtorch/{lib,bin,include,share}

--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@ -185,7 +185,7 @@ torchbench_setup_macos() {
 }

 pip_benchmark_deps() {
-  python -mpip install --no-input astunparse requests cython scikit-learn
+  python -mpip install --no-input requests cython scikit-learn six
 }


--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -436,11 +436,11 @@ test_inductor_aoti() {
    python3 tools/amd_build/build_amd.py
  fi
  if [[ "$BUILD_ENVIRONMENT" == *sm86* ]]; then
-    BUILD_COMMAND=(TORCH_CUDA_ARCH_LIST=8.6 USE_FLASH_ATTENTION=OFF python setup.py develop)
+    BUILD_COMMAND=(TORCH_CUDA_ARCH_LIST=8.6 USE_FLASH_ATTENTION=OFF python -m pip install --no-build-isolation -v -e .)
    # TODO: Replace me completely, as one should not use conda libstdc++, nor need special path to TORCH_LIB
    TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="/opt/conda/envs/py_3.10/lib:${TORCH_LIB_DIR}:${LD_LIBRARY_PATH}")
  else
-    BUILD_COMMAND=(python setup.py develop)
+    BUILD_COMMAND=(python -m pip install --no-build-isolation -v -e .)
    TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}")
  fi

@ -1579,7 +1579,7 @@ test_operator_benchmark() {
  test_inductor_set_cpu_affinity

  cd benchmarks/operator_benchmark/pt_extension
-  python setup.py install
+  python -m pip install .

  cd "${TEST_DIR}"/benchmarks/operator_benchmark
  $TASKSET python -m benchmark_all_test --device "$1" --tag-filter "$2" \
--- a/.ci/pytorch/win-test-helpers/build_pytorch.bat
+++ b/.ci/pytorch/win-test-helpers/build_pytorch.bat
@ -42,7 +42,7 @@ call choco upgrade -y cmake --no-progress --installargs 'ADD_CMAKE_TO_PATH=Syste
 if errorlevel 1 goto fail
 if not errorlevel 0 goto fail

-call pip install mkl-include==2021.4.0 mkl-devel==2021.4.0
+call pip install mkl==2024.2.0 mkl-static==2024.2.0 mkl-include==2024.2.0
 if errorlevel 1 goto fail
 if not errorlevel 0 goto fail

--- a/.devcontainer/README.md
+++ b/.devcontainer/README.md
@ -61,8 +61,8 @@ You are now all set to start developing with PyTorch in a DevContainer environme
 ## Step 8: Build PyTorch

 To build pytorch from source, simply run:
-   ```
-   python setup.py develop
+   ```bash
+   python -m pip install --no-build-isolation -v -e .
   ```

 The process involves compiling thousands of files, and would take a long time. Fortunately, the compiled objects can be useful for your next build. When you modify some files, you only need to compile the changed files the next time.
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-70caf76066ef2c1054d6128b11769dc816a779e7
+6c57850358f34c47802db216b0746e4e9d08a95a
--- a/.github/scripts/td_llm_indexer.sh
+++ b/.github/scripts/td_llm_indexer.sh
@ -6,7 +6,7 @@ set -euxo pipefail
 cd llm-target-determinator
 pip install -q -r requirements.txt
 cd ../codellama
-pip install -e .
+pip install --no-build-isolation -v -e .
 pip install numpy==1.26.0

 # Run indexer
--- a/.github/workflows/_linux-build.yml
+++ b/.github/workflows/_linux-build.yml
@ -131,6 +131,9 @@ jobs:
        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+          instructions: |
+            Build is done inside the container, to start an interactive session run:
+              docker exec -it $(docker container ps --format '{{.ID}}') bash

      # [pytorch repo ref]
      # Use a pytorch/pytorch reference instead of a reference to the local
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@ -88,6 +88,14 @@ jobs:
            pkill "${PROCESS}" || true
          done

+      - name: Clean up brew miniconda, if installed
+        continue-on-error: true
+        run: |
+          if brew list miniconda; then
+            brew uninstall miniconda
+            echo "REINSTALL_BREW_MINICONDA=1" >> "${GITHUB_ENV}"
+          fi
+
      - name: Clean up leftover local python3 site-packages on MacOS pet runner
        continue-on-error: true
        run: |
@ -268,6 +276,14 @@ jobs:
          workflow_attempt: ${{github.run_attempt}}
          local_path: usage_log.txt

+      - name: Reinstall brew miniconda, if was installed
+        if: always()
+        continue-on-error: true
+        run: |
+          if [[ -n "$REINSTALL_BREW_MINICONDA" ]]; then
+              brew install miniconda
+          fi
+
      - name: Clean up disk space
        if: always()
        continue-on-error: true
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -63,6 +63,7 @@ jobs:
          pytorch-linux-jammy-py3.13-clang12,
          pytorch-linux-jammy-rocm-n-1-py3,
          pytorch-linux-jammy-rocm-n-py3,
+          pytorch-linux-noble-rocm-n-py3,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12,
          pytorch-linux-jammy-py3.9-gcc11,
          pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks,
--- a/.github/workflows/rocm-mi300.yml
+++ b/.github/workflows/rocm-mi300.yml
@ -36,15 +36,15 @@ jobs:
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}

-  linux-jammy-rocm-py3_10-build:
+  linux-noble-rocm-py3_12-build:
    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
-    name: linux-jammy-rocm-py3.10-mi300
+    name: linux-noble-rocm-py3.12-mi300
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-rocm-py3.10-mi300
-      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
+      build-environment: linux-noble-rocm-py3.12-mi300
+      docker-image-name: ci-image:pytorch-linux-noble-rocm-n-py3
      sync-tag: rocm-build
      test-matrix: |
        { include: [
@ -57,17 +57,17 @@ jobs:
        ]}
    secrets: inherit

-  linux-jammy-rocm-py3_10-test:
+  linux-noble-rocm-py3_12-test:
    permissions:
      id-token: write
      contents: read
-    name: linux-jammy-rocm-py3.10-mi300
+    name: linux-noble-rocm-py3.12-mi300
    uses: ./.github/workflows/_rocm-test.yml
    needs:
-      - linux-jammy-rocm-py3_10-build
+      - linux-noble-rocm-py3_12-build
      - target-determination
    with:
-      build-environment: linux-jammy-rocm-py3.10-mi300
-      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
+      build-environment: linux-noble-rocm-py3.12-mi300
+      docker-image: ${{ needs.linux-noble-rocm-py3_12-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-noble-rocm-py3_12-build.outputs.test-matrix }}
    secrets: inherit
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -231,7 +231,8 @@ include_patterns = [
    'c10/**/*.cpp',
    'c10/**/*.h',
    'torch/*.h',
-    'torch/_inductor/codegen/aoti_runtime/interface.cpp',
+    'torch/_inductor/codegen/aoti_runtime/*.h',
+    'torch/_inductor/codegen/aoti_runtime/*.cpp',
    'torch/csrc/*.h',
    'torch/csrc/*.cpp',
    'torch/csrc/**/*.h',
@ -1476,6 +1477,31 @@ init_command = [
    'tomli==2.2.1 ; python_version < "3.11"',
 ]

+[[linter]]
+code = 'CMAKE_MINIMUM_REQUIRED'
+command = [
+    'python3',
+    'tools/linter/adapters/cmake_minimum_required_linter.py',
+    '--',
+    '@{{PATHSFILE}}'
+]
+include_patterns = [
+    "**/pyproject.toml",
+    "**/CMakeLists.txt",
+    "**/CMakeLists.txt.in",
+    "**/*.cmake",
+    "**/*.cmake.in",
+    "**/*requirements*.txt",
+    "**/*requirements*.in",
+]
+init_command = [
+    'python3',
+    'tools/linter/adapters/pip_init.py',
+    '--dry-run={{DRYRUN}}',
+    'packaging==25.0',
+    'tomli==2.2.1 ; python_version < "3.11"',
+]
+
 [[linter]]
 code = 'COPYRIGHT'
 include_patterns = ['**']
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -88,20 +88,19 @@ source venv/bin/activate  # or `& .\venv\Scripts\Activate.ps1` on Windows

 * If you want to have no-op incremental rebuilds (which are fast), see [Make no-op build fast](#make-no-op-build-fast) below.

-* When installing with `python setup.py develop` (in contrast to `python setup.py install`) Python runtime will use
+* When installing with `python -m pip install -e .` (in contrast to `python -m pip install .`) Python runtime will use
  the current local source-tree when importing `torch` package. (This is done by creating [`.egg-link`](https://wiki.python.org/moin/PythonPackagingTerminology#egg-link) file in `site-packages` folder)
  This way you do not need to repeatedly install after modifying Python files (`.py`).
-  However, you would need to reinstall if you modify Python interface (`.pyi`, `.pyi.in`) or
-   non-Python files (`.cpp`, `.cc`, `.cu`, `.h`, ...).
+  However, you would need to reinstall if you modify Python interface (`.pyi`, `.pyi.in`) or non-Python files (`.cpp`, `.cc`, `.cu`, `.h`, ...).


-  One way to avoid running `python setup.py develop` every time one makes a change to C++/CUDA/ObjectiveC files on Linux/Mac,
+  One way to avoid running `python -m pip install -e .` every time one makes a change to C++/CUDA/ObjectiveC files on Linux/Mac,
  is to create a symbolic link from `build` folder to `torch/lib`, for example, by issuing following:
  ```bash
-   pushd torch/lib; sh -c "ln -sf ../../build/lib/libtorch_cpu.* ."; popd
+  pushd torch/lib; sh -c "ln -sf ../../build/lib/libtorch_cpu.* ."; popd
  ```
-   Afterwards rebuilding a library (for example to rebuild `libtorch_cpu.so` issue `ninja torch_cpu` from `build` folder),
-   would be sufficient to make change visible in `torch` package.
+  Afterwards rebuilding a library (for example to rebuild `libtorch_cpu.so` issue `ninja torch_cpu` from `build` folder),
+  would be sufficient to make change visible in `torch` package.


  To reinstall, first uninstall all existing PyTorch installs. You may need to run `pip
@ -115,9 +114,9 @@ source venv/bin/activate  # or `& .\venv\Scripts\Activate.ps1` on Windows
  pip uninstall torch
  ```

-  Next run `python setup.py clean`. After that, you can install in `develop` mode again.
+  Next run `python setup.py clean`. After that, you can install in editable mode again.

-* If you run into errors when running `python setup.py develop`, here are some debugging steps:
+* If you run into errors when running `python -m pip install -e .`, here are some debugging steps:
  1. Run `printf '#include <stdio.h>\nint main() { printf("Hello World");}'|clang -x c -; ./a.out` to make sure
  your CMake works and can compile this simple Hello World program without errors.
  2. Nuke your `build` directory. The `setup.py` script compiles binaries into the `build` folder and caches many
@ -130,13 +129,20 @@ source venv/bin/activate  # or `& .\venv\Scripts\Activate.ps1` on Windows
      git clean -xdf
      python setup.py clean
      git submodule update --init --recursive
-      python setup.py develop
+      python -m pip install -r requirements.txt
+      python -m pip install --no-build-isolation -v -e .
      ```
-  4. The main step within `python setup.py develop` is running `make` from the `build` directory. If you want to
+  4. The main step within `python -m pip install -e .` is running `cmake --build build` from the `build` directory. If you want to
    experiment with some environment variables, you can pass them into the command:
      ```bash
-      ENV_KEY1=ENV_VAL1[, ENV_KEY2=ENV_VAL2]* python setup.py develop
+      ENV_KEY1=ENV_VAL1[, ENV_KEY2=ENV_VAL2]* CMAKE_FRESH=1 python -m pip install --no-build-isolation -v -e .
      ```
+  5. Try installing PyTorch without build isolation by adding `--no-build-isolation` to the `pip install` command.
+  This will use the current environment's packages instead of creating a new isolated environment for the build.
+      ```bash
+      python -m pip install --no-build-isolation -v -e .
+      ```
+

 * If you run into issue running `git submodule update --init --recursive`. Please try the following:
  - If you encounter an error such as
@ -639,9 +645,9 @@ can be selected interactively with your mouse to zoom in on a particular part of
 the program execution timeline. The `--native` command-line option tells
 `py-spy` to record stack frame entries for PyTorch C++ code. To get line numbers
 for C++ code it may be necessary to compile PyTorch in debug mode by prepending
-your `setup.py develop` call to compile PyTorch with `DEBUG=1`. Depending on
-your operating system it may also be necessary to run `py-spy` with root
-privileges.
+your `python -m pip install -e .` call to compile PyTorch with `DEBUG=1`.
+Depending on your operating system it may also be necessary to run `py-spy` with
+root privileges.

 `py-spy` can also work in an `htop`-like "live profiling" mode and can be
 tweaked to adjust the stack sampling rate, see the `py-spy` readme for more
@ -649,7 +655,7 @@ details.

 ## Managing multiple build trees

-One downside to using `python setup.py develop` is that your development
+One downside to using `python -m pip install -e .` is that your development
 version of PyTorch will be installed globally on your account (e.g., if
 you run `import torch` anywhere else, the development version will be
 used).
@ -663,7 +669,7 @@ specific build of PyTorch. To set one up:
 python -m venv pytorch-myfeature
 source pytorch-myfeature/bin/activate  # or `& .\pytorch-myfeature\Scripts\Activate.ps1` on Windows
 # if you run python now, torch will NOT be installed
-python setup.py develop
+python -m pip install --no-build-isolation -v -e .
 ```

 ## C++ development tips
@ -701,7 +707,9 @@ variables `DEBUG`, `USE_DISTRIBUTED`, `USE_MKLDNN`, `USE_CUDA`, `USE_FLASH_ATTEN
 For example:

 ```bash
-DEBUG=1 USE_DISTRIBUTED=0 USE_MKLDNN=0 USE_CUDA=0 BUILD_TEST=0 USE_FBGEMM=0 USE_NNPACK=0 USE_QNNPACK=0 USE_XNNPACK=0 python setup.py develop
+DEBUG=1 USE_DISTRIBUTED=0 USE_MKLDNN=0 USE_CUDA=0 BUILD_TEST=0 \
+    USE_FBGEMM=0 USE_NNPACK=0 USE_QNNPACK=0 USE_XNNPACK=0 \
+    python -m pip install --no-build-isolation -v -e .
 ```

 For subsequent builds (i.e., when `build/CMakeCache.txt` exists), the build
@ -711,7 +719,7 @@ options.

 ### Code completion and IDE support

-When using `python setup.py develop`, PyTorch will generate
+When using `python -m pip install -e .`, PyTorch will generate
 a `compile_commands.json` file that can be used by many editors
 to provide command completion and error highlighting for PyTorch's
 C++ code. You need to `pip install ninja` to generate accurate
@ -772,7 +780,7 @@ If not, you can define these variables on the command line before invoking `setu
 export CMAKE_C_COMPILER_LAUNCHER=ccache
 export CMAKE_CXX_COMPILER_LAUNCHER=ccache
 export CMAKE_CUDA_COMPILER_LAUNCHER=ccache
-python setup.py develop
+python -m pip install --no-build-isolation -v -e .
 ```

 #### Use a faster linker
@ -785,7 +793,7 @@ If you are editing a single file and rebuilding in a tight loop, the time spent
 Starting with CMake 3.29, you can specify the linker type using the [`CMAKE_LINKER_TYPE`](https://cmake.org/cmake/help/latest/variable/CMAKE_LINKER_TYPE.html) variable. For example, with `mold` installed:

 ```sh
-CMAKE_LINKER_TYPE=MOLD python setup.py develop
+CMAKE_LINKER_TYPE=MOLD python -m pip install --no-build-isolation -v -e .
 ```

 #### Use pre-compiled headers
@ -797,7 +805,7 @@ setting `USE_PRECOMPILED_HEADERS=1` either on first setup, or in the
 `CMakeCache.txt` file.

 ```sh
-USE_PRECOMPILED_HEADERS=1 python setup.py develop
+USE_PRECOMPILED_HEADERS=1 python -m pip install --no-build-isolation -v -e .
 ```

 This adds a build step where the compiler takes `<ATen/ATen.h>` and essentially
@ -820,7 +828,7 @@ A compiler-wrapper to fix this is provided in `tools/nvcc_fix_deps.py`. You can
 this as a compiler launcher, similar to `ccache`
 ```bash
 export CMAKE_CUDA_COMPILER_LAUNCHER="python;`pwd`/tools/nvcc_fix_deps.py;ccache"
-python setup.py develop
+python -m pip install --no-build-isolation -v -e .
 ```

 ### Rebuild few files with debug information
@ -1171,7 +1179,7 @@ build_with_asan()
  CFLAGS="-fsanitize=address -fno-sanitize-recover=all -shared-libasan -pthread" \
  CXX_FLAGS="-pthread" \
  USE_CUDA=0 USE_OPENMP=0 USE_DISTRIBUTED=0 DEBUG=1 \
-  python setup.py develop
+  python -m pip install --no-build-isolation -v -e .
 }

 run_with_asan()
--- a/2
+++ b/2
@ -57,7 +57,7 @@ RUN --mount=type=cache,target=/opt/ccache \
    export eval ${CMAKE_VARS} && \
    TORCH_CUDA_ARCH_LIST="7.0 7.2 7.5 8.0 8.6 8.7 8.9 9.0 9.0a" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
    CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \
-    python setup.py install
+    python -m pip install --no-build-isolation -v .

 FROM conda as conda-installs
 ARG PYTHON_VERSION=3.11
--- a/README.md
+++ b/README.md
@ -228,6 +228,7 @@ If you want to disable Intel GPU support, export the environment variable `USE_X
 Other potentially useful environment variables may be found in `setup.py`.

 #### Get the PyTorch Source
+
 ```bash
 git clone https://github.com/pytorch/pytorch
 cd pytorch
@ -279,24 +280,29 @@ conda install -c conda-forge libuv=1.39
 ```

 #### Install PyTorch
+
 **On Linux**

 If you're compiling for AMD ROCm then first run this command:
+
 ```bash
 # Only run this if you're compiling for ROCm
 python tools/amd_build/build_amd.py
 ```

 Install PyTorch
+
 ```bash
 export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}"
-python setup.py develop
+python -m pip install -r requirements.txt
+python -m pip install --no-build-isolation -v -e .
 ```

 **On macOS**

 ```bash
-python3 setup.py develop
+python -m pip install -r requirements.txt
+python -m pip install --no-build-isolation -v -e .
 ```

 **On Windows**
@ -308,7 +314,7 @@ If you want to build legacy python code, please refer to [Building on legacy cod
 In this mode PyTorch computations will run on your CPU, not your GPU.

 ```cmd
-python setup.py develop
+python -m pip install --no-build-isolation -v -e .
 ```

 Note on OpenMP: The desired OpenMP implementation is Intel OpenMP (iomp). In order to link against iomp, you'll need to manually download the library and set up the building environment by tweaking `CMAKE_INCLUDE_PATH` and `LIB`. The instruction [here](https://github.com/pytorch/pytorch/blob/main/docs/source/notes/windows.rst#building-from-source) is an example for setting up both MKL and Intel OpenMP. Without these configurations for CMake, Microsoft Visual C OpenMP runtime (vcomp) will be used.
@ -329,7 +335,6 @@ Additional libraries such as

 You can refer to the [build_pytorch.bat](https://github.com/pytorch/pytorch/blob/main/.ci/pytorch/win-test-helpers/build_pytorch.bat) script for some other environment variables configurations

-
 ```cmd
 cmd

@ -349,8 +354,7 @@ for /f "usebackq tokens=*" %i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\
 :: [Optional] If you want to override the CUDA host compiler
 set CUDAHOSTCXX=C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.27.29110\bin\HostX64\x64\cl.exe

-python setup.py develop
-
+python -m pip install --no-build-isolation -v -e .
 ```

 **Intel GPU builds**
@ -372,7 +376,7 @@ if defined CMAKE_PREFIX_PATH (
    set "CMAKE_PREFIX_PATH=%CONDA_PREFIX%\Library"
 )

-python setup.py develop
+python -m pip install --no-build-isolation -v -e .
 ```

 ##### Adjust Build Options (Optional)
@ -382,6 +386,7 @@ the following. For example, adjusting the pre-detected directories for CuDNN or
 with such a step.

 On Linux
+
 ```bash
 export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}"
 CMAKE_ONLY=1 python setup.py build
@ -389,6 +394,7 @@ ccmake build  # or cmake-gui build
 ```

 On macOS
+
 ```bash
 export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}"
 MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ CMAKE_ONLY=1 python setup.py build
--- a/aten/src/ATen/CPUGeneratorImpl.cpp
+++ b/aten/src/ATen/CPUGeneratorImpl.cpp
@ -131,69 +131,25 @@ uint64_t CPUGeneratorImpl::seed() {

 /**
 * Sets the internal state of CPUGeneratorImpl. The new internal state
- * must be a strided CPU byte tensor and of the same size as either
- * CPUGeneratorImplStateLegacy (for legacy CPU generator state) or
- * CPUGeneratorImplState (for new state).
- *
- * FIXME: Remove support of the legacy state in the future?
+ * must be a strided CPU byte tensor and of the same size as CPUGeneratorImplState.
 */
 void CPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
  using detail::CPUGeneratorImplState;
  using detail::CPUGeneratorImplStateLegacy;

-  static_assert(std::is_standard_layout_v<CPUGeneratorImplStateLegacy>, "CPUGeneratorImplStateLegacy is not a PODType");
  static_assert(std::is_standard_layout_v<CPUGeneratorImplState>, "CPUGeneratorImplState is not a PODType");
-
-  static const size_t size_legacy = sizeof(CPUGeneratorImplStateLegacy);
-  static const size_t size_current = sizeof(CPUGeneratorImplState);
-  static_assert(size_legacy != size_current, "CPUGeneratorImplStateLegacy and CPUGeneratorImplState can't be of the same size");
+  constexpr size_t size = sizeof(CPUGeneratorImplState);

  detail::check_rng_state(new_state);

  at::mt19937 engine;
-  auto float_normal_sample = std::optional<float>();
-  auto double_normal_sample = std::optional<double>();
-
-  // Construct the state of at::CPUGeneratorImpl based on input byte tensor size.
-  CPUGeneratorImplStateLegacy* legacy_pod{nullptr};
  auto new_state_size = new_state.numel();
-  if (new_state_size == size_legacy) {
-    legacy_pod = (CPUGeneratorImplStateLegacy*)new_state.data();
-    // Note that in CPUGeneratorImplStateLegacy, we didn't have float version
-    // of normal sample and hence we leave the std::optional<float> as is

-    // Update next_double_normal_sample.
-    // Note that CPUGeneratorImplStateLegacy stores two uniform values (normal_x, normal_y)
-    // and a rho value (normal_rho). These three values were redundant and in the new
-    // DistributionsHelper.h, we store the actual extra normal sample, rather than three
-    // intermediate values.
-    if (legacy_pod->normal_is_valid) {
-      auto r = legacy_pod->normal_rho;
-      auto theta = 2.0 * c10::pi<double> * legacy_pod->normal_x;
-      // we return the sin version of the normal sample when in caching mode
-      double_normal_sample = std::optional<double>(r * ::sin(theta));
-    }
-  } else if (new_state_size == size_current) {
-    auto rng_state = (CPUGeneratorImplState*)new_state.data();
-    legacy_pod = &rng_state->legacy_pod;
-    // update next_float_normal_sample
-    if (rng_state->is_next_float_normal_sample_valid) {
-      float_normal_sample = std::optional<float>(rng_state->next_float_normal_sample);
-    }
-
-    // Update next_double_normal_sample.
-    // Note that in getRNGState, we now return the actual normal sample in normal_y
-    // and if it's valid in normal_is_valid. The redundant normal_x and normal_rho
-    // are squashed to 0.0.
-    if (legacy_pod->normal_is_valid) {
-      double_normal_sample = std::optional<double>(legacy_pod->normal_y);
-    }
-  } else {
-    TORCH_CHECK(false, "Expected either a CPUGeneratorImplStateLegacy of size ", size_legacy,
-             " or a CPUGeneratorImplState of size ", size_current,
-             " but found the input RNG state size to be ", new_state_size);
-  }
+  TORCH_CHECK(new_state_size == size, "Expected a CPUGeneratorImplState of size ", size,
+            " but found the input RNG state size to be ", new_state_size);

+  auto rng_state = new_state.data_ptr_impl<CPUGeneratorImplState>();
+  auto legacy_pod = &(rng_state->legacy_pod);
  // construct engine_
  // Note that CPUGeneratorImplStateLegacy stored a state array of 64 bit uints, whereas in our
  // redefined mt19937, we have changed to a state array of 32 bit uints. Hence, we are
@ -207,8 +163,12 @@ void CPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
  engine.set_data(rng_data);
  TORCH_CHECK(engine.is_valid(), "Invalid mt19937 state");
  this->engine_ = engine;
-  this->next_float_normal_sample_ = float_normal_sample;
-  this->next_double_normal_sample_ = double_normal_sample;
+  this->next_float_normal_sample_ = rng_state->is_next_float_normal_sample_valid
+      ? std::optional<float>(rng_state->next_float_normal_sample)
+      : std::optional<float>();
+  this->next_double_normal_sample_ = legacy_pod->normal_is_valid
+      ? std::optional<double>(legacy_pod->normal_y)
+      : std::optional<double>();
 }

 /**
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@ -431,7 +431,8 @@ class TORCH_API Context {
      at::SDPBackend::flash_attention,
      at::SDPBackend::efficient_attention,
      at::SDPBackend::math,
-      at::SDPBackend::cudnn_attention};
+      at::SDPBackend::cudnn_attention,
+      at::SDPBackend::overrideable};
  bool enabled_flashSDP = true;
  bool enabled_mem_efficientSDP = true;
  bool enabled_mathSDP = true;
--- a/aten/src/ATen/InferSize.h
+++ b/aten/src/ATen/InferSize.h
@ -26,9 +26,7 @@ inline void infer_size_impl(
  std::optional<int64_t> infer_dim;
  for (int64_t dim = 0, ndim = shape.size(); dim != ndim; dim++) {
    if (TORCH_GUARD_OR_FALSE(sym_eq(shape[dim], -1))) {
-      if (infer_dim) {
-        throw std::runtime_error("only one dimension can be inferred");
-      }
+      TORCH_CHECK(!infer_dim, "only one dimension can be inferred");
      infer_dim = dim;
    } else {
      // in case of unbacked shape[dim] we assume it's not -1 and add a runtime
--- a/aten/src/ATen/TensorIndexing.h
+++ b/aten/src/ATen/TensorIndexing.h
@ -214,7 +214,7 @@ inline Tensor applySlice(
      "step must be greater than zero");

  // See NOTE [nested tensor size for indexing]
-  if (self_sizes.has_value()) {
+  if (self_sizes.has_value() && self_sizes.value().size() > 0) {
    // Skip this optimization if we are tracing, as the trace may be polymorphic
    // over the shape of the `self` tensor, and we still want to record
    // the slice.
@ -223,7 +223,7 @@ inline Tensor applySlice(
        : self.sym_size(dim);
    if (!disable_slice_optimization &&
        TORCH_STATICALLY_KNOWN_TRUE(start.sym_eq(0)) &&
-        TORCH_STATICALLY_KNOWN_TRUE(length.sym_eq(stop)) && step == 1) {
+        TORCH_STATICALLY_KNOWN_TRUE(length.sym_le(stop)) && step == 1) {
      return self;
    }
  }
--- a/aten/src/ATen/core/Generator.h
+++ b/aten/src/ATen/core/Generator.h
@ -59,9 +59,7 @@ struct TORCH_API Generator {

  explicit Generator(c10::intrusive_ptr<c10::GeneratorImpl> gen_impl)
   : impl_(std::move(gen_impl)) {
-    if (impl_.get() == nullptr) {
-      throw std::runtime_error("GeneratorImpl with nullptr is not supported");
-    }
+    TORCH_CHECK(impl_, "GeneratorImpl with nullptr is not supported");
  }

  bool operator==(const Generator& rhs) const {
--- a/aten/src/ATen/core/TensorBase.h
+++ b/aten/src/ATen/core/TensorBase.h
@ -98,9 +98,7 @@ class TORCH_API TensorBase {
  explicit TensorBase(
      c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl)
      : impl_(std::move(tensor_impl)) {
-    if (impl_.get() == nullptr) {
-      throw std::runtime_error("TensorImpl with nullptr is not supported");
-    }
+    TORCH_CHECK(impl_, "TensorImpl with nullptr is not supported");
  }
  TensorBase(const TensorBase&) = default;
  TensorBase(TensorBase&&) noexcept = default;
--- a/aten/src/ATen/core/interned_strings.cpp
+++ b/aten/src/ATen/core/interned_strings.cpp
@ -68,11 +68,10 @@ Symbol InternedStrings::_symbol(const std::string& s) {
    return it->second;

  auto pos = s.find("::");
-  if (pos == std::string::npos) {
-    std::stringstream ss;
-    ss << "all symbols must have a namespace, <namespace>::<string>, but found: " << s;
-    throw std::runtime_error(ss.str());
-  }
+  TORCH_CHECK(
+      pos != std::string::npos,
+      "all symbols must have a namespace, <namespace>::<string>, but found: ",
+      s);
  Symbol ns = _symbol("namespaces::" + s.substr(0, pos));

  Symbol sym(sym_to_info_.size());
@ -121,12 +120,11 @@ std::string Symbol::domainString() const {
 }

 Symbol Symbol::fromDomainAndUnqualString(const std::string & d, const std::string & s) {
-  if (d.compare(0, domain_prefix().size(), domain_prefix()) != 0) {
-    std::ostringstream ss;
-    ss << "Symbol: domain string is expected to be prefixed with '"
-       << domain_prefix() << "', e.g. 'org.pytorch.aten'";
-    throw std::runtime_error(ss.str());
-  }
+  TORCH_CHECK(
+      d.compare(0, domain_prefix().size(), domain_prefix()) == 0,
+      "Symbol: domain string is expected to be prefixed with '",
+      domain_prefix(),
+      "', e.g. 'org.pytorch.aten'");
  std::string qualString = d.substr(domain_prefix().size()) + "::" + s;
  return fromQualString(qualString);
 }
--- a/aten/src/ATen/core/ivalue.cpp
+++ b/aten/src/ATen/core/ivalue.cpp
@ -406,8 +406,7 @@ size_t IValue::hash(const IValue& v) {
    case Tag::Enum:
    case Tag::Stream:
    case Tag::Uninitialized:
-      throw std::runtime_error(
-          "unhashable type: '" + v.type()->repr_str() + "'");
+      TORCH_CHECK(false, "unhashable type: '" + v.type()->repr_str() + "'");
  }
  // the above switch should be exhaustive
  TORCH_INTERNAL_ASSERT(false, "we should never reach here")
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@ -116,10 +116,9 @@ struct SingleElementType : public SharedType {

 protected:
  SingleElementType(TypePtr elem) : SharedType(Kind), elem(std::move(elem)) {
-    if (!this->elem) {
-      throw std::runtime_error(c10::str(
-            "Can not create ", typeKindToString(Kind), " with None type"));
-    }
+    TORCH_CHECK(
+        this->elem,
+        c10::str("Can not create ", typeKindToString(Kind), " with None type"));
  }

 private:
@ -416,16 +415,12 @@ struct TORCH_API SymbolicShape {
  }

  ShapeSymbol operator[](size_t i) const {
-    if (!dims_) {
-      throw std::runtime_error("Rank isn't fixed");
-    }
+    TORCH_CHECK(dims_, "Rank isn't fixed");
    return (*dims_).at(i);
  }

  ShapeSymbol at(size_t i) const {
-    if (!dims_) {
-      throw std::runtime_error("Rank isn't fixed");
-    }
+    TORCH_CHECK(dims_, "Rank isn't fixed");
    return (*dims_).at(i);
  }

@ -520,9 +515,7 @@ struct VaryingShape {
  }

  const std::optional<T> &operator[](size_t i) const {
-    if (!dims_) {
-      throw std::runtime_error("Rank isn't fixed");
-    }
+    TORCH_CHECK(dims_, "Rank isn't fixed");
    return (*dims_).at(i);
  }

@ -957,9 +950,7 @@ struct TORCH_API DictType : public SharedType {

  TypePtr createWithContained(
      std::vector<TypePtr> contained_types) const override {
-    if (contained_types.size() != 2) {
-      throw std::runtime_error("Expected 2 contained types");
-    }
+    TORCH_CHECK(contained_types.size() == 2, "Expected 2 contained types");
    return create(std::move(contained_types.at(0)), std::move(contained_types.at(1)));
  }

--- a/aten/src/ATen/core/type.cpp
+++ b/aten/src/ATen/core/type.cpp
@ -826,9 +826,7 @@ TupleType::TupleType(
    : NamedType(TypeKind::TupleType, std::move(name)),
      elements_(std::move(elements)),
      has_free_variables_(std::any_of(elements_.begin(), elements_.end(), [](const TypePtr& v) {
-        if (!v) {
-          throw std::runtime_error("Can not create tuple with None type");
-        }
+        TORCH_CHECK(v, "Can not create tuple with None type");
        return v->hasFreeVariables();
      })), schema_(std::move(schema)) {

--- a/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h
+++ b/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h
@ -163,6 +163,9 @@ class Vectorized<BFloat16> {
  Vectorized<BFloat16> exp_u20() const {
    return exp();
  }
+  Vectorized<BFloat16> fexp_u20() const {
+    return exp();
+  }
  Vectorized<BFloat16> fmod(const Vectorized<BFloat16>& q) const;
  Vectorized<BFloat16> hypot(const Vectorized<BFloat16>& b) const;
  Vectorized<BFloat16> i0() const;
--- a/aten/src/ATen/cpu/vec/sve/vec_double.h
+++ b/aten/src/ATen/cpu/vec/sve/vec_double.h
@ -249,6 +249,9 @@ class Vectorized<double> {
  Vectorized<double> exp_u20() const {
    return exp();
  }
+  Vectorized<double> fexp_u20() const {
+    return exp();
+  }
  Vectorized<double> fmod(const Vectorized<double>& q) const {USE_SLEEF(
      { return Vectorized<double>(Sleef_fmoddx_sve(values, q)); },
      {
--- a/aten/src/ATen/cpu/vec/sve/vec_float.h
+++ b/aten/src/ATen/cpu/vec/sve/vec_float.h
@ -314,6 +314,9 @@ class Vectorized<float> {
  Vectorized<float> exp_u20() const {
    return exp();
  }
+  Vectorized<float> fexp_u20() const {
+    return exp();
+  }
  Vectorized<float> fmod(const Vectorized<float>& q) const {USE_SLEEF(
      { return Vectorized<float>(Sleef_fmodfx_sve(values, q)); },
      {
--- a/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h
@ -308,6 +308,9 @@ class Vectorized<float> {
  Vectorized<float> exp_u20() const {
    return exp();
  }
+  Vectorized<float> fexp_u20() const {
+    return exp();
+  }
  DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(
      fmod,
      Sleef_fmodf4)
--- a/aten/src/ATen/cpu/vec/vec128/vec128_reduced_precision_common_neon.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_reduced_precision_common_neon.h
@ -206,6 +206,10 @@ struct Vectorized16 {
    return static_cast<const Derived*>(this)->map_with_vec_float_method(
        &Vectorized<float>::exp_u20);
  }
+  Derived fexp_u20() const {
+    return static_cast<const Derived*>(this)->map_with_vec_float_method(
+        &Vectorized<float>::exp_u20);
+  }
  Derived fmod(const Derived& q) const {
    // This function is questionable with a conversion, so we use map2
    return map2(q, std::fmod);
--- a/aten/src/ATen/cpu/vec/vec256/vec256_16bit_float.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_16bit_float.h
@ -488,6 +488,9 @@ class Vectorized16 {
  Vectorized<T> expm1() const {
    return map(Sleef_expm1f8_u10);
  }
+  Vectorized<T> fexp_u20() const {
+    return exp();
+  }
  Vectorized<T> exp_u20() const {
    return exp();
  }
--- a/aten/src/ATen/cpu/vec/vec256/vec256_double.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_double.h
@ -198,6 +198,9 @@ class Vectorized<double> {
  Vectorized<double> exp_u20() const {
    return exp();
  }
+  Vectorized<double> fexp_u20() const {
+    return exp();
+  }
  Vectorized<double> fmod(const Vectorized<double>& q) const {
    return Vectorized<double>(Sleef_fmodd4(values, q));
  }
--- a/aten/src/ATen/cpu/vec/vec256/vec256_float.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_float.h
@ -1,5 +1,4 @@
 #pragma once
-
 // DO NOT DEFINE STATIC DATA IN THIS HEADER!
 // See Note [Do not compile initializers with AVX]

@ -256,6 +255,63 @@ class Vectorized<float> {
  Vectorized<float> expm1() const {
    return Vectorized<float>(Sleef_expm1f8_u10(values));
  }
+  Vectorized<float> fexp_u20() const {
+    const __m256 vec_c0 = _mm256_set1_ps(0.00010703434948458272f);
+    const __m256 vec_c1 = _mm256_set1_ps(0.30354260500649682f);
+    const __m256 vec_c2 = _mm256_set1_ps(-0.22433836478672356);
+    const __m256 vec_c3 = _mm256_set1_ps(-0.079204240219773236);
+
+    const __m256 vec_exp_log2ef =
+        _mm256_castsi256_ps(_mm256_set1_epi32(0x3fb8aa3b)); // log2(e)
+
+    const __m256 vec_a = _mm256_set1_ps(std::pow(2, 23) / std::log2(2));
+    const __m256 vec_b = _mm256_set1_ps(std::pow(2, 23) * 127.f);
+
+    const __m256 vec_ln_flt_min =
+        _mm256_castsi256_ps(_mm256_set1_epi32(0xc2aeac50));
+    const __m256 vec_ln_flt_max =
+        _mm256_castsi256_ps(_mm256_set1_epi32(0x42b17218));
+    const __m256 vec_inf = _mm256_set1_ps(INFINITY);
+    const __m256 zero = _mm256_setzero_ps();
+
+    // exp(x) = 2**(x * log2(e))
+    //        = 2**xi * 2**xf   - TIPS we are using  the EEEE floating point
+    //        representation with identification to the exponent and the
+    //        mentissa
+    //  2**xf will be approximated to a polynomial of degree 3 computed with
+    //  Horner method
+    // compute the min/max for the mask
+    // Masks
+    __m256 mask_too_small =
+        _mm256_cmp_ps(values, vec_ln_flt_min, _CMP_LT_OS); // x < min
+    __m256 mask_too_large =
+        _mm256_cmp_ps(values, vec_ln_flt_max, _CMP_GT_OS); // x > max
+
+    // transformation with log2(e)
+    auto vec_src = _mm256_mul_ps(values, vec_exp_log2ef);
+    auto vec_fractional = _mm256_sub_ps(vec_src, _mm256_floor_ps(vec_src));
+
+    // compute polynomial using Horner Scheme
+    auto vec_res = _mm256_fmadd_ps(vec_fractional, vec_c3, vec_c2);
+    vec_res = _mm256_fmadd_ps(vec_fractional, vec_res, vec_c1);
+    vec_res = _mm256_fmadd_ps(vec_fractional, vec_res, vec_c0);
+
+    vec_src = _mm256_sub_ps(vec_src, vec_res);
+    // // the tips is here, headache in perspective
+    auto tmp = _mm256_fmadd_ps(vec_a, vec_src, vec_b);
+    // headache bis
+    __m256i casted_integer = _mm256_cvttps_epi32(tmp);
+    // bitwise to float for the final transformation
+    auto result = _mm256_castsi256_ps(casted_integer);
+    // boundary condition
+    // Set to 0 where x < ln(FLT_MIN)
+    result = _mm256_blendv_ps(result, zero, mask_too_small);
+    // Set to +inf where x > ln(FLT_MAX)
+    result = _mm256_blendv_ps(result, vec_inf, mask_too_large);
+    // final interpretation to float
+    return result;
+  }
+
  Vectorized<float> exp_u20() const {
    // A faster version of exp with ULP=20
    const __m256 vec_factorial_1 =
--- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
@ -121,27 +121,52 @@ typename std::enable_if_t<
 }

 template <typename T>
-typename std::enable_if_t<
-    std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>,
-    at::vec::Vectorized<
-        T>> inline convert_float_to_int8(at::vec::Vectorized<float> src) {
+at::vec::Vectorized<T> inline convert_float_to_int8(
+    at::vec::Vectorized<float> src);
+
+template <>
+at::vec::Vectorized<int8_t> inline convert_float_to_int8(
+    at::vec::Vectorized<float> src) {
  // Convert from float32 to int32 with truncation
  __m256i x_values_int32 = _mm256_cvttps_epi32(src);

  // Convert from int32 to int16 using signed saturation
  __m256i xy_packed_v = _mm256_packs_epi32(x_values_int32, x_values_int32);

-  constexpr auto min_val = std::numeric_limits<T>::min();
-  constexpr auto max_val = std::numeric_limits<T>::max();
+  constexpr auto min_val = std::numeric_limits<int8_t>::min();
+  constexpr auto max_val = std::numeric_limits<int8_t>::max();

-  // Convert from int16 to uint8/int8 using unsigned saturation
-  __m256i xyzw_clamped_v =
-      pack_saturate_and_clamp<T>(xy_packed_v, xy_packed_v, min_val, max_val);
+  // Convert from int16 to int8 using unsigned saturation
+  __m256i xyzw_clamped_v = pack_saturate_and_clamp<int8_t>(
+      xy_packed_v, xy_packed_v, min_val, max_val);
  __m256i permute_mask_v =
      _mm256_set_epi32(0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00);
  return _mm256_permutevar8x32_epi32(xyzw_clamped_v, permute_mask_v);
 }

+template <>
+at::vec::Vectorized<uint8_t> inline convert_float_to_int8(
+    at::vec::Vectorized<float> src) {
+  // The type of *_val should be int32_t to ensure correct clamping behavior.
+  constexpr auto min_val = std::numeric_limits<int32_t>::min();
+  constexpr auto max_val = std::numeric_limits<int32_t>::max();
+  __m256 float32_min_val = _mm256_set1_ps(float(min_val));
+  __m256 float32_max_val = _mm256_set1_ps(float(max_val));
+  __m256 float32_src = _mm256_max_ps(src, float32_min_val);
+  float32_src = _mm256_min_ps(float32_src, float32_max_val);
+  __m256i truncated_src = _mm256_cvttps_epi32(float32_src);
+
+  __m128i r1 = _mm256_castsi256_si128(truncated_src);
+  __m128i mask = _mm_setr_epi8(
+      0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+  __m128i r1_shuffled = _mm_shuffle_epi8(r1, mask);
+  __m128i r2 = _mm256_extractf128_si256(truncated_src, 1);
+  __m128i r2_shuffled = _mm_shuffle_epi8(r2, mask);
+  __m128i result = _mm_unpacklo_epi32(r1_shuffled, r2_shuffled);
+
+  return _mm256_castsi128_si256(result);
+}
+
 template <typename T>
 __FORCE_INLINE void QuantizeAvx2(
    const float* src,
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h
@ -273,6 +273,9 @@ class Vectorized<double> {
  Vectorized<double> C10_ALWAYS_INLINE exp_u20() const {
    return exp();
  }
+  Vectorized<double> C10_ALWAYS_INLINE fexp_u20() const {
+    return exp();
+  }

  Vectorized<double> lgamma() const __ubsan_ignore_undefined__ {
    return {Sleef_lgammad2_u10(_vec0), Sleef_lgammad2_u10(_vec1)};
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h
@ -352,6 +352,9 @@ class Vectorized<float> {
  Vectorized<float> C10_ALWAYS_INLINE exp_u20() const {
    return exp();
  }
+  Vectorized<float> C10_ALWAYS_INLINE fexp_u20() const {
+    return exp();
+  }

  Vectorized<float> C10_ALWAYS_INLINE log() const {
    return {Sleef_logf4_u10(_vec0), Sleef_logf4_u10(_vec1)};
--- a/aten/src/ATen/cpu/vec/vec256/zarch/vec256_zarch.h
+++ b/aten/src/ATen/cpu/vec/vec256/zarch/vec256_zarch.h
@ -1023,6 +1023,9 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
  Vectorized<T> exp_u20() const {
    return exp();
  }
+  Vectorized<T> fexp_u20() const {
+    return exp();
+  }

  Vectorized<T> log() const {
    return mapSleef(Sleef_logf4_u10, Sleef_logd2_u10);
--- a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
@ -535,6 +535,9 @@ class Vectorized16 {
  Vectorized<T> expm1() const {
    return map(Sleef_expm1f16_u10);
  }
+  Vectorized<T> fexp_u20() const {
+    return exp();
+  }
  Vectorized<T> exp_u20() const {
    return exp();
  }
--- a/aten/src/ATen/cpu/vec/vec512/vec512_double.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_double.h
@ -221,6 +221,9 @@ class Vectorized<double> {
  Vectorized<double> exp_u20() const {
    return exp();
  }
+  Vectorized<double> fexp_u20() const {
+    return exp();
+  }
  Vectorized<double> fmod(const Vectorized<double>& q) const {
    return Vectorized<double>(Sleef_fmodd8(values, q));
  }
--- a/aten/src/ATen/cpu/vec/vec512/vec512_float.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_float.h
@ -310,6 +310,60 @@ class Vectorized<float> {
  Vectorized<float> expm1() const {
    return Vectorized<float>(Sleef_expm1f16_u10(values));
  }
+  Vectorized<float> fexp_u20() const {
+    const __m512 vec_c0 = _mm512_set1_ps(0.00010703434948458272f);
+    const __m512 vec_c1 = _mm512_set1_ps(0.30354260500649682f);
+    const __m512 vec_c2 = _mm512_set1_ps(-0.22433836478672356);
+    const __m512 vec_c3 = _mm512_set1_ps(-0.079204240219773236);
+
+    const __m512 vec_exp_log2ef =
+        _mm512_castsi512_ps(_mm512_set1_epi32(0x3fb8aa3b)); // log2(e)
+
+    const __m512 vec_a = _mm512_set1_ps(std::pow(2, 23) / std::log2(2));
+    const __m512 vec_b = _mm512_set1_ps(std::pow(2, 23) * 127.f);
+
+    const __m512 vec_ln_flt_min =
+        _mm512_castsi512_ps(_mm512_set1_epi32(0xc2aeac50));
+    const __m512 vec_ln_flt_max =
+        _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218));
+    __m512i vec_infinity = _mm512_set1_epi32(0x7F800000);
+    __m512i vec_zero = _mm512_setzero_epi32();
+
+    // Fast Exponential Computation on SIMD Architectures
+    // A. Cristiano I. Malossi, Yves Ineichen, Costas Bekas, and Alessandro
+    // Curioni exp(x) = 2**(x * log2(e))
+    //        = 2**xi * 2**xf   - TIPS we are using  the EEEE floating point
+    //        representation with identification to the exponent and the
+    //        mentissa
+    //  2**xf will be approximated to a polynomial of degree 3 computed with
+    //  Horner method
+    // mask for the boundary condition
+    auto min_mask = _mm512_cmp_ps_mask(values, vec_ln_flt_min, _CMP_LT_OS);
+    auto max_mask = _mm512_cmp_ps_mask(values, vec_ln_flt_max, _CMP_GT_OS);
+
+    // transformation with log2(e)
+    auto vec_src = _mm512_mul_ps(values, vec_exp_log2ef);
+    auto vec_fractional = _mm512_sub_ps(vec_src, _mm512_floor_ps(vec_src));
+
+    // compute polynomial using Horner Scheme, for superscalar processor
+    auto vec_res = _mm512_fmadd_ps(vec_fractional, vec_c3, vec_c2);
+    vec_res = _mm512_fmadd_ps(vec_fractional, vec_res, vec_c1);
+    vec_res = _mm512_fmadd_ps(vec_fractional, vec_res, vec_c0);
+
+    vec_src = _mm512_sub_ps(vec_src, vec_res);
+    // the tips is here, headache in perspective
+    auto tmp = _mm512_fmadd_ps(vec_a, vec_src, vec_b);
+    // headache bis - we loose precision with the cast but it "fits", but ok
+    // after f32 -> f16 later
+    __m512i casted_integer = _mm512_cvttps_epi32(tmp);
+    // boundary condition, lower than the min -> 0
+    casted_integer = _mm512_mask_mov_epi32(casted_integer, min_mask, vec_zero);
+    // boundary condition, larger than the max -> +oo
+    casted_integer =
+        _mm512_mask_mov_epi32(casted_integer, max_mask, vec_infinity);
+    // final interpretation to float
+    return _mm512_castsi512_ps(casted_integer);
+  }
  Vectorized<float> exp_u20() const {
    // A faster version of exp with ULP=20
    const __m512 vec_factorial_1 =
--- a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
@ -123,22 +123,24 @@ typename std::enable_if_t<
 }

 template <typename T>
-typename std::enable_if_t<
-    std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>,
-    at::vec::Vectorized<
-        T>> inline convert_float_to_int8(at::vec::Vectorized<float> src) {
+at::vec::Vectorized<T> inline convert_float_to_int8(
+    at::vec::Vectorized<float> src);
+
+template <>
+at::vec::Vectorized<int8_t> inline convert_float_to_int8(
+    at::vec::Vectorized<float> src) {
  // Convert from float32 to int32 with truncation
  __m512i x_values_int32 = _mm512_cvttps_epi32(src);

  // Convert from int32 to int16 using signed saturation
  __m512i xy_packed_v = _mm512_packs_epi32(x_values_int32, x_values_int32);

-  constexpr auto min_val = std::numeric_limits<T>::min();
-  constexpr auto max_val = std::numeric_limits<T>::max();
+  constexpr auto min_val = std::numeric_limits<int8_t>::min();
+  constexpr auto max_val = std::numeric_limits<int8_t>::max();

-  // Convert from int16 to uint8/int8 using unsigned saturation
-  __m512i xyzw_clamped_v =
-      pack_saturate_and_clamp<T>(xy_packed_v, xy_packed_v, min_val, max_val);
+  // Convert from int16 to int8 using unsigned saturation
+  __m512i xyzw_clamped_v = pack_saturate_and_clamp<int8_t>(
+      xy_packed_v, xy_packed_v, min_val, max_val);
  __m512i permute_mask_v = _mm512_set_epi32(
      0x0f,
      0x0b,
@ -159,6 +161,21 @@ typename std::enable_if_t<
  return _mm512_permutexvar_epi32(permute_mask_v, xyzw_clamped_v);
 }

+template <>
+at::vec::Vectorized<uint8_t> inline convert_float_to_int8(
+    at::vec::Vectorized<float> src) {
+  // The type of *_val should be int32_t to ensure correct clamping behavior.
+  constexpr auto min_val = std::numeric_limits<int32_t>::min();
+  constexpr auto max_val = std::numeric_limits<int32_t>::max();
+  __m512 float32_min_val = _mm512_set1_ps(float(min_val));
+  __m512 float32_max_val = _mm512_set1_ps(float(max_val));
+  __m512 float32_src = _mm512_max_ps(src, float32_min_val);
+  float32_src = _mm512_min_ps(float32_src, float32_max_val);
+  __m512i int32_src_clamped = _mm512_cvttps_epi32(float32_src);
+  __m128i int8_src = _mm512_cvtepi32_epi8(int32_src_clamped);
+  return _mm512_castsi128_si512(int8_src);
+}
+
 template <typename T>
 __FORCE_INLINE void QuantizeAvx512(
    const float* src,
--- a/aten/src/ATen/cpu/vec/vec_base.h
+++ b/aten/src/ATen/cpu/vec/vec_base.h
@ -238,9 +238,6 @@ struct Vectorized {
    Vectorized vector;
    int_same_size_t<T> buffer[size()];
    mask.store(buffer);
-#if defined(__clang__) && __ARM_FEATURE_SVE
-#pragma clang loop vectorize(disable)
-#endif
    for (const auto i : c10::irange(size())) {
      if (buffer[i] & 0x01) {
        vector[i] = b[i];
@ -547,6 +544,9 @@ struct Vectorized {
  Vectorized<T> exp_u20() const {
    return map(std::exp);
  }
+  Vectorized<T> fexp_u20() const {
+    return map(std::exp);
+  }
  Vectorized<T> frac() const {
    return *this - this->trunc();
  }
--- a/aten/src/ATen/cpu/vec/vec_n.h
+++ b/aten/src/ATen/cpu/vec/vec_n.h
@ -263,6 +263,7 @@ class VectorizedN {
  VECTORIZEDN_DEFINE_UNARY_OP(exp2)
  VECTORIZEDN_DEFINE_UNARY_OP(expm1)
  VECTORIZEDN_DEFINE_UNARY_OP(exp_u20)
+  VECTORIZEDN_DEFINE_UNARY_OP(fexp_u20)
  VECTORIZEDN_DEFINE_UNARY_OP(frac)
  VECTORIZEDN_DEFINE_BINARY_OP(fmod)
  VECTORIZEDN_DEFINE_UNARY_OP(log)
--- a/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
@ -94,9 +94,10 @@ static std::vector<std::optional<Tensor>> batchIndices(
    if (index.has_value() && index->sym_numel() != 0) {
      const auto idx_bdim = indices_bdims[i];
      indices_.emplace_back(maybePadToLogicalRank(moveBatchDimToFront(index.value(), idx_bdim), idx_bdim, maxLogicalRank));
-      if (index.value().dtype() == kBool && indices_bdims[i].has_value()) {
-        throw std::runtime_error("vmap: We do not support batching operators that can support dynamic shape. Attempting to batch over indexing with a boolean mask.");
-      }
+      TORCH_CHECK(
+          index.value().dtype() != kBool || !indices_bdims[i].has_value(),
+          "vmap: We do not support batching operators that can support ",
+          "dynamic shape. Attempting to batch over indexing with a boolean mask.");
    } else {
      indices_.push_back(index);
    }
--- a/aten/src/ATen/native/ComparisonUtils.cpp
+++ b/aten/src/ATen/native/ComparisonUtils.cpp
@ -16,11 +16,14 @@ template<typename O, typename C>
 static void _assert_match(const O& original, const C& compared, const std::string& name) {
  if (compared) {
    bool equal = (original == compared.value());
-    if (!equal) {
-      std::stringstream msg;
-      msg << "Tensor " << name << " mismatch! Expected: " << compared.value() << ", Got: " << original;
-      throw std::runtime_error(msg.str());
-    }
+    TORCH_CHECK(
+        equal,
+        "Tensor ",
+        name,
+        " mismatch! Expected: ",
+        compared.value(),
+        ", Got: ",
+        original);
  }
 }

--- a/aten/src/ATen/native/Distributions.cpp
+++ b/aten/src/ATen/native/Distributions.cpp
@ -424,6 +424,14 @@ Tensor _dirichlet_grad_cpu(const Tensor& x, const Tensor& alpha, const Tensor& t
 */

 Tensor _s_binomial_cpu(const Tensor& count, const Tensor& prob, std::optional<Generator> gen) {
+  TORCH_CHECK_VALUE(
+      at::isFloatingType(count.scalar_type()),
+      "binomial only supports floating-point dtypes for count, got: ",
+      count.scalar_type());
+  TORCH_CHECK_VALUE(
+      at::isFloatingType(prob.scalar_type()),
+      "binomial only supports floating-point dtypes for prob, got: ",
+      prob.scalar_type());
  Tensor ret = at::zeros(count.sizes(), count.options());
  auto iter = TensorIteratorConfig()
    .add_output(ret)
--- a/aten/src/ATen/native/TriangularOps.cpp
+++ b/aten/src/ATen/native/TriangularOps.cpp
@ -180,9 +180,7 @@ TORCH_IMPL_FUNC(triu_cpu)(const Tensor& self, int64_t k, const Tensor &result) {
 }

 Tensor trace_backward_symint(const Tensor& grad, c10::SymIntArrayRef sizes) {
-  if (sizes.size() != 2) {
-    throw std::runtime_error("expected matrix input");
-  }
+  TORCH_CHECK(sizes.size() == 2, "expected matrix input");

  auto grad_input = at::zeros_symint(sizes[0] * sizes[1], grad.options());
  auto indices = at::arange(0, grad_input.numel(), sizes[1] + 1, grad.options().dtype(at::kLong));
--- a/aten/src/ATen/native/ao_sparse/quantized/cpu/packed_params.h
+++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/packed_params.h
@ -62,7 +62,8 @@ struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
  virtual std::optional<at::Tensor> bias() = 0;

  virtual void set_bias(const std::optional<at::Tensor>& bias) {
-    throw std::runtime_error(
+    TORCH_CHECK(
+        false,
        "set_bias is not implemented for this packed "
        "parameter type");
  }
--- a/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp
+++ b/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp
@ -96,7 +96,14 @@ inline void _exp_reduce_sum_fusion_kernel(
  for (long i = 0; i < vec_size * (size / vec_size); i += vec_size) {
    auto tmp0 = vec::Vectorized<T1>::loadu(a + i);
    auto tmp1 = tmp0 - vec_max;
-    auto tmp2 = tmp1.exp_u20();
+    Vectorized<T1> tmp2;
+    if constexpr (std::is_same_v<T1, float> &&
+              (std::is_same_v<T2, at::BFloat16> || std::is_same_v<T2, at::Half>))
+    {
+        tmp2 = tmp1.fexp_u20();
+    } else {
+        tmp2 = tmp1.exp_u20();
+    }
    vec_tmp_sum += tmp2;
    _store(out + i, tmp2);
  }
--- a/aten/src/ATen/native/cuda/GroupMMCommon.cuh
+++ b/aten/src/ATen/native/cuda/GroupMMCommon.cuh
@ -48,12 +48,7 @@ __global__ void prepare_grouped_gemm_data(
    int32_t start = tid == 0 ? 0 : offs[tid - 1];
    delta = offs[tid] - start;
    if (K < 0) {
-      if (!a_row_major && b_row_major) {
-        CUDA_KERNEL_ASSERT(delta >=0 && "expected offsets to be greater or equal 0\n");
-      } else  {
-        // CUTLASS cannot handle delta=0 here.
-        CUDA_KERNEL_ASSERT(delta >0 && "expected offsets to be greater than 0\n");
-      }
+      CUDA_KERNEL_ASSERT(delta >=0 && "expected offsets to be greater or equal 0\n");
    }

    // TMA transfers require global memory tensor addresses to be
--- a/aten/src/ATen/native/mkl/SpectralOps.cpp
+++ b/aten/src/ATen/native/mkl/SpectralOps.cpp
@ -337,6 +337,7 @@ Tensor _fft_c2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization,
 #include <cmath>

 #include <mkl_dfti.h>
+#include <mkl_version.h>
 #include <ATen/mkl/Exceptions.h>
 #include <ATen/mkl/Descriptors.h>
 #include <ATen/mkl/Limits.h>
@ -479,6 +480,19 @@ static Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_sizes,
  const auto value_type = c10::toRealValueType(input.scalar_type());
  out.resize_(batched_out_sizes, MemoryFormat::Contiguous);

+  // fix mkl issue
+  // https://github.com/pytorch/pytorch/issues/154477
+#ifdef INTEL_MKL_VERSION
+#if INTEL_MKL_VERSION > 20210400L
+  for (const auto& stride : input.strides()) {
+    if (stride == 0) {
+      input = input.clone(MemoryFormat::Contiguous);
+      break;
+    }
+  }
+#endif
+#endif
+
  auto descriptor = _plan_mkl_fft(
      input.strides(), out.strides(), signal_size, input.is_complex(),
      out.is_complex(), normalization, forward, value_type);
--- a/aten/src/ATen/native/mkldnn/xpu/Attention.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/Attention.cpp
@ -79,14 +79,16 @@ sdp::SDPBackend select_sdp_backend_xpu(sdp::sdp_params const& kernel_params) {
  // 2. Math fallback
  auto& ctx = at::globalContext();
  // use overrideable linked to onednn as overrideable implementation
-  if (!ctx.userEnabledMathSDP() && !ctx.userEnabledOverrideableSDP()) {
+  if (!ctx.userEnabledMathSDP() && !ctx.userEnabledOverrideableSDP() &&
+      !ctx.userEnabledFlashSDP()) {
    return sdp::SDPBackend::error;
  }

  // Get ideal kernel ordering
-  const std::array<sdp::SDPBackend, 2> priority_order{
+  const std::array<sdp::SDPBackend, 3> priority_order{
      sdp::SDPBackend::overrideable,
      sdp::SDPBackend::math,
+      sdp::SDPBackend::flash_attention,
  };

  // Because TORCHCHECK checks if condition is true we negate debug so that
@ -105,6 +107,14 @@ sdp::SDPBackend select_sdp_backend_xpu(sdp::sdp_params const& kernel_params) {
          return sdp::SDPBackend::math;
        }
        break;
+      case sdp::SDPBackend::flash_attention:
+        if (ctx.userEnabledFlashSDP() &&
+            use_overrideable_xpu(kernel_params, print_debug)) {
+          TORCH_WARN(
+              "Flash Attention is not supported on XPU, falling back to overrideable kernel.");
+          return sdp::SDPBackend::overrideable;
+        }
+        break;
      default:
        TORCH_CHECK(false, "Invalid backend");
    }
@ -141,7 +151,7 @@ int64_t _fused_sdp_choice_xpu(
    TORCH_CHECK(
        false,
        "No viable backend for scaled_dot_product_attention was found. ",
-        "This is likely due to turning off both the math kernel and the fused kernels.");
+        "This is likely due to turning off both the math kernel and the overrideable kernels.");
  }
  return static_cast<int64_t>(backend);
 }
--- a/aten/src/ATen/native/mps/operations/Indexing.h
+++ b/aten/src/ATen/native/mps/operations/Indexing.h
@ -1,8 +0,0 @@
-//  Copyright © 2022 Apple Inc.
-#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
-#include <ATen/native/mps/OperationUtils.h>
-#include <ATen/native/mps/TensorFactory.h>
-#include <c10/core/ScalarType.h>
-#include <unordered_map>
-
-using namespace at::mps;
--- a/aten/src/ATen/native/mps/operations/Indexing.mm
+++ b/aten/src/ATen/native/mps/operations/Indexing.mm
@ -18,8 +18,6 @@
 #include <ATen/native/Resize.h>
 #include <ATen/native/TensorAdvancedIndexing.h>
 #include <ATen/native/mps/MPSGraphVenturaOps.h>
-#include <ATen/native/mps/operations/Indexing.h>
-#include <c10/core/QScheme.h>
 #include <c10/util/SmallVector.h>
 #include <c10/util/irange.h>
 #include <fmt/format.h>
--- a/aten/src/ATen/native/nested/NestedTensorMath.cpp
+++ b/aten/src/ATen/native/nested/NestedTensorMath.cpp
@ -746,7 +746,7 @@ inline std::tuple<bool, Tensor, Tensor> NestedTensor_compute_size_stride(
        }
        else if (size_reshaped == -1) {
          if (infer_index > -1) {
-            throw std::runtime_error("only one dimension can be inferred");
+            TORCH_CHECK(false, "only one dimension can be inferred");
          }
          else {
            infer_index = idim;
--- a/aten/src/ATen/native/quantized/PackedParams.h
+++ b/aten/src/ATen/native/quantized/PackedParams.h
@ -19,7 +19,8 @@ struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
      double /*output_scale*/,
      int64_t /*output_zero_point*/,
      at::Tensor& output) {
-    throw std::runtime_error(
+    TORCH_CHECK(
+        false,
        "apply_out is not implemented for this packed "
        "parameter type");
    return output;
@ -30,7 +31,8 @@ struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
      double /*output_scale*/,
      int64_t /*output_zero_point*/,
      at::Tensor& output) {
-    throw std::runtime_error(
+    TORCH_CHECK(
+        false,
        "apply_relu_out is not implemented for this packed "
        "parameter type");
    return output;
@ -55,7 +57,8 @@ struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
      at::Tensor input,
      double input_scale,
      int64_t input_zero_point) {
-    throw std::runtime_error(
+    TORCH_CHECK(
+        false,
        "apply_with_input_q_dq_qweight_dq_output_fp32 is not implemented for this packed "
        "parameter type");
    return {};
@ -79,7 +82,8 @@ struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
      at::Tensor input,
      double input_scale,
      int64_t input_zero_point) {
-    throw std::runtime_error(
+    TORCH_CHECK(
+        false,
        "apply_with_input_q_dq_qweight_dq_relu_output_fp32 is not implemented for this packed "
        "parameter type");
    return {};
@ -96,7 +100,8 @@ struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
      const at::Tensor& /* input */,
      at::Tensor& output,
      bool /* reduce_range */) {
-    throw std::runtime_error(
+    TORCH_CHECK(
+        false,
        "apply_dynamic_out is not implemented for this packed "
        "parameter type");
    return output;
@ -105,7 +110,8 @@ struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
      const at::Tensor& /* input */,
      at::Tensor& output,
      bool /* reduce_range */) {
-    throw std::runtime_error(
+    TORCH_CHECK(
+        false,
        "apply_dynamic_relu_out is not implemented for this packed "
        "parameter type");
    return output;
@ -116,7 +122,8 @@ struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
  virtual std::optional<at::Tensor> bias() = 0;

  virtual void set_bias(std::optional<at::Tensor> /*bias*/) {
-    throw std::runtime_error(
+    TORCH_CHECK(
+        false,
        "set_bias is not implemented for this packed "
        "parameter type");
  }
--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
@ -843,6 +843,11 @@ SDPBackend select_sdp_backend(sdp_params const& kernel_params) {
          return SDPBackend::math;
        }
        break;
+      case SDPBackend::overrideable:
+        if (ctx.userEnabledOverrideableSDP()) {
+          TORCH_CHECK(false, "Invalid backend");
+        }
+        break;
      default:
        TORCH_CHECK(false, "Invalid backend");
    }
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@ -8,12 +8,12 @@ It also provides mechanisms to compare PyTorch with other frameworks.
 Make sure you're on a machine with CUDA, torchvision, and pytorch installed. Install in the following order:
 ```
 # Install torchvision. It comes with the pytorch stable release binary
-pip3 install torch torchvision
+python -m pip install torch torchvision

 # Install the latest pytorch master from source.
 # It should supersede the installation from the release binary.
 cd $PYTORCH_HOME
-python setup.py build develop
+python -m pip install --no-build-isolation -v -e .

 # Check the pytorch installation version
 python -c "import torch; print(torch.__version__)"
--- a/benchmarks/dynamo/Makefile
+++ b/benchmarks/dynamo/Makefile
@ -27,7 +27,7 @@ pull-deps: clone-deps
 	(cd ../../../torchbenchmark && git fetch && git checkout "$$(cat ../pytorch/.github/ci_commit_pins/torchbench.txt)" && git submodule update --init --recursive)

 build-deps: clone-deps
-	uv pip install astunparse numpy scipy ninja pyyaml mkl mkl-include setuptools cmake \
+	uv pip install numpy scipy ninja pyyaml six mkl mkl-include setuptools wheel cmake \
 		typing-extensions requests protobuf numba cython scikit-learn librosa
 	(cd ../../../torchvision && uv pip install -e . --no-build-isolation)
 	(cd ../../../torchdata && uv pip install -e .)
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv
@ -210,7 +210,7 @@ mobilenet_v2,pass,0



-mobilenet_v2_quantized_qat,pass,2
+mobilenet_v2_quantized_qat,pass,3



@ -274,7 +274,7 @@ resnet50,pass,0



-resnet50_quantized_qat,pass,2
+resnet50_quantized_qat,pass,3



--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv
@ -210,7 +210,7 @@ mobilenet_v2,pass,0



-mobilenet_v2_quantized_qat,pass,2
+mobilenet_v2_quantized_qat,pass,3



@ -274,7 +274,7 @@ resnet50,pass,0



-resnet50_quantized_qat,pass,2
+resnet50_quantized_qat,pass,3



--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
@ -210,7 +210,7 @@ mobilenet_v2,pass,0



-mobilenet_v2_quantized_qat,pass,2
+mobilenet_v2_quantized_qat,pass,3



@ -274,7 +274,7 @@ resnet50,pass,0



-resnet50_quantized_qat,pass,2
+resnet50_quantized_qat,pass,3



--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv
@ -194,7 +194,7 @@ mobilenet_v2,pass,0



-mobilenet_v2_quantized_qat,pass,2
+mobilenet_v2_quantized_qat,pass,3



@ -258,7 +258,7 @@ resnet50,pass,0



-resnet50_quantized_qat,pass,2
+resnet50_quantized_qat,pass,3



--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv
@ -210,7 +210,7 @@ mobilenet_v2,pass,0



-mobilenet_v2_quantized_qat,pass,2
+mobilenet_v2_quantized_qat,pass,3



@ -274,7 +274,7 @@ resnet50,pass,0



-resnet50_quantized_qat,pass,2
+resnet50_quantized_qat,pass,3



--- a/benchmarks/functional_autograd_benchmark/README.md
+++ b/benchmarks/functional_autograd_benchmark/README.md
@ -17,8 +17,8 @@ export DEBUG=0
 export OMP_NUM_THREADS=10

 # Compile pytorch with the base revision
-git checkout master
-python setup.py develop
+git checkout main
+python -m pip install --no-build-isolation -v -e .

 # Install dependencies:
 # Scipy is required by detr
@ -32,7 +32,7 @@ python functional_autograd_benchmark.py --output before.txt
 # Compile pytorch with your change
 popd
 git checkout your_feature_branch
-python setup.py develop
+python -m pip install --no-build-isolation -v -e .

 # Run the benchmark for the new version
 pushd benchmarks/functional_autograd_benchmark
--- a/benchmarks/operator_benchmark/README.md
+++ b/benchmarks/operator_benchmark/README.md
@ -20,7 +20,7 @@ Key Features:
 The instruction below installs a cpp\_extension for PyTorch and it is required to run the benchmark suite.
 ```bash
 cd pt_extension
-python setup.py install
+python -m pip install .
 ```

 ## How to run the benchmarks:
--- a/benchmarks/sparse/test_csr.sh
+++ b/benchmarks/sparse/test_csr.sh
@ -11,7 +11,7 @@ export USE_MKL=1
 CMAKE_ONLY=1 python setup.py build
 ccmake build  # or cmake-gui build

-python setup.py install
+python -m pip install --no-build-isolation -v .

 cd benchmarks
 echo "!! SPARSE SPMM TIME BENCHMARK!! " >> $OUTFILE
@ -28,7 +28,7 @@ echo "----- USE_MKL=0 ------" >> $OUTFILE
 rm -rf build

 export USE_MKL=0
-python setup.py install
+python -m pip install --no-build-isolation -v .

 cd benchmarks
 for dim0 in 1000 5000 10000; do
--- a/c10/core/AllocatorConfig.cpp
+++ b/c10/core/AllocatorConfig.cpp
@ -0,0 +1,233 @@
+#include <c10/core/AllocatorConfig.h>
+#include <c10/core/DeviceType.h>
+#include <c10/util/env.h>
+#include <c10/util/irange.h>
+
+namespace c10::CachingAllocator {
+
+namespace {
+constexpr size_t kRoundUpPowerOfTwoIntervals = 16;
+constexpr size_t kMB = 1024 * 1024ul;
+constexpr size_t kRoundUpPowerOfTwoStart = 1 * kMB; // 1MB
+constexpr size_t kRoundUpPowerOfTwoEnd = 64 * 1024ul * kMB; // 64GB
+} // anonymous namespace
+
+AcceleratorAllocatorConfig& AcceleratorAllocatorConfig::instance() {
+  static AcceleratorAllocatorConfig instance;
+#define C10_ALLOCATOR_CONFIG_PARSE_ENV(env, deprecated)                       \
+  auto env##_name = c10::utils::get_env(#env);                                \
+  if (env##_name.has_value()) {                                               \
+    if (deprecated) {                                                         \
+      TORCH_WARN_ONCE(#env " is deprecated, use PYTORCH_ALLOC_CONF instead"); \
+    }                                                                         \
+    instance.parseArgs(env##_name.value());                                   \
+    return true;                                                              \
+  }
+  static bool env_flag [[maybe_unused]] = []() {
+    C10_ALLOCATOR_CONFIG_PARSE_ENV(PYTORCH_ALLOC_CONF, false)
+    // Keep this for backwards compatibility
+    C10_ALLOCATOR_CONFIG_PARSE_ENV(PYTORCH_CUDA_ALLOC_CONF, /*deprecated=*/true)
+    C10_ALLOCATOR_CONFIG_PARSE_ENV(PYTORCH_HIP_ALLOC_CONF, /*deprecated=*/true)
+    return false;
+  }();
+#undef C10_ALLOCATOR_CONFIG_PARSE_ENV
+  return instance;
+}
+
+AcceleratorAllocatorConfig::AcceleratorAllocatorConfig() {
+  roundup_power2_divisions_.assign(kRoundUpPowerOfTwoIntervals, 0);
+}
+
+size_t AcceleratorAllocatorConfig::roundup_power2_divisions(size_t size) {
+  size_t log_size = (63 - llvm::countLeadingZeros(size));
+
+  // Our intervals start at 1MB and end at 64GB
+  const size_t interval_start =
+      63 - llvm::countLeadingZeros(kRoundUpPowerOfTwoStart);
+  const size_t interval_end =
+      63 - llvm::countLeadingZeros(kRoundUpPowerOfTwoEnd);
+  TORCH_CHECK(
+      interval_end - interval_start == kRoundUpPowerOfTwoIntervals,
+      "kRoundUpPowerOfTwoIntervals mismatch");
+
+  size_t index =
+      (log_size > interval_start) ? (log_size - interval_start) : 0ul;
+  index = std::min(index, kRoundUpPowerOfTwoIntervals - 1);
+  return instance().roundup_power2_divisions_[index];
+}
+
+size_t AcceleratorAllocatorConfig::parseMaxSplitSize(
+    const ConfigTokenizer& tokenizer,
+    size_t i) {
+  tokenizer.checkToken(++i, ":");
+  constexpr size_t min_allowed_split_size_mb = kLargeBuffer / kMB;
+  constexpr size_t max_allowed_split_size_mb =
+      std::numeric_limits<size_t>::max() / kMB;
+
+  size_t val_env = tokenizer.toSizeT(++i);
+  TORCH_CHECK(
+      val_env >= min_allowed_split_size_mb,
+      "CachingAllocator option max_split_size_mb too small, must be >= ",
+      min_allowed_split_size_mb);
+  val_env = std::min(val_env, max_allowed_split_size_mb);
+  max_split_size_ = val_env * kMB;
+
+  return i;
+}
+
+size_t AcceleratorAllocatorConfig::parseMaxNonSplitRoundingSize(
+    const ConfigTokenizer& tokenizer,
+    size_t i) {
+  tokenizer.checkToken(++i, ":");
+  constexpr size_t min_allowed_split_size_mb = kLargeBuffer / kMB;
+  constexpr size_t max_allowed_split_size_mb =
+      std::numeric_limits<size_t>::max() / kMB;
+
+  size_t val_env = tokenizer.toSizeT(++i);
+  TORCH_CHECK(
+      val_env >= min_allowed_split_size_mb,
+      "CachingAllocator option max_non_split_rounding_mb too small, must be >= ",
+      min_allowed_split_size_mb);
+  val_env = std::min(val_env, max_allowed_split_size_mb);
+  max_non_split_rounding_size_ = val_env * kMB;
+
+  return i;
+}
+
+size_t AcceleratorAllocatorConfig::parseGarbageCollectionThreshold(
+    const ConfigTokenizer& tokenizer,
+    size_t i) {
+  tokenizer.checkToken(++i, ":");
+  double val_env = tokenizer.toDouble(++i);
+  TORCH_CHECK(
+      val_env > 0 && val_env < 1.0,
+      "garbage_collect_threshold is invalid, set it in (0.0, 1.0)");
+  garbage_collection_threshold_ = val_env;
+
+  return i;
+}
+
+size_t AcceleratorAllocatorConfig::parseRoundUpPower2Divisions(
+    const ConfigTokenizer& tokenizer,
+    size_t i) {
+  tokenizer.checkToken(++i, ":");
+  bool first_value = true;
+
+  if (tokenizer[++i] == "[") {
+    size_t last_index = 0;
+    // NOLINTNEXTLINE(bugprone-inc-dec-in-conditions)
+    while (++i < tokenizer.size() && tokenizer[i] != "]") {
+      size_t value_index = i;
+      tokenizer.checkToken(++i, ":");
+      size_t value = tokenizer.toSizeT(++i);
+      TORCH_CHECK(
+          value == 0 || llvm::isPowerOf2_64(value),
+          "For roundups, the divisions has to be power of 2 or 0 to disable roundup ");
+
+      if (tokenizer[value_index] == ">") {
+        std::fill(
+            std::next(
+                roundup_power2_divisions_.begin(),
+                static_cast<std::vector<size_t>::difference_type>(
+                    last_index + 1)),
+            roundup_power2_divisions_.end(),
+            value);
+      } else {
+        size_t boundary = tokenizer.toSizeT(value_index);
+        TORCH_CHECK(
+            llvm::isPowerOf2_64(boundary),
+            "For roundups, the intervals have to be power of 2 ");
+
+        size_t index = 63 - llvm::countLeadingZeros(boundary);
+        index =
+            std::clamp(index, size_t{0}, roundup_power2_divisions_.size() - 1);
+
+        if (first_value) {
+          std::fill(
+              roundup_power2_divisions_.begin(),
+              std::next(
+                  roundup_power2_divisions_.begin(),
+                  static_cast<std::vector<size_t>::difference_type>(index)),
+              value);
+          first_value = false;
+        }
+        roundup_power2_divisions_[index] = value;
+        last_index = index;
+      }
+
+      if (tokenizer[i + 1] != "]") {
+        tokenizer.checkToken(++i, ",");
+      }
+    }
+    TORCH_INTERNAL_ASSERT(
+        i < tokenizer.size(),
+        "Expected closing bracket ']' in ConfigTokenizer but reached end of config");
+  } else { // Keep this for backwards compatibility
+    size_t value = tokenizer.toSizeT(i);
+    TORCH_CHECK(
+        llvm::isPowerOf2_64(value),
+        "For roundups, the divisions has to be power of 2 ");
+    std::fill(
+        roundup_power2_divisions_.begin(),
+        roundup_power2_divisions_.end(),
+        value);
+  }
+  return i;
+}
+
+size_t AcceleratorAllocatorConfig::parseExpandableSegments(
+    const ConfigTokenizer& tokenizer,
+    size_t i) {
+  tokenizer.checkToken(++i, ":");
+  use_expandable_segments_ = tokenizer.toBool(++i);
+
+  return i;
+}
+
+size_t AcceleratorAllocatorConfig::parsePinnedUseBackgroundThreads(
+    const ConfigTokenizer& tokenizer,
+    size_t i) {
+  tokenizer.checkToken(++i, ":");
+  pinned_use_background_threads_ = tokenizer.toBool(++i);
+
+  return i;
+}
+
+void AcceleratorAllocatorConfig::parseArgs(const std::string& env) {
+  // The following option will be reset to its default value if not explicitly
+  // set each time.
+  max_split_size_ = std::numeric_limits<size_t>::max();
+  roundup_power2_divisions_.assign(kRoundUpPowerOfTwoIntervals, 0);
+  garbage_collection_threshold_ = 0;
+
+  {
+    std::lock_guard<std::mutex> lock(last_allocator_settings_mutex_);
+    last_allocator_settings_ = env;
+  }
+
+  ConfigTokenizer tokenizer(env);
+  for (size_t i = 0; i < tokenizer.size(); i++) {
+    const auto& key = tokenizer[i];
+    if (key == "max_split_size_mb") {
+      i = parseMaxSplitSize(tokenizer, i);
+    } else if (key == "max_non_split_rounding_mb") {
+      i = parseMaxNonSplitRoundingSize(tokenizer, i);
+    } else if (key == "garbage_collection_threshold") {
+      i = parseGarbageCollectionThreshold(tokenizer, i);
+    } else if (key == "roundup_power2_divisions") {
+      i = parseRoundUpPower2Divisions(tokenizer, i);
+    } else if (key == "expandable_segments") {
+      i = parseExpandableSegments(tokenizer, i);
+    } else if (key == "pinned_use_background_threads") {
+      i = parsePinnedUseBackgroundThreads(tokenizer, i);
+    } else {
+      i = tokenizer.skipKey(i);
+    }
+
+    if (i + 1 < tokenizer.size()) {
+      tokenizer.checkToken(++i, ",");
+    }
+  }
+}
+
+} // namespace c10::CachingAllocator
--- a/c10/core/AllocatorConfig.h
+++ b/c10/core/AllocatorConfig.h
@ -0,0 +1,337 @@
+#pragma once
+
+#include <c10/core/DeviceType.h>
+#include <c10/util/Exception.h>
+#include <c10/util/llvmMathExtras.h>
+
+#include <atomic>
+#include <mutex>
+#include <string>
+#include <vector>
+
+namespace c10::CachingAllocator {
+
+// "large" allocations may be packed in 20 MiB blocks
+const size_t kLargeBuffer = 20971520;
+
+// A utility class for tokenizing allocator configuration strings into discrete
+// parts. For example, the config string:
+//   "key1:val1,key2:[val2,val3]"
+// is tokenized into:
+//   "key1", ":", "val1", ",", "key2", ":", "[", "val2", ",", "val3", "]",
+//
+// Tokens include keys, values, and special characters (':', ',', '[', ']').
+// Whitespace is ignored.
+class ConfigTokenizer {
+ public:
+  explicit ConfigTokenizer(const std::string& env) {
+    std::string buffer;
+    for (char ch : env) {
+      if (ch == ',' || ch == ':' || ch == '[' || ch == ']') {
+        if (!buffer.empty()) {
+          config_.emplace_back(std::move(buffer));
+          buffer.clear();
+        }
+        config_.emplace_back(1, ch);
+      } else if (!std::isspace(static_cast<unsigned char>(ch))) {
+        buffer += ch;
+      }
+    }
+    if (!buffer.empty()) {
+      config_.emplace_back(std::move(buffer));
+    }
+  }
+
+  const std::string& operator[](size_t i) const {
+    TORCH_INTERNAL_ASSERT(
+        i < config_.size(), "Index out of bounds in ConfigTokenizer");
+    return config_[i];
+  }
+
+  size_t size() const {
+    return config_.size();
+  }
+
+  bool checkToken(size_t i, const std::string& token) const {
+    checkIndex(i);
+    return config_[i] == token;
+  }
+
+  size_t toSizeT(size_t i) const {
+    checkIndex(i);
+    return std::stoull(config_[i]);
+  }
+
+  double toDouble(size_t i) const {
+    checkIndex(i);
+    return std::stod(config_[i]);
+  }
+
+  bool toBool(size_t i) const {
+    checkIndex(i);
+    const auto& token = config_[i];
+    if (token == "True") {
+      return true;
+    } else if (token == "False") {
+      return false;
+    } else {
+      TORCH_CHECK(
+          false,
+          "Expected 'True' or 'False' at index ",
+          i,
+          " in ConfigTokenizer but got '",
+          token,
+          "'");
+    }
+  }
+
+  // Skips the current token group and returns the index of the value token.
+  // Assumes the current index `i` points to a key name in a key-value pair.
+  size_t skipKey(size_t i) const {
+    // Expect a colon after the key
+    checkToken(++i, ":");
+
+    ++i; // Move to the value
+    checkIndex(i);
+    if (config_[i] != "[") {
+      // Value is a single token (not a list) -> return its index
+      return i;
+    }
+
+    // Skip tokens inside the list until matching ']'
+    // NOLINTNEXTLINE(bugprone-inc-dec-in-conditions)
+    while (++i < config_.size() && config_[i] != "]") {
+    }
+
+    TORCH_INTERNAL_ASSERT(
+        i < config_.size(),
+        "Expected closing bracket ']' in ConfigTokenizer but reached end of config");
+
+    return i; // Return the index of the closing ']'
+  }
+
+ private:
+  void checkIndex(size_t i) const {
+    TORCH_INTERNAL_ASSERT(
+        i < config_.size(), "Index out of bounds in ConfigTokenizer");
+  }
+
+  std::vector<std::string> config_;
+};
+
+/**
+ * Note [AcceleratorAllocatorConfig design]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * This class configures memory allocation for both device and host memory. A
+ * single `AcceleratorAllocatorConfig` instance is shared across all accelerator
+ * backends, such as CUDA and XPU, under the assumption that relevant
+ * environment variables apply uniformly to all accelerators. Device-specific
+ * configuration extensions are supported via hooks (see
+ * `registerDeviceConfigParserHook`).
+ *
+ * Recommended design:
+ * - Place common configurations in `AcceleratorAllocatorConfig`.
+ * - Extend backend-specific configurations in corresponding device-specific
+ *     classes, such as `CUDAAllocatorConfig`, etc.
+ *
+ * Scope:
+ * - Configuration options must be environment-variable driven.
+ *
+ * Naming Convention:
+ * - Public API names in `AcceleratorAllocatorConfig` should be device-generic.
+ * - Members prefixed with `pinned_` are specific to the host/pinned allocator.
+ * - Environment variable names should be generic across backends.
+ * - Comma-separated key-value pairs in the format: `key:value`. Use square
+ *     brackets `[]` for list values Example: `key1:123, key2:[val1,val2]`
+ *
+ * Environment Variables:
+ * - The primary environment variable for configuration is `PYTORCH_ALLOC_CONF`.
+ * - For backward compatibility, `PYTORCH_CUDA_ALLOC_CONF` is also supported
+ *     with lower priority.
+ */
+
+class C10_API AcceleratorAllocatorConfig {
+ public:
+  static AcceleratorAllocatorConfig& instance();
+
+  C10_DISABLE_COPY_AND_ASSIGN(AcceleratorAllocatorConfig);
+  AcceleratorAllocatorConfig(AcceleratorAllocatorConfig&&) = delete;
+  AcceleratorAllocatorConfig& operator=(AcceleratorAllocatorConfig&&) = delete;
+  ~AcceleratorAllocatorConfig() = default;
+
+  /* Device allocator settings */
+
+  // Returns the maximum block size (in MB) that is allowed to be split. The
+  // default is unlimited (all blocks can be split).
+  static size_t max_split_size() {
+    return instance().max_split_size_;
+  }
+
+  // Returns the maximum block size (in MB) that is allowed to be rounded up
+  // without requiring splitting when searching for a free block. The default is
+  // 20 MiB.
+  static size_t max_non_split_rounding_size() {
+    return instance().max_non_split_rounding_size_;
+  }
+
+  // Return the number of divisions used when rounding up allocation sizes (in
+  // MB) to the nearest power-of-2 boundary.
+  static size_t roundup_power2_divisions(size_t size);
+
+  // Returns the vector of division factors used for rounding up allocation
+  // sizes. These divisions apply to size intervals between 1MB and 64GB.
+  static std::vector<size_t> roundup_power2_divisions() {
+    return instance().roundup_power2_divisions_;
+  }
+
+  // Returns the threshold that triggers garbage collection when the ratio of
+  // used memory to maximum allowed memory exceeds this value. The default is 0,
+  // meaning no garbage collection is triggered. The value should be in the
+  // range (0.0, 1.0).
+  static double garbage_collection_threshold() {
+    return instance().garbage_collection_threshold_;
+  }
+
+  // Returns whether the expandable segment feature is enabled. This allows the
+  // allocator to start with one segment that grows as needed, rather than
+  // creating a new segment for each allocation. Default is false (expandable
+  // segments disabled).
+  static bool use_expandable_segments() {
+    return instance().use_expandable_segments_;
+  }
+
+  /* Host allocator settings */
+
+  // Returns whether the pinned host allocator uses background threads for
+  // processing events. This is useful for improving performance in scenarios
+  // where many small allocations are made. Default is false (background threads
+  // disabled).
+  static bool pinned_use_background_threads() {
+    return instance().pinned_use_background_threads_;
+  }
+
+  /* Settings for both device and host allocator */
+
+  // Returns the current allocator settings as a string. This string is useful
+  // to expand device-specific allocator configurations
+  static std::string last_allocator_settings() {
+    std::lock_guard<std::mutex> lock(instance().last_allocator_settings_mutex_);
+    return instance().last_allocator_settings_;
+  }
+
+  // Parses the environment variable `env` to update the allocator settings.
+  // If the environment variable is not set, it does nothing.
+  // The configuration string should be a comma-separated list of key-value
+  // pairs, where each key is a configuration option and the value is the
+  // corresponding setting. For example:
+  // "max_split_size_mb:100,max_non_split_rounding_mb:20,garbage_collection_threshold:0.5,roundup_power2_divisions:[64:8,256:4,1024:4,>:1],expandable_segments:true,pinned_use_background_threads:true"
+  void parseArgs(const std::string& env);
+
+  // Registers a device-specific configuration parser hook. This allows
+  // backends to parse additional device-specific configuration options from the
+  // environment variable. The hook should be a function that takes a string
+  // (the environment variable value) and parses it to set device-specific
+  // configuration options.
+  // The hook will be called when the environment variable is parsed.
+  // If a hook is already registered, it will be replaced with the new one.
+  void registerDeviceConfigParserHook(
+      std::function<void(const std::string&)> hook) {
+    device_config_parser_hook_ = std::move(hook);
+  }
+
+  // Calls the registered device-specific configuration parser hook with the
+  // provided environment string. This allows backends to parse additional
+  // device-specific configuration options from the environment variable.
+  // If no hook is registered, this function does nothing.
+  void callDeviceConfigParserHook(const std::string& env) const {
+    if (device_config_parser_hook_) {
+      device_config_parser_hook_(env);
+    }
+  }
+
+ private:
+  AcceleratorAllocatorConfig();
+
+  /* Internal functions for device allocator */
+
+  // Parse `max_split_size_mb` from environment variable.
+  size_t parseMaxSplitSize(const ConfigTokenizer& tokenizer, size_t i);
+  // Parse `max_non_split_rounding_mb` from environment variable.
+  size_t parseMaxNonSplitRoundingSize(
+      const ConfigTokenizer& tokenizer,
+      size_t i);
+  // Parse `garbage_collection_threshold` from environment variable.
+  size_t parseGarbageCollectionThreshold(
+      const ConfigTokenizer& tokenizer,
+      size_t i);
+  // Parse `roundup_power2_divisions` from environment variable.
+  size_t parseRoundUpPower2Divisions(
+      const ConfigTokenizer& tokenizer,
+      size_t i);
+  // Parse `expandable_segments` from environment variable.
+  size_t parseExpandableSegments(const ConfigTokenizer& tokenizer, size_t i);
+
+  /* Internal functions for host allocator */
+
+  // Parse `pinned_use_background_threads` from environment variable.
+  size_t parsePinnedUseBackgroundThreads(
+      const ConfigTokenizer& tokenizer,
+      size_t i);
+
+  /* The following members are specifically used for the device allocator. */
+
+  // The maximum block size that is allowed to be split.
+  std::atomic<size_t> max_split_size_{std::numeric_limits<size_t>::max()};
+  // The maximum allowable extra size of a memory block without requiring
+  // splitting when searching for a free block.
+  std::atomic<size_t> max_non_split_rounding_size_{kLargeBuffer};
+  // Used to store how memory allocations of different sizes should be rounded
+  // up to the nearest power of 2 divisions.
+  std::vector<size_t> roundup_power2_divisions_;
+  // The threshold that triggers garbage collection when the ratio of used
+  // memory to maximum allowed memory exceeds this value.
+  std::atomic<double> garbage_collection_threshold_{0};
+  // A flag to enable expandable segments feature.
+  std::atomic<bool> use_expandable_segments_{false};
+
+  /* The following members are specifically used for the host allocator. */
+
+  // A flag to enable background thread for processing events.
+  std::atomic<bool> pinned_use_background_threads_{false};
+
+  /* The following members are used for both device and host allocator. */
+
+  // Record the last allocator config environment setting.
+  std::mutex last_allocator_settings_mutex_;
+  std::string last_allocator_settings_;
+
+  // Optional hook for parsing additional device-specific allocator settings.
+  // This allows backends (e.g., CUDA, XPU) to register a custom parser for
+  // their own environment configuration extensions.
+  std::function<void(const std::string&)> device_config_parser_hook_{nullptr};
+};
+
+C10_API inline void setAllocatorSettings(const std::string& env) {
+  AcceleratorAllocatorConfig::instance().parseArgs(env);
+  AcceleratorAllocatorConfig::instance().callDeviceConfigParserHook(env);
+}
+
+C10_API inline std::string getAllocatorSettings() {
+  return AcceleratorAllocatorConfig::instance().last_allocator_settings();
+}
+
+struct DeviceConfigParserHookRegistry {
+  explicit DeviceConfigParserHookRegistry(
+      std::function<void(const std::string&)> hook) {
+    AcceleratorAllocatorConfig::instance().registerDeviceConfigParserHook(
+        std::move(hook));
+  }
+};
+
+#define REGISTER_ALLOCATOR_CONFIG_PARSE_HOOK(hook)            \
+  namespace {                                                 \
+  static at::CachingAllocator::DeviceConfigParserHookRegistry \
+      g_device_config_parse_hook_registry_instance(hook);     \
+  }
+
+} // namespace c10::CachingAllocator
--- a/c10/cuda/driver_api.cpp
+++ b/c10/cuda/driver_api.cpp
@ -1,30 +1,27 @@
 #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
+#include <c10/cuda/CUDAException.h>
 #include <c10/cuda/driver_api.h>
 #include <c10/util/CallOnce.h>
 #include <c10/util/Exception.h>
+#include <c10/util/Logging.h>
+#include <cuda_runtime.h>
 #include <dlfcn.h>

 namespace c10::cuda {

 namespace {

+void* get_symbol(const char* name, int version);
+
 DriverAPI create_driver_api() {
-  void* handle_0 = dlopen("libcuda.so.1", RTLD_LAZY | RTLD_NOLOAD);
-  TORCH_CHECK(handle_0, "Can't open libcuda.so.1: ", dlerror());
  void* handle_1 = DriverAPI::get_nvml_handle();
  DriverAPI r{};

-#define LOOKUP_LIBCUDA_ENTRY(name)                       \
-  r.name##_ = ((decltype(&name))dlsym(handle_0, #name)); \
-  TORCH_INTERNAL_ASSERT(r.name##_, "Can't find ", #name, ": ", dlerror())
-  C10_LIBCUDA_DRIVER_API(LOOKUP_LIBCUDA_ENTRY)
-#undef LOOKUP_LIBCUDA_ENTRY
-
-#define LOOKUP_LIBCUDA_ENTRY(name)                       \
-  r.name##_ = ((decltype(&name))dlsym(handle_0, #name)); \
-  dlerror();
-  C10_LIBCUDA_DRIVER_API_12030(LOOKUP_LIBCUDA_ENTRY)
-#undef LOOKUP_LIBCUDA_ENTRY
+#define LOOKUP_LIBCUDA_ENTRY_WITH_VERSION(name, version)                     \
+  r.name##_ = reinterpret_cast<decltype(&name)>(get_symbol(#name, version)); \
+  TORCH_INTERNAL_ASSERT(r.name##_, "Can't find ", #name)
+  C10_LIBCUDA_DRIVER_API(LOOKUP_LIBCUDA_ENTRY_WITH_VERSION)
+#undef LOOKUP_LIBCUDA_ENTRY_WITH_VERSION

  if (handle_1) {
 #define LOOKUP_NVML_ENTRY(name)                          \
@ -35,6 +32,32 @@ DriverAPI create_driver_api() {
  }
  return r;
 }
+
+void* get_symbol(const char* name, int version) {
+  void* out = nullptr;
+  cudaDriverEntryPointQueryResult qres{};
+
+  // CUDA 12.5+ supports version-based lookup
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12050)
+  if (auto st = cudaGetDriverEntryPointByVersion(
+          name, &out, version, cudaEnableDefault, &qres);
+      st == cudaSuccess && qres == cudaDriverEntryPointSuccess && out) {
+    return out;
+  }
+#endif
+
+  // This fallback to the old API to try getting the symbol again.
+  if (auto st = cudaGetDriverEntryPoint(name, &out, cudaEnableDefault, &qres);
+      st == cudaSuccess && qres == cudaDriverEntryPointSuccess && out) {
+    return out;
+  }
+
+  // If the symbol cannot be resolved, report and return nullptr;
+  // the caller is responsible for checking the pointer.
+  LOG(INFO) << "Failed to resolve symbol " << name;
+  return nullptr;
+}
+
 } // namespace

 void* DriverAPI::get_nvml_handle() {
--- a/c10/cuda/driver_api.h
+++ b/c10/cuda/driver_api.h
@ -20,30 +20,24 @@
    }                                                                      \
  } while (0)

-#define C10_LIBCUDA_DRIVER_API(_)   \
-  _(cuDeviceGetAttribute)           \
-  _(cuMemAddressReserve)            \
-  _(cuMemRelease)                   \
-  _(cuMemMap)                       \
-  _(cuMemAddressFree)               \
-  _(cuMemSetAccess)                 \
-  _(cuMemUnmap)                     \
-  _(cuMemCreate)                    \
-  _(cuMemGetAllocationGranularity)  \
-  _(cuMemExportToShareableHandle)   \
-  _(cuMemImportFromShareableHandle) \
-  _(cuMemsetD32Async)               \
-  _(cuStreamWriteValue32)           \
-  _(cuGetErrorString)
-
-#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12030)
-#define C10_LIBCUDA_DRIVER_API_12030(_) \
-  _(cuMulticastAddDevice)               \
-  _(cuMulticastBindMem)                 \
-  _(cuMulticastCreate)
-#else
-#define C10_LIBCUDA_DRIVER_API_12030(_)
-#endif
+#define C10_LIBCUDA_DRIVER_API(_)          \
+  _(cuDeviceGetAttribute, 12000)           \
+  _(cuMemAddressReserve, 12000)            \
+  _(cuMemRelease, 12000)                   \
+  _(cuMemMap, 12000)                       \
+  _(cuMemAddressFree, 12000)               \
+  _(cuMemSetAccess, 12000)                 \
+  _(cuMemUnmap, 12000)                     \
+  _(cuMemCreate, 12000)                    \
+  _(cuMemGetAllocationGranularity, 12000)  \
+  _(cuMemExportToShareableHandle, 12000)   \
+  _(cuMemImportFromShareableHandle, 12000) \
+  _(cuMemsetD32Async, 12000)               \
+  _(cuStreamWriteValue32, 12000)           \
+  _(cuGetErrorString, 12000)               \
+  _(cuMulticastAddDevice, 12030)           \
+  _(cuMulticastBindMem, 12030)             \
+  _(cuMulticastCreate, 12030)

 #define C10_NVML_DRIVER_API(_)            \
  _(nvmlInit_v2)                          \
@ -56,11 +50,13 @@
 namespace c10::cuda {

 struct DriverAPI {
+#define CREATE_MEMBER_VERSIONED(name, version) decltype(&name) name##_;
 #define CREATE_MEMBER(name) decltype(&name) name##_;
-  C10_LIBCUDA_DRIVER_API(CREATE_MEMBER)
-  C10_LIBCUDA_DRIVER_API_12030(CREATE_MEMBER)
+  C10_LIBCUDA_DRIVER_API(CREATE_MEMBER_VERSIONED)
  C10_NVML_DRIVER_API(CREATE_MEMBER)
+#undef CREATE_MEMBER_VERSIONED
 #undef CREATE_MEMBER
+
  static DriverAPI* get();
  static void* get_nvml_handle();
 };
--- a/c10/ovrsource_defs.bzl
+++ b/c10/ovrsource_defs.bzl
@ -63,7 +63,6 @@ def define_c10_ovrsource(name, is_mobile):
            "core/impl/*.h",
        ]),
        reexport_all_header_dependencies = False,
-        # tests = C10_CPU_TEST_TARGETS,
        visibility = [
            "//xplat/caffe2/c10:c10_ovrsource",
        ],
@ -84,25 +83,6 @@ def define_c10_ovrsource(name, is_mobile):
    )

 def define_ovrsource_targets():
-    # C10_CPU_TEST_FILES = native.glob([
-    #     "test/core/*.cpp",
-    #     "test/util/*.cpp",
-    # ])
-
-    # C10_GPU_TEST_FILES = native.glob([
-    #     "cuda/test/**/*.cpp",
-    # ])
-
-    # C10_CPU_TEST_TARGETS = [
-    #     ":" + paths.basename(test)[:-len(".cpp")] + "_ovrsource"
-    #     for test in C10_CPU_TEST_FILES
-    # ]
-
-    # C10_GPU_TEST_TARGETS = [
-    #     ":" + paths.basename(test)[:-len(".cpp")] + "_ovrsource"
-    #     for test in C10_GPU_TEST_FILES
-    # ]
-
    common_c10_cmake_defines = [
        ("#cmakedefine C10_BUILD_SHARED_LIBS", ""),
        ("#cmakedefine C10_USE_NUMA", ""),
@ -207,7 +187,6 @@ def define_ovrsource_targets():
            "cuda/impl/*.h",
        ]),
        reexport_all_header_dependencies = False,
-        # tests = C10_GPU_TEST_TARGETS,
        visibility = ["PUBLIC"],
        deps = [
            "//third-party/cuda:libcuda",
@ -217,64 +196,3 @@ def define_ovrsource_targets():
            ":c10_ovrsource",
        ],
    )
-
-    # [
-    #     oxx_test(
-    #         name = paths.basename(test)[:-len(".cpp")] + "_ovrsource",
-    #         srcs = [test],
-    #         compatible_with = cpu_supported_platforms,
-    #         compiler_flags = select({
-    #             "DEFAULT": [],
-    #             "ovr_config//compiler:cl": [
-    #                 "/w",
-    #             ],
-    #             "ovr_config//compiler:clang": [
-    #                 "-Wno-error",
-    #                 "-Wno-self-assign-overloaded",
-    #                 "-Wno-self-move",
-    #                 "-Wno-shadow",
-    #                 "-Wno-undef",
-    #                 "-Wno-unused-function",
-    #                 "-Wno-unused-variable",
-    #             ],
-    #         }),
-    #         framework = "gtest",
-    #         oncall = "ovrsource_pytorch",
-    #         raw_headers = native.glob([
-    #             "test/**/*.h",
-    #         ]),
-    #         deps = [
-    #             ":c10_ovrsource",
-    #         ],
-    #     )
-    #     for test in C10_CPU_TEST_FILES
-    # ]
-
-    # [
-    #     oxx_test(
-    #         name = paths.basename(test)[:-len(".cpp")] + "_ovrsource",
-    #         srcs = [test],
-    #         compatible_with = cuda_supported_platforms,
-    #         compiler_flags = select({
-    #             "DEFAULT": [],
-    #             "ovr_config//compiler:cl": [
-    #                 "/w",
-    #             ],
-    #             "ovr_config//compiler:clang": [
-    #                 "-Wno-error",
-    #             ],
-    #         }),
-    #         framework = "gtest",
-    #         oncall = "ovrsource_pytorch",
-    #         raw_headers = native.glob([
-    #             "test/**/*.h",
-    #         ]),
-    #         runtime_shared_libraries = [
-    #             "//third-party/cuda:cudart",
-    #         ],
-    #         deps = [
-    #             ":c10_cuda_ovrsource",
-    #         ],
-    #     )
-    #     for test in C10_GPU_TEST_FILES
-    # ]
--- a/c10/test/core/AllocatorConfig_test.cpp
+++ b/c10/test/core/AllocatorConfig_test.cpp
@ -0,0 +1,123 @@
+#include <c10/core/AllocatorConfig.h>
+
+#include <gtest/gtest.h>
+
+using namespace c10::CachingAllocator;
+constexpr size_t kMB = 1024 * 1024ul;
+
+struct ExtendedAllocatorConfig {
+  static ExtendedAllocatorConfig& instance() {
+    static ExtendedAllocatorConfig instance;
+    return instance;
+  }
+
+  // Returns the device-specific option value in bytes.
+  static size_t device_specific_option() {
+    return instance().device_specific_option_;
+  }
+
+  void parseArgs(const std::string& env) {
+    // Parse device-specific options from the environment variable
+    ConfigTokenizer tokenizer(env);
+    for (size_t i = 0; i < tokenizer.size(); i++) {
+      const auto& key = tokenizer[i];
+      if (key == "device_specific_option_mb") {
+        tokenizer.checkToken(++i, ":");
+        device_specific_option_ = tokenizer.toSizeT(++i) * kMB;
+      } else {
+        i = tokenizer.skipKey(i);
+      }
+
+      if (i + 1 < tokenizer.size()) {
+        tokenizer.checkToken(++i, ",");
+      }
+    }
+  }
+
+ private:
+  // Device-specific option, e.g., memory limit for a specific device.
+  std::atomic<size_t> device_specific_option_{0};
+};
+
+REGISTER_ALLOCATOR_CONFIG_PARSE_HOOK([](const std::string& env) {
+  ExtendedAllocatorConfig::instance().parseArgs(env);
+})
+
+TEST(AllocatorConfigTest, allocator_config_test) {
+  std::string env =
+      "max_split_size_mb:40,"
+      "max_non_split_rounding_mb:30,"
+      "garbage_collection_threshold:0.5,"
+      "roundup_power2_divisions:[64:8,128:2,256:4,512:2,1024:4,>:1],"
+      "expandable_segments:True,"
+      "pinned_use_background_threads:True,"
+      "device_specific_option_mb:64";
+  c10::CachingAllocator::setAllocatorSettings(env);
+  EXPECT_EQ(c10::CachingAllocator::getAllocatorSettings(), env);
+  EXPECT_EQ(AcceleratorAllocatorConfig::max_split_size(), 40 * kMB);
+  EXPECT_EQ(
+      AcceleratorAllocatorConfig::max_non_split_rounding_size(), 30 * kMB);
+  EXPECT_EQ(AcceleratorAllocatorConfig::garbage_collection_threshold(), 0.5);
+  EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(32 * kMB), 8);
+  EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(64 * kMB), 8);
+  EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(128 * kMB), 2);
+  EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(256 * kMB), 4);
+  EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(512 * kMB), 2);
+  EXPECT_EQ(
+      AcceleratorAllocatorConfig::roundup_power2_divisions(1024 * kMB), 4);
+  EXPECT_EQ(
+      AcceleratorAllocatorConfig::roundup_power2_divisions(2048 * kMB), 1);
+  EXPECT_EQ(
+      AcceleratorAllocatorConfig::roundup_power2_divisions(4096 * kMB), 1);
+  EXPECT_EQ(
+      AcceleratorAllocatorConfig::roundup_power2_divisions(8192 * kMB), 1);
+  EXPECT_EQ(AcceleratorAllocatorConfig::use_expandable_segments(), true);
+  EXPECT_EQ(AcceleratorAllocatorConfig::pinned_use_background_threads(), true);
+  EXPECT_EQ(ExtendedAllocatorConfig::device_specific_option(), 64 * kMB);
+
+  env =
+      "max_split_size_mb:20,"
+      "max_non_split_rounding_mb:40,"
+      "garbage_collection_threshold:0.8";
+  c10::CachingAllocator::setAllocatorSettings(env);
+  EXPECT_EQ(c10::CachingAllocator::getAllocatorSettings(), env);
+  EXPECT_EQ(AcceleratorAllocatorConfig::max_split_size(), 20 * kMB);
+  EXPECT_EQ(
+      AcceleratorAllocatorConfig::max_non_split_rounding_size(), 40 * kMB);
+  EXPECT_EQ(AcceleratorAllocatorConfig::garbage_collection_threshold(), 0.8);
+
+  // roundup_power2_divisions knob array syntax
+  env = "roundup_power2_divisions:[128:8,256:16,512:1,2048:8,>:2]";
+  c10::CachingAllocator::setAllocatorSettings(env);
+  EXPECT_EQ(c10::CachingAllocator::getAllocatorSettings(), env);
+  EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(64 * kMB), 8);
+  EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(128 * kMB), 8);
+  EXPECT_EQ(
+      AcceleratorAllocatorConfig::roundup_power2_divisions(256 * kMB), 16);
+  EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(512 * kMB), 1);
+  EXPECT_EQ(
+      AcceleratorAllocatorConfig::roundup_power2_divisions(1024 * kMB), 0);
+  EXPECT_EQ(
+      AcceleratorAllocatorConfig::roundup_power2_divisions(2048 * kMB), 8);
+  EXPECT_EQ(
+      AcceleratorAllocatorConfig::roundup_power2_divisions(4096 * kMB), 2);
+
+  // roundup_power2_divisions single value syntax for backward compatibility
+  env = "roundup_power2_divisions:4";
+  c10::CachingAllocator::setAllocatorSettings(env);
+  EXPECT_EQ(c10::CachingAllocator::getAllocatorSettings(), env);
+  EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(64 * kMB), 4);
+  EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(256 * kMB), 4);
+  EXPECT_EQ(
+      AcceleratorAllocatorConfig::roundup_power2_divisions(2048 * kMB), 4);
+
+  env = "expandable_segments:False,";
+  c10::CachingAllocator::setAllocatorSettings(env);
+  EXPECT_EQ(c10::CachingAllocator::getAllocatorSettings(), env);
+  EXPECT_EQ(AcceleratorAllocatorConfig::use_expandable_segments(), false);
+
+  env = "pinned_use_background_threads:False";
+  c10::CachingAllocator::setAllocatorSettings(env);
+  EXPECT_EQ(c10::CachingAllocator::getAllocatorSettings(), env);
+  EXPECT_EQ(AcceleratorAllocatorConfig::pinned_use_background_threads(), false);
+}
--- a/c10/util/BFloat16.h
+++ b/c10/util/BFloat16.h
@ -4,6 +4,7 @@
 // 1 bit for the sign, 8 bits for the exponent and 7 bits for the mantissa.

 #include <c10/macros/Macros.h>
+#include <c10/util/bit_cast.h>
 #include <cmath>
 #include <cstdint>
 #include <cstring>
@ -67,13 +68,7 @@ inline C10_HOST_DEVICE uint16_t round_to_nearest_even(float src) {
 #endif
    return UINT16_C(0x7FC0);
  } else {
-    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-    union {
-      uint32_t U32; // NOLINT(facebook-hte-BadMemberName)
-      float F32; // NOLINT(facebook-hte-BadMemberName)
-    };
-
-    F32 = src;
+    const uint32_t U32 = c10::bit_cast<uint32_t>(src);
    uint32_t rounding_bias = ((U32 >> 16) & 1) + UINT32_C(0x7FFF);
    return static_cast<uint16_t>((U32 + rounding_bias) >> 16);
  }
--- a/c10/util/bit_cast.h
+++ b/c10/util/bit_cast.h
@ -3,6 +3,8 @@
 #include <cstring>
 #include <type_traits>

+#include <c10/macros/Macros.h>
+
 #if __has_include(<bit>) && (defined(__cpp_lib_bit_cast) && __cpp_lib_bit_cast >= 201806L)
 #include <bit>
 #define C10_HAVE_STD_BIT_CAST 1
@ -23,7 +25,7 @@ using std::bit_cast;
 // See https://en.cppreference.com/w/cpp/numeric/bit_cast for more
 // information as well as the source of our implementations.
 template <class To, class From>
-std::enable_if_t<
+C10_HOST_DEVICE std::enable_if_t<
    sizeof(To) == sizeof(From) && std::is_trivially_copyable_v<From> &&
        std::is_trivially_copyable_v<To>,
    To>
--- a/c10/util/build.bzl
+++ b/c10/util/build.bzl
@ -58,6 +58,9 @@ def define_targets(rules):
        name = "bit_cast",
        hdrs = ["bit_cast.h"],
        visibility = ["//:__subpackages__"],
+        deps = [
+            "//c10/macros",
+        ],
    )

    rules.cc_library(
--- a/docs/source/distributed._dist2.md
+++ b/docs/source/distributed._dist2.md
@ -0,0 +1,18 @@
+```{eval-rst}
+.. role:: hidden
+    :class: hidden-section
+```
+
+```{eval-rst}
+.. automodule:: torch.distributed._dist2
+    :members:
+    :undoc-members:
+    :show-inheritance:
+```
+
+```{eval-rst}
+.. autoclass:: torch.distributed.ProcessGroup
+    :members:
+    :undoc-members:
+    :show-inheritance:
+```
--- a/docs/source/distributed.md
+++ b/docs/source/distributed.md
@ -224,6 +224,10 @@ inconsistent 'UUID' assignment across ranks, and to prevent races during initial
 .. autofunction:: is_torchelastic_launched
 ```

+```{eval-rst}
+.. autofunction:: get_default_backend_for_device
+```
+
 ______________________________________________________________________

 Currently three initialization methods are supported:
@ -1471,3 +1475,9 @@ If you are running single node training, it may be convenient to interactively b
 ```{eval-rst}
 .. py:module:: torch.distributed.checkpoint.state_dict
 ```
+
+```{toctree}
+:hidden:
+
+distributed._dist2
+```
--- a/docs/source/rpc.md
+++ b/docs/source/rpc.md
@ -8,16 +8,14 @@ higher-level API to automatically differentiate models split across several
 machines.

 ```{warning}
-APIs in the RPC package are stable. There are multiple ongoing work items
-to improve performance and error handling, which will ship in future releases.
+APIs in the RPC package are stable and in maintenance mode.
 ```

 ```{warning}
-CUDA support was introduced in PyTorch 1.9 and is still a **beta** feature.
+CUDA support is a **beta** feature.
 Not all features of the RPC package are yet compatible with CUDA support and
 thus their use is discouraged. These unsupported features include: RRefs,
-JIT compatibility, dist autograd and dist optimizer, and profiling. These
-shortcomings will be addressed in future releases.
+JIT compatibility, dist autograd and dist optimizer, and profiling.
 ```

 ```{note}
@ -102,13 +100,6 @@ device lists on source and destination workers do not match. In such cases,
 applications can always explicitly move the input tensors to CPU on the caller
 and move it to the desired devices on the callee if necessary.

-```{warning}
-  TorchScript support in RPC is a prototype feature and subject to change. Since
-  v1.5.0, ``torch.distributed.rpc`` supports calling TorchScript functions as
-  RPC target functions, and this will help improve parallelism on the callee
-  side as executing TorchScript functions does not require GIL.
-```
-
 ```{eval-rst}
 .. autofunction:: rpc_sync
 .. autofunction:: rpc_async
@ -159,9 +150,7 @@ multiple different transports (TCP, of course, but also shared memory, NVLink,
 InfiniBand, ...) and can automatically detect their availability and negotiate
 the best transport to use for each pipe.

-The TensorPipe backend has been introduced in PyTorch v1.6 and is being actively
-developed. At the moment, it only supports CPU tensors, with GPU support coming
-soon. It comes with a TCP-based transport, just like Gloo. It is also able to
+The TensorPipe backend comes with a TCP-based transport, just like Gloo. It is also able to
 automatically chunk and multiplex large tensors over multiple sockets and
 threads in order to achieve very high bandwidths. The agent will be able to pick
 the best transport on its own, with no intervention required.
@ -301,6 +290,4 @@ to use [the profiler](https://pytorch.org/docs/stable/autograd.html#profiler) to
 -  [Getting started with Distributed RPC Framework](https://pytorch.org/tutorials/intermediate/rpc_tutorial.html)
 -  [Implementing a Parameter Server using Distributed RPC Framework](https://pytorch.org/tutorials/intermediate/rpc_param_server_tutorial.html)
 -  [Combining Distributed DataParallel with Distributed RPC Framework](https://pytorch.org/tutorials/advanced/rpc_ddp_tutorial.html) (covers **RemoteModule** as well)
-  [Profiling RPC-based Workloads](https://pytorch.org/tutorials/recipes/distributed_rpc_profiling.html)
 -  [Implementing batch RPC processing](https://pytorch.org/tutorials/intermediate/rpc_async_execution.html)
-  [Distributed Pipeline Parallel](https://pytorch.org/tutorials/intermediate/dist_pipeline_parallel_tutorial.html)
--- a/pyproject.toml
+++ b/pyproject.toml
@ -9,13 +9,13 @@ requires = [
    # 77.0.0: min version for SPDX expression support for project.license
    "setuptools>=62.3.0,<80.0",
    "wheel",
-    "astunparse",
    "cmake>=3.27",
    "ninja",
    "numpy",
    "packaging",
    "pyyaml",
    "requests",
+    "six",  # dependency chain: NNPACK -> PeachPy -> six
    "typing-extensions>=4.10.0",
 ]
 build-backend = "setuptools.build_meta"
--- a/requirements.txt
+++ b/requirements.txt
@ -1,5 +1,4 @@
 # Python dependencies required for development
-astunparse
 build[uv]  # for building sdist and wheel
 cmake>=3.27
 expecttest>=0.3.0
@ -18,6 +17,8 @@ pyyaml
 requests
 # setuptools develop deprecated on 80.0
 setuptools>=62.3.0,<80.0
+six  # dependency chain: NNPACK -> PeachPy -> six
 sympy>=1.13.3
 types-dataclasses
 typing-extensions>=4.13.2
+wheel
--- a/scripts/onnx/install-develop.sh
+++ b/scripts/onnx/install-develop.sh
@ -15,4 +15,4 @@ pip install --no-use-pep517 -e "$tp2_dir/onnx"
 # Install caffe2 and pytorch
 pip install -r "$top_dir/caffe2/requirements.txt"
 pip install -r "$top_dir/requirements.txt"
-python setup.py develop
+python -m pip install --no-build-isolation -v -e .
--- a/scripts/onnx/install.sh
+++ b/scripts/onnx/install.sh
@ -35,4 +35,4 @@ _pip_install -b "$BUILD_DIR/onnx" "file://$tp2_dir/onnx#egg=onnx"
 # Install caffe2 and pytorch
 pip install -r "$top_dir/caffe2/requirements.txt"
 pip install -r "$top_dir/requirements.txt"
-python setup.py install
+python -m pip install --no-build-isolation -v .
--- a/setup.py
+++ b/setup.py
@ -263,6 +263,7 @@ import json
 import shutil
 import subprocess
 import sysconfig
+import textwrap
 import time
 from collections import defaultdict
 from pathlib import Path
@ -601,7 +602,7 @@ def build_deps() -> None:
        report(
            'Finished running cmake. Run "ccmake build" or '
            '"cmake-gui build" to adjust build options and '
-            '"python setup.py install" to build.'
+            '"python -m pip install --no-build-isolation -v ." to build.'
        )
        sys.exit()

@ -1207,24 +1208,25 @@ def configure_extension_build() -> tuple[

 # post run, warnings, printed at the end to make them more visible
 build_update_message = """
-    It is no longer necessary to use the 'build' or 'rebuild' targets
+It is no longer necessary to use the 'build' or 'rebuild' targets

-    To install:
-      $ python setup.py install
-    To develop locally:
-      $ python setup.py develop
-    To force cmake to re-generate native build files (off by default):
-      $ CMAKE_FRESH=1 python setup.py develop
-"""
+To install:
+  $ python -m pip install --no-build-isolation -v .
+To develop locally:
+  $ python -m pip install --no-build-isolation -v -e .
+To force cmake to re-generate native build files (off by default):
+  $ CMAKE_FRESH=1 python -m pip install --no-build-isolation -v -e .
+""".strip()


 def print_box(msg: str) -> None:
-    lines = msg.split("\n")
-    size = max(len(l) + 1 for l in lines)
-    print("-" * (size + 2))
-    for l in lines:
-        print("|{}{}|".format(l, " " * (size - len(l))))
-    print("-" * (size + 2))
+    msg = textwrap.dedent(msg).strip()
+    lines = ["", *msg.split("\n"), ""]
+    max_width = max(len(l) for l in lines)
+    print("+" + "-" * (max_width + 4) + "+", file=sys.stderr, flush=True)
+    for line in lines:
+        print(f"|  {line:<{max_width}s}  |", file=sys.stderr, flush=True)
+    print("+" + "-" * (max_width + 4) + "+", file=sys.stderr, flush=True)


 def main() -> None:
@ -1308,7 +1310,9 @@ def main() -> None:
        "include/**/*.hpp",
        "include/*.cuh",
        "include/**/*.cuh",
+        "csrc/inductor/aoti_runtime/model.h",
        "_inductor/codegen/*.h",
+        "_inductor/codegen/aoti_runtime/*.h",
        "_inductor/codegen/aoti_runtime/*.cpp",
        "_inductor/script.ld",
        "_export/serde/*.yaml",
--- a/test/cpp/jit/README.md
+++ b/test/cpp/jit/README.md
@ -36,7 +36,7 @@ The following commands assume you are in PyTorch root.

 ```bash
 # ... Build PyTorch from source, e.g.
-python setup.py develop
+python -m pip install --no-build-isolation -v -e .
 # (re)build just the binary
 ninja -C build bin/test_jit
 # run tests
--- a/test/cpp_extensions/open_registration_extension/README.md
+++ b/test/cpp_extensions/open_registration_extension/README.md
@ -4,8 +4,8 @@ This folder contains a self-contained example of a PyTorch out-of-tree backend l

 ## How to use

-Install as standalone with `python setup.py develop` (or install) from this folder.
-You can run test via `python {PYTORCH_ROOT_PATH}/test/test_openreg.py`.
+Install as standalone with `python -m pip install -e .` (or `python -m pip install .`)
+from this folder. You can run test via `python {PYTORCH_ROOT_PATH}/test/test_openreg.py`.

 ## Design principles

--- a/test/distributed/_composable/fsdp/test_fully_shard_comm.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_comm.py
@ -1570,5 +1570,54 @@ class TestFullyShardForceSumReduction(FSDPTest):
        self.assertRegex(logs, all_reduce_sum_re)


+class TestFullyShardReduceOpWorldSize1(FSDPTest):
+    @property
+    def world_size(self) -> int:
+        return 1
+
+    def test_size1_reduceop(self):
+        from torch.distributed.distributed_c10d import ReduceOp
+
+        model = nn.Linear(1024, 1025)
+        ref_model = copy.deepcopy(model).to(device_type)
+        ref_optim = torch.optim.Adam(ref_model.parameters())
+        fully_shard(
+            model,
+            mesh=init_device_mesh(device_type.type, (1,)),
+            reshard_after_forward=False,
+        )
+        optim = torch.optim.Adam(model.parameters())
+
+        inp = torch.randn(1025, 1024, device=device_type.type)
+        for _ in range(3):
+            ref_optim.zero_grad()
+            ref_loss = ref_model(inp).sum()
+            ref_loss.backward()
+            for param in ref_model.parameters():
+                dist.all_reduce(param.grad, op=dist.ReduceOp.SUM)
+            ref_optim.step()
+
+            optim.zero_grad()
+            loss = model(inp).sum()
+            loss.backward()
+            optim.step()
+            self.assertEqual(loss, ref_loss)
+            self.assertEqual(
+                model.bias.grad._local_tensor,
+                ref_model.bias.grad,
+            )
+
+        state = model._get_fsdp_state()
+        fsdp_param_group = state._fsdp_param_group
+        group = fsdp_param_group.mesh_info.shard_process_group
+        (
+            _,
+            _,
+            _,
+            all_reduce_op,
+        ) = _get_gradient_divide_factors(group, None, torch.float32)
+        self.assertEqual(all_reduce_op, ReduceOp.SUM)
+
+
 if __name__ == "__main__":
    run_tests()
--- a/test/distributed/_composable/test_composability/test_2d_composability.py
+++ b/test/distributed/_composable/test_composability/test_2d_composability.py
@ -554,21 +554,6 @@ class TestNew2dParallelTraining(DTensorTestBase):
                        p2 = p2.redistribute(p2.device_mesh, [Replicate()]).to_local()
                    self.assertTrue(torch.allclose(p1, p2), f"{p1} vs {p2}")

-    @with_comms
-    @skip_if_lt_x_gpu(4)
-    def test_raise_invalid_tp_composition(self):
-        with self.assertRaisesRegex(
-            RuntimeError, r"Found TP device_mesh on the \d dimension of its parent mesh"
-        ):
-            mesh_2d = init_device_mesh(
-                self.device_type, (2, self.world_size // 2), mesh_dim_names=("tp", "dp")
-            )
-            parallelize_plan = {
-                "net1": ColwiseParallel(),
-                "net2": RowwiseParallel(),
-            }
-            parallelize_module(SimpleModel().cuda(), mesh_2d["tp"], parallelize_plan)
-
    @with_comms
    @skip_if_lt_x_gpu(4)
    def test_2d_fsdp_state_enable_extension(self):
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@ -3182,7 +3182,7 @@ class NcclRegistrationTest(MultiProcessTestCase):

        # Use NCCL memory allocator
        # enable symmetric memory usage in NCCL
-        pool = torch.cuda.MemPool(backend.mem_allocator, symm_mem=True)
+        pool = torch.cuda.MemPool(backend.mem_allocator, symmetric=True)

        # allocate memory with ncclMemAlloc
        # note: symmetric kernels are not available for dtypes like torch.int64
--- a/Show More
+++ b/Show More