flag control mkldnn fusion static shape

Memoize local_scalar_dense calls, refactor all memos (#125623 )
Signed-off-by: Edward Z. Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/125623 Approved by: https://github.com/eellison
2025-11-03 23:45:05 +08:00 · 2024-05-10 14:55:02 +08:00 · 2024-05-10 01:52:55 +00:00 · 2024-05-10 01:43:59 +00:00 · 2024-05-10 01:30:29 +00:00 · 2024-05-10 01:27:30 +00:00
2329 changed files with 41076 additions and 315517 deletions
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -204,7 +204,7 @@ case "$image" in
    PROTOBUF=yes
    DB=yes
    VISION=yes
-    ROCM_VERSION=5.7
+    ROCM_VERSION=6.0
    NINJA_VERSION=1.9.0
    CONDA_CMAKE=yes
    TRITON=yes
@ -215,7 +215,7 @@ case "$image" in
    PROTOBUF=yes
    DB=yes
    VISION=yes
-    ROCM_VERSION=6.0
+    ROCM_VERSION=6.1
    NINJA_VERSION=1.9.0
    CONDA_CMAKE=yes
    TRITON=yes
@ -306,6 +306,12 @@ case "$image" in
    DB=yes
    VISION=yes
    CONDA_CMAKE=yes
+    # snadampal: skipping sccache due to the following issue
+    # https://github.com/pytorch/pytorch/issues/121559
+    SKIP_SCCACHE_INSTALL=yes
+    # snadampal: skipping llvm src build install because the current version
+    # from pytorch/llvm:9.0.1 is x86 specific
+    SKIP_LLVM_SRC_BUILD_INSTALL=yes
    ;;
  *)
    # Catch-all for builds that are not hardcoded.
@ -360,7 +366,7 @@ if [[ "$image" == *cuda*  && ${OS} == "ubuntu" ]]; then
 fi

 # Build image
-DOCKER_BUILDKIT=1 docker build \
+docker build \
       --no-cache \
       --progress=plain \
       --build-arg "BUILD_ENVIRONMENT=${image}" \
@ -399,6 +405,8 @@ DOCKER_BUILDKIT=1 docker build \
       --build-arg "EXECUTORCH=${EXECUTORCH}" \
       --build-arg "BASEKIT_VERSION=${BASEKIT_VERSION}" \
       --build-arg "ACL=${ACL:-}" \
+       --build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \
+       --build-arg "SKIP_LLVM_SRC_BUILD_INSTALL=${SKIP_LLVM_SRC_BUILD_INSTALL:-}" \
       -f $(dirname ${DOCKERFILE})/Dockerfile \
       -t "$tmp_tag" \
       "$@" \
--- a/.ci/docker/common/install_base.sh
+++ b/.ci/docker/common/install_base.sh
@ -113,7 +113,6 @@ install_centos() {
    glibc-devel \
    glibc-headers \
    glog-devel \
-    hiredis-devel \
    libstdc++-devel \
    libsndfile-devel \
    make \
--- a/.ci/docker/common/install_db.sh
+++ b/.ci/docker/common/install_db.sh
@ -4,11 +4,6 @@ set -ex

 install_ubuntu() {
  apt-get update
-  apt-get install -y --no-install-recommends \
-          libhiredis-dev \
-          libleveldb-dev \
-          liblmdb-dev \
-          libsnappy-dev

  # Cleanup
  apt-get autoclean && apt-get clean
@ -20,12 +15,6 @@ install_centos() {
  # See http://fedoraproject.org/wiki/EPEL
  yum --enablerepo=extras install -y epel-release

-  yum install -y \
-      hiredis-devel \
-      leveldb-devel \
-      lmdb-devel \
-      snappy-devel
-
  # Cleanup
  yum clean all
  rm -rf /var/cache/yum
--- a/.ci/docker/common/install_rocm.sh
+++ b/.ci/docker/common/install_rocm.sh
@ -61,6 +61,10 @@ install_ubuntu() {
                   rocprofiler-dev \
                   roctracer-dev

+    if [[ $(ver $ROCM_VERSION) -ge $(ver 6.1) ]]; then
+        DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated rocm-llvm-dev
+    fi
+
    # precompiled miopen kernels added in ROCm 3.5, renamed in ROCm 5.5
    # search for all unversioned packages
    # if search fails it will abort this script; use true to avoid case where search fails
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -263,10 +263,10 @@ unittest-xml-reporting<=3.2.0,>=2.0.0
 #Pinned versions:
 #test that import:

-#wheel not found on aarch64, and source build requires rust
-lintrunner==0.10.7 ; platform_machine == "x86_64"
+#lintrunner is supported on aarch64-linux only from 0.12.4 version
+lintrunner==0.12.5
 #Description: all about linters!
-#Pinned versions: 0.10.7
+#Pinned versions: 0.12.5
 #test that import:

 rockset==1.0.3
@ -279,9 +279,9 @@ ghstack==0.8.0
 #Pinned versions: 0.8.0
 #test that import:

-jinja2==3.1.3
+jinja2==3.1.4
 #Description: jinja2 template engine
-#Pinned versions: 3.1.3
+#Pinned versions: 3.1.4
 #test that import:

 pytest-cpp==2.3.0
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -169,9 +169,11 @@ RUN rm install_acl.sh
 ENV INSTALLED_ACL ${ACL}

 # Install ccache/sccache (do this last, so we get priority in PATH)
+ARG SKIP_SCCACHE_INSTALL
 COPY ./common/install_cache.sh install_cache.sh
 ENV PATH /opt/cache/bin:$PATH
-RUN bash ./install_cache.sh && rm install_cache.sh
+RUN if [ -z "${SKIP_SCCACHE_INSTALL}" ]; then bash ./install_cache.sh; fi
+RUN rm install_cache.sh

 # Add jni.h for java host build
 COPY ./common/install_jni.sh install_jni.sh
@ -188,7 +190,9 @@ ARG BUILD_ENVIRONMENT
 ENV BUILD_ENVIRONMENT ${BUILD_ENVIRONMENT}

 # Install LLVM dev version (Defined in the pytorch/builder github repository)
+ARG SKIP_LLVM_SRC_BUILD_INSTALL
 COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm
+RUN if [ -n "${SKIP_LLVM_SRC_BUILD_INSTALL}" ]; then set -eu; rm -rf /opt/llvm; fi

 # AWS specific CUDA build guidance
 ENV TORCH_CUDA_ARCH_LIST Maxwell
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -81,7 +81,22 @@ if ! which conda; then
    export USE_MKLDNN=0
  fi
 else
-  export CMAKE_PREFIX_PATH=/opt/conda
+  # CMAKE_PREFIX_PATH precedences
+  # 1. $CONDA_PREFIX, if defined. This follows the pytorch official build instructions.
+  # 2. /opt/conda/envs/py_${ANACONDA_PYTHON_VERSION}, if ANACONDA_PYTHON_VERSION defined.
+  #    This is for CI, which defines ANACONDA_PYTHON_VERSION but not CONDA_PREFIX.
+  # 3. $(conda info --base). The fallback value of pytorch official build
+  #    instructions actually refers to this.
+  #    Commonly this is /opt/conda/
+  if [[ -v CONDA_PREFIX ]]; then
+    export CMAKE_PREFIX_PATH=${CONDA_PREFIX}
+  elif [[ -v ANACONDA_PYTHON_VERSION ]]; then
+    export CMAKE_PREFIX_PATH="/opt/conda/envs/py_${ANACONDA_PYTHON_VERSION}"
+  else
+    # already checked by `! which conda`
+    CMAKE_PREFIX_PATH="$(conda info --base)"
+    export CMAKE_PREFIX_PATH
+  fi

  # Workaround required for MKL library linkage
  # https://github.com/pytorch/pytorch/issues/119557
@ -376,4 +391,8 @@ if [[ "$BUILD_ENVIRONMENT" != *libtorch* && "$BUILD_ENVIRONMENT" != *bazel* ]];
  python tools/stats/export_test_times.py
 fi

-print_sccache_stats
+# snadampal: skipping it till sccache support added for aarch64
+# https://github.com/pytorch/pytorch/issues/121559
+if [[ "$BUILD_ENVIRONMENT" != *aarch64* ]]; then
+  print_sccache_stats
+fi
--- a/.ci/pytorch/multigpu-test.sh
+++ b/.ci/pytorch/multigpu-test.sh
@ -45,7 +45,10 @@ time python test/run_test.py --verbose -i distributed/test_device_mesh
 time python test/run_test.py --verbose -i distributed/tensor/parallel/test_ddp_2d_parallel
 time python test/run_test.py --verbose -i distributed/tensor/parallel/test_fsdp_2d_parallel
 time python test/run_test.py --verbose -i distributed/tensor/parallel/test_tp_examples
-time python test/run_test.py --verbose -i distributed/tensor/parallel/test_tp_random_state.py
+time python test/run_test.py --verbose -i distributed/tensor/parallel/test_tp_random_state
+
+# FSDP2 tests
+time python test/run_test.py --verbose -i distributed/_composable/fsdp/test_fully_shard_training -- -k test_2d_mlp_with_nd_mesh

 # Other tests
 time python test/run_test.py --verbose -i test_cuda_primary_ctx
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -181,6 +181,11 @@ if [[ "$BUILD_ENVIRONMENT" != *-bazel-* ]] ; then
  export PATH="$HOME/.local/bin:$PATH"
 fi

+if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then
+  # TODO: revisit this once the CI is stabilized on aarch64 linux
+  export VALGRIND=OFF
+fi
+
 install_tlparse

 # DANGER WILL ROBINSON.  The LD_PRELOAD here could cause you problems
@ -305,22 +310,23 @@ test_dynamo_shard() {
 test_inductor_distributed() {
  # Smuggle a few multi-gpu tests here so that we don't have to request another large node
  echo "Testing multi_gpu tests in test_torchinductor"
-  pytest test/inductor/test_torchinductor.py -k test_multi_gpu
-  pytest test/inductor/test_aot_inductor.py -k test_non_default_cuda_device
-  pytest test/inductor/test_aot_inductor.py -k test_replicate_on_devices
-  pytest test/distributed/test_c10d_functional_native.py
-  pytest test/distributed/_tensor/test_dtensor_compile.py
-  pytest test/distributed/tensor/parallel/test_fsdp_2d_parallel.py
-  pytest test/distributed/_composable/fsdp/test_fully_shard_comm.py
-  pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_multi_group
-  pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_with_activation_checkpointing
-  pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_2d_mlp
-  pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_hsdp
-  pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_2d_transformer_checkpoint_resume
-  pytest test/distributed/_composable/fsdp/test_fully_shard_frozen.py
-  pytest test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_compute_dtype
-  pytest test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_reduce_dtype
-  pytest test/distributed/fsdp/test_fsdp_tp_integration.py -k test_fsdp_tp_integration
+  python test/run_test.py -i inductor/test_torchinductor.py -k test_multi_gpu --verbose
+  python test/run_test.py -i inductor/test_aot_inductor.py -k test_non_default_cuda_device --verbose
+  python test/run_test.py -i inductor/test_aot_inductor.py -k test_replicate_on_devices --verbose
+  python test/run_test.py -i distributed/test_c10d_functional_native.py --verbose
+  python test/run_test.py -i distributed/_tensor/test_dtensor_compile.py --verbose
+  python test/run_test.py -i distributed/tensor/parallel/test_fsdp_2d_parallel.py --verbose
+  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_comm.py --verbose
+  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_multi_group --verbose
+  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_with_activation_checkpointing --verbose
+  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_2d_mlp --verbose
+  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_hsdp --verbose
+  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_2d_transformer_checkpoint_resume --verbose
+  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_gradient_accumulation --verbose
+  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_frozen.py --verbose
+  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_compute_dtype --verbose
+  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_reduce_dtype --verbose
+  python test/run_test.py -i distributed/fsdp/test_fsdp_tp_integration.py -k test_fsdp_tp_integration --verbose

  # this runs on both single-gpu and multi-gpu instance. It should be smart about skipping tests that aren't supported
  # with if required # gpus aren't available
@ -516,6 +522,11 @@ test_single_dynamo_benchmark() {
  fi
 }

+test_inductor_micro_benchmark() {
+  TEST_REPORTS_DIR=$(pwd)/test/test-micro-reports
+  python benchmarks/gpt_fast/benchmark.py
+}
+
 test_dynamo_benchmark() {
  # Usage: test_dynamo_benchmark huggingface 0
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
@ -1152,11 +1163,33 @@ test_executorch() {
  assert_git_not_dirty
 }

+test_linux_aarch64(){
+  python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \
+       test_transformers test_multiprocessing test_numpy_interop --verbose
+
+  # Dynamo tests
+  python test/run_test.py --include dynamo/test_compile dynamo/test_backends dynamo/test_comptime dynamo/test_config \
+       dynamo/test_functions dynamo/test_fx_passes_pre_grad dynamo/test_interop dynamo/test_model_output dynamo/test_modules \
+       dynamo/test_optimizers dynamo/test_recompile_ux dynamo/test_recompiles --verbose
+
+  # Inductor tests
+  python test/run_test.py --include inductor/test_torchinductor inductor/test_benchmark_fusion inductor/test_codecache \
+       inductor/test_config inductor/test_control_flow inductor/test_coordinate_descent_tuner inductor/test_fx_fusion \
+       inductor/test_group_batch_fusion inductor/test_inductor_freezing inductor/test_inductor_utils \
+       inductor/test_inplacing_pass inductor/test_kernel_benchmark inductor/test_layout_optim \
+       inductor/test_max_autotune inductor/test_memory_planning inductor/test_metrics inductor/test_multi_kernel inductor/test_pad_mm \
+       inductor/test_pattern_matcher inductor/test_perf inductor/test_profiler inductor/test_select_algorithm inductor/test_smoke \
+       inductor/test_split_cat_fx_passes inductor/test_standalone_compile inductor/test_torchinductor \
+       inductor/test_torchinductor_codegen_dynamic_shapes inductor/test_torchinductor_dynamic_shapes --verbose
+}
+
 if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
  (cd test && python -c "import torch; print(torch.__config__.show())")
  (cd test && python -c "import torch; print(torch.__config__.parallel_info())")
 fi
-if [[ "${TEST_CONFIG}" == *backward* ]]; then
+if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then
+  test_linux_aarch64
+elif [[ "${TEST_CONFIG}" == *backward* ]]; then
  test_forward_backward_compatibility
  # Do NOT add tests after bc check tests, see its comment.
 elif [[ "${TEST_CONFIG}" == *xla* ]]; then
@ -1181,6 +1214,8 @@ elif [[ "$TEST_CONFIG" == deploy ]]; then
  test_torch_deploy
 elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
  test_inductor_distributed
+elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then
+  test_inductor_micro_benchmark
 elif [[ "${TEST_CONFIG}" == *huggingface* ]]; then
  install_torchvision
  id=$((SHARD_NUMBER-1))
--- a/.ci/pytorch/win-test-helpers/build_pytorch.bat
+++ b/.ci/pytorch/win-test-helpers/build_pytorch.bat
@ -17,22 +17,22 @@ set PATH=C:\Program Files\CMake\bin;C:\Program Files\7-Zip;C:\ProgramData\chocol
 set INSTALLER_DIR=%SCRIPT_HELPERS_DIR%\installation-helpers

 call %INSTALLER_DIR%\install_magma.bat
-if errorlevel 1 exit /b
-if not errorlevel 0 exit /b
+if errorlevel 1 goto fail
+if not errorlevel 0 goto fail

 call %INSTALLER_DIR%\install_sccache.bat
-if errorlevel 1 exit /b
-if not errorlevel 0 exit /b
+if errorlevel 1 goto fail
+if not errorlevel 0 goto fail

 :: Miniconda has been installed as part of the Windows AMI with all the dependencies.
 :: We just need to activate it here
 call %INSTALLER_DIR%\activate_miniconda3.bat
-if errorlevel 1 exit /b
-if not errorlevel 0 exit /b
+if errorlevel 1 goto fail
+if not errorlevel 0 goto fail

 call pip install mkl-include==2021.4.0 mkl-devel==2021.4.0
-if errorlevel 1 exit /b
-if not errorlevel 0 exit /b
+if errorlevel 1 goto fail
+if not errorlevel 0 goto fail

 :: Override VS env here
 pushd .
@ -41,8 +41,8 @@ if "%VC_VERSION%" == "" (
 ) else (
    call "C:\Program Files (x86)\Microsoft Visual Studio\%VC_YEAR%\%VC_PRODUCT%\VC\Auxiliary\Build\vcvarsall.bat" x64 -vcvars_ver=%VC_VERSION%
 )
-if errorlevel 1 exit /b
-if not errorlevel 0 exit /b
+if errorlevel 1 goto fail
+if not errorlevel 0 goto fail
@echo on
 popd

@ -52,12 +52,12 @@ set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION%

 if x%CUDA_VERSION:.=%==x%CUDA_VERSION% (
    echo CUDA version %CUDA_VERSION% format isn't correct, which doesn't contain '.'
-    exit /b 1
+    goto fail
 )
 rem version transformer, for example 10.1 to 10_1.
 if x%CUDA_VERSION:.=%==x%CUDA_VERSION% (
    echo CUDA version %CUDA_VERSION% format isn't correct, which doesn't contain '.'
-    exit /b 1
+    goto fail
 )
 set VERSION_SUFFIX=%CUDA_VERSION:.=_%
 set CUDA_PATH_V%VERSION_SUFFIX%=%CUDA_PATH%
@ -101,8 +101,8 @@ if "%USE_CUDA%"=="1" (
  :: CMake requires a single command as CUDA_NVCC_EXECUTABLE, so we push the wrappers
  :: randomtemp.exe and sccache.exe into a batch file which CMake invokes.
  curl -kL https://github.com/peterjc123/randomtemp-rust/releases/download/v0.4/randomtemp.exe --output %TMP_DIR_WIN%\bin\randomtemp.exe
-  if errorlevel 1 exit /b
-  if not errorlevel 0 exit /b
+  if errorlevel 1 goto fail
+  if not errorlevel 0 goto fail
  echo @"%TMP_DIR_WIN%\bin\randomtemp.exe" "%TMP_DIR_WIN%\bin\sccache.exe" "%CUDA_PATH%\bin\nvcc.exe" %%* > "%TMP_DIR%/bin/nvcc.bat"
  cat %TMP_DIR%/bin/nvcc.bat
  set CUDA_NVCC_EXECUTABLE=%TMP_DIR%/bin/nvcc.bat
@ -114,8 +114,8 @@ if "%USE_CUDA%"=="1" (
 set

 python setup.py bdist_wheel
-if errorlevel 1 exit /b
-if not errorlevel 0 exit /b
+if errorlevel 1 goto fail
+if not errorlevel 0 goto fail
 sccache --show-stats
 python -c "import os, glob; os.system('python -mpip install --no-index --no-deps ' + glob.glob('dist/*.whl')[0])"
 (
@ -135,3 +135,8 @@ python -c "import os, glob; os.system('python -mpip install --no-index --no-deps

 sccache --show-stats --stats-format json | jq .stats > sccache-stats-%BUILD_ENVIRONMENT%-%OUR_GITHUB_JOB_ID%.json
 sccache --stop-server
+
+exit /b 0
+
+:fail
+exit /b 1
--- a/.flake8
+++ b/.flake8
@ -54,6 +54,7 @@ per-file-ignores =
    torch/ao/quantization/fx/_decomposed.py: TOR901
    torch/distributed/_functional_collectives.py: TOR901
    torch/distributed/_spmd/data_parallel.py: TOR901
+    torch/distributed/_tensor/_collective_utils.py: TOR901
 optional-ascii-coding = True
 exclude =
    ./.git,
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@ -21,6 +21,7 @@ self-hosted-runner:
    - linux.rocm.gpu
    - macos-m1-stable
    - macos-m1-13
+    - macos-m1-14
    - macos-12-xl
    - macos-12
    - macos12.3-m1
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@ -1 +1 @@
-2c4665ffbb64f03f5d18016d3398af4ac4da5f03
+d23a6e1664d20707c11781299611436e1f0c104f
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-58a412cb271a3f98ae2e01fd1d24bdbb66645d4e
+e3fc03314dab5f44e3ed9ccbba6c15fbca3285cd
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -58,6 +58,17 @@
 - third_party/mkl-dnn.BUILD
 - torch/csrc/jit/codegen/onednn/**
 - test/test_jit_llga_fuser.py
+- test/test_mkldnn.py
+
+"ciflow/linux-aarch64":
+- third_party/ideep
+- caffe2/ideep/**
+- caffe2/python/ideep/**
+- cmake/Modules/FindMKLDNN.cmake
+- third_party/mkl-dnn.BUILD
+- torch/csrc/jit/codegen/onednn/**
+- test/test_jit_llga_fuser.py
+- test/test_mkldnn.py

 "module: amp (automated mixed precision)":
 - torch/amp/**
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@ -29,10 +29,12 @@
  approved_by:
  - BowenBao
  - justinchuby
+  - liqunfu
  - shubhambhokare1
  - thiagocrepaldi
  - titaiwangms
  - wschin
+  - xadupre
  mandatory_checks_name:
  - EasyCLA
  - Lint
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -8,6 +8,8 @@ ciflow_push_tags:
 - ciflow/binaries_wheel
 - ciflow/inductor
 - ciflow/inductor-perf-compare
+- ciflow/inductor-micro-benchmark
+- ciflow/linux-aarch64
 - ciflow/mps
 - ciflow/nightly
 - ciflow/periodic
--- a/.github/requirements-gha-cache.txt
+++ b/.github/requirements-gha-cache.txt
@ -5,7 +5,7 @@
 #   functorch/docs/requirements.txt
 #   .ci/docker/requirements-ci.txt
 boto3==1.19.12
-jinja2==3.1.3
+jinja2==3.1.4
 lintrunner==0.10.7
 ninja==1.10.0.post1
 nvidia-ml-py==11.525.84
--- a/.github/scripts/amd/patch_triton_wheel.sh
+++ b/.github/scripts/amd/patch_triton_wheel.sh
@ -1,7 +1,11 @@
 #!/bin/bash
 set -x

-WHEELHOUSE_DIR=/artifacts
+if [ -z "$1" ]; then
+    echo "Need wheel location argument" && exit 1
+fi
+
+WHEELHOUSE_DIR=$1
 PATCHELF_BIN=patchelf
 ROCM_LIB=backends/amd/lib
 ROCM_LD=backends/amd/llvm/bin
--- a/.github/scripts/build_triton_wheel.py
+++ b/.github/scripts/build_triton_wheel.py
@ -157,10 +157,10 @@ def build_triton(

        if build_rocm:
            check_call(
-                [f"{SCRIPT_DIR}/amd/patch_triton_wheel.sh"],
+                [f"{SCRIPT_DIR}/amd/patch_triton_wheel.sh", Path.cwd()],
                cwd=triton_basedir,
-                shell=True,
            )
+
        return Path.cwd() / whl_path.name


--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -13,16 +13,16 @@ architectures:
 import os
 from typing import Dict, List, Optional, Tuple

-CUDA_ARCHES = ["11.8", "12.1"]
+CUDA_ARCHES = ["11.8", "12.1", "12.4"]


-CUDA_ARCHES_FULL_VERSION = {"11.8": "11.8.0", "12.1": "12.1.1"}
+CUDA_ARCHES_FULL_VERSION = {"11.8": "11.8.0", "12.1": "12.1.1", "12.4": "12.4.0"}


-CUDA_ARCHES_CUDNN_VERSION = {"11.8": "8", "12.1": "8"}
+CUDA_ARCHES_CUDNN_VERSION = {"11.8": "8", "12.1": "8", "12.4": "8"}


-ROCM_ARCHES = ["5.7", "6.0"]
+ROCM_ARCHES = ["6.0", "6.1"]


 CPU_CXX11_ABI_ARCH = ["cpu-cxx11-abi"]
@ -58,6 +58,20 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'"
    ),
+    "12.4": (
+        "nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cudnn-cu12==8.9.7.29; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'"
+    ),
 }


@ -324,7 +338,7 @@ def generate_wheels_matrix(
            )

            # 12.1 linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install
-            if arch_version in ["12.1", "11.8"] and os == "linux":
+            if arch_version in ["12.4", "12.1", "11.8"] and os == "linux":
                ret.append(
                    {
                        "python_version": python_version,
@ -367,5 +381,6 @@ def generate_wheels_matrix(
    return ret


+validate_nccl_dep_consistency("12.4")
 validate_nccl_dep_consistency("12.1")
 validate_nccl_dep_consistency("11.8")
--- a/.github/scripts/generate_docker_release_matrix.py
+++ b/.github/scripts/generate_docker_release_matrix.py
@ -21,6 +21,8 @@ DOCKER_IMAGE_TYPES = ["runtime", "devel"]

 def generate_docker_matrix() -> Dict[str, List[Dict[str, str]]]:
    ret: List[Dict[str, str]] = []
+    # CUDA amd64 Docker images are available as both runtime and devel while
+    # CPU arm64 image is only available as runtime.
    for cuda, version in generate_binary_build_matrix.CUDA_ARCHES_FULL_VERSION.items():
        for image in DOCKER_IMAGE_TYPES:
            ret.append(
@ -31,9 +33,19 @@ def generate_docker_matrix() -> Dict[str, List[Dict[str, str]]]:
                        cuda
                    ],
                    "image_type": image,
-                    "platform": "linux/arm64,linux/amd64",
+                    "platform": "linux/amd64",
                }
            )
+    ret.append(
+        {
+            "cuda": "cpu",
+            "cuda_full_version": "",
+            "cudnn_version": "",
+            "image_type": "runtime",
+            "platform": "linux/arm64",
+        }
+    )
+
    return {"include": ret}


--- a/.github/templates/linux_binary_build_workflow.yml.j2
+++ b/.github/templates/linux_binary_build_workflow.yml.j2
@ -46,7 +46,7 @@ env:
  PYTORCH_FINAL_PACKAGE_DIR: /artifacts
  PYTORCH_ROOT: /pytorch
  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  SKIP_ALL_TESTS: 1
+  SKIP_ALL_TESTS: 0
 !{{ common.concurrency(build_environment) }}

 jobs:
--- a/.github/templates/macos_binary_build_workflow.yml.j2
+++ b/.github/templates/macos_binary_build_workflow.yml.j2
@ -48,7 +48,7 @@ env:
  BUILD_ENVIRONMENT: !{{ build_environment }}
  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SKIP_ALL_TESTS: 1
+  SKIP_ALL_TESTS: 0
 {%- if cross_compile_arm64 %}
  CROSS_COMPILE_ARM64: 1
 {% endif %}
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@ -24,11 +24,6 @@ on:
        default: "3.8"
        description: |
          The python version to be used. Will be 3.8 by default
-      arch:
-        required: true
-        type: string
-        description: |
-          Contains the architecture to run the tests with
      timeout-minutes:
        required: false
        type: number
@ -44,7 +39,7 @@ jobs:
    # Also ensure that we always run with the right architecture
    defaults:
      run:
-        shell: arch -arch ${{ inputs.arch }} bash -e -l {0}
+        shell: bash -e -l {0}
    strategy:
      matrix: ${{ fromJSON(inputs.test-matrix) }}
      fail-fast: false
@ -133,12 +128,6 @@ jobs:
          test-matrix: ${{ inputs.test-matrix }}
          job-name: ${{ steps.get-job-id.outputs.job-name }}

-      - name: Pre-process arm64 wheels
-        if: inputs.build-environment == 'macos-12-py3-arm64'
-        run: |
-          # As wheels are cross-compiled they are reported as x86_64 ones
-          ORIG_WHLNAME=$(ls -1 dist/*.whl); ARM_WHLNAME=${ORIG_WHLNAME/x86_64/arm64}; mv "${ORIG_WHLNAME}" "${ARM_WHLNAME}"
-
      - name: Set Test step time
        id: test-timeout
        shell: bash
--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@ -37,7 +37,7 @@ jobs:
        device: ["cuda", "rocm"]
        include:
          - device: "rocm"
-            rocm_version: "6.0"
+            rocm_version: "6.1"
          - device: "cuda"
            rocm_version: ""
    timeout-minutes: 40
--- a/.github/workflows/docker-release.yml
+++ b/.github/workflows/docker-release.yml
@ -7,6 +7,7 @@ on:
      - Dockerfile
      - docker.Makefile
      - .github/workflows/docker-release.yml
+      - .github/scripts/generate_docker_release_matrix.py
  push:
    branches:
      - nightly
@ -126,20 +127,25 @@ jobs:
        run: |
          make -f docker.Makefile "${BUILD_IMAGE_TYPE}-image"
      - name: Push nightly tags
-        if: ${{ github.event.ref == 'refs/heads/nightly' && matrix.image_type == 'runtime' }}
+        if: ${{ github.event.ref == 'refs/heads/nightly' && matrix.image_type == 'runtime' && matrix.build_platforms == 'linux/amd4' }}
        run: |
          PYTORCH_DOCKER_TAG="${PYTORCH_VERSION}-cuda${CUDA_VERSION_SHORT}-cudnn${CUDNN_VERSION}-runtime"
-
+          CUDA_SUFFIX="-cu${CUDA_VERSION}"
          PYTORCH_NIGHTLY_COMMIT=$(docker run ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_DOCKER_TAG}" \
                                          python -c 'import torch; print(torch.version.git_version[:7],end="")')

          docker tag ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_DOCKER_TAG}" \
-                 ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}-cu${CUDA_VERSION}"
-          docker push ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}-cu${CUDA_VERSION}"
+                 ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}${CUDA_SUFFIX}"
+
+          docker push ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}${CUDA_SUFFIX}"
+
+          # Please note, here we ned to pin specific verison of CUDA as with latest label
+          if [[ ${CUDA_VERSION_SHORT} == "12.1" ]]; then
+            docker tag ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}${CUDA_SUFFIX}" \
+                    ghcr.io/pytorch/pytorch-nightly:latest
+            docker push ghcr.io/pytorch/pytorch-nightly:latest
+          fi

-          docker tag ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}-cu${CUDA_VERSION}" \
-                 ghcr.io/pytorch/pytorch-nightly:latest
-          docker push ghcr.io/pytorch/pytorch-nightly:latest
      - name: Teardown Linux
        uses: pytorch/test-infra/.github/actions/teardown-linux@main
        if: always()
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@ -31,7 +31,7 @@ env:
  PYTORCH_FINAL_PACKAGE_DIR: /artifacts
  PYTORCH_ROOT: /pytorch
  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  SKIP_ALL_TESTS: 1
+  SKIP_ALL_TESTS: 0
 concurrency:
  group: linux-aarch64-binary-manywheel-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true
--- a/.github/workflows/generated-linux-binary-conda-nightly.yml
+++ b/.github/workflows/generated-linux-binary-conda-nightly.yml
@ -31,7 +31,7 @@ env:
  PYTORCH_FINAL_PACKAGE_DIR: /artifacts
  PYTORCH_ROOT: /pytorch
  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  SKIP_ALL_TESTS: 1
+  SKIP_ALL_TESTS: 0
 concurrency:
  group: linux-binary-conda-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true
@ -222,6 +222,69 @@ jobs:
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml

+  conda-py3_8-cuda12_4-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main
+      DESIRED_PYTHON: "3.8"
+      runs_on: linux.24xlarge
+      build_name: conda-py3_8-cuda12_4
+      build_environment: linux-binary-conda
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  conda-py3_8-cuda12_4-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_8-cuda12_4-build
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main
+      DESIRED_PYTHON: "3.8"
+      build_name: conda-py3_8-cuda12_4
+      build_environment: linux-binary-conda
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  conda-py3_8-cuda12_4-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: conda-py3_8-cuda12_4-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main
+      DESIRED_PYTHON: "3.8"
+      build_name: conda-py3_8-cuda12_4
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
  conda-py3_9-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -407,6 +470,69 @@ jobs:
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml

+  conda-py3_9-cuda12_4-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main
+      DESIRED_PYTHON: "3.9"
+      runs_on: linux.24xlarge
+      build_name: conda-py3_9-cuda12_4
+      build_environment: linux-binary-conda
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  conda-py3_9-cuda12_4-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_9-cuda12_4-build
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main
+      DESIRED_PYTHON: "3.9"
+      build_name: conda-py3_9-cuda12_4
+      build_environment: linux-binary-conda
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  conda-py3_9-cuda12_4-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: conda-py3_9-cuda12_4-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main
+      DESIRED_PYTHON: "3.9"
+      build_name: conda-py3_9-cuda12_4
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
  conda-py3_10-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -592,6 +718,69 @@ jobs:
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml

+  conda-py3_10-cuda12_4-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main
+      DESIRED_PYTHON: "3.10"
+      runs_on: linux.24xlarge
+      build_name: conda-py3_10-cuda12_4
+      build_environment: linux-binary-conda
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  conda-py3_10-cuda12_4-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_10-cuda12_4-build
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main
+      DESIRED_PYTHON: "3.10"
+      build_name: conda-py3_10-cuda12_4
+      build_environment: linux-binary-conda
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  conda-py3_10-cuda12_4-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: conda-py3_10-cuda12_4-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main
+      DESIRED_PYTHON: "3.10"
+      build_name: conda-py3_10-cuda12_4
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
  conda-py3_11-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -777,6 +966,69 @@ jobs:
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml

+  conda-py3_11-cuda12_4-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main
+      DESIRED_PYTHON: "3.11"
+      runs_on: linux.24xlarge
+      build_name: conda-py3_11-cuda12_4
+      build_environment: linux-binary-conda
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  conda-py3_11-cuda12_4-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_11-cuda12_4-build
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main
+      DESIRED_PYTHON: "3.11"
+      build_name: conda-py3_11-cuda12_4
+      build_environment: linux-binary-conda
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  conda-py3_11-cuda12_4-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: conda-py3_11-cuda12_4-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main
+      DESIRED_PYTHON: "3.11"
+      build_name: conda-py3_11-cuda12_4
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
  conda-py3_12-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -961,3 +1213,66 @@ jobs:
      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml
+
+  conda-py3_12-cuda12_4-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main
+      DESIRED_PYTHON: "3.12"
+      runs_on: linux.24xlarge
+      build_name: conda-py3_12-cuda12_4
+      build_environment: linux-binary-conda
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  conda-py3_12-cuda12_4-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_12-cuda12_4-build
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main
+      DESIRED_PYTHON: "3.12"
+      build_name: conda-py3_12-cuda12_4
+      build_environment: linux-binary-conda
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  conda-py3_12-cuda12_4-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: conda-py3_12-cuda12_4-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main
+      DESIRED_PYTHON: "3.12"
+      build_name: conda-py3_12-cuda12_4
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-main.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-main.yml
@ -26,7 +26,7 @@ env:
  PYTORCH_FINAL_PACKAGE_DIR: /artifacts
  PYTORCH_ROOT: /pytorch
  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  SKIP_ALL_TESTS: 1
+  SKIP_ALL_TESTS: 0
 concurrency:
  group: linux-binary-libtorch-cxx11-abi-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true
--- a/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml
@ -31,7 +31,7 @@ env:
  PYTORCH_FINAL_PACKAGE_DIR: /artifacts
  PYTORCH_ROOT: /pytorch
  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  SKIP_ALL_TESTS: 1
+  SKIP_ALL_TESTS: 0
 concurrency:
  group: linux-binary-libtorch-cxx11-abi-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true
@ -229,7 +229,7 @@ jobs:
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml

-  libtorch-rocm5_7-shared-with-deps-cxx11-abi-build:
+  libtorch-cuda12_4-shared-with-deps-cxx11-abi-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
    with:
@ -238,97 +238,56 @@ jobs:
      PACKAGE_TYPE: libtorch
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.7
-      GPU_ARCH_VERSION: 5.7
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.7-main
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.4-main
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-rocm5_7-shared-with-deps-cxx11-abi
+      build_name: libtorch-cuda12_4-shared-with-deps-cxx11-abi
      build_environment: linux-binary-libtorch-cxx11-abi
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
-  libtorch-rocm5_7-shared-with-deps-cxx11-abi-test:  # Testing
+  libtorch-cuda12_4-shared-with-deps-cxx11-abi-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-rocm5_7-shared-with-deps-cxx11-abi-build
-    runs-on: linux.rocm.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.7
-      GPU_ARCH_VERSION: 5.7
-      GPU_ARCH_TYPE: rocm
-      SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.7-main
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Setup ROCm
-        uses: ./.github/actions/setup-rocm
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: libtorch-rocm5_7-shared-with-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: ROCm set GPU_FLAG
-        run: |
-          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
-        with:
-          docker-image: pytorch/libtorch-cxx11-builder:rocm5.7-main
-      - name: Test Pytorch binary
-        uses: ./pytorch/.github/actions/test-pytorch-binary
-      - name: Teardown ROCm
-        uses: ./.github/actions/teardown-rocm
-  libtorch-rocm5_7-shared-with-deps-cxx11-abi-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: libtorch-rocm5_7-shared-with-deps-cxx11-abi-test
+    needs: libtorch-cuda12_4-shared-with-deps-cxx11-abi-build
+    uses: ./.github/workflows/_binary-test-linux.yml
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
      PACKAGE_TYPE: libtorch
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.7
-      GPU_ARCH_VERSION: 5.7
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.7-main
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.4-main
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-rocm5_7-shared-with-deps-cxx11-abi
+      build_name: libtorch-cuda12_4-shared-with-deps-cxx11-abi
+      build_environment: linux-binary-libtorch-cxx11-abi
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  libtorch-cuda12_4-shared-with-deps-cxx11-abi-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: libtorch-cuda12_4-shared-with-deps-cxx11-abi-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.4-main
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+      build_name: libtorch-cuda12_4-shared-with-deps-cxx11-abi
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
@ -440,3 +399,109 @@ jobs:
      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml
+
+  libtorch-rocm6_1-shared-with-deps-cxx11-abi-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.1
+      GPU_ARCH_VERSION: 6.1
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.1-main
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+      build_name: libtorch-rocm6_1-shared-with-deps-cxx11-abi
+      build_environment: linux-binary-libtorch-cxx11-abi
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  libtorch-rocm6_1-shared-with-deps-cxx11-abi-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-rocm6_1-shared-with-deps-cxx11-abi-build
+    runs-on: linux.rocm.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.1
+      GPU_ARCH_VERSION: 6.1
+      GPU_ARCH_TYPE: rocm
+      SKIP_ALL_TESTS: 1
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.1-main
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Setup ROCm
+        uses: ./.github/actions/setup-rocm
+      - uses: actions/download-artifact@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-rocm6_1-shared-with-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          quiet-checkout: true
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+          quiet-checkout: true
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        with:
+          docker-image: pytorch/libtorch-cxx11-builder:rocm6.1-main
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+      - name: Teardown ROCm
+        uses: ./.github/actions/teardown-rocm
+  libtorch-rocm6_1-shared-with-deps-cxx11-abi-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: libtorch-rocm6_1-shared-with-deps-cxx11-abi-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.1
+      GPU_ARCH_VERSION: 6.1
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.1-main
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+      build_name: libtorch-rocm6_1-shared-with-deps-cxx11-abi
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-main.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-main.yml
@ -26,7 +26,7 @@ env:
  PYTORCH_FINAL_PACKAGE_DIR: /artifacts
  PYTORCH_ROOT: /pytorch
  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  SKIP_ALL_TESTS: 1
+  SKIP_ALL_TESTS: 0
 concurrency:
  group: linux-binary-libtorch-pre-cxx11-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true
--- a/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-nightly.yml
@ -31,7 +31,7 @@ env:
  PYTORCH_FINAL_PACKAGE_DIR: /artifacts
  PYTORCH_ROOT: /pytorch
  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  SKIP_ALL_TESTS: 1
+  SKIP_ALL_TESTS: 0
 concurrency:
  group: linux-binary-libtorch-pre-cxx11-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true
@ -229,7 +229,7 @@ jobs:
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml

-  libtorch-rocm5_7-shared-with-deps-pre-cxx11-build:
+  libtorch-cuda12_4-shared-with-deps-pre-cxx11-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
    with:
@ -238,97 +238,56 @@ jobs:
      PACKAGE_TYPE: libtorch
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.7
-      GPU_ARCH_VERSION: 5.7
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.7-main
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-rocm5_7-shared-with-deps-pre-cxx11
+      build_name: libtorch-cuda12_4-shared-with-deps-pre-cxx11
      build_environment: linux-binary-libtorch-pre-cxx11
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
-  libtorch-rocm5_7-shared-with-deps-pre-cxx11-test:  # Testing
+  libtorch-cuda12_4-shared-with-deps-pre-cxx11-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-rocm5_7-shared-with-deps-pre-cxx11-build
-    runs-on: linux.rocm.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.7
-      GPU_ARCH_VERSION: 5.7
-      GPU_ARCH_TYPE: rocm
-      SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.7-main
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Setup ROCm
-        uses: ./.github/actions/setup-rocm
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: libtorch-rocm5_7-shared-with-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: ROCm set GPU_FLAG
-        run: |
-          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
-        with:
-          docker-image: pytorch/manylinux-builder:rocm5.7-main
-      - name: Test Pytorch binary
-        uses: ./pytorch/.github/actions/test-pytorch-binary
-      - name: Teardown ROCm
-        uses: ./.github/actions/teardown-rocm
-  libtorch-rocm5_7-shared-with-deps-pre-cxx11-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: libtorch-rocm5_7-shared-with-deps-pre-cxx11-test
+    needs: libtorch-cuda12_4-shared-with-deps-pre-cxx11-build
+    uses: ./.github/workflows/_binary-test-linux.yml
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
      PACKAGE_TYPE: libtorch
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.7
-      GPU_ARCH_VERSION: 5.7
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.7-main
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-rocm5_7-shared-with-deps-pre-cxx11
+      build_name: libtorch-cuda12_4-shared-with-deps-pre-cxx11
+      build_environment: linux-binary-libtorch-pre-cxx11
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  libtorch-cuda12_4-shared-with-deps-pre-cxx11-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: libtorch-cuda12_4-shared-with-deps-pre-cxx11-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+      build_name: libtorch-cuda12_4-shared-with-deps-pre-cxx11
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
@ -440,3 +399,109 @@ jobs:
      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml
+
+  libtorch-rocm6_1-shared-with-deps-pre-cxx11-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.1
+      GPU_ARCH_VERSION: 6.1
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+      build_name: libtorch-rocm6_1-shared-with-deps-pre-cxx11
+      build_environment: linux-binary-libtorch-pre-cxx11
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  libtorch-rocm6_1-shared-with-deps-pre-cxx11-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-rocm6_1-shared-with-deps-pre-cxx11-build
+    runs-on: linux.rocm.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.1
+      GPU_ARCH_VERSION: 6.1
+      GPU_ARCH_TYPE: rocm
+      SKIP_ALL_TESTS: 1
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Setup ROCm
+        uses: ./.github/actions/setup-rocm
+      - uses: actions/download-artifact@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-rocm6_1-shared-with-deps-pre-cxx11
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          quiet-checkout: true
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+          quiet-checkout: true
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        with:
+          docker-image: pytorch/manylinux-builder:rocm6.1-main
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+      - name: Teardown ROCm
+        uses: ./.github/actions/teardown-rocm
+  libtorch-rocm6_1-shared-with-deps-pre-cxx11-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: libtorch-rocm6_1-shared-with-deps-pre-cxx11-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.1
+      GPU_ARCH_VERSION: 6.1
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+      build_name: libtorch-rocm6_1-shared-with-deps-pre-cxx11
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-linux-binary-manywheel-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-main.yml
@ -26,7 +26,7 @@ env:
  PYTORCH_FINAL_PACKAGE_DIR: /artifacts
  PYTORCH_ROOT: /pytorch
  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  SKIP_ALL_TESTS: 1
+  SKIP_ALL_TESTS: 0
 concurrency:
  group: linux-binary-manywheel-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
--- a/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml
@ -26,7 +26,7 @@ env:
  BUILD_ENVIRONMENT: macos-arm64-binary-conda
  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SKIP_ALL_TESTS: 1
+  SKIP_ALL_TESTS: 0
 concurrency:
  group: macos-arm64-binary-conda-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true
--- a/.github/workflows/generated-macos-arm64-binary-libtorch-cxx11-abi-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-libtorch-cxx11-abi-nightly.yml
@ -26,7 +26,7 @@ env:
  BUILD_ENVIRONMENT: macos-arm64-binary-libtorch-cxx11-abi
  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SKIP_ALL_TESTS: 1
+  SKIP_ALL_TESTS: 0
 concurrency:
  group: macos-arm64-binary-libtorch-cxx11-abi-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true
--- a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
@ -26,7 +26,7 @@ env:
  BUILD_ENVIRONMENT: macos-arm64-binary-wheel
  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SKIP_ALL_TESTS: 1
+  SKIP_ALL_TESTS: 0
 concurrency:
  group: macos-arm64-binary-wheel-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true
--- a/.github/workflows/generated-windows-binary-conda-nightly.yml
+++ b/.github/workflows/generated-windows-binary-conda-nightly.yml
--- a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
@ -800,3 +800,260 @@ jobs:
      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml
+  libtorch-cuda12_4-shared-with-deps-debug-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge.nonephemeral
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          quiet-checkout: true
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+          quiet-checkout: true
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: actions/upload-artifact@v3
+        if: always()
+        with:
+          name: libtorch-cuda12_4-shared-with-deps-debug
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cuda12_4-shared-with-deps-debug-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda12_4-shared-with-deps-debug-build
+    runs-on: windows.8xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: actions/download-artifact@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda12_4-shared-with-deps-debug
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          quiet-checkout: true
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+          quiet-checkout: true
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cuda12_4-shared-with-deps-debug-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: libtorch-cuda12_4-shared-with-deps-debug-test
+    with:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.8"
+      build_name: libtorch-cuda12_4-shared-with-deps-debug
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
@ -800,3 +800,260 @@ jobs:
      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml
+  libtorch-cuda12_4-shared-with-deps-release-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge.nonephemeral
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          quiet-checkout: true
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+          quiet-checkout: true
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: actions/upload-artifact@v3
+        if: always()
+        with:
+          name: libtorch-cuda12_4-shared-with-deps-release
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cuda12_4-shared-with-deps-release-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda12_4-shared-with-deps-release-build
+    runs-on: windows.8xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: actions/download-artifact@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda12_4-shared-with-deps-release
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          quiet-checkout: true
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+          quiet-checkout: true
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cuda12_4-shared-with-deps-release-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: libtorch-cuda12_4-shared-with-deps-release-test
+    with:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.8"
+      build_name: libtorch-cuda12_4-shared-with-deps-release
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-windows-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-windows-binary-wheel-nightly.yml
--- a/.github/workflows/inductor-micro-benchmark.yml
+++ b/.github/workflows/inductor-micro-benchmark.yml
@ -0,0 +1,40 @@
+name: inductor-micro-benchmark
+
+on:
+  schedule:
+    - cron: 0 7 * * *
+  push:
+    tags:
+      - ciflow/inductor-micro-benchmark/*
+  workflow_dispatch:
+
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+permissions: read-all
+
+jobs:
+  linux-focal-cuda12_1-py3_10-gcc9-inductor-micro-benchmark-build:
+    name: cuda12.1-py3.10-gcc9-sm80
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
+      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks
+      cuda-arch-list: '8.0'
+      test-matrix: |
+        { include: [
+          { config: "inductor-micro-benchmark", shard: 1, num_shards: 1, runner: "linux.gcp.a100" },
+        ]}
+
+  linux-focal-cuda12_1-py3_10-gcc9-inductor-micro-benchmark-test:
+    name: cuda12.1-py3.10-gcc9-sm80
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-focal-cuda12_1-py3_10-gcc9-inductor-micro-benchmark-build
+    with:
+      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
+      docker-image: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-micro-benchmark-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-micro-benchmark-build.outputs.test-matrix }}
+      use-gha: anything-non-empty-to-use-gha
+      timeout-minutes: 720
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@ -16,28 +16,28 @@ concurrency:
 permissions: read-all

 jobs:
-  linux-focal-rocm6_0-py3_8-inductor-build:
-    name: rocm6.0-py3.8-inductor
+  linux-focal-rocm6_1-py3_8-inductor-build:
+    name: rocm6.1-py3.8-inductor
    uses: ./.github/workflows/_linux-build.yml
    with:
-      build-environment: linux-focal-rocm6.0-py3.8
+      build-environment: linux-focal-rocm6.1-py3.8
      docker-image-name: pytorch-linux-focal-rocm-n-py3
      test-matrix: |
        { include: [
          { config: "inductor", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.2" },
        ]}

-  linux-focal-rocm6_0-py3_8-inductor-test:
+  linux-focal-rocm6_1-py3_8-inductor-test:
    permissions:
      id-token: write
      contents: read
-    name: rocm6.0-py3.8-inductor
+    name: rocm6.1-py3.8-inductor
    uses: ./.github/workflows/_rocm-test.yml
-    needs: linux-focal-rocm6_0-py3_8-inductor-build
+    needs: linux-focal-rocm6_1-py3_8-inductor-build
    with:
-      build-environment: linux-focal-rocm6.0-py3.8
-      docker-image: ${{ needs.linux-focal-rocm6_0-py3_8-inductor-build.outputs.docker-image }}
-      test-matrix:  ${{ needs.linux-focal-rocm6_0-py3_8-inductor-build.outputs.test-matrix }}
+      build-environment: linux-focal-rocm6.1-py3.8
+      docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-inductor-build.outputs.docker-image }}
+      test-matrix:  ${{ needs.linux-focal-rocm6_1-py3_8-inductor-build.outputs.test-matrix }}

  linux-focal-cuda12_1-py3_10-gcc9-inductor-build:
    name: cuda12.1-py3.10-gcc9-sm86
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -230,11 +230,11 @@ jobs:
        with:
          submodules: false
          fetch-depth: 1
-      - name: Setup Python 3.5
+      - name: Setup Python 3.6
        if: matrix.test_type == 'older_python_version'
        uses: actions/setup-python@v4
        with:
-          python-version: '3.5'
+          python-version: '3.6'
          architecture: x64
          check-latest: false
          cache: pip
--- a/.github/workflows/linux-aarch64.yml
+++ b/.github/workflows/linux-aarch64.yml
@ -0,0 +1,39 @@
+name: linux-aarch64
+
+on:
+  push:
+    tags:
+      - ciflow/linux-aarch64/*
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} but found ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+jobs:
+  linux-jammy-aarch64-py3_10-build:
+    name: linux-jammy-aarch64-py3.10
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-jammy-aarch64-py3.10
+      docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11
+      runner: linux.arm64.2xlarge
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 4, runner: "linux.arm64.2xlarge" },
+          { config: "default", shard: 2, num_shards: 4, runner: "linux.arm64.2xlarge" },
+          { config: "default", shard: 3, num_shards: 4, runner: "linux.arm64.2xlarge" },
+          { config: "default", shard: 4, num_shards: 4, runner: "linux.arm64.2xlarge" },
+        ]}
+
+  linux-jammy-aarch64-py3_10-test:
+    name: linux-jammy-aarch64-py3.10
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-aarch64-py3_10-build
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      build-environment: linux-jammy-aarch64-py3.10
+      docker-image: ${{ needs.linux-jammy-aarch64-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-aarch64-py3_10-build.outputs.test-matrix }}
--- a/.github/workflows/mac-mps.yml
+++ b/.github/workflows/mac-mps.yml
@ -13,33 +13,29 @@ concurrency:
 permissions: read-all

 jobs:
-  macos-12-py3-arm64-build:
-    name: macos-12-py3-arm64
+  macos-13-py3-arm64-build:
+    name: macos-13-py3-arm64
    uses: ./.github/workflows/_mac-build.yml
    with:
-      sync-tag: macos-12-py3-arm64-build
-      build-environment: macos-12-py3-arm64
+      sync-tag: macos-py3-arm64-build
+      build-environment: macos-13-py3-arm64
      runner-type: macos-m1-stable
      build-generates-artifacts: true
      # To match the one pre-installed in the m1 runners
      python-version: 3.9.12
-      # We need to set the environment file here instead of trying to detect it automatically because
-      # MacOS arm64 is cross-compiled from x86-64. Specifically, it means that arm64 conda environment
-      # is needed when building PyTorch MacOS arm64 from x86-64
-      environment-file: .github/requirements/conda-env-macOS-ARM64
      test-matrix: |
        { include: [
          { config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-stable" },
          { config: "mps", shard: 1, num_shards: 1, runner: "macos-m2-14" },
        ]}

-  macos-12-py3-arm64-mps-test:
-    name: macos-12-py3-arm64-mps
+  macos-py3-arm64-mps-test:
+    name: macos-py3-arm64-mps
    uses: ./.github/workflows/_mac-test-mps.yml
-    needs: macos-12-py3-arm64-build
+    needs: macos-13-py3-arm64-build
    with:
-      sync-tag: macos-12-py3-arm64-mps-test
-      build-environment: macos-12-py3-arm64
+      sync-tag: macos-py3-arm64-mps-test
+      build-environment: macos-13-py3-arm64
      # Same as the build job
      python-version: 3.9.12
-      test-matrix: ${{ needs.macos-12-py3-arm64-build.outputs.test-matrix }}
+      test-matrix: ${{ needs.macos-13-py3-arm64-build.outputs.test-matrix }}
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@ -217,11 +217,11 @@ jobs:
      docker-image: ${{ needs.linux-vulkan-focal-py3_11-clang10-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-vulkan-focal-py3_11-clang10-build.outputs.test-matrix }}

-  linux-focal-rocm6_0-py3_8-build:
-    name: linux-focal-rocm6.0-py3.8
+  linux-focal-rocm6_1-py3_8-build:
+    name: linux-focal-rocm6.1-py3.8
    uses: ./.github/workflows/_linux-build.yml
    with:
-      build-environment: linux-focal-rocm6.0-py3.8
+      build-environment: linux-focal-rocm6.1-py3.8
      docker-image-name: pytorch-linux-focal-rocm-n-py3
      test-matrix: |
        { include: [
@ -229,16 +229,16 @@ jobs:
          { config: "distributed", shard: 2, num_shards: 2, runner: "linux.rocm.gpu" },
        ]}

-  linux-focal-rocm6_0-py3_8-test:
+  linux-focal-rocm6_1-py3_8-test:
    permissions:
      id-token: write
      contents: read
-    name: linux-focal-rocm6.0-py3.8
+    name: linux-focal-rocm6.1-py3.8
    uses: ./.github/workflows/_rocm-test.yml
    needs:
-      - linux-focal-rocm6_0-py3_8-build
+      - linux-focal-rocm6_1-py3_8-build
      - target-determination
    with:
-      build-environment: linux-focal-rocm6.0-py3.8
-      docker-image: ${{ needs.linux-focal-rocm6_0-py3_8-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-rocm6_0-py3_8-build.outputs.test-matrix }}
+      build-environment: linux-focal-rocm6.1-py3.8
+      docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.test-matrix }}
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -414,13 +414,13 @@ jobs:
          { config: "default", shard: 1, num_shards: 1 },
        ]}

-  linux-focal-rocm6_0-py3_8-build:
+  linux-focal-rocm6_1-py3_8-build:
    # don't run build twice on main
    if: github.event_name == 'pull_request'
-    name: linux-focal-rocm6.0-py3.8
+    name: linux-focal-rocm6.1-py3.8
    uses: ./.github/workflows/_linux-build-label.yml
    with:
-      build-environment: linux-focal-rocm6.0-py3.8
+      build-environment: linux-focal-rocm6.1-py3.8
      docker-image-name: pytorch-linux-focal-rocm-n-py3
      sync-tag: rocm-build
      test-matrix: |
--- a/.github/workflows/rocm.yml
+++ b/.github/workflows/rocm.yml
@ -25,11 +25,11 @@ jobs:
      id-token: write
      contents: read

-  linux-focal-rocm6_0-py3_8-build:
-    name: linux-focal-rocm6.0-py3.8
+  linux-focal-rocm6_1-py3_8-build:
+    name: linux-focal-rocm6.1-py3.8
    uses: ./.github/workflows/_linux-build-label.yml
    with:
-      build-environment: linux-focal-rocm6.0-py3.8
+      build-environment: linux-focal-rocm6.1-py3.8
      docker-image-name: pytorch-linux-focal-rocm-n-py3
      sync-tag: rocm-build
      test-matrix: |
@ -42,16 +42,16 @@ jobs:
          { config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.2" },
        ]}

-  linux-focal-rocm6_0-py3_8-test:
+  linux-focal-rocm6_1-py3_8-test:
    permissions:
      id-token: write
      contents: read
-    name: linux-focal-rocm6.0-py3.8
+    name: linux-focal-rocm6.1-py3.8
    uses: ./.github/workflows/_rocm-test.yml
    needs:
-      - linux-focal-rocm6_0-py3_8-build
+      - linux-focal-rocm6_1-py3_8-build
      - target-determination
    with:
-      build-environment: linux-focal-rocm6.0-py3.8
-      docker-image: ${{ needs.linux-focal-rocm6_0-py3_8-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-rocm6_0-py3_8-build.outputs.test-matrix }}
+      build-environment: linux-focal-rocm6.1-py3.8
+      docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.test-matrix }}
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@ -111,30 +111,30 @@ jobs:
      docker-image: ${{ needs.linux-focal-py3_8-clang10-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-focal-py3_8-clang10-build.outputs.test-matrix }}

-  linux-focal-rocm6_0-py3_8-build:
-    name: linux-focal-rocm6.0-py3.8
+  linux-focal-rocm6_1-py3_8-build:
+    name: linux-focal-rocm6.1-py3.8
    uses: ./.github/workflows/_linux-build.yml
    with:
-      build-environment: linux-focal-rocm6.0-py3.8
+      build-environment: linux-focal-rocm6.1-py3.8
      docker-image-name: pytorch-linux-focal-rocm-n-py3
      test-matrix: |
        { include: [
          { config: "slow", shard: 1, num_shards: 1, runner: "linux.rocm.gpu" },
        ]}

-  linux-focal-rocm6_0-py3_8-test:
+  linux-focal-rocm6_1-py3_8-test:
    permissions:
      id-token: write
      contents: read
-    name: linux-focal-rocm6.0-py3.8
+    name: linux-focal-rocm6.1-py3.8
    uses: ./.github/workflows/_rocm-test.yml
    needs:
-      - linux-focal-rocm6_0-py3_8-build
+      - linux-focal-rocm6_1-py3_8-build
      - target-determination
    with:
-      build-environment: linux-focal-rocm6.0-py3.8
-      docker-image: ${{ needs.linux-focal-rocm6_0-py3_8-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-rocm6_0-py3_8-build.outputs.test-matrix }}
+      build-environment: linux-focal-rocm6.1-py3.8
+      docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.test-matrix }}

  linux-jammy-py3_10-clang15-asan-build:
    name: linux-jammy-py3.10-clang15-asan
@ -144,8 +144,9 @@ jobs:
      docker-image-name: pytorch-linux-jammy-py3-clang15-asan
      test-matrix: |
        { include: [
-          { config: "slow", shard: 1, num_shards: 2, runner: "linux.4xlarge" },
-          { config: "slow", shard: 2, num_shards: 2, runner: "linux.4xlarge" },
+          { config: "slow", shard: 1, num_shards: 3, runner: "linux.4xlarge" },
+          { config: "slow", shard: 2, num_shards: 3, runner: "linux.4xlarge" },
+          { config: "slow", shard: 3, num_shards: 3, runner: "linux.4xlarge" },
        ]}
      sync-tag: asan-build

--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -34,18 +34,6 @@ jobs:
      id-token: write
      contents: read

-  # Build PyTorch with BUILD_CAFFE2=ON
-  caffe2-linux-jammy-py3_8-gcc11-build:
-    name: caffe2-linux-jammy-py3.8-gcc11
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: caffe2-linux-jammy-py3.8-gcc11
-      docker-image-name: pytorch-linux-jammy-py3.8-gcc11
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 1 },
-        ]}
-
  linux-focal-cuda12_1-py3_10-gcc9-build:
    name: linux-focal-cuda12.1-py3.10-gcc9
    uses: ./.github/workflows/_linux-build.yml
@ -106,20 +94,16 @@ jobs:
          { config: "default", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
        ]}

-  macos-12-py3-arm64-build:
-    name: macos-12-py3-arm64
+  macos-13-py3-arm64-build:
+    name: macos-13-py3-arm64
    uses: ./.github/workflows/_mac-build.yml
    with:
-      sync-tag: macos-12-py3-arm64-build
-      build-environment: macos-12-py3-arm64
+      sync-tag: macos-py3-arm64-build
+      build-environment: macos-13-py3-arm64
      runner-type: macos-m1-stable
      build-generates-artifacts: true
      # To match the one pre-installed in the m1 runners
      python-version: 3.9.12
-      # We need to set the environment file here instead of trying to detect it automatically because
-      # MacOS arm64 is cross-compiled from x86-64. Specifically, it means that arm64 conda environment
-      # is needed when building PyTorch MacOS arm64 from x86-64
-      environment-file: .github/requirements/conda-env-macOS-ARM64
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 3, runner: "macos-m1-stable" },
@ -127,33 +111,34 @@ jobs:
          { config: "default", shard: 3, num_shards: 3, runner: "macos-m1-stable" },
        ]}

-  macos-12-py3-arm64-mps-test:
-    name: macos-12-py3-arm64-mps
+  macos-py3-arm64-mps-test:
+    name: macos-py3-arm64-mps
    uses: ./.github/workflows/_mac-test-mps.yml
-    needs: macos-12-py3-arm64-build
-    if: needs.macos-12-py3-arm64-build.outputs.build-outcome == 'success'
+    needs: macos-13-py3-arm64-build
+    if: needs.macos-13-py3-arm64-build.outputs.build-outcome == 'success'
    with:
-      sync-tag: macos-12-py3-arm64-mps-test
-      build-environment: macos-12-py3-arm64
+      sync-tag: macos-py3-arm64-mps-test
+      build-environment: macos-13-py3-arm64
      # Same as the build job
      python-version: 3.9.12
      test-matrix: |
        { include: [
          { config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-stable" },
+          { config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-14" },
+
        ]}

-  macos-12-py3-arm64-test:
-    name: macos-12-py3-arm64
+  macos-13-py3-arm64-test:
+    name: macos-13-py3-arm64
    uses: ./.github/workflows/_mac-test.yml
    needs:
-      - macos-12-py3-arm64-build
+      - macos-13-py3-arm64-build
      - target-determination
    with:
-      build-environment: macos-12-py3-arm64
+      build-environment: macos-13-py3-arm64
      # Same as the build job
      python-version: 3.9.12
-      test-matrix: ${{ needs.macos-12-py3-arm64-build.outputs.test-matrix }}
-      arch: arm64
+      test-matrix: ${{ needs.macos-13-py3-arm64-build.outputs.test-matrix }}

  win-vs2019-cpu-py3-build:
    name: win-vs2019-cpu-py3
@ -198,11 +183,11 @@ jobs:
          { config: "force_on_cpu", shard: 1, num_shards: 1, runner: "windows.4xlarge.nonephemeral" },
        ]}

-  linux-focal-rocm6_0-py3_8-build:
-    name: linux-focal-rocm6.0-py3.8
+  linux-focal-rocm6_1-py3_8-build:
+    name: linux-focal-rocm6.1-py3.8
    uses: ./.github/workflows/_linux-build-label.yml
    with:
-      build-environment: linux-focal-rocm6.0-py3.8
+      build-environment: linux-focal-rocm6.1-py3.8
      docker-image-name: pytorch-linux-focal-rocm-n-py3
      sync-tag: rocm-build
      test-matrix: |
@ -210,17 +195,17 @@ jobs:
          { config: "default", shard: 1, num_shards: 1, runner: "linux.rocm.gpu" },
        ]}

-  linux-focal-rocm6_0-py3_8-test:
+  linux-focal-rocm6_1-py3_8-test:
    permissions:
      id-token: write
      contents: read
-    name: linux-focal-rocm6.0-py3.8
+    name: linux-focal-rocm6.1-py3.8
    uses: ./.github/workflows/_rocm-test.yml
    needs:
-      - linux-focal-rocm6_0-py3_8-build
+      - linux-focal-rocm6_1-py3_8-build
      - target-determination
    with:
-      build-environment: linux-focal-rocm6.0-py3.8
-      docker-image: ${{ needs.linux-focal-rocm6_0-py3_8-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-rocm6_0-py3_8-build.outputs.test-matrix }}
+      build-environment: linux-focal-rocm6.1-py3.8
+      docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.test-matrix }}
      tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor"
--- a/.github/workflows/unstable.yml
+++ b/.github/workflows/unstable.yml
@ -36,6 +36,20 @@ jobs:
  #
  # Experimental ARC jobs
  #
+  llm-td:
+    name: before-test
+    uses: ./.github/workflows/llm_td_retrieval.yml
+    permissions:
+      id-token: write
+      contents: read
+
+  target-determination:
+    name: before-test
+    uses: ./.github/workflows/target_determination.yml
+    needs: llm-td
+    permissions:
+      id-token: write
+      contents: read

  linux-jammy-py3_8-gcc11-build:
    name: linux-jammy-py3.8-gcc11
@ -45,16 +59,26 @@ jobs:
      docker-image-name: pytorch-linux-jammy-py3.8-gcc11
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 3, runner: "linux.2xlarge" },
-          { config: "default", shard: 2, num_shards: 3, runner: "linux.2xlarge" },
-          { config: "default", shard: 3, num_shards: 3, runner: "linux.2xlarge" },
-          { config: "docs_test", shard: 1, num_shards: 1,  runner: "linux.2xlarge" },
-          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
-          { config: "backwards_compat", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
-          { config: "distributed", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "distributed", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
+          { config: "default", shard: 1, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "default", shard: 2, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "default", shard: 3, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "docs_test", shard: 1, num_shards: 1,  runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "backwards_compat", shard: 1, num_shards: 1, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "distributed", shard: 1, num_shards: 2, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "distributed", shard: 2, num_shards: 2, runner: "arc-lf-linux.2xlarge.avx512" },
        ]}

+  linux-jammy-py3_8-gcc11-test:
+    name: linux-jammy-py3.8-gcc11
+    uses: ./.github/workflows/_linux-test-rg.yml
+    needs:
+      - linux-jammy-py3_8-gcc11-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-py3.8-gcc11
+      docker-image: ${{ needs.linux-jammy-py3_8-gcc11-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3_8-gcc11-build.outputs.test-matrix }}

  linux-jammy-py3_8-gcc11-no-ops:
    name: linux-jammy-py3.8-gcc11-no-ops
@ -86,10 +110,21 @@ jobs:
      docker-image-name: pytorch-linux-focal-py3-clang10-onnx
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "default", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
+          { config: "default", shard: 1, num_shards: 2, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "default", shard: 2, num_shards: 2, runner: "arc-lf-linux.2xlarge.avx512" },
        ]}

+  linux-focal-py3_8-clang10-onnx-test:
+    name: linux-focal-py3.8-clang10-onnx
+    uses: ./.github/workflows/_linux-test-rg.yml
+    needs:
+      - linux-focal-py3_8-clang10-onnx-build
+      - target-determination
+    with:
+      build-environment: linux-focal-py3.8-clang10-onnx
+      docker-image: ${{ needs.linux-focal-py3_8-clang10-onnx-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-py3_8-clang10-onnx-build.outputs.test-matrix }}
+
  linux-jammy-py3_10-clang15-asan-build:
    name: linux-jammy-py3.10-clang15-asan
    uses: ./.github/workflows/_linux-build-rg.yml
@ -115,16 +150,27 @@ jobs:
      docker-image-name: pytorch-linux-focal-py3.8-clang10
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 3, runner: "linux.2xlarge" },
-          { config: "default", shard: 2, num_shards: 3, runner: "linux.2xlarge" },
-          { config: "default", shard: 3, num_shards: 3, runner: "linux.2xlarge" },
-          { config: "crossref", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "crossref", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "dynamo", shard: 1, num_shards: 3, runner: "linux.2xlarge" },
-          { config: "dynamo", shard: 2, num_shards: 3, runner: "linux.2xlarge" },
-          { config: "dynamo", shard: 3, num_shards: 3, runner: "linux.2xlarge" },
+          { config: "default", shard: 1, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "default", shard: 2, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "default", shard: 3, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "crossref", shard: 1, num_shards: 2, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "crossref", shard: 2, num_shards: 2, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "dynamo", shard: 1, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "dynamo", shard: 2, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "dynamo", shard: 3, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
        ]}

+  linux-focal-py3_8-clang10-test:
+    name: linux-focal-py3.8-clang10
+    uses: ./.github/workflows/_linux-test-rg.yml
+    needs:
+      - linux-focal-py3_8-clang10-build
+      - target-determination
+    with:
+      build-environment: linux-focal-py3.8-clang10
+      docker-image: ${{ needs.linux-focal-py3_8-clang10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-py3_8-clang10-build.outputs.test-matrix }}
+
  linux-focal-py3_11-clang10-build:
    name: linux-focal-py3.11-clang10
    uses: ./.github/workflows/_linux-build-rg.yml
@ -133,16 +179,27 @@ jobs:
      docker-image-name: pytorch-linux-focal-py3.11-clang10
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 3, runner: "linux.2xlarge" },
-          { config: "default", shard: 2, num_shards: 3, runner: "linux.2xlarge" },
-          { config: "default", shard: 3, num_shards: 3, runner: "linux.2xlarge" },
-          { config: "crossref", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "crossref", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "dynamo", shard: 1, num_shards: 3, runner: "linux.2xlarge" },
-          { config: "dynamo", shard: 2, num_shards: 3, runner: "linux.2xlarge" },
-          { config: "dynamo", shard: 3, num_shards: 3, runner: "linux.2xlarge" },
+          { config: "default", shard: 1, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "default", shard: 2, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "default", shard: 3, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "crossref", shard: 1, num_shards: 2, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "crossref", shard: 2, num_shards: 2, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "dynamo", shard: 1, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "dynamo", shard: 2, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "dynamo", shard: 3, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
        ]}

+  linux-focal-py3_11-clang10-test:
+    name: linux-focal-py3.11-clang10
+    uses: ./.github/workflows/_linux-test-rg.yml
+    needs:
+      - linux-focal-py3_11-clang10-build
+      - target-determination
+    with:
+      build-environment: linux-focal-py3.11-clang10
+      docker-image: ${{ needs.linux-focal-py3_11-clang10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-py3_11-clang10-build.outputs.test-matrix }}
+
  #
  # End of Experimental ARC jobs
-  #
+  #
--- a/.github/workflows/upload-test-stats.yml
+++ b/.github/workflows/upload-test-stats.yml
@ -49,22 +49,6 @@ jobs:
      - run: |
          pip3 install requests==2.26 rockset==1.0.3 boto3==1.19.12

-      - name: Upload test stats
-        env:
-          ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }}
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          WORKFLOW_RUN_ID: ${{ github.event.workflow_run.id }}
-          WORKFLOW_RUN_ATTEMPT: ${{ github.event.workflow_run.run_attempt }}
-          WORKFLOW_URL: ${{ github.event.workflow_run.html_url }}
-          HEAD_REPOSITORY: ${{ github.event.workflow_run.head_repository.full_name }}
-          HEAD_BRANCH: ${{ github.event.workflow_run.head_branch }}
-        run: |
-          echo "${WORKFLOW_URL}"
-          python3 -m tools.stats.upload_test_stats --workflow-run-id "${WORKFLOW_RUN_ID}" --workflow-run-attempt "${WORKFLOW_RUN_ATTEMPT}" --head-branch "${HEAD_BRANCH}" --head-repository "${HEAD_REPOSITORY}"
-          python3 -m tools.stats.upload_sccache_stats --workflow-run-id "${WORKFLOW_RUN_ID}" --workflow-run-attempt "${WORKFLOW_RUN_ATTEMPT}"
-
      - name: Upload test artifacts
        env:
          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
@ -81,6 +65,22 @@ jobs:
          # anything on GitHub to upload. The command should return right away
          python3 -m tools.stats.upload_artifacts --workflow-run-id "${WORKFLOW_RUN_ID}" --workflow-run-attempt "${WORKFLOW_RUN_ATTEMPT}" --repo "${REPO_FULLNAME}"

+      - name: Upload test stats
+        env:
+          ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }}
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          WORKFLOW_RUN_ID: ${{ github.event.workflow_run.id }}
+          WORKFLOW_RUN_ATTEMPT: ${{ github.event.workflow_run.run_attempt }}
+          WORKFLOW_URL: ${{ github.event.workflow_run.html_url }}
+          HEAD_REPOSITORY: ${{ github.event.workflow_run.head_repository.full_name }}
+          HEAD_BRANCH: ${{ github.event.workflow_run.head_branch }}
+        run: |
+          echo "${WORKFLOW_URL}"
+          python3 -m tools.stats.upload_test_stats --workflow-run-id "${WORKFLOW_RUN_ID}" --workflow-run-attempt "${WORKFLOW_RUN_ATTEMPT}" --head-branch "${HEAD_BRANCH}" --head-repository "${HEAD_REPOSITORY}"
+          python3 -m tools.stats.upload_sccache_stats --workflow-run-id "${WORKFLOW_RUN_ID}" --workflow-run-attempt "${WORKFLOW_RUN_ATTEMPT}"
+
      - name: Analyze disabled tests rerun
        env:
          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
--- a/.gitignore
+++ b/.gitignore
@ -87,7 +87,7 @@ torch/csrc/api/include/torch/version.h
 torch/csrc/cudnn/cuDNN.cpp
 torch/csrc/generated
 torch/csrc/generic/TensorMethods.cpp
-torch/csrc/inductor/aoti_torch/generated/*
+torch/csrc/inductor/aoti_torch/generated/*.cpp
 torch/csrc/jit/generated/*
 torch/csrc/jit/fuser/config.h
 torch/csrc/nn/THCUNN.cpp
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -78,6 +78,7 @@ exclude_patterns = [
    'aten/src/ATen/native/vulkan/api/vk_mem_alloc.h',
    'c10/util/strong_type.h',
    '**/fb/**',
+    'torch/csrc/inductor/aoti_torch/generated/**',
    'torch/csrc/jit/serialization/mobile_bytecode_generated.h',
    'torch/csrc/utils/pythoncapi_compat.h',
    'aten/src/ATen/dlpack.h',
@ -1051,21 +1052,12 @@ exclude_patterns = [
    'test/quantization/fx/test_numeric_suite_fx.py',
    'test/quantization/fx/test_quantize_fx.py',
    'test/quantization/fx/test_subgraph_rewriter.py',
-    'test/test_custom_op_testing.py',
-    'test/test_dataloader.py',
    'test/test_datapipe.py',
-    'test/test_decomp.py',
-    'test/test_deploy.py',
-    'test/test_determination.py',
-    'test/test_dlpack.py',
-    'test/test_dynamic_shapes.py',
-    'test/test_expanded_weights.py',
    'test/test_fake_tensor.py',
    'test/test_flop_counter.py',
    'test/test_function_schema.py',
    'test/test_functional_autograd_benchmark.py',
    'test/test_functional_optim.py',
-    'test/test_functionalization.py',
    'test/test_functionalization_of_rng_ops.py',
    'test/test_futures.py',
    'test/test_fx.py',
@ -1074,7 +1066,6 @@ exclude_patterns = [
    'test/test_fx_reinplace_pass.py',
    'test/test_hub.py',
    'test/test_import_stats.py',
-    'test/test_indexing.py',
    'test/test_itt.py',
    'test/test_jit.py',
    'test/test_jit_autocast.py',
@ -1123,8 +1114,6 @@ exclude_patterns = [
    'test/test_optim.py',
    'test/test_out_dtype_op.py',
    'test/test_overrides.py',
-    'test/test_package.py',
-    'test/test_per_overload_api.py',
    'test/test_prims.py',
    'test/test_proxy_tensor.py',
    'test/test_pruning_op.py',
@ -1160,7 +1149,6 @@ exclude_patterns = [
    'test/test_type_promotion.py',
    'test/test_unary_ufuncs.py',
    'test/test_utils.py',
-    'test/test_view_ops.py',
    'test/test_vulkan.py',
    'test/test_xnnpack_integration.py',
    'test/torch_np/numpy_test/**/*.py',
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -446,7 +446,6 @@ cu_library(
 # caffe2
 CAFFE2_COPTS = COMMON_COPTS + [
    "-Dcaffe2_EXPORTS",
-    "-DCAFFE2_USE_GLOO",
    "-DCAFFE2_USE_CUDNN",
    "-DCAFFE2_BUILD_MAIN_LIB",
    "-fvisibility-inlines-hidden",
@ -454,22 +453,6 @@ CAFFE2_COPTS = COMMON_COPTS + [
    "-fno-trapping-math",
 ]

-filegroup(
-    name = "caffe2_contrib_srcs",
-    srcs = [
-        "caffe2/contrib/aten/aten_op.cc",
-        "caffe2/contrib/gloo/allgather_ops.cc",
-        "caffe2/contrib/gloo/allreduce_ops.cc",
-        "caffe2/contrib/gloo/barrier_ops.cc",
-        "caffe2/contrib/gloo/broadcast_ops.cc",
-        "caffe2/contrib/gloo/common.cc",
-        "caffe2/contrib/gloo/common_world_ops.cc",
-        "caffe2/contrib/gloo/context.cc",
-        "caffe2/contrib/gloo/reduce_scatter_ops.cc",
-        "caffe2/contrib/gloo/store_handler.cc",
-    ],
-)
-
 filegroup(
    name = "caffe2_core_srcs",
    srcs = [
@ -520,363 +503,6 @@ filegroup(
    ],
 )

-filegroup(
-    name = "caffe2_distributed_srcs",
-    srcs = [
-        "caffe2/distributed/file_store_handler.cc",
-        "caffe2/distributed/file_store_handler_op.cc",
-        "caffe2/distributed/store_handler.cc",
-        "caffe2/distributed/store_ops.cc",
-    ],
-)
-
-filegroup(
-    name = "caffe2_ideep_srcs",
-    srcs = [
-        "caffe2/ideep/operators/adam_op.cc",
-        "caffe2/ideep/operators/channel_shuffle_op.cc",
-        "caffe2/ideep/operators/concat_split_op.cc",
-        "caffe2/ideep/operators/conv_op.cc",
-        "caffe2/ideep/operators/conv_transpose_op.cc",
-        "caffe2/ideep/operators/dropout_op.cc",
-        "caffe2/ideep/operators/elementwise_sum_op.cc",
-        "caffe2/ideep/operators/expand_squeeze_dims_op.cc",
-        "caffe2/ideep/operators/fully_connected_op.cc",
-        "caffe2/ideep/operators/local_response_normalization_op.cc",
-        "caffe2/ideep/operators/momentum_sgd_op.cc",
-        "caffe2/ideep/operators/operator_fallback_ideep.cc",
-        "caffe2/ideep/operators/order_switch_ops.cc",
-        "caffe2/ideep/operators/pool_op.cc",
-        "caffe2/ideep/operators/quantization/int8_add_op.cc",
-        "caffe2/ideep/operators/quantization/int8_conv_op.cc",
-        "caffe2/ideep/operators/quantization/int8_dequantize_op.cc",
-        "caffe2/ideep/operators/quantization/int8_fully_connected_op.cc",
-        "caffe2/ideep/operators/quantization/int8_given_tensor_fill_op.cc",
-        "caffe2/ideep/operators/quantization/int8_pool_op.cc",
-        "caffe2/ideep/operators/quantization/int8_quantize_op.cc",
-        "caffe2/ideep/operators/quantization/int8_relu_op.cc",
-        "caffe2/ideep/operators/queue_ops.cc",
-        "caffe2/ideep/operators/relu_op.cc",
-        "caffe2/ideep/operators/reshape_op.cc",
-        "caffe2/ideep/operators/shape_op.cc",
-        "caffe2/ideep/operators/sigmoid_op.cc",
-        "caffe2/ideep/operators/spatial_batch_norm_op.cc",
-        "caffe2/ideep/operators/transpose_op.cc",
-        "caffe2/ideep/operators/utility_ops.cc",
-        "caffe2/ideep/utils/ideep_register.cc",
-    ],
-)
-
-filegroup(
-    name = "caffe2_onnx_srcs",
-    srcs = [
-        "caffe2/onnx/backend.cc",
-        "caffe2/onnx/backend_rep.cc",
-        "caffe2/onnx/device.cc",
-        "caffe2/onnx/helper.cc",
-        "caffe2/onnx/offline_tensor.cc",
-        "caffe2/onnx/onnx_exporter.cc",
-        "caffe2/onnx/onnxifi_graph_info.cc",
-        "caffe2/onnx/onnxifi_init.cc",
-    ],
-)
-
-filegroup(
-    name = "caffe2_operators_srcs",
-    srcs = [
-        "caffe2/operators/abs_op.cc",
-        "caffe2/operators/accumulate_op.cc",
-        "caffe2/operators/accuracy_op.cc",
-        "caffe2/operators/acos_op.cc",
-        "caffe2/operators/affine_channel_op.cc",
-        "caffe2/operators/alias_with_name.cc",
-        "caffe2/operators/apmeter_op.cc",
-        "caffe2/operators/arg_ops.cc",
-        "caffe2/operators/asin_op.cc",
-        "caffe2/operators/assert_op.cc",
-        "caffe2/operators/atan_op.cc",
-        "caffe2/operators/atomic_ops.cc",
-        "caffe2/operators/batch_box_cox_op.cc",
-        "caffe2/operators/batch_bucketize_op.cc",
-        "caffe2/operators/batch_gather_ops.cc",
-        "caffe2/operators/batch_matmul_op.cc",
-        "caffe2/operators/batch_moments_op.cc",
-        "caffe2/operators/batch_permutation_op.cc",
-        "caffe2/operators/batch_sparse_to_dense_op.cc",
-        "caffe2/operators/bbox_transform_op.cc",
-        "caffe2/operators/bisect_percentile_op.cc",
-        "caffe2/operators/boolean_mask_ops.cc",
-        "caffe2/operators/boolean_unmask_ops.cc",
-        "caffe2/operators/box_with_nms_limit_op.cc",
-        "caffe2/operators/bucketize_op.cc",
-        "caffe2/operators/byte_weight_dequant_op.cc",
-        "caffe2/operators/cast_op.cc",
-        "caffe2/operators/cbrt_op.cc",
-        "caffe2/operators/cc_bmm_bg_op.cc",
-        "caffe2/operators/ceil_op.cc",
-        "caffe2/operators/channel_backprop_stats_op.cc",
-        "caffe2/operators/channel_shuffle_op.cc",
-        "caffe2/operators/channel_stats_op.cc",
-        "caffe2/operators/clip_op.cc",
-        "caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.cc",
-        "caffe2/operators/communicator_op.cc",
-        "caffe2/operators/concat_split_op.cc",
-        "caffe2/operators/conditional_op.cc",
-        "caffe2/operators/conv_gradient_op.cc",
-        "caffe2/operators/conv_op.cc",
-        "caffe2/operators/conv_op_eigen.cc",
-        "caffe2/operators/conv_op_shared.cc",
-        "caffe2/operators/conv_transpose_gradient_op.cc",
-        "caffe2/operators/conv_transpose_op.cc",
-        "caffe2/operators/conv_transpose_op_mobile.cc",
-        "caffe2/operators/copy_op.cc",
-        "caffe2/operators/copy_rows_to_tensor_op.cc",
-        "caffe2/operators/cos_op.cc",
-        "caffe2/operators/cosh_op.cc",
-        "caffe2/operators/cosine_embedding_criterion_op.cc",
-        "caffe2/operators/counter_ops.cc",
-        "caffe2/operators/crash_op.cc",
-        "caffe2/operators/create_scope_op.cc",
-        "caffe2/operators/crf_viterbi_op.cc",
-        "caffe2/operators/cross_entropy_op.cc",
-        "caffe2/operators/ctc_beam_search_decoder_op.cc",
-        "caffe2/operators/ctc_greedy_decoder_op.cc",
-        "caffe2/operators/cube_op.cc",
-        "caffe2/operators/data_couple.cc",
-        "caffe2/operators/dataset_ops.cc",
-        "caffe2/operators/deform_conv_gradient_op.cc",
-        "caffe2/operators/deform_conv_op.cc",
-        "caffe2/operators/dense_vector_to_id_list_op.cc",
-        "caffe2/operators/distance_op.cc",
-        "caffe2/operators/do_op.cc",
-        "caffe2/operators/dropout_op.cc",
-        "caffe2/operators/elementwise_add_gradient_op.cc",
-        "caffe2/operators/elementwise_add_op.cc",
-        "caffe2/operators/elementwise_div_gradient_op.cc",
-        "caffe2/operators/elementwise_div_op.cc",
-        "caffe2/operators/elementwise_linear_op.cc",
-        "caffe2/operators/elementwise_logical_ops.cc",
-        "caffe2/operators/elementwise_mul_gradient_op.cc",
-        "caffe2/operators/elementwise_mul_op.cc",
-        "caffe2/operators/elementwise_ops.cc",
-        "caffe2/operators/elementwise_ops_schema.cc",
-        "caffe2/operators/elementwise_ops_utils.cc",
-        "caffe2/operators/elementwise_sub_gradient_op.cc",
-        "caffe2/operators/elementwise_sub_op.cc",
-        "caffe2/operators/elementwise_sum_op.cc",
-        "caffe2/operators/elu_op.cc",
-        "caffe2/operators/enforce_finite_op.cc",
-        "caffe2/operators/ensure_clipped_op.cc",
-        "caffe2/operators/ensure_cpu_output_op.cc",
-        "caffe2/operators/erf_op.cc",
-        "caffe2/operators/exp_op.cc",
-        "caffe2/operators/expand_op.cc",
-        "caffe2/operators/expand_squeeze_dims_op.cc",
-        "caffe2/operators/fc_inference.cc",
-        "caffe2/operators/feature_maps_ops.cc",
-        "caffe2/operators/feed_blob_op.cc",
-        "caffe2/operators/filler_op.cc",
-        "caffe2/operators/find_duplicate_elements_op.cc",
-        "caffe2/operators/find_op.cc",
-        "caffe2/operators/flatten_op.cc",
-        "caffe2/operators/flexible_top_k.cc",
-        "caffe2/operators/floor_op.cc",
-        "caffe2/operators/free_op.cc",
-        "caffe2/operators/fully_connected_op.cc",
-        "caffe2/operators/fused_rowwise_8bit_conversion_ops.cc",
-        "caffe2/operators/fused_rowwise_random_quantization_ops.cc",
-        "caffe2/operators/gather_fused_8bit_rowwise_op.cc",
-        "caffe2/operators/gather_op.cc",
-        "caffe2/operators/gather_ranges_to_dense_op.cc",
-        "caffe2/operators/gelu_op.cc",
-        "caffe2/operators/generate_proposals_op.cc",
-        "caffe2/operators/given_tensor_byte_string_to_uint8_fill_op.cc",
-        "caffe2/operators/given_tensor_fill_op.cc",
-        "caffe2/operators/glu_op.cc",
-        "caffe2/operators/group_norm_op.cc",
-        "caffe2/operators/gru_unit_op.cc",
-        "caffe2/operators/h_softmax_op.cc",
-        "caffe2/operators/half_float_ops.cc",
-        "caffe2/operators/hard_sigmoid_op.cc",
-        "caffe2/operators/heatmap_max_keypoint_op.cc",
-        "caffe2/operators/if_op.cc",
-        "caffe2/operators/im2col_op.cc",
-        "caffe2/operators/index_hash_ops.cc",
-        "caffe2/operators/index_ops.cc",
-        "caffe2/operators/inference_lstm_op.cc",
-        "caffe2/operators/instance_norm_gradient_op.cc",
-        "caffe2/operators/instance_norm_op.cc",
-        "caffe2/operators/integral_image_op.cc",
-        "caffe2/operators/is_empty_op.cc",
-        "caffe2/operators/jsd_op.cc",
-        "caffe2/operators/key_split_ops.cc",
-        "caffe2/operators/last_n_window_collector.cc",
-        "caffe2/operators/layer_norm_op.cc",
-        "caffe2/operators/leaky_relu_op.cc",
-        "caffe2/operators/length_split_op.cc",
-        "caffe2/operators/lengths_pad_op.cc",
-        "caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.cc",
-        "caffe2/operators/lengths_reducer_ops.cc",
-        "caffe2/operators/lengths_reducer_rowwise_8bit_ops.cc",
-        "caffe2/operators/lengths_tile_op.cc",
-        "caffe2/operators/lengths_top_k_op.cc",
-        "caffe2/operators/listwise_l2r_op.cc",
-        "caffe2/operators/load_save_op.cc",
-        "caffe2/operators/load_save_op_util.cc",
-        "caffe2/operators/local_response_normalization_op.cc",
-        "caffe2/operators/locally_connected_op.cc",
-        "caffe2/operators/locally_connected_op_util.cc",
-        "caffe2/operators/log_op.cc",
-        "caffe2/operators/logit_op.cc",
-        "caffe2/operators/loss_op.cc",
-        "caffe2/operators/lp_pool_op.cc",
-        "caffe2/operators/lpnorm_op.cc",
-        "caffe2/operators/lstm_unit_op.cc",
-        "caffe2/operators/map_ops.cc",
-        "caffe2/operators/margin_ranking_criterion_op.cc",
-        "caffe2/operators/matmul_op.cc",
-        "caffe2/operators/mean_op.cc",
-        "caffe2/operators/merge_id_lists_op.cc",
-        "caffe2/operators/minmax_gradient_ops.cc",
-        "caffe2/operators/minmax_ops.cc",
-        "caffe2/operators/mod_op.cc",
-        "caffe2/operators/moments_op.cc",
-        "caffe2/operators/multi_class_accuracy_op.cc",
-        "caffe2/operators/negate_gradient_op.cc",
-        "caffe2/operators/negative_op.cc",
-        "caffe2/operators/ngram_ops.cc",
-        "caffe2/operators/norm_planar_yuv_op.cc",
-        "caffe2/operators/normalize_l1_op.cc",
-        "caffe2/operators/normalize_op.cc",
-        "caffe2/operators/numpy_tile_op.cc",
-        "caffe2/operators/one_hot_ops.cc",
-        "caffe2/operators/onnx_while_op.cc",
-        "caffe2/operators/order_switch_ops.cc",
-        "caffe2/operators/pack_rnn_sequence_op.cc",
-        "caffe2/operators/pack_segments.cc",
-        "caffe2/operators/pad_op.cc",
-        "caffe2/operators/partition_ops.cc",
-        "caffe2/operators/percentile_op.cc",
-        "caffe2/operators/perplexity_op.cc",
-        "caffe2/operators/piecewise_linear_transform_op.cc",
-        "caffe2/operators/pool_gradient_op.cc",
-        "caffe2/operators/pool_op.cc",
-        "caffe2/operators/pool_op_util.cc",
-        "caffe2/operators/pow_op.cc",
-        "caffe2/operators/prelu_op.cc",
-        "caffe2/operators/prepend_dim_op.cc",
-        "caffe2/operators/quant_decode_op.cc",
-        "caffe2/operators/rank_loss_op.cc",
-        "caffe2/operators/reciprocal_gradient_op.cc",
-        "caffe2/operators/reciprocal_op.cc",
-        "caffe2/operators/reduce_front_back_max_ops.cc",
-        "caffe2/operators/reduce_front_back_mean_ops.cc",
-        "caffe2/operators/reduce_front_back_sum_ops.cc",
-        "caffe2/operators/reduce_ops.cc",
-        "caffe2/operators/reduction_ops.cc",
-        "caffe2/operators/relu_n_op.cc",
-        "caffe2/operators/relu_op.cc",
-        "caffe2/operators/remove_data_blocks_op.cc",
-        "caffe2/operators/replace_nan_op.cc",
-        "caffe2/operators/reservoir_sampling.cc",
-        "caffe2/operators/reshape_op.cc",
-        "caffe2/operators/resize_3d_op.cc",
-        "caffe2/operators/resize_op.cc",
-        "caffe2/operators/reverse_packed_segs_op.cc",
-        "caffe2/operators/rmac_regions_op.cc",
-        "caffe2/operators/rnn/recurrent_network_blob_fetcher_op.cc",
-        "caffe2/operators/rnn/recurrent_network_executor.cc",
-        "caffe2/operators/rnn/recurrent_network_op.cc",
-        "caffe2/operators/roi_align_gradient_op.cc",
-        "caffe2/operators/roi_align_op.cc",
-        "caffe2/operators/roi_align_rotated_gradient_op.cc",
-        "caffe2/operators/roi_align_rotated_op.cc",
-        "caffe2/operators/roi_pool_op.cc",
-        "caffe2/operators/rowmul_op.cc",
-        "caffe2/operators/rsqrt_op.cc",
-        "caffe2/operators/scale_blobs_op.cc",
-        "caffe2/operators/scale_op.cc",
-        "caffe2/operators/segment_reduction_op.cc",
-        "caffe2/operators/selu_op.cc",
-        "caffe2/operators/sequence_ops.cc",
-        "caffe2/operators/shape_op.cc",
-        "caffe2/operators/sigmoid_gradient_op.cc",
-        "caffe2/operators/sigmoid_op.cc",
-        "caffe2/operators/sin_op.cc",
-        "caffe2/operators/sinh_op.cc",
-        "caffe2/operators/sinusoid_position_encoding_op.cc",
-        "caffe2/operators/slice_op.cc",
-        "caffe2/operators/softmax_op.cc",
-        "caffe2/operators/softmax_utils.cc",
-        "caffe2/operators/softmax_with_loss_op.cc",
-        "caffe2/operators/softplus_op.cc",
-        "caffe2/operators/softsign_op.cc",
-        "caffe2/operators/space_batch_op.cc",
-        "caffe2/operators/sparse_dropout_with_replacement_op.cc",
-        "caffe2/operators/sparse_normalize_op.cc",
-        "caffe2/operators/sparse_to_dense_mask_op.cc",
-        "caffe2/operators/sparse_to_dense_op.cc",
-        "caffe2/operators/spatial_batch_norm_gradient_op.cc",
-        "caffe2/operators/spatial_batch_norm_op.cc",
-        "caffe2/operators/spatial_softmax_with_loss_op.cc",
-        "caffe2/operators/sqr_op.cc",
-        "caffe2/operators/sqrt_op.cc",
-        "caffe2/operators/square_root_divide_op.cc",
-        "caffe2/operators/stats_ops.cc",
-        "caffe2/operators/stats_put_ops.cc",
-        "caffe2/operators/stop_gradient.cc",
-        "caffe2/operators/string_ops.cc",
-        "caffe2/operators/stump_func_op.cc",
-        "caffe2/operators/stylizer_ops.cc",
-        "caffe2/operators/summarize_op.cc",
-        "caffe2/operators/swish_op.cc",
-        "caffe2/operators/tan_op.cc",
-        "caffe2/operators/tanh_gradient_op.cc",
-        "caffe2/operators/tanh_op.cc",
-        "caffe2/operators/tensor_protos_db_input.cc",
-        "caffe2/operators/text_file_reader.cc",
-        "caffe2/operators/text_file_reader_utils.cc",
-        "caffe2/operators/thresholded_relu_op.cc",
-        "caffe2/operators/tile_op.cc",
-        "caffe2/operators/top_k.cc",
-        "caffe2/operators/transpose_op.cc",
-        "caffe2/operators/tt_linear_op.cc",
-        "caffe2/operators/unique_ops.cc",
-        "caffe2/operators/upsample_op.cc",
-        "caffe2/operators/utility_ops.cc",
-        "caffe2/operators/variable_length_sequence_padding.cc",
-        "caffe2/operators/weighted_multi_sampling_op.cc",
-        "caffe2/operators/weighted_sample_op.cc",
-        "caffe2/operators/while_op.cc",
-        "caffe2/operators/workspace_ops.cc",
-        "caffe2/operators/zero_gradient_op.cc",
-    ],
-)
-
-filegroup(
-    name = "caffe2_opt_srcs",
-    srcs = [
-        "caffe2/opt/annotations.cc",
-        "caffe2/opt/backend_cutting.cc",
-        "caffe2/opt/backend_transformer_base.cc",
-        "caffe2/opt/bound_shape_inferencer.cc",
-        "caffe2/opt/converter.cc",
-        "caffe2/opt/dead_code_elim.cc",
-        "caffe2/opt/device.cc",
-        "caffe2/opt/distributed.cc",
-        "caffe2/opt/distributed_converter.cc",
-        "caffe2/opt/fusion.cc",
-        "caffe2/opt/mobile.cc",
-        "caffe2/opt/onnxifi_op.cc",
-        "caffe2/opt/onnxifi_transformer.cc",
-        "caffe2/opt/optimize_ideep.cc",
-        "caffe2/opt/optimizer.cc",
-        "caffe2/opt/passes.cc",
-        "caffe2/opt/shape_info.cc",
-        "caffe2/opt/tvm_transformer.cc",
-    ],
-)
-
 filegroup(
    name = "caffe2_perfkernels_srcs",
    srcs = [
@ -892,70 +518,6 @@ filegroup(
    ],
 )

-filegroup(
-    name = "caffe2_predictor_srcs",
-    srcs = [
-        "caffe2/predictor/emulator/data_filler.cc",
-        "caffe2/predictor/emulator/data_filler.h",
-        "caffe2/predictor/predictor.cc",
-        "caffe2/predictor/predictor_config.cc",
-        "caffe2/predictor/predictor_utils.cc",
-    ],
-)
-
-filegroup(
-    name = "caffe2_quantization_srcs",
-    srcs = [
-        "caffe2/quantization/server/activation_distribution_observer.cc",
-        "caffe2/quantization/server/batch_matmul_dnnlowp_op.cc",
-        "caffe2/quantization/server/caffe2_dnnlowp_utils.cc",
-        "caffe2/quantization/server/channel_shuffle_dnnlowp_op.cc",
-        "caffe2/quantization/server/concat_dnnlowp_op.cc",
-        "caffe2/quantization/server/conv_dnnlowp_acc16_op.cc",
-        "caffe2/quantization/server/conv_dnnlowp_op.cc",
-        "caffe2/quantization/server/conv_relu_op.cc",
-        "caffe2/quantization/server/dequantize_dnnlowp_op.cc",
-        "caffe2/quantization/server/dnnlowp.cc",
-        "caffe2/quantization/server/dnnlowp_partition.cc",
-        "caffe2/quantization/server/dynamic_histogram.cc",
-        "caffe2/quantization/server/elementwise_add_dnnlowp_op.cc",
-        "caffe2/quantization/server/elementwise_linear_dnnlowp_op.cc",
-        "caffe2/quantization/server/elementwise_mul_dnnlowp_op.cc",
-        "caffe2/quantization/server/elementwise_sum_dnnlowp_op.cc",
-        "caffe2/quantization/server/elementwise_sum_relu_op.cc",
-        "caffe2/quantization/server/fbgemm_pack_matrix_cache.cc",
-        "caffe2/quantization/server/fbgemm_pack_op.cc",
-        "caffe2/quantization/server/fully_connected_dnnlowp_acc16_op.cc",
-        "caffe2/quantization/server/fully_connected_dnnlowp_op.cc",
-        "caffe2/quantization/server/fully_connected_fake_lowp_op.cc",
-        "caffe2/quantization/server/group_norm_dnnlowp_op.cc",
-        "caffe2/quantization/server/int8_gen_quant_params.cc",
-        "caffe2/quantization/server/kl_minimization.cc",
-        "caffe2/quantization/server/lstm_unit_dnnlowp_op.cc",
-        "caffe2/quantization/server/norm_minimization.cc",
-        "caffe2/quantization/server/p99.cc",
-        "caffe2/quantization/server/pool_dnnlowp_op.cc",
-        "caffe2/quantization/server/quantize_dnnlowp_op.cc",
-        "caffe2/quantization/server/relu_dnnlowp_op.cc",
-        "caffe2/quantization/server/sigmoid.cc",
-        "caffe2/quantization/server/sigmoid_dnnlowp_op.cc",
-        "caffe2/quantization/server/spatial_batch_norm_dnnlowp_op.cc",
-        "caffe2/quantization/server/tanh.cc",
-        "caffe2/quantization/server/tanh_dnnlowp_op.cc",
-        "caffe2/quantization/server/utility_dnnlowp_ops.cc",
-    ],
-)
-
-filegroup(
-    name = "caffe2_queue_srcs",
-    srcs = [
-        "caffe2/queue/blobs_queue.cc",
-        "caffe2/queue/blobs_queue_db.cc",
-        "caffe2/queue/queue_ops.cc",
-        "caffe2/queue/rebatching_queue.cc",
-        "caffe2/queue/rebatching_queue_ops.cc",
-    ],
-)

 filegroup(
    name = "caffe2_serialize_srcs",
@ -967,36 +529,6 @@ filegroup(
    ],
 )

-filegroup(
-    name = "caffe2_sgd_srcs",
-    srcs = [
-        "caffe2/sgd/adadelta_op.cc",
-        "caffe2/sgd/adagrad_op.cc",
-        "caffe2/sgd/adam_op.cc",
-        "caffe2/sgd/clip_tensor_op.cc",
-        "caffe2/sgd/ftrl_op.cc",
-        "caffe2/sgd/gftrl_op.cc",
-        "caffe2/sgd/iter_op.cc",
-        "caffe2/sgd/lars_op.cc",
-        "caffe2/sgd/learning_rate_adaption_op.cc",
-        "caffe2/sgd/learning_rate_op.cc",
-        "caffe2/sgd/momentum_sgd_op.cc",
-        "caffe2/sgd/rmsprop_op.cc",
-        "caffe2/sgd/wngrad_op.cc",
-        "caffe2/sgd/yellowfin_op.cc",
-    ],
-)
-
-filegroup(
-    name = "caffe2_transforms_srcs",
-    srcs = [
-        "caffe2/transforms/common_subexpression_elimination.cc",
-        "caffe2/transforms/conv_to_nnpack_transform.cc",
-        "caffe2/transforms/pattern_net_transform.cc",
-        "caffe2/transforms/single_op_transform.cc",
-    ],
-)
-
 filegroup(
    name = "caffe2_utils_srcs",
    srcs = [
@ -1021,228 +553,6 @@ filegroup(
    ],
 )

-filegroup(
-    name = "caffe2_cuda_cpp_srcs",
-    srcs = [
-        "caffe2/contrib/aten/aten_op_gpu.cc",
-        "caffe2/contrib/gloo/allreduce_ops_gpu.cc",
-        "caffe2/contrib/gloo/broadcast_ops_gpu.cc",
-        "caffe2/contrib/gloo/common_world_ops_gpu.cc",
-        "caffe2/core/blob_serialization_gpu.cc",
-        "caffe2/core/common_cudnn.cc",
-        "caffe2/core/common_gpu.cc",
-        "caffe2/core/event_gpu.cc",
-        "caffe2/db/create_db_op_gpu.cc",
-        "caffe2/distributed/file_store_handler_op_gpu.cc",
-        "caffe2/operators/communicator_op_gpu.cc",
-        "caffe2/operators/concat_split_op_gpu.cc",
-        "caffe2/operators/conv_op_cache_cudnn.cc",
-        "caffe2/operators/conv_op_cudnn.cc",
-        "caffe2/operators/conv_op_gpu.cc",
-        "caffe2/operators/conv_op_shared_gpu.cc",
-        "caffe2/operators/conv_transpose_op_cudnn.cc",
-        "caffe2/operators/conv_transpose_op_gpu.cc",
-        "caffe2/operators/counter_ops_gpu.cc",
-        "caffe2/operators/do_op_gpu.cc",
-        "caffe2/operators/dropout_op_cudnn.cc",
-        "caffe2/operators/elementwise_add_op_gpu.cc",
-        "caffe2/operators/elementwise_sub_op_gpu.cc",
-        "caffe2/operators/elu_op_cudnn.cc",
-        "caffe2/operators/exp_op_gpu.cc",
-        "caffe2/operators/expand_op_gpu.cc",
-        "caffe2/operators/expand_squeeze_dims_op_gpu.cc",
-        "caffe2/operators/free_op_gpu.cc",
-        "caffe2/operators/fully_connected_op_gpu.cc",
-        "caffe2/operators/if_op_gpu.cc",
-        "caffe2/operators/im2col_op_gpu.cc",
-        "caffe2/operators/load_save_op_gpu.cc",
-        "caffe2/operators/local_response_normalization_op_cudnn.cc",
-        "caffe2/operators/locally_connected_op_gpu.cc",
-        "caffe2/operators/log_op_gpu.cc",
-        "caffe2/operators/matmul_op_gpu.cc",
-        "caffe2/operators/negate_gradient_op_gpu.cc",
-        "caffe2/operators/negative_op_gpu.cc",
-        "caffe2/operators/order_switch_ops_cudnn.cc",
-        "caffe2/operators/order_switch_ops_gpu.cc",
-        "caffe2/operators/pool_op_cudnn.cc",
-        "caffe2/operators/prepend_dim_op_gpu.cc",
-        "caffe2/operators/reshape_op_gpu.cc",
-        "caffe2/operators/rnn/recurrent_network_blob_fetcher_op_gpu.cc",
-        "caffe2/operators/rnn/recurrent_network_executor_gpu.cc",
-        "caffe2/operators/rnn/recurrent_op_cudnn.cc",
-        "caffe2/operators/scale_op_gpu.cc",
-        "caffe2/operators/shape_op_gpu.cc",
-        "caffe2/operators/sigmoid_op_cudnn.cc",
-        "caffe2/operators/softmax_op_cudnn.cc",
-        "caffe2/operators/sqr_op_gpu.cc",
-        "caffe2/operators/sqrt_op_gpu.cc",
-        "caffe2/operators/stop_gradient_gpu.cc",
-        "caffe2/operators/tanh_op_cudnn.cc",
-        "caffe2/operators/tensor_protos_db_input_gpu.cc",
-        "caffe2/operators/transpose_op_cudnn.cc",
-        "caffe2/operators/while_op_gpu.cc",
-        "caffe2/operators/zero_gradient_op_gpu.cc",
-        "caffe2/queue/queue_ops_gpu.cc",
-        "caffe2/sgd/iter_op_gpu.cc",
-        "caffe2/sgd/learning_rate_op_gpu.cc",
-    ],
-)
-
-filegroup(
-    name = "caffe2_cu_srcs",
-    srcs = [
-        "caffe2/core/context_gpu.cu",
-        "caffe2/operators/abs_op.cu",
-        "caffe2/operators/accumulate_op.cu",
-        "caffe2/operators/accuracy_op.cu",
-        "caffe2/operators/acos_op.cu",
-        "caffe2/operators/affine_channel_op.cu",
-        "caffe2/operators/alias_with_name.cu",
-        "caffe2/operators/arg_ops.cu",
-        "caffe2/operators/asin_op.cu",
-        "caffe2/operators/assert_op.cu",
-        "caffe2/operators/atan_op.cu",
-        "caffe2/operators/batch_gather_ops.cu",
-        "caffe2/operators/batch_matmul_op.cu",
-        "caffe2/operators/batch_moments_op.cu",
-        "caffe2/operators/batch_permutation_op.cu",
-        "caffe2/operators/batch_sparse_to_dense_op.cu",
-        "caffe2/operators/boolean_mask_ops.cu",
-        "caffe2/operators/boolean_unmask_ops.cu",
-        "caffe2/operators/bucketize_op.cu",
-        "caffe2/operators/cast_op.cu",
-        "caffe2/operators/cbrt_op.cu",
-        "caffe2/operators/ceil_op.cu",
-        "caffe2/operators/channel_backprop_stats_op.cu",
-        "caffe2/operators/channel_shuffle_op.cu",
-        "caffe2/operators/channel_stats_op.cu",
-        "caffe2/operators/channelwise_conv3d_op_cudnn.cu",
-        "caffe2/operators/clip_op.cu",
-        "caffe2/operators/copy_op.cu",
-        "caffe2/operators/cos_op.cu",
-        "caffe2/operators/cosh_op.cu",
-        "caffe2/operators/cosine_embedding_criterion_op.cu",
-        "caffe2/operators/cross_entropy_op.cu",
-        "caffe2/operators/cube_op.cu",
-        "caffe2/operators/data_couple_gpu.cu",
-        "caffe2/operators/deform_conv_op.cu",
-        "caffe2/operators/depthwise_3x3_conv_op_cudnn.cu",
-        "caffe2/operators/distance_op.cu",
-        "caffe2/operators/dropout_op.cu",
-        "caffe2/operators/elementwise_div_op.cu",
-        "caffe2/operators/elementwise_linear_op.cu",
-        "caffe2/operators/elementwise_mul_op.cu",
-        "caffe2/operators/elementwise_ops.cu",
-        "caffe2/operators/elu_op.cu",
-        "caffe2/operators/enforce_finite_op.cu",
-        "caffe2/operators/ensure_cpu_output_op.cu",
-        "caffe2/operators/erf_op.cu",
-        "caffe2/operators/filler_op.cu",
-        "caffe2/operators/find_op.cu",
-        "caffe2/operators/floor_op.cu",
-        "caffe2/operators/gather_op.cu",
-        "caffe2/operators/gelu_op.cu",
-        "caffe2/operators/generate_proposals_op.cu",
-        "caffe2/operators/generate_proposals_op_util_nms_gpu.cu",
-        "caffe2/operators/given_tensor_byte_string_to_uint8_fill_op.cu",
-        "caffe2/operators/given_tensor_fill_op.cu",
-        "caffe2/operators/glu_op.cu",
-        "caffe2/operators/group_norm_op.cu",
-        "caffe2/operators/gru_unit_op_gpu.cu",
-        "caffe2/operators/half_float_ops.cu",
-        "caffe2/operators/hard_sigmoid_op.cu",
-        "caffe2/operators/instance_norm_op.cu",
-        "caffe2/operators/integral_image_op.cu",
-        "caffe2/operators/layer_norm_op.cu",
-        "caffe2/operators/leaky_relu_op.cu",
-        "caffe2/operators/lengths_pad_op.cu",
-        "caffe2/operators/lengths_tile_op.cu",
-        "caffe2/operators/local_response_normalization_op.cu",
-        "caffe2/operators/logit_op.cu",
-        "caffe2/operators/loss_op.cu",
-        "caffe2/operators/lp_pool_op.cu",
-        "caffe2/operators/lstm_unit_op_gpu.cu",
-        "caffe2/operators/margin_ranking_criterion_op.cu",
-        "caffe2/operators/max_pool_with_index.cu",
-        "caffe2/operators/mean_op.cu",
-        "caffe2/operators/mem_query_op.cu",
-        "caffe2/operators/minmax_ops.cu",
-        "caffe2/operators/moments_op.cu",
-        "caffe2/operators/multi_class_accuracy_op.cu",
-        "caffe2/operators/normalize_ops.cu",
-        "caffe2/operators/one_hot_ops.cu",
-        "caffe2/operators/pack_segments.cu",
-        "caffe2/operators/pad_op_gpu.cu",
-        "caffe2/operators/perplexity_op.cu",
-        "caffe2/operators/piecewise_linear_transform_op.cu",
-        "caffe2/operators/pool_op.cu",
-        "caffe2/operators/pow_op.cu",
-        "caffe2/operators/prelu_op.cu",
-        "caffe2/operators/reciprocal_op.cu",
-        "caffe2/operators/reduce_front_back_max_ops.cu",
-        "caffe2/operators/reduce_front_back_sum_mean_ops.cu",
-        "caffe2/operators/reduce_ops.cu",
-        "caffe2/operators/reduction_ops.cu",
-        "caffe2/operators/relu_n_op.cu",
-        "caffe2/operators/relu_op.cu",
-        "caffe2/operators/replace_nan_op.cu",
-        "caffe2/operators/resize_3d_op.cu",
-        "caffe2/operators/resize_op.cu",
-        "caffe2/operators/reverse_packed_segs_op.cu",
-        "caffe2/operators/rmac_regions_op.cu",
-        "caffe2/operators/rnn/recurrent_network_op_gpu.cu",
-        "caffe2/operators/roi_align_gradient_op.cu",
-        "caffe2/operators/roi_align_op.cu",
-        "caffe2/operators/roi_align_rotated_gradient_op.cu",
-        "caffe2/operators/roi_align_rotated_op.cu",
-        "caffe2/operators/roi_pool_op.cu",
-        "caffe2/operators/rsqrt_op.cu",
-        "caffe2/operators/scale_blobs_op.cu",
-        "caffe2/operators/segment_reduction_op_gpu.cu",
-        "caffe2/operators/selu_op.cu",
-        "caffe2/operators/sequence_ops.cu",
-        "caffe2/operators/sigmoid_op.cu",
-        "caffe2/operators/sin_op.cu",
-        "caffe2/operators/sinh_op.cu",
-        "caffe2/operators/slice_op.cu",
-        "caffe2/operators/softmax_ops.cu",
-        "caffe2/operators/softplus_op.cu",
-        "caffe2/operators/softsign_op.cu",
-        "caffe2/operators/space_batch_op_gpu.cu",
-        "caffe2/operators/sparse_normalize_op_gpu.cu",
-        "caffe2/operators/sparse_to_dense_op.cu",
-        "caffe2/operators/spatial_batch_norm_op.cu",
-        "caffe2/operators/spatial_batch_norm_op_cudnn.cu",
-        "caffe2/operators/stump_func_op.cu",
-        "caffe2/operators/summarize_op.cu",
-        "caffe2/operators/swish_op.cu",
-        "caffe2/operators/tan_op.cu",
-        "caffe2/operators/tanh_op.cu",
-        "caffe2/operators/thresholded_relu_op.cu",
-        "caffe2/operators/tile_op.cu",
-        "caffe2/operators/top_k.cu",
-        "caffe2/operators/transpose_op.cu",
-        "caffe2/operators/unique_ops.cu",
-        "caffe2/operators/upsample_op.cu",
-        "caffe2/operators/utility_ops.cu",
-        "caffe2/operators/weighted_sample_op.cu",
-        "caffe2/sgd/adadelta_op_gpu.cu",
-        "caffe2/sgd/adagrad_op_gpu.cu",
-        "caffe2/sgd/adam_op_gpu.cu",
-        "caffe2/sgd/fp16_momentum_sgd_op.cu",
-        "caffe2/sgd/fp32_momentum_sgd_op.cu",
-        "caffe2/sgd/lars_op_gpu.cu",
-        "caffe2/sgd/momentum_sgd_op_gpu.cu",
-        "caffe2/sgd/rmsprop_op_gpu.cu",
-        "caffe2/sgd/yellowfin_op_gpu.cu",
-        "caffe2/utils/math/broadcast.cu",
-        "caffe2/utils/math/elementwise.cu",
-        "caffe2/utils/math/reduce.cu",
-        "caffe2/utils/math/transpose.cu",
-        "caffe2/utils/math_gpu.cu",
-    ],
-)
-
 # To achieve finer granularity and make debug easier, caffe2 is split into three libraries:
 # ATen, caffe2 and caffe2_for_aten_headers. ATen lib group up source codes under
 # aten/ directory and caffe2 contains most files under `caffe2/` directory. Since the
@ -1271,35 +581,10 @@ cc_library(
    ],
 )

-py_binary(
-    name = "gen_op",
-    srcs = ["caffe2/contrib/aten/gen_op.py"],
-    deps = ["//torchgen"],
-)
-
-genrule(
-    name = "generated_caffe2_aten_op_headers",
-    srcs = [
-        "caffe2/contrib/aten/aten_op_template.h",
-        "aten/src/ATen/Declarations.yaml",
-    ],
-    outs = ["caffe2/caffe2/contrib/aten/gen_aten_op.h"],
-    cmd = """
-    $(location :gen_op) \
-        --output_prefix gen_ \
-        --install_dir $(@D) \
-        --aten_root `dirname $(location aten/src/ATen/Declarations.yaml)`/../.. \
-        --template_dir `dirname $(location caffe2/contrib/aten/aten_op_template.h)` \
-        --yaml_dir `dirname $(location aten/src/ATen/Declarations.yaml)`""",
-    tools = [":gen_op"],
-)
-
 cc_library(
    name = "caffe2_headers",
    hdrs = glob(
        [
-            "caffe2/contrib/aten/*.h",
-            "caffe2/contrib/gloo/*.h",
            "caffe2/core/*.h",
            "caffe2/core/nomnigraph/include/nomnigraph/Converters/*.h",
            "caffe2/core/nomnigraph/include/nomnigraph/Generated/*.h",
@ -1308,25 +593,8 @@ cc_library(
            "caffe2/core/nomnigraph/include/nomnigraph/Support/*.h",
            "caffe2/core/nomnigraph/include/nomnigraph/Transformations/*.h",
            "caffe2/core/nomnigraph/tests/*.h",
-            "caffe2/db/*.h",
-            "caffe2/distributed/*.h",
-            "caffe2/ideep/*.h",
-            "caffe2/ideep/operators/*.h",
-            "caffe2/ideep/operators/quantization/*.h",
-            "caffe2/ideep/utils/*.h",
-            "caffe2/onnx/*.h",
-            "caffe2/operators/*.h",
-            "caffe2/operators/rnn/*.h",
-            "caffe2/opt/*.h",
            "caffe2/perfkernels/*.h",
-            "caffe2/predictor/*.h",
-            "caffe2/predictor/emulator/*.h",
-            "caffe2/quantization/server/*.h",
-            "caffe2/queue/*.h",
            "caffe2/serialize/*.h",
-            "caffe2/sgd/*.h",
-            "caffe2/share/contrib/depthwise/*.h",
-            "caffe2/transforms/*.h",
            "caffe2/utils/*.h",
            "caffe2/utils/math/*.h",
            "caffe2/utils/threadpool/*.h",
@ -1338,10 +606,9 @@ cc_library(
    ) + if_cuda(glob([
        "caffe2/**/*.cuh",
        "caffe2/image/*.h",
-    ])) + [":generated_caffe2_aten_op_headers"],
+    ])),
    copts = CAFFE2_COPTS,
    includes = [
-        "caffe2/contrib/aten",
        "caffe2/core/nomnigraph/include",
    ],
    visibility = ["//visibility:public"],
@ -1353,52 +620,12 @@ cc_library(
    ],
 )

-cc_library(
-    name = "caffe2_dnnlowp_avx2_ops",
-    srcs = [
-        "caffe2/quantization/server/elementwise_sum_dnnlowp_op_avx2.cc",
-        "caffe2/quantization/server/fully_connected_fake_lowp_op_avx2.cc",
-        "caffe2/quantization/server/group_norm_dnnlowp_op_avx2.cc",
-        "caffe2/quantization/server/norm_minimization_avx2.cc",
-        "caffe2/quantization/server/pool_dnnlowp_op_avx2.cc",
-        "caffe2/quantization/server/relu_dnnlowp_op_avx2.cc",
-        "caffe2/quantization/server/spatial_batch_norm_dnnlowp_op_avx2.cc",
-        "caffe2/quantization/server/transpose.cc",
-    ],
-    copts = CAFFE2_COPTS + [
-        "-mf16c",
-        "-mavx2",
-        "-mfma",
-        "-mxsave",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":caffe2_headers",
-        "@fbgemm",
-    ],
-    alwayslink = True,
-)
-
 cc_library(
    name = "caffe2",
    srcs = [
-        "caffe2/db/create_db_op.cc",
-        "caffe2/db/protodb.cc",
-        "caffe2/share/contrib/depthwise/depthwise3x3_conv_op.cc",
-        ":caffe2_contrib_srcs",
        ":caffe2_core_srcs",
-        ":caffe2_distributed_srcs",
-        ":caffe2_ideep_srcs",
-        ":caffe2_onnx_srcs",
-        ":caffe2_operators_srcs",
-        ":caffe2_opt_srcs",
        ":caffe2_perfkernels_srcs",
-        ":caffe2_predictor_srcs",
-        ":caffe2_quantization_srcs",
-        ":caffe2_queue_srcs",
        ":caffe2_serialize_srcs",
-        ":caffe2_sgd_srcs",
-        ":caffe2_transforms_srcs",
        ":caffe2_utils_srcs",
    ],
    copts = CAFFE2_COPTS + ["-mf16c"],
@ -1406,7 +633,6 @@ cc_library(
    visibility = ["//visibility:public"],
    deps = [
        ":caffe2_core_macros",
-        ":caffe2_dnnlowp_avx2_ops",
        ":caffe2_headers",
        ":caffe2_perfkernels_avx",
        ":caffe2_perfkernels_avx2",
@ -1419,11 +645,9 @@ cc_library(
        "@fbgemm//:fbgemm_src_headers",
        "@fmt",
        "@foxi",
-        "@gloo",
        "@onnx",
    ] + if_cuda(
        [
-            ":caffe2_cuda_cpp",
            ":aten_cuda",
            "@tensorpipe//:tensorpipe_cuda",
        ],
@ -1435,39 +659,20 @@ cc_library(
    alwayslink = True,
 )

-cc_library(
-    name = "caffe2_cuda_cpp",
-    srcs = [":caffe2_cuda_cpp_srcs"],
-    copts = CAFFE2_COPTS,
-    visibility = ["//visibility:public"],
-    deps = [
-        ":caffe2_cuda",
-        ":caffe2_headers",
-    ],
-    alwayslink = True,
-)
-
 cu_library(
-    name = "caffe2_cuda",
-    # one may think that `quantization_gpu.cu` could be a separate kernel,
-    # however that leads to de-registration problem that's described in
-    # https://github.com/pytorch/pytorch/issues/79236
-    # To solve it we add it into the `caffe2_cuda`,
-    # this is also aligned with the CMake build.
-    srcs = [":caffe2_cu_srcs"] + [
+    name = "torch_cuda",
+    srcs = [
        "torch/csrc/distributed/c10d/intra_node_comm.cu",
        "torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
    ],
-    copts = CAFFE2_COPTS + torch_cuda_half_options,
+    copts = torch_cuda_half_options,
    visibility = ["//visibility:public"],
    deps = [
        ":aten",
-        ":caffe2_headers",
        "@cuda//:cublas",
        "@cuda//:curand",
        "@cudnn",
        "@eigen",
-        "@gloo",
        "@tensorpipe//:tensorpipe_cuda",
    ],
    alwayslink = True,
@ -1641,6 +846,7 @@ cc_library(
    ] + if_cuda([
        "@cuda//:nvToolsExt",
        "@cutlass",
+        ":torch_cuda",
    ]),
    alwayslink = True,
 )
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -56,7 +56,7 @@ endif()

 # This define is needed to preserve behavior given anticpated changes to cccl/thrust
 # https://nvidia.github.io/libcudacxx/standard_api/numerics_library/complex.html
-string(APPEND CMAKE_CUDA_FLAGS "-DLIBCUDACXX_ENABLE_SIMPLIFIED_COMPLEX_OPERATIONS")
+string(APPEND CMAKE_CUDA_FLAGS " -DLIBCUDACXX_ENABLE_SIMPLIFIED_COMPLEX_OPERATIONS")

 if(LINUX)
  include(cmake/CheckAbi.cmake)
@ -228,14 +228,10 @@ option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" ON)
 option(USE_KINETO "Use Kineto profiling library" ON)
 option(USE_CUPTI_SO "Use CUPTI as a shared library" ON)
 option(USE_FAKELOWP "Use FakeLowp operators" OFF)
-option(USE_FFMPEG "Use ffmpeg" OFF)
 option(USE_GFLAGS "Use GFLAGS" OFF)
 option(USE_GLOG "Use GLOG" OFF)
-option(USE_LEVELDB "Use LEVELDB" OFF)
 option(USE_LITE_PROTO "Use lite protobuf instead of full." OFF)
-option(USE_LMDB "Use LMDB" OFF)
 option(USE_MAGMA "Use MAGMA" ON)
-option(USE_METAL "Use Metal for Caffe2 iOS build" ON)
 option(USE_PYTORCH_METAL "Use Metal for PyTorch iOS build" OFF)
 option(USE_PYTORCH_METAL_EXPORT "Export Metal models on MacOSX desktop" OFF)
 option(USE_NATIVE_ARCH "Use -march=native" OFF)
@ -264,15 +260,12 @@ cmake_dependent_option(
 option(USE_NUMPY "Use NumPy" ON)
 option(USE_OBSERVERS "Use observers module." OFF)
 option(USE_OPENCL "Use OpenCL" OFF)
-option(USE_OPENCV "Use OpenCV" OFF)
 option(USE_OPENMP "Use OpenMP for parallel code" ON)
 option(USE_PRECOMPILED_HEADERS "Use pre-compiled headers to accelerate build." OFF)

 option(USE_PROF "Use profiling" OFF)
 option(USE_QNNPACK "Use QNNPACK (quantized 8-bit operators)" ON)
 option(USE_PYTORCH_QNNPACK "Use ATen/QNNPACK (quantized 8-bit operators)" ON)
-option(USE_REDIS "Use Redis" OFF)
-option(USE_ROCKSDB "Use RocksDB" OFF)
 option(USE_SNPE "Use Qualcomm's SNPE library" OFF)
 option(USE_SYSTEM_EIGEN_INSTALL
    "Use system Eigen instead of the one under third_party" OFF)
@ -294,7 +287,6 @@ option(USE_VULKAN_FP16_INFERENCE "Vulkan - Use fp16 inference" OFF)
 option(USE_VULKAN_RELAXED_PRECISION "Vulkan - Use relaxed precision math in the kernels (mediump)" OFF)
 # option USE_XNNPACK: try to enable xnnpack by default.
 option(USE_XNNPACK "Use XNNPACK" ON)
-option(USE_ZMQ "Use ZMQ" OFF)
 option(USE_ZSTD "Use ZSTD" OFF)
 option(USE_ROCM_KERNEL_ASSERT "Use Kernel Assert for ROCm" OFF)
 # Ensure that an ITT build is the default for x86 CPUs
--- a/13
+++ b/13
@ -116,7 +116,7 @@ torch/profiler/ @aaronenyeshi
 test/functorch/test_aotdispatch.py @ezyang @Chillee

 # Dataloader
-torch/utils/data/ @ejguan
+torch/utils/data/ @andrewkho @gokulavasan

 # hipify
 torch/utils/hipify/ @jeffdaily @jithunnair-amd
@ -144,3 +144,14 @@ caffe2/utils/hip @jeffdaily @jithunnair-amd
 /torch/csrc/Storage* @mikaylagawarecki
 # subscribing for PyTorchFileWriter/PyTorchFileReader changes
 /torch/csrc/jit/python/init.cpp @mikaylagawarecki
+
+# CUDA and CUDA math libraries
+aten/src/ATen/cuda/ @eqy
+aten/src/ATen/cudnn/ @eqy
+aten/src/ATen/native/cuda/ @eqy
+aten/src/ATen/native/cudnn/ @eqy
+c10/cuda @eqy
+torch/cuda/ @eqy
+torch/csrc/cuda/ @eqy
+torch/backends/cuda/ @eqy
+torch/backends/cudnn/ @eqy
--- a/19
+++ b/19
@ -1,12 +1,10 @@
-# syntax = docker/dockerfile:experimental
+# syntax=docker/dockerfile:1
+
+# NOTE: Building this image require's docker version >= 23.0.
 #
-# NOTE: To build this you will need a docker version > 18.06 with
-#       experimental enabled and DOCKER_BUILDKIT=1
-#
-#       If you do not use buildkit you are not going to have a good time
-#
-#       For reference:
-#           https://docs.docker.com/develop/develop-images/build_enhancements/
+# For reference:
+# - https://docs.docker.com/build/dockerfile/frontend/#stable-channel
+
 ARG BASE_IMAGE=ubuntu:22.04
 ARG PYTHON_VERSION=3.11

@ -67,8 +65,9 @@ ARG CUDA_VERSION=12.1
 ARG CUDA_CHANNEL=nvidia
 ARG INSTALL_CHANNEL=pytorch-nightly
 # Automatically set by buildx
-# Note conda needs to be pinned to 23.5.2 see: https://github.com/pytorch/pytorch/issues/106470
-RUN /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -y python=${PYTHON_VERSION} conda=23.5.2
+RUN /opt/conda/bin/conda update -y -n base -c defaults conda
+RUN /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -y python=${PYTHON_VERSION}
+
 ARG TARGETPLATFORM

 # On arm64 we can only install wheel packages.
--- a/RELEASE.md
+++ b/RELEASE.md
@ -65,8 +65,8 @@ Following is the release cadence for year 2023/2024. All dates below are tentati
 | --- | --- | --- | --- | --- |
 | 2.1 | Aug 2023 | Oct 2023 | Nov 2023 | Dec 2023 |
 | 2.2 | Dec 2023 | Jan 2024 | Feb 2024 | Mar 2024 |
-| 2.3 | Mar 2024 | Apr 2024 | May 2024 | Jun 2024 |
-| 2.4 | May 2024 | Jul 2024 | Aug 2024 | Sep 2024 |
+| 2.3 | Mar 2024 | Apr 2024 | Jun 2024 | Not planned |
+| 2.4 | Jun 2024 | Jul 2024 | Aug 2024 | Sep 2024 |
 | 2.5 | Aug 2024 | Oct 2024 | Nov 2024 | Dec 2024 |

 ## General Overview
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@ -268,6 +268,12 @@ at::BlasBackend Context::blasPreferredBackend() const {
 }

 void Context::setBlasPreferredBackend(at::BlasBackend b) {
+#ifdef _MSC_VER
+  TORCH_WARN_ONCE(
+    "torch.backends.cuda.preferred_blas_library is an experimental feature. "
+    "It is not supported on Windows."
+  );
+#else
  TORCH_CHECK((b != at::BlasBackend::Cublaslt) || hasCuBLASLt(),
      "Cannot set preferred backend to cuBLASLt if PyTorch has not been compiled with cuBLASLt.");
  if (b != at::BlasBackend::Cublas) {
@ -278,6 +284,7 @@ void Context::setBlasPreferredBackend(at::BlasBackend b) {
    );
  }
  blas_preferred_backend = b;
+#endif
 }

 bool Context::allowFP16ReductionCuBLAS() const {
--- a/aten/src/ATen/SparseCsrTensorImpl.cpp
+++ b/aten/src/ATen/SparseCsrTensorImpl.cpp
@ -57,6 +57,8 @@ SparseCsrTensorImpl::SparseCsrTensorImpl(
  TORCH_INTERNAL_ASSERT(((key_set.has(DispatchKey::SparseCsrCPU) && device().type() == kCPU)
                         || (key_set.has(DispatchKey::SparseCsrCUDA) && device().type() == kCUDA)
                         || (key_set.has(DispatchKey::SparseCsrMeta) && device().type() == kMeta)
+                         || (key_set.has(DispatchKey::SparseCsrCPU) && device().type() == kMeta)   // fake tensor
+                         || (key_set.has(DispatchKey::SparseCsrCUDA) && device().type() == kMeta)  // fake tensor
                         || (key_set.has(DispatchKey::SparseCsrPrivateUse1) && device().type() == kPrivateUse1)),
                        "Inconsistent key_set (=", key_set, ") and device (=", device(), ")");

--- a/aten/src/ATen/SparseCsrTensorImpl.h
+++ b/aten/src/ATen/SparseCsrTensorImpl.h
@ -2,6 +2,7 @@

 #include <ATen/Tensor.h>
 #include <c10/core/TensorImpl.h>
+#include <c10/core/impl/TorchDispatchModeTLS.h>
 #include <c10/util/Exception.h>
 namespace at {

@ -107,6 +108,39 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl {
    }
  }

+  template <typename VariableVersion>
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach_core(
+      VariableVersion&& version_counter,
+      bool allow_tensor_metadata_change) const {
+    const auto mode_stack_len = c10::impl::TorchDispatchModeTLS::stack_len();
+    c10::impl::PyInterpreter&& interpreter = nullptr;
+    if (mode_stack_len > 0 &&
+        !c10::impl::tls_is_dispatch_key_excluded(DispatchKey::Python)) {
+      const auto& cur_torch_dispatch_mode_state =
+          c10::impl::TorchDispatchModeTLS::get_stack_at(mode_stack_len - 1);
+      interpreter = cur_torch_dispatch_mode_state->pyinterpreter();
+    } else if (
+        key_set_.has(DispatchKey::Python) &&
+        !c10::impl::tls_is_dispatch_key_excluded(DispatchKey::Python)) {
+      interpreter = pyobj_slot_.load_pyobj_interpreter();
+    } else {
+      // otherwise just copy the SparseTensorImpl and not the PyObject.
+      auto impl = c10::make_intrusive<SparseCsrTensorImpl>(
+          key_set(), device(), layout_impl(), dtype());
+      copy_tensor_metadata(
+          /*src_sparse_impl=*/this,
+          /*dest_sparse_impl=*/impl.get(),
+          /*version_counter=*/version_counter,
+          /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
+      impl->refresh_numel();
+      return impl;
+    }
+    auto r = interpreter->detach(this);
+    r->set_version_counter(std::forward<VariableVersion>(version_counter));
+    r->set_allow_tensor_metadata_change(allow_tensor_metadata_change);
+    return r;
+  }
+
  /**
   * Return a TensorImpl that is a shallow-copy of this TensorImpl.
   *
@ -116,15 +150,8 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl {
  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
      const c10::VariableVersion& version_counter,
      bool allow_tensor_metadata_change) const override {
-    auto impl = c10::make_intrusive<SparseCsrTensorImpl>(
-        key_set(), device(), layout_impl(), dtype());
-    copy_tensor_metadata(
-        /*src_sparse_impl=*/this,
-        /*dest_sparse_impl=*/impl.get(),
-        /*version_counter=*/version_counter,
-        /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
-    impl->refresh_numel();
-    return impl;
+    return shallow_copy_and_detach_core(
+        version_counter, allow_tensor_metadata_change);
  }

  /**
@ -136,15 +163,8 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl {
  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
      c10::VariableVersion&& version_counter,
      bool allow_tensor_metadata_change) const override {
-    auto impl = c10::make_intrusive<SparseCsrTensorImpl>(
-        key_set(), device(), layout_impl(), dtype());
-    copy_tensor_metadata(
-        /*src_sparse_impl=*/this,
-        /*dest_sparse_impl=*/impl.get(),
-        /*version_counter=*/std::move(version_counter),
-        /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
-    impl->refresh_numel();
-    return impl;
+    return shallow_copy_and_detach_core(
+        std::move(version_counter), allow_tensor_metadata_change);
  }

 private:
--- a/aten/src/ATen/SparseTensorImpl.h
+++ b/aten/src/ATen/SparseTensorImpl.h
@ -2,6 +2,7 @@

 #include <ATen/Tensor.h>
 #include <c10/core/TensorImpl.h>
+#include <c10/core/impl/TorchDispatchModeTLS.h>
 #include <c10/util/Exception.h>
 #include <c10/util/irange.h>

@ -306,6 +307,38 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
      const Tensor& indices,
      const Tensor& values);

+  template <typename VariableVersion>
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach_core(
+      VariableVersion&& version_counter,
+      bool allow_tensor_metadata_change) const {
+    const auto mode_stack_len = c10::impl::TorchDispatchModeTLS::stack_len();
+    c10::impl::PyInterpreter&& interpreter = nullptr;
+    if (mode_stack_len > 0 &&
+        !c10::impl::tls_is_dispatch_key_excluded(DispatchKey::Python)) {
+      const auto& cur_torch_dispatch_mode_state =
+          c10::impl::TorchDispatchModeTLS::get_stack_at(mode_stack_len - 1);
+      interpreter = cur_torch_dispatch_mode_state->pyinterpreter();
+    } else if (
+        key_set_.has(DispatchKey::Python) &&
+        !c10::impl::tls_is_dispatch_key_excluded(DispatchKey::Python)) {
+      interpreter = pyobj_slot_.load_pyobj_interpreter();
+    } else {
+      // otherwise just copy the SparseTensorImpl and not the PyObject.
+      auto impl = c10::make_intrusive<SparseTensorImpl>(key_set(), dtype());
+      copy_tensor_metadata(
+          /*src_sparse_impl=*/this,
+          /*dest_sparse_impl=*/impl.get(),
+          /*version_counter=*/version_counter,
+          /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
+      impl->refresh_numel();
+      return impl;
+    }
+    auto r = interpreter->detach(this);
+    r->set_version_counter(std::forward<VariableVersion>(version_counter));
+    r->set_allow_tensor_metadata_change(allow_tensor_metadata_change);
+    return r;
+  }
+
  /**
   * Return a TensorImpl that is a shallow-copy of this TensorImpl.
   *
@ -315,14 +348,8 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
      const c10::VariableVersion& version_counter,
      bool allow_tensor_metadata_change) const override {
-    auto impl = c10::make_intrusive<SparseTensorImpl>(key_set(), dtype());
-    copy_tensor_metadata(
-        /*src_sparse_impl=*/this,
-        /*dest_sparse_impl=*/impl.get(),
-        /*version_counter=*/version_counter,
-        /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
-    impl->refresh_numel();
-    return impl;
+    return shallow_copy_and_detach_core(
+        version_counter, allow_tensor_metadata_change);
  }

  /**
@ -334,14 +361,8 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
      c10::VariableVersion&& version_counter,
      bool allow_tensor_metadata_change) const override {
-    auto impl = c10::make_intrusive<SparseTensorImpl>(key_set(), dtype());
-    copy_tensor_metadata(
-        /*src_sparse_impl=*/this,
-        /*dest_sparse_impl=*/impl.get(),
-        /*version_counter=*/std::move(version_counter),
-        /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
-    impl->refresh_numel();
-    return impl;
+    return shallow_copy_and_detach_core(
+        std::move(version_counter), allow_tensor_metadata_change);
  }

  /**
--- a/aten/src/ATen/autocast_mode.cpp
+++ b/aten/src/ATen/autocast_mode.cpp
@ -158,159 +158,6 @@ namespace {
 Explicit registration for out-of-place ops
 *****************************************/

-#define AT_FORALL_LOWER_PRECISION_FP(_)  \
-  _(_convolution, deprecated)            \
-  _(_convolution)                        \
-  _(conv1d)                              \
-  _(conv2d)                              \
-  _(conv3d)                              \
-  _(conv_tbc)                            \
-  _(conv_transpose1d)                    \
-  _(conv_transpose2d, input)             \
-  _(conv_transpose3d, input)             \
-  _(convolution)                         \
-  _(prelu)                               \
-  _(addmm)                               \
-  _(addmv)                               \
-  _(addr)                                \
-  _(matmul)                              \
-  _(einsum)                              \
-  _(mm)                                  \
-  _(mv)                                  \
-  _(linalg_vecdot)                       \
-  _(linear)                              \
-  _(addbmm)                              \
-  _(baddbmm)                             \
-  _(bmm)                                 \
-  _(chain_matmul)                        \
-  _(linalg_multi_dot)                    \
-  _(_thnn_fused_lstm_cell)               \
-  _(_thnn_fused_gru_cell)                \
-  _(lstm_cell)                           \
-  _(gru_cell)                            \
-  _(rnn_tanh_cell)                       \
-  _(rnn_relu_cell)                       \
-  _(_scaled_dot_product_flash_attention) \
-  _(scaled_dot_product_attention)
-
-#define AT_FORALL_FP32(_)             \
-  _(acos)                             \
-  _(asin)                             \
-  _(cosh)                             \
-  _(erfinv)                           \
-  _(exp)                              \
-  _(expm1)                            \
-  _(log)                              \
-  _(log10)                            \
-  _(log2)                             \
-  _(log1p)                            \
-  _(reciprocal)                       \
-  _(rsqrt)                            \
-  _(sinh)                             \
-  _(tan)                              \
-  _(pow, Tensor_Scalar)               \
-  _(pow, Tensor_Tensor)               \
-  _(pow, Scalar)                      \
-  _(softplus)                         \
-  _(layer_norm)                       \
-  _(native_layer_norm)                \
-  _(group_norm)                       \
-  _(frobenius_norm, dim)              \
-  _(nuclear_norm)                     \
-  _(nuclear_norm, dim)                \
-  _(cosine_similarity)                \
-  _(poisson_nll_loss)                 \
-  _(cosine_embedding_loss)            \
-  _(nll_loss)                         \
-  _(nll_loss2d)                       \
-  _(hinge_embedding_loss)             \
-  _(kl_div)                           \
-  _(l1_loss)                          \
-  _(smooth_l1_loss)                   \
-  _(huber_loss)                       \
-  _(mse_loss)                         \
-  _(margin_ranking_loss)              \
-  _(multilabel_margin_loss)           \
-  _(soft_margin_loss)                 \
-  _(triplet_margin_loss)              \
-  _(multi_margin_loss)                \
-  _(binary_cross_entropy_with_logits) \
-  _(dist)                             \
-  _(pdist)                            \
-  _(cdist)                            \
-  _(renorm)                           \
-  _(logsumexp)                        \
-  _(upsample_nearest1d)               \
-  _(_upsample_nearest_exact1d)        \
-  _(upsample_nearest2d)               \
-  _(_upsample_nearest_exact2d)        \
-  _(upsample_nearest3d)               \
-  _(_upsample_nearest_exact3d)        \
-  _(upsample_linear1d)                \
-  _(upsample_bilinear2d)              \
-  _(_upsample_bilinear2d_aa)          \
-  _(upsample_trilinear3d)             \
-  _(upsample_bicubic2d)               \
-  _(_upsample_bicubic2d_aa)
-
-#define AT_FORALL_FP32_SET_OPT_DTYPE(_) \
-  _(prod)                               \
-  _(prod, dim_int)                      \
-  _(prod, dim_Dimname)                  \
-  _(softmax, int)                       \
-  _(softmax, Dimname)                   \
-  _(log_softmax, int)                   \
-  _(log_softmax, Dimname)               \
-  _(cumprod)                            \
-  _(cumprod, dimname)                   \
-  _(cumsum)                             \
-  _(cumsum, dimname)                    \
-  _(linalg_vector_norm)                 \
-  _(linalg_matrix_norm)                 \
-  _(linalg_matrix_norm, str_ord)        \
-  _(sum)                                \
-  _(sum, dim_IntList)                   \
-  _(sum, dim_DimnameList)
-
-#define AT_FORALL_DIFFERENT_REDISPATCH_SIGNATURE(_)                         \
-  _(ADD_NS(norm),                                                           \
-    "norm.Scalar",                                                          \
-    Tensor(const Tensor&, const Scalar&),                                   \
-    Tensor(const Tensor&, const c10::optional<Scalar>&, ScalarType),        \
-    fp32_append_dtype)                                                      \
-  _(ADD_NS(norm),                                                           \
-    "norm.ScalarOpt_dim",                                                   \
-    Tensor(const Tensor&, const c10::optional<Scalar>&, IntArrayRef, bool), \
-    Tensor(                                                                 \
-        const Tensor&,                                                      \
-        const c10::optional<Scalar>&,                                       \
-        IntArrayRef,                                                        \
-        bool,                                                               \
-        ScalarType),                                                        \
-    fp32_append_dtype)                                                      \
-  _(ADD_NS(norm),                                                           \
-    "norm.names_ScalarOpt_dim",                                             \
-    Tensor(const Tensor&, const c10::optional<Scalar>&, DimnameList, bool), \
-    Tensor(                                                                 \
-        const Tensor&,                                                      \
-        const c10::optional<Scalar>&,                                       \
-        DimnameList,                                                        \
-        bool,                                                               \
-        ScalarType),                                                        \
-    fp32_append_dtype)
-
-#define AT_FORALL_PROMOTE(_) \
-  _(addcdiv)                 \
-  _(addcmul)                 \
-  _(atan2)                   \
-  _(bilinear)                \
-  _(cross)                   \
-  _(dot)                     \
-  _(grid_sampler)            \
-  _(index_put)               \
-  _(tensordot)               \
-  _(scatter_add)
-
 TORCH_LIBRARY_IMPL(_, Autocast, m) {
  m.fallback(torch::CppFunction::makeFallthrough());
 }
--- a/aten/src/ATen/autocast_mode.h
+++ b/aten/src/ATen/autocast_mode.h
@ -728,7 +728,7 @@ copy pasted in from VariableTypeEverything.cpp with appropriate substitutions.

 // KERNEL_PRIVATEUSEONE/KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_PRIVATEUSEONE
 // registration (OP, POLICY) or (OP, OVERLOAD, POLICY) for AutocastPrivateUse1
-#define KERNEL_PRIVATEUSEONE(OP, ...) \
+#define KERNEL_PRIVATEUSEONE(...) \
  KERNEL(c10::DeviceType::PrivateUse1, __VA_ARGS__)

 #define KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_PRIVATEUSEONE( \
@ -744,3 +744,158 @@ copy pasted in from VariableTypeEverything.cpp with appropriate substitutions.
      REGISTER_SIGNATURE,                                    \
      REDISPATCH_SIGNATURE,                                  \
      POLICY)
+
+// Op lists for different policies.
+// To make sure other backends can reuse the policy op list.
+#define AT_FORALL_LOWER_PRECISION_FP(_)  \
+  _(_convolution, deprecated)            \
+  _(_convolution)                        \
+  _(conv1d)                              \
+  _(conv2d)                              \
+  _(conv3d)                              \
+  _(conv_tbc)                            \
+  _(conv_transpose1d)                    \
+  _(conv_transpose2d, input)             \
+  _(conv_transpose3d, input)             \
+  _(convolution)                         \
+  _(prelu)                               \
+  _(addmm)                               \
+  _(addmv)                               \
+  _(addr)                                \
+  _(matmul)                              \
+  _(einsum)                              \
+  _(mm)                                  \
+  _(mv)                                  \
+  _(linalg_vecdot)                       \
+  _(linear)                              \
+  _(addbmm)                              \
+  _(baddbmm)                             \
+  _(bmm)                                 \
+  _(chain_matmul)                        \
+  _(linalg_multi_dot)                    \
+  _(_thnn_fused_lstm_cell)               \
+  _(_thnn_fused_gru_cell)                \
+  _(lstm_cell)                           \
+  _(gru_cell)                            \
+  _(rnn_tanh_cell)                       \
+  _(rnn_relu_cell)                       \
+  _(_scaled_dot_product_flash_attention) \
+  _(scaled_dot_product_attention)
+
+#define AT_FORALL_FP32(_)             \
+  _(acos)                             \
+  _(asin)                             \
+  _(cosh)                             \
+  _(erfinv)                           \
+  _(exp)                              \
+  _(expm1)                            \
+  _(log)                              \
+  _(log10)                            \
+  _(log2)                             \
+  _(log1p)                            \
+  _(reciprocal)                       \
+  _(rsqrt)                            \
+  _(sinh)                             \
+  _(tan)                              \
+  _(pow, Tensor_Scalar)               \
+  _(pow, Tensor_Tensor)               \
+  _(pow, Scalar)                      \
+  _(softplus)                         \
+  _(layer_norm)                       \
+  _(native_layer_norm)                \
+  _(group_norm)                       \
+  _(frobenius_norm, dim)              \
+  _(nuclear_norm)                     \
+  _(nuclear_norm, dim)                \
+  _(cosine_similarity)                \
+  _(poisson_nll_loss)                 \
+  _(cosine_embedding_loss)            \
+  _(nll_loss)                         \
+  _(nll_loss2d)                       \
+  _(hinge_embedding_loss)             \
+  _(kl_div)                           \
+  _(l1_loss)                          \
+  _(smooth_l1_loss)                   \
+  _(huber_loss)                       \
+  _(mse_loss)                         \
+  _(margin_ranking_loss)              \
+  _(multilabel_margin_loss)           \
+  _(soft_margin_loss)                 \
+  _(triplet_margin_loss)              \
+  _(multi_margin_loss)                \
+  _(binary_cross_entropy_with_logits) \
+  _(dist)                             \
+  _(pdist)                            \
+  _(cdist)                            \
+  _(renorm)                           \
+  _(logsumexp)                        \
+  _(upsample_nearest1d)               \
+  _(_upsample_nearest_exact1d)        \
+  _(upsample_nearest2d)               \
+  _(_upsample_nearest_exact2d)        \
+  _(upsample_nearest3d)               \
+  _(_upsample_nearest_exact3d)        \
+  _(upsample_linear1d)                \
+  _(upsample_bilinear2d)              \
+  _(_upsample_bilinear2d_aa)          \
+  _(upsample_trilinear3d)             \
+  _(upsample_bicubic2d)               \
+  _(_upsample_bicubic2d_aa)
+
+#define AT_FORALL_FP32_SET_OPT_DTYPE(_) \
+  _(prod)                               \
+  _(prod, dim_int)                      \
+  _(prod, dim_Dimname)                  \
+  _(softmax, int)                       \
+  _(softmax, Dimname)                   \
+  _(log_softmax, int)                   \
+  _(log_softmax, Dimname)               \
+  _(cumprod)                            \
+  _(cumprod, dimname)                   \
+  _(cumsum)                             \
+  _(cumsum, dimname)                    \
+  _(linalg_vector_norm)                 \
+  _(linalg_matrix_norm)                 \
+  _(linalg_matrix_norm, str_ord)        \
+  _(sum)                                \
+  _(sum, dim_IntList)                   \
+  _(sum, dim_DimnameList)
+
+#define AT_FORALL_DIFFERENT_REDISPATCH_SIGNATURE(_)                         \
+  _(ADD_NS(norm),                                                           \
+    "norm.Scalar",                                                          \
+    Tensor(const Tensor&, const Scalar&),                                   \
+    Tensor(const Tensor&, const c10::optional<Scalar>&, ScalarType),        \
+    fp32_append_dtype)                                                      \
+  _(ADD_NS(norm),                                                           \
+    "norm.ScalarOpt_dim",                                                   \
+    Tensor(const Tensor&, const c10::optional<Scalar>&, IntArrayRef, bool), \
+    Tensor(                                                                 \
+        const Tensor&,                                                      \
+        const c10::optional<Scalar>&,                                       \
+        IntArrayRef,                                                        \
+        bool,                                                               \
+        ScalarType),                                                        \
+    fp32_append_dtype)                                                      \
+  _(ADD_NS(norm),                                                           \
+    "norm.names_ScalarOpt_dim",                                             \
+    Tensor(const Tensor&, const c10::optional<Scalar>&, DimnameList, bool), \
+    Tensor(                                                                 \
+        const Tensor&,                                                      \
+        const c10::optional<Scalar>&,                                       \
+        DimnameList,                                                        \
+        bool,                                                               \
+        ScalarType),                                                        \
+    fp32_append_dtype)
+
+#define AT_FORALL_PROMOTE(_) \
+  _(addcdiv)                 \
+  _(addcmul)                 \
+  _(atan2)                   \
+  _(bilinear)                \
+  _(cross)                   \
+  _(dot)                     \
+  _(grid_sampler)            \
+  _(index_put)               \
+  _(tensordot)               \
+  _(scatter_add)
--- a/aten/src/ATen/core/functional.h
+++ b/aten/src/ATen/core/functional.h
@ -9,7 +9,7 @@ namespace c10 {
 // const reference (const T&); taking T by non-const reference
 // will result in an error like:
 //
-//    error: no type named 'type' in 'class std::result_of<foobar::__lambda(T)>'
+//    error: no type named 'type' in 'class std::invoke_result<foobar::__lambda, T>'
 //
 // No explicit template parameters are required.

--- a/aten/src/ATen/core/interned_strings.h
+++ b/aten/src/ATen/core/interned_strings.h
@ -227,6 +227,7 @@ namespace c10 {
  _(aten, is_autocast_enabled)       \
  _(aten, is_autocast_cpu_enabled)   \
  _(aten, is_autocast_xla_enabled)   \
+  _(aten, get_autocast_dtype)        \
  FORALL_ATEN_BASE_SYMBOLS(_)        \
  _(onnx, Add)                       \
  _(onnx, Concat)                    \
--- a/aten/src/ATen/core/ivalue_inl.h
+++ b/aten/src/ATen/core/ivalue_inl.h
@ -1034,11 +1034,9 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
   */
  template <typename T>
  void addCallback(T callback, bool uses_future = true) {
-#if __cpp_lib_is_invocable >= 201703
    static_assert(
        std::is_invocable_r<void, T, Future&>::value,
        "The callback must have signature void(Future&)");
-#endif

    std::unique_lock<std::mutex> lock(mutex_);
    if (completed()) {
@ -1057,14 +1055,13 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
  template <typename T>
  c10::intrusive_ptr<Future> then(T callback, TypePtr type) {
    using IValueWithStorages = std::tuple<IValue, std::vector<WeakStorage>>;
-#if __cpp_lib_is_invocable >= 201703
    static_assert(
        std::disjunction<
            std::is_invocable_r<IValue, T, Future&>,
            std::is_invocable_r<IValueWithStorages, T, Future&>>::value,
        "The callback must have signature IValue(Future&) or "
        "std::tuple<IValue, std::vector<Storage>>(Future&)");
-#endif
+
    auto childFut = createInstance(::std::move(type));
    addCallback([childFut,
                 cb = std::move(callback)](Future& parentFut) mutable {
@ -1084,11 +1081,10 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {

  template <typename T>
  c10::intrusive_ptr<Future> thenAsync(T callback, TypePtr type) {
-#if __cpp_lib_is_invocable >= 201703
    static_assert(
        std::is_invocable_r<c10::intrusive_ptr<Future>, T, Future&>::value,
        "The callback must have signature c10::intrusive_ptr<Future>(Future&)");
-#endif
+
    auto childFut = createInstance(std::move(type));
    addCallback(
        [childFut, cb = std::move(callback)](Future& parentFut) mutable {
@ -1165,11 +1161,9 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
  // synchronize them with the value, and so on (if needed).
  template<typename T>
  void invokeCallback(T callback, bool uses_future) {
-#if __cpp_lib_is_invocable >= 201703
    static_assert(
        std::is_invocable_r<void, T, Future&>::value,
        "The callback must have signature void(Future&)");
-#endif

    // The synchronization performed below shouldn't be needed when the future
    // is not used by the callback.
@ -2321,8 +2315,7 @@ IValue::IValue(c10::intrusive_ptr<T> custom_class) : tag(Tag::Object) {
    } catch (const c10::Error&) {
      throw c10::Error(
          "Trying to instantiate a class that isn't a registered custom class: " +
-          std::string(c10::util::get_fully_qualified_type_name<T>()),
-          "");
+          std::string(c10::util::get_fully_qualified_type_name<T>()));
    }
  }();
  auto ivalue_obj = c10::ivalue::Object::create(std::move(classType), /* numSlots */1);
--- a/aten/src/ATen/cpu/vec/vec256/vec256_convert.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_convert.h
@ -126,32 +126,44 @@ struct VecConvert<int32_t, 1, uint8_t, 1> {
  }
 };

+template <typename dst_t, typename src_t>
+struct VecConvert<
+    dst_t,
+    1,
+    src_t,
+    1,
+    typename std::enable_if_t<
+        (is_reduced_floating_point_v<dst_t> && is_8bit_integer_v<src_t>) ||
+            (is_reduced_floating_point_v<src_t> && is_8bit_integer_v<dst_t>),
+        void>> {
+  static inline VectorizedN<dst_t, 1> apply(const VectorizedN<src_t, 1>& src) {
+    VectorizedN<float, 1> tmp_fp32 = VecConvert<float, 1, src_t, 1>::apply(src);
+    return VecConvert<dst_t, 1, float, 1>::apply(tmp_fp32);
+  }
+};
+
 template <typename dst_t>
 struct VecConvert<
-  dst_t,
-  1,
-  float,
-  1,
-  typename std::enable_if_t<
-    std::is_same_v<dst_t, unsigned char> || std::is_same_v<dst_t, signed char>,
-    void>> {
-  static inline VectorizedN<dst_t, 1> apply(
-      const VectorizedN<float, 1>& src) {
+    dst_t,
+    1,
+    float,
+    1,
+    typename std::enable_if_t<is_8bit_integer_v<dst_t>,
+        void>> {
+  static inline VectorizedN<dst_t, 1> apply(const VectorizedN<float, 1>& src) {
    return convert_float_to_int8<dst_t>(src[0]);
  }
 };

 template <typename src_t>
 struct VecConvert<
-  float,
-  1,
-  src_t,
-  1,
-  typename std::enable_if_t<
-    std::is_same_v<src_t, unsigned char> || std::is_same_v<src_t, signed char>,
-    void>> {
-  static inline VectorizedN<float, 1> apply(
-      const VectorizedN<src_t, 1>& src) {
+    float,
+    1,
+    src_t,
+    1,
+    typename std::enable_if_t<is_8bit_integer_v<src_t>,
+        void>> {
+  static inline VectorizedN<float, 1> apply(const VectorizedN<src_t, 1>& src) {
    return convert_int8_to_float<src_t>(src[0]);
  }
 };
--- a/aten/src/ATen/cpu/vec/vec256/zarch/vec256_zarch.h
+++ b/aten/src/ATen/cpu/vec/vec256/zarch/vec256_zarch.h
@ -13,8 +13,6 @@
 #include <ATen/cpu/vec/vec_base.h>
 #include <c10/util/complex.h>

-#define SLEEF_MEMORY_WORKAROUND
-
 namespace at {
 namespace vec {

@ -1148,32 +1146,20 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
  }

  Vectorized<T> sin() const {
-#ifndef SLEEF_MEMORY_WORKAROUND
    return mapSleef(Sleef_sinf4_u10, Sleef_sind2_u10);
-#else
-    return mapOrdinary(std::sin);
-#endif
  }
  Vectorized<T> sinh() const {
    return mapSleef(Sleef_sinhf4_u10, Sleef_sinhd2_u10);
  }
  Vectorized<T> cos() const {
-#ifndef SLEEF_MEMORY_WORKAROUND
    return mapSleef(Sleef_cosf4_u10, Sleef_cosd2_u10);
-#else
-    return mapOrdinary(std::cos);
-#endif
  }
  Vectorized<T> cosh() const {
    return mapSleef(Sleef_coshf4_u10, Sleef_coshd2_u10);
  }

  Vectorized<T> tan() const {
-#ifndef SLEEF_MEMORY_WORKAROUND
    return mapSleef(Sleef_tanf4_u10, Sleef_tand2_u10);
-#else
-    return mapOrdinary(std::tan);
-#endif
  }
  Vectorized<T> tanh() const {
    return mapSleef(Sleef_tanhf4_u10, Sleef_tanhd2_u10);
--- a/aten/src/ATen/cpu/vec/vec512/vec512_convert.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_convert.h
@ -117,32 +117,44 @@ struct VecConvert<int32_t, 1, uint8_t, 1> {
  }
 };

+template <typename dst_t, typename src_t>
+struct VecConvert<
+    dst_t,
+    1,
+    src_t,
+    1,
+    typename std::enable_if_t<
+        (is_reduced_floating_point_v<dst_t> && is_8bit_integer_v<src_t>) ||
+            (is_reduced_floating_point_v<src_t> && is_8bit_integer_v<dst_t>),
+        void>> {
+  static inline VectorizedN<dst_t, 1> apply(const VectorizedN<src_t, 1>& src) {
+    VectorizedN<float, 1> tmp_fp32 = VecConvert<float, 1, src_t, 1>::apply(src);
+    return VecConvert<dst_t, 1, float, 1>::apply(tmp_fp32);
+  }
+};
+
 template <typename dst_t>
 struct VecConvert<
-  dst_t,
-  1,
-  float,
-  1,
-  typename std::enable_if_t<
-    std::is_same_v<dst_t, unsigned char> || std::is_same_v<dst_t, signed char>,
-    void>> {
-  static inline VectorizedN<dst_t, 1> apply(
-      const VectorizedN<float, 1>& src) {
+    dst_t,
+    1,
+    float,
+    1,
+    typename std::enable_if_t<is_8bit_integer_v<dst_t>,
+        void>> {
+  static inline VectorizedN<dst_t, 1> apply(const VectorizedN<float, 1>& src) {
    return convert_float_to_int8<dst_t>(src[0]);
  }
 };

 template <typename src_t>
 struct VecConvert<
-  float,
-  1,
-  src_t,
-  1,
-  typename std::enable_if_t<
-    std::is_same_v<src_t, unsigned char> || std::is_same_v<src_t, signed char>,
-    void>> {
-  static inline VectorizedN<float, 1> apply(
-      const VectorizedN<src_t, 1>& src) {
+    float,
+    1,
+    src_t,
+    1,
+    typename std::enable_if_t<is_8bit_integer_v<src_t>,
+        void>> {
+  static inline VectorizedN<float, 1> apply(const VectorizedN<src_t, 1>& src) {
    return convert_int8_to_float<src_t>(src[0]);
  }
 };
--- a/aten/src/ATen/cpu/vec/vec_base.h
+++ b/aten/src/ATen/cpu/vec/vec_base.h
@ -90,6 +90,16 @@ struct is_reduced_floating_point:
 template <typename T>
 constexpr bool is_reduced_floating_point_v = is_reduced_floating_point<T>::value;

+template <typename T>
+struct is_8bit_integer:
+    std::integral_constant<bool,
+      std::is_same_v<T, unsigned char> ||
+      std::is_same_v<T, signed char>> {
+};
+
+template <typename T>
+constexpr bool is_8bit_integer_v = is_8bit_integer<T>::value;
+
 template<size_t n> struct int_of_size;

 #define DEFINE_INT_OF_SIZE(int_t) \
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@ -236,7 +236,7 @@ namespace at::cuda::blas {
    CUDABLAS_NONNEGINT_CHECK(bgemm<Dtype>, num_batches);  \
  } while (0)

-#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)

 #if defined(USE_ROCM) && ROCM_VERSION >= 50700 && ROCM_VERSION < 60000
 // only for rocm 5.7 where we first supported hipblaslt, it was difficult
@ -375,7 +375,7 @@ class CuBlasLtMatmulPreference : public CuBlasLtDescriptor<

 template <typename Dtype>
 inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
-#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 60000)
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 60000)
  cudaDataType_t abcType = CUDA_R_32F;
  cublasComputeType_t computeType = CUBLAS_COMPUTE_32F;
  cudaDataType_t scaleType = CUDA_R_32F;
@ -1235,7 +1235,7 @@ void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
  }
 }

-#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)

 template <typename Dtype>
 void gemm_and_bias(
@ -1745,7 +1745,7 @@ void int8_gemm(
  TORCH_CHECK(false, "int8_gemm is only supported for ROCm 6.0 and above");
 #endif // !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 60000)
 }
-#endif // (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
+#endif // !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)

 // ROCm 5.6 hipblas matches the const Dtype *A API, but prior hipblas does not.
 #if defined(USE_ROCM) && ROCM_VERSION < 50600
--- a/aten/src/ATen/cuda/CUDABlas.h
+++ b/aten/src/ATen/cuda/CUDABlas.h
@ -82,7 +82,7 @@ void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half));
 template <>
 void gemm_internal<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16));

-#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
 enum GEMMAndBiasActivationEpilogue {
  None,
  RELU,
--- a/aten/src/ATen/cuda/CUDAContextLight.h
+++ b/aten/src/ATen/cuda/CUDAContextLight.h
@ -9,7 +9,7 @@

 // cublasLT was introduced in CUDA 10.1 but we enable only for 11.1 that also
 // added bf16 support
-#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
 #include <cublasLt.h>
 #endif

@ -82,7 +82,7 @@ TORCH_CUDA_CPP_API c10::Allocator* getCUDADeviceAllocator();
 /* Handles */
 TORCH_CUDA_CPP_API cusparseHandle_t getCurrentCUDASparseHandle();
 TORCH_CUDA_CPP_API cublasHandle_t getCurrentCUDABlasHandle();
-#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
 TORCH_CUDA_CPP_API cublasLtHandle_t getCurrentCUDABlasLtHandle();
 #endif

--- a/aten/src/ATen/cuda/CublasHandlePool.cpp
+++ b/aten/src/ATen/cuda/CublasHandlePool.cpp
@ -191,7 +191,7 @@ cublasHandle_t getCurrentCUDABlasHandle() {
  return handle;
 }

-#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
 cublasLtHandle_t getCurrentCUDABlasLtHandle() {
 #ifdef USE_ROCM
  c10::DeviceIndex device = 0;
--- a/aten/src/ATen/cuda/tunable/GemmCommon.h
+++ b/aten/src/ATen/cuda/tunable/GemmCommon.h
@ -15,6 +15,14 @@
 #include <ATen/cuda/Exceptions.h>
 #include <c10/util/StringUtil.h>

+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/allclose.h>
+#include <ATen/ops/from_blob.h>
+#endif
+
 namespace at::cuda::tunable {

 enum class BlasOp {
@ -33,6 +41,39 @@ inline std::string BlasOpToString(BlasOp op) {
  return "N";
 }

+namespace detail {
+
+static bool NumericalCheck(ScalarType dtype, void* c, void* other_c, int64_t size) {
+  auto options = at::TensorOptions().dtype(dtype).device(at::kCUDA);
+  // comparison done as 1D tensor
+  at::Tensor ref = at::from_blob(c,       {size}, options);
+  at::Tensor oth = at::from_blob(other_c, {size}, options);
+  at::Tensor ref_float = ref.to(at::kFloat);
+  at::Tensor oth_float = oth.to(at::kFloat);
+  std::vector<double> atols{1e-1, 1e-2, 1e-3, 1e-4, 1e-5};
+  std::vector<double> rtols{1e-1, 1e-2, 1e-3, 1e-4, 1e-5};
+  double last_succeed_atol = 1;
+  double last_succeed_rtol = 1;
+  for (auto& atol : atols) {
+    for (auto& rtol : rtols) {
+      if (at::allclose(ref_float, oth_float, rtol, atol)) {
+        last_succeed_atol = atol;
+        last_succeed_rtol = rtol;
+      }
+    }
+  }
+  if (last_succeed_atol == 1) {
+    return false;
+  }
+  else {
+    TUNABLE_LOG("├──verify numerics: atol=", last_succeed_atol, ", rtol=", last_succeed_rtol);
+  }
+
+  return true;
+}
+
+}
+
 template <typename T>
 struct GemmParams : OpParams {
  std::string Signature() const override {
@ -57,32 +98,8 @@ struct GemmParams : OpParams {
  }

  TuningStatus NumericalCheck(GemmParams<T> *other) {
-    auto options = at::TensorOptions().dtype(c10::CppTypeToScalarType<T>::value).device(at::kCUDA);
-    // comparison done as 1D tensor
-    at::Tensor ref = at::from_blob(c,        {m*n}, options);
-    at::Tensor oth = at::from_blob(other->c, {m*n}, options);
-    at::Tensor ref_float = ref.to(at::kFloat);
-    at::Tensor oth_float = oth.to(at::kFloat);
-    std::vector<double> atols{1e-1, 1e-2, 1e-3, 1e-4, 1e-5};
-    std::vector<double> rtols{1e-1, 1e-2, 1e-3, 1e-4, 1e-5};
-    double last_succeed_atol = 1;
-    double last_succeed_rtol = 1;
-    for (auto& atol : atols) {
-      for (auto& rtol : rtols) {
-        if (at::allclose(ref_float, oth_float, rtol, atol)) {
-          last_succeed_atol = atol;
-          last_succeed_rtol = rtol;
-        }
-      }
-    }
-    if (last_succeed_atol == 1) {
-      return FAIL;
-    }
-    else {
-      TUNABLE_LOG("├──verify numerics: atol=", last_succeed_atol, ", rtol=", last_succeed_rtol);
-    }
-
-    return OK;
+    auto c_dtype = c10::CppTypeToScalarType<T>::value;
+    return detail::NumericalCheck(c_dtype, c, other->c, m*n) ? OK : FAIL;
  }

  char transa;
@ -124,32 +141,8 @@ struct GemmStridedBatchedParams : OpParams {
  }

  TuningStatus NumericalCheck(GemmStridedBatchedParams<T> *other) {
-    auto options = at::TensorOptions().dtype(c10::CppTypeToScalarType<T>::value).device(at::kCUDA);
-    // comparison done as 1D tensor
-    at::Tensor ref = at::from_blob(c,        {batch*stride_c}, options);
-    at::Tensor oth = at::from_blob(other->c, {batch*stride_c}, options);
-    at::Tensor ref_float = ref.to(at::kFloat);
-    at::Tensor oth_float = oth.to(at::kFloat);
-    std::vector<double> atols{1e-1, 1e-2, 1e-3, 1e-4, 1e-5};
-    std::vector<double> rtols{1e-1, 1e-2, 1e-3, 1e-4, 1e-5};
-    double last_succeed_atol = 1;
-    double last_succeed_rtol = 1;
-    for (auto& atol : atols) {
-      for (auto& rtol : rtols) {
-        if (at::allclose(ref_float, oth_float, rtol, atol)) {
-          last_succeed_atol = atol;
-          last_succeed_rtol = rtol;
-        }
-      }
-    }
-    if (last_succeed_atol == 1) {
-      return FAIL;
-    }
-    else {
-      TUNABLE_LOG("├──verify numerics: atol=", last_succeed_atol, ", rtol=", last_succeed_rtol);
-    }
-
-    return OK;
+    auto c_dtype = c10::CppTypeToScalarType<T>::value;
+    return detail::NumericalCheck(c_dtype, c, other->c, batch*stride_c) ? OK : FAIL;
  }

  char transa;
@ -171,4 +164,54 @@ struct GemmStridedBatchedParams : OpParams {
  int64_t batch;
 };

+template <typename T>
+struct ScaledGemmParams : OpParams {
+  std::string Signature() const override {
+    return c10::str(transa, transb, "_", m, "_", n, "_", k);
+  }
+
+  ScaledGemmParams* DeepCopy() const {
+    ScaledGemmParams* copy = new ScaledGemmParams;
+    *copy = *this;
+    c10::DeviceIndex device = 0;
+    AT_CUDA_CHECK(c10::cuda::GetDevice(&device));
+    size_t c_size = m * n * sizeof(T);
+    copy->c = c10::cuda::CUDACachingAllocator::raw_alloc(c_size);
+    AT_CUDA_CHECK(c10::cuda::CUDACachingAllocator::memcpyAsync(
+        copy->c, device, c, device, c_size, getCurrentCUDAStream(device), true));
+    return copy;
+  }
+
+  // only call on object returned by DeepCopy
+  void Delete() {
+    c10::cuda::CUDACachingAllocator::raw_delete(c);
+  }
+
+  TuningStatus NumericalCheck(ScaledGemmParams<T> *other) {
+    return detail::NumericalCheck(c_dtype, c, other->c, m*n) ? OK : FAIL;
+  }
+
+  char transa;
+  char transb;
+  int64_t m;
+  int64_t n;
+  int64_t k;
+  const void* a;
+  const void* a_scale_ptr;
+  int64_t lda;
+  ScalarType a_dtype;
+  const void* b;
+  const void* b_scale_ptr;
+  int64_t ldb;
+  ScalarType b_dtype;
+  const void* bias_ptr;
+  ScalarType bias_dtype;
+  void* c;
+  const void* c_scale_ptr;
+  int64_t ldc;
+  ScalarType c_dtype;
+  void* amax_ptr;
+  bool use_fast_accum;
+};
+
 } // namespace at::cuda::tunable
--- a/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
+++ b/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
@ -4,6 +4,7 @@
 #pragma once

 #include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDADataType.h>
 #include <ATen/cuda/tunable/TunableOp.h>
 #include <ATen/cuda/tunable/GemmCommon.h>
 #include <c10/cuda/CUDACachingAllocator.h>
@ -67,6 +68,16 @@ constexpr hipblasltDatatype_t HipBlasDataTypeFor<double>() {
  return HIPBLASLT_R_64F;
 }

+template <>
+constexpr hipblasltDatatype_t HipBlasDataTypeFor<c10::Float8_e4m3fnuz>() {
+  return HIPBLASLT_R_8F_E4M3;
+}
+
+template <>
+constexpr hipblasltDatatype_t HipBlasDataTypeFor<c10::Float8_e5m2fnuz>() {
+  return HIPBLASLT_R_8F_E5M3;
+}
+
 #define DATA_TYPE_R_32 HIPBLASLT_R_32F

 #else
@ -94,6 +105,16 @@ constexpr hipblasDatatype_t HipBlasDataTypeFor<double>() {
  return HIPBLAS_R_64F;
 }

+template <>
+constexpr hipblasDatatype_t HipBlasDataTypeFor<c10::Float8_e4m3fnuz>() {
+  return HIP_R_8F_E4M3_FNUZ;
+}
+
+template <>
+constexpr hipblasDatatype_t HipBlasDataTypeFor<c10::Float8_e5m2fnuz>() {
+  return HIP_R_8F_E5M2_FNUZ;
+}
+
 #ifdef HIPBLAS_V2
 #define DATA_TYPE_R_32 HIP_R_32F
 #else
@ -102,8 +123,8 @@ constexpr hipblasDatatype_t HipBlasDataTypeFor<double>() {

 #endif

-template <typename T, typename ParamsT>
-int GetBatchFromParams(const ParamsT* params) {
+template <typename T>
+int GetBatchFromParams(const GemmParams<T>* params) {
  return 1;
 }

@ -112,8 +133,13 @@ int GetBatchFromParams(const GemmStridedBatchedParams<T>* params) {
  return params->batch;
 }

-template <typename T, typename ParamsT>
-int GetStrideAFromParams(const ParamsT* params) {
+template <typename T>
+int GetBatchFromParams(const ScaledGemmParams<T>* params) {
+  return 1;
+}
+
+template <typename T>
+int GetStrideAFromParams(const GemmParams<T>* params) {
  return 1;
 }

@ -122,8 +148,13 @@ int GetStrideAFromParams(const GemmStridedBatchedParams<T>* params) {
  return params->stride_a;
 }

-template <typename T, typename ParamsT>
-int GetStrideBFromParams(const ParamsT* params) {
+template <typename T>
+int GetStrideAFromParams(const ScaledGemmParams<T>* params) {
+  return 1;
+}
+
+template <typename T>
+int GetStrideBFromParams(const GemmParams<T>* params) {
  return 1;
 }

@ -132,8 +163,13 @@ int GetStrideBFromParams(const GemmStridedBatchedParams<T>* params) {
  return params->stride_b;
 }

-template <typename T, typename ParamsT>
-int GetStrideCFromParams(const ParamsT* params) {
+template <typename T>
+int GetStrideBFromParams(const ScaledGemmParams<T>* params) {
+  return 1;
+}
+
+template <typename T>
+int GetStrideCFromParams(const GemmParams<T>* params) {
  return 1;
 }

@ -142,6 +178,116 @@ int GetStrideCFromParams(const GemmStridedBatchedParams<T>* params) {
  return params->stride_c;
 }

+template <typename T>
+int GetStrideCFromParams(const ScaledGemmParams<T>* params) {
+  return 1;
+}
+
+template <typename T>
+float GetAlphaFromParams(const GemmParams<T>* params) {
+  return params->alpha;
+}
+
+template <typename T>
+float GetAlphaFromParams(const GemmStridedBatchedParams<T>* params) {
+  return params->alpha;
+}
+
+template <typename T>
+float GetAlphaFromParams(const ScaledGemmParams<T>* params) {
+  return 1.0;
+}
+
+template <typename T>
+float GetBetaFromParams(const GemmParams<T>* params) {
+  return params->beta;
+}
+
+template <typename T>
+float GetBetaFromParams(const GemmStridedBatchedParams<T>* params) {
+  return params->beta;
+}
+
+template <typename T>
+float GetBetaFromParams(const ScaledGemmParams<T>* params) {
+  return 0.0;
+}
+
+template <typename T>
+const void* GetAScalePointerFromParams(const GemmParams<T>* params) {
+  return nullptr;
+}
+
+template <typename T>
+const void* GetAScalePointerFromParams(const GemmStridedBatchedParams<T>* params) {
+  return nullptr;
+}
+
+template <typename T>
+const void* GetAScalePointerFromParams(const ScaledGemmParams<T>* params) {
+  return params->a_scale_ptr;
+}
+
+template <typename T>
+const void* GetBScalePointerFromParams(const GemmParams<T>* params) {
+  return nullptr;
+}
+
+template <typename T>
+const void* GetBScalePointerFromParams(const GemmStridedBatchedParams<T>* params) {
+  return nullptr;
+}
+
+template <typename T>
+const void* GetBScalePointerFromParams(const ScaledGemmParams<T>* params) {
+  return params->b_scale_ptr;
+}
+
+template <typename T>
+const void* GetDScalePointerFromParams(const GemmParams<T>* params) {
+  return nullptr;
+}
+
+template <typename T>
+const void* GetDScalePointerFromParams(const GemmStridedBatchedParams<T>* params) {
+  return nullptr;
+}
+
+template <typename T>
+const void* GetDScalePointerFromParams(const ScaledGemmParams<T>* params) {
+  return params->c_scale_ptr;
+}
+
+template <typename T>
+const void* GetBiasPointerFromParams(const GemmParams<T>* params) {
+  return nullptr;
+}
+
+template <typename T>
+const void* GetBiasPointerFromParams(const GemmStridedBatchedParams<T>* params) {
+  return nullptr;
+}
+
+template <typename T>
+const void* GetBiasPointerFromParams(const ScaledGemmParams<T>* params) {
+  return params->bias_ptr;
+}
+
+template <typename T>
+hipDataType GetBiasTypeFromParams(const GemmParams<T>* params) {
+  return HIP_R_32F;
+}
+
+template <typename T>
+hipDataType GetBiasTypeFromParams(const GemmStridedBatchedParams<T>* params) {
+  return HIP_R_32F;
+}
+
+template <typename T>
+hipDataType GetBiasTypeFromParams(const ScaledGemmParams<T>* params) {
+  return at::cuda::ScalarTypeToCudaDataType(params->bias_dtype);
+}
+
 static hipblasOperation_t _hipblasOpFromChar(char op) {
  switch (op) {
    case 'n':
@ -198,7 +344,48 @@ static size_t GetHipblasltWorkspaceSize() {
  return workspace_size;
 }

-template <typename T, BlasOp ALayout, BlasOp BLayout, typename ParamsT>
+template <typename T, cublasStatus_t (*destructor)(T*)>
+struct HipBlasLtDeleter {
+  void operator()(T* x) {
+    if (x != nullptr) {
+      TORCH_CUDABLAS_CHECK(destructor(x));
+    }
+  }
+};
+
+template <typename T, hipblasStatus_t (*destructor)(T*)>
+class HipBlasLtDescriptor {
+ public:
+  T* descriptor() const {
+    return descriptor_.get();
+  }
+  T* descriptor() {
+    return descriptor_.get();
+  }
+
+ protected:
+  std::unique_ptr<T, HipBlasLtDeleter<T, destructor>> descriptor_;
+};
+
+class HipBlasLtMatmulDescriptor : public HipBlasLtDescriptor<
+                                     hipblasLtMatmulDescOpaque_t,
+                                     &hipblasLtMatmulDescDestroy> {
+ public:
+  HipBlasLtMatmulDescriptor(
+      hipblasComputeType_t compute_type,
+      hipDataType scale_type) {
+    hipblasLtMatmulDesc_t raw_descriptor = nullptr;
+    TORCH_HIPBLASLT_CHECK(
+        hipblasLtMatmulDescCreate(&raw_descriptor, compute_type, scale_type));
+    descriptor_.reset(raw_descriptor);
+  }
+  template <typename T>
+  inline void setAttribute(hipblasLtMatmulDescAttributes_t attr, const T value) {
+    TORCH_HIPBLASLT_CHECK(::hipblasLtMatmulDescSetAttribute(descriptor(), attr, &value, sizeof(T)));
+  }
+};
+
+template <typename AT, typename BT, typename CT, BlasOp ALayout, BlasOp BLayout, typename ParamsT>
 class HipblasltGemmOp : public Callable<ParamsT> {
  public:
    HipblasltGemmOp(hipblasLtMatmulAlgo_t algo) : algo_{algo} {}
@ -206,37 +393,38 @@ class HipblasltGemmOp : public Callable<ParamsT> {
    TuningStatus Call(const ParamsT* params) override {
      hipblasOperation_t transa_outer = MapLayoutToHipBlasLt(ALayout);
      hipblasOperation_t transb_outer = MapLayoutToHipBlasLt(BLayout);
-      auto in_out_datatype = HipBlasDataTypeFor<T>();
+      auto a_datatype = HipBlasDataTypeFor<AT>();
+      auto b_datatype = HipBlasDataTypeFor<BT>();
+      auto in_out_datatype = HipBlasDataTypeFor<CT>();
      auto opa = _hipblasOpFromChar(params->transa);
      auto opb = _hipblasOpFromChar(params->transb);

      TORCH_CHECK(transa_outer == opa && transb_outer == opb, "trans mismatch, shouldn't happen");

-      float alpha = static_cast<float>(params->alpha);
-      float beta = static_cast<float>(params->beta);
+      float alpha = GetAlphaFromParams<CT>(params);
+      float beta = GetBetaFromParams<CT>(params);

      hipblasLtMatrixLayout_t mat_a, mat_b, mat_c;
-      hipblasLtMatmulDesc_t matmul;
      if (opa == HIPBLAS_OP_N) {
-        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_a, in_out_datatype, params->m, params->k, params->lda));
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_a, a_datatype, params->m, params->k, params->lda));
      }
      else {
-        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_a, in_out_datatype, params->k, params->m, params->lda));
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_a, a_datatype, params->k, params->m, params->lda));
      }
      if (opb == HIPBLAS_OP_N) {
-        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_b, in_out_datatype, params->k, params->n, params->ldb));
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_b, b_datatype, params->k, params->n, params->ldb));
      }
      else {
-        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_b, in_out_datatype, params->n, params->k, params->ldb));
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_b, b_datatype, params->n, params->k, params->ldb));
      }
      TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_c, in_out_datatype, params->m, params->n, params->ldc));
-      TORCH_HIPBLASLT_CHECK(hipblasLtMatmulDescCreate(&matmul, COMPUTE_TYPE_32, DATA_TYPE_R_32));

-      int batch = GetBatchFromParams<T>(params);
+      // specific to batched gemmm
+      int batch = GetBatchFromParams<CT>(params);
      if (batch > 1) {
-        int64_t stride_a = GetStrideAFromParams<T>(params);
-        int64_t stride_b = GetStrideBFromParams<T>(params);
-        int64_t stride_c = GetStrideCFromParams<T>(params);
+        int64_t stride_a = GetStrideAFromParams<CT>(params);
+        int64_t stride_b = GetStrideBFromParams<CT>(params);
+        int64_t stride_c = GetStrideCFromParams<CT>(params);
        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute(
            mat_a, HIPBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, sizeof(batch)));
        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute(
@ -251,10 +439,27 @@ class HipblasltGemmOp : public Callable<ParamsT> {
            mat_c, HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stride_c, sizeof(stride_c)));
      }

-      TORCH_HIPBLASLT_CHECK(hipblasLtMatmulDescSetAttribute(
-            matmul, HIPBLASLT_MATMUL_DESC_TRANSA, &opa, sizeof(int32_t)));
-      TORCH_HIPBLASLT_CHECK(hipblasLtMatmulDescSetAttribute(
-            matmul, HIPBLASLT_MATMUL_DESC_TRANSB, &opb, sizeof(int32_t)));
+      HipBlasLtMatmulDescriptor matmul(COMPUTE_TYPE_32, DATA_TYPE_R_32);
+      matmul.setAttribute(HIPBLASLT_MATMUL_DESC_TRANSA, opa);
+      matmul.setAttribute(HIPBLASLT_MATMUL_DESC_TRANSB, opb);
+
+      // specific to scaled gemm
+      const void* mat1_scale_ptr = GetAScalePointerFromParams<CT>(params);
+      const void* mat2_scale_ptr = GetBScalePointerFromParams<CT>(params);
+      const void* result_scale_ptr = GetDScalePointerFromParams<CT>(params);
+      if (mat1_scale_ptr && mat2_scale_ptr && result_scale_ptr) {
+        matmul.setAttribute(HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER, mat1_scale_ptr);
+        matmul.setAttribute(HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER, mat2_scale_ptr);
+        matmul.setAttribute(HIPBLASLT_MATMUL_DESC_D_SCALE_POINTER, result_scale_ptr);
+
+        const void* bias_ptr = GetBiasPointerFromParams<CT>(params);
+        auto bias_datatype = GetBiasTypeFromParams<CT>(params);
+        if (bias_ptr) {
+          matmul.setAttribute(HIPBLASLT_MATMUL_DESC_BIAS_POINTER, bias_ptr);
+          matmul.setAttribute(HIPBLASLT_MATMUL_DESC_EPILOGUE, HIPBLASLT_EPILOGUE_BIAS);
+          matmul.setAttribute(HIPBLASLT_MATMUL_DESC_BIAS_DATA_TYPE, bias_datatype);
+        }
+      }

      size_t workspace_size = GetHipblasltWorkspaceSize();

@ -262,7 +467,7 @@ class HipblasltGemmOp : public Callable<ParamsT> {

      size_t ret_workspace_size = 0;
      auto status = hipblaslt_ext::matmulIsAlgoSupported(op_handle,
-          matmul,
+          matmul.descriptor(),
          &alpha,
          mat_a,
          mat_b,
@ -289,7 +494,7 @@ class HipblasltGemmOp : public Callable<ParamsT> {
      }

      TORCH_HIPBLASLT_CHECK(hipblasLtMatmul(op_handle,
-            matmul,
+            matmul.descriptor(),
            &alpha,
            params->a,
            mat_a,
@ -305,7 +510,7 @@ class HipblasltGemmOp : public Callable<ParamsT> {
            workspace_size,
            at::cuda::getCurrentCUDAStream()));

-      TORCH_HIPBLASLT_CHECK(hipblasLtMatmulDescDestroy(matmul));
+      //TORCH_HIPBLASLT_CHECK(hipblasLtMatmulDescDestroy(matmul));
      TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutDestroy(mat_a));
      TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutDestroy(mat_b));
      TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutDestroy(mat_c));
@ -319,11 +524,13 @@ class HipblasltGemmOp : public Callable<ParamsT> {
    hipblasLtMatmulAlgo_t algo_;
 };

-template <typename T, BlasOp ALayout, BlasOp BLayout, typename ParamsT>
+template <typename AT, typename BT, typename CT, BlasOp ALayout, BlasOp BLayout, typename ParamsT>
 auto GetHipBlasLtTypeStringAndOps() {
  hipblasOperation_t transa_outer = MapLayoutToHipBlasLt(ALayout);
  hipblasOperation_t transb_outer = MapLayoutToHipBlasLt(BLayout);
-  auto in_out_datatype = HipBlasDataTypeFor<T>();
+  auto a_datatype = HipBlasDataTypeFor<AT>();
+  auto b_datatype = HipBlasDataTypeFor<BT>();
+  auto in_out_datatype = HipBlasDataTypeFor<CT>();
  std::vector<hipblasLtMatmulHeuristicResult_t> heuristic_result;

  hipblasLtHandle_t handle;
@ -332,8 +539,8 @@ auto GetHipBlasLtTypeStringAndOps() {
        hipblaslt_ext::GemmType::HIPBLASLT_GEMM,
        transa_outer,
        transb_outer,
-        in_out_datatype,
-        in_out_datatype,
+        a_datatype,
+        b_datatype,
        in_out_datatype,
        in_out_datatype,
        COMPUTE_TYPE_32,
@ -352,7 +559,7 @@ auto GetHipBlasLtTypeStringAndOps() {
  for (int i = 0; i < returned_algo_count; i++) {
    auto algo = heuristic_result[i].algo;
    int algo_index = GETINDEXFROMALGO(algo);
-    auto callable = std::make_unique<HipblasltGemmOp<T, ALayout, BLayout, ParamsT>>(algo);
+    auto callable = std::make_unique<HipblasltGemmOp<AT, BT, CT, ALayout, BLayout, ParamsT>>(algo);
    std::string type_string = c10::str(
        "Gemm_Hipblaslt_", _charFromhipblasOp(transa_outer), _charFromhipblasOp(transb_outer), "_", algo_index);
    ret.emplace_back(type_string, std::move(callable));
@ -363,12 +570,17 @@ auto GetHipBlasLtTypeStringAndOps() {

 template <typename T, BlasOp ALayout, BlasOp BLayout>
 auto GetHipBlasLtGemmTypeStringAndOps() {
-  return GetHipBlasLtTypeStringAndOps<T, ALayout, BLayout, GemmParams<T>>();
+  return GetHipBlasLtTypeStringAndOps<T, T, T, ALayout, BLayout, GemmParams<T>>();
 }

 template <typename T, BlasOp ALayout, BlasOp BLayout>
 auto GetHipBlasLtGemmStridedBatchedTypeStringAndOps() {
-  return GetHipBlasLtTypeStringAndOps<T, ALayout, BLayout, GemmStridedBatchedParams<T>>();
+  return GetHipBlasLtTypeStringAndOps<T, T, T, ALayout, BLayout, GemmStridedBatchedParams<T>>();
+}
+
+template <typename AT, typename BT, typename CT, BlasOp ALayout, BlasOp BLayout>
+auto GetHipBlasLtScaledGemmTypeStringAndOps() {
+  return GetHipBlasLtTypeStringAndOps<AT, BT, CT, ALayout, BLayout, ScaledGemmParams<CT>>();
 }

 #undef TORCH_HIPBLASLT_CHECK
--- a/aten/src/ATen/cuda/tunable/TunableGemm.h
+++ b/aten/src/ATen/cuda/tunable/TunableGemm.h
@ -19,6 +19,10 @@
 #include <ATen/cuda/tunable/StreamTimer.h>
 #include <ATen/cuda/tunable/TunableOp.h>
 #include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/util/Float8_e4m3fn.h>
+#include <c10/util/Float8_e4m3fnuz.h>
+#include <c10/util/Float8_e5m2.h>
+#include <c10/util/Float8_e5m2fnuz.h>
 #include <c10/util/StringUtil.h>

 #ifdef USE_ROCM
@ -64,62 +68,112 @@ class DefaultGemmStridedBatchedOp : public Callable<GemmStridedBatchedParams<T>>
 };

 template <typename T>
-bool IsZero(T v) {
+class DefaultScaledGemmOp : public Callable<ScaledGemmParams<T>> {
+  public:
+    TuningStatus Call(const ScaledGemmParams<T>* params) override {
+      at::cuda::blas::scaled_gemm(
+          params->transa,
+          params->transb,
+          params->m,
+          params->n,
+          params->k,
+          params->a,
+          params->a_scale_ptr,
+          params->lda,
+          params->a_dtype,
+          params->b,
+          params->b_scale_ptr,
+          params->ldb,
+          params->b_dtype,
+          params->bias_ptr,
+          params->bias_dtype,
+          params->c,
+          params->c_scale_ptr,
+          params->ldc,
+          params->c_dtype,
+          params->amax_ptr,
+          params->use_fast_accum);
+      return OK;
+    }
+};
+
+template <typename T>
+inline bool IsZero(T v) {
  return v == 0.0f;
 }

 template <>
-bool IsZero(BFloat16 v) {
+inline bool IsZero(BFloat16 v) {
  return v.x == 0;
 }

 template <>
-bool IsZero(Half v) {
+inline bool IsZero(Half v) {
  return float(v) == 0.0f;
 }

 template <>
-bool IsZero(c10::complex<double> v) {
+inline bool IsZero(c10::complex<double> v) {
  return v == 0.0;
 }

 template <>
-bool IsZero(c10::complex<float> v) {
+inline bool IsZero(c10::complex<float> v) {
  return v == 0.0f;
 }

 template <typename T>
-std::string TypeName(T v) {
+inline std::string TypeName(T v) {
  return "unknown";
 }

 template <>
-std::string TypeName(float v) {
+inline std::string TypeName(float v) {
  return "float";
 }

 template <>
-std::string TypeName(double v) {
+inline std::string TypeName(double v) {
  return "double";
 }

 template <>
-std::string TypeName(BFloat16 v) {
+inline std::string TypeName(BFloat16 v) {
  return "BFloat16";
 }

 template <>
-std::string TypeName(Half v) {
+inline std::string TypeName(Half v) {
  return "Half";
 }

 template <>
-std::string TypeName(c10::complex<double> v) {
+inline std::string TypeName(Float8_e4m3fn v) {
+  return "Float8_e4m3fn";
+}
+
+template <>
+inline std::string TypeName(Float8_e5m2 v) {
+  return "Float8_e5m2";
+}
+
+template <>
+inline std::string TypeName(Float8_e4m3fnuz v) {
+  return "Float8_e4m3fnuz";
+}
+
+template <>
+inline std::string TypeName(Float8_e5m2fnuz v) {
+  return "Float8_e5m2fnuz";
+}
+
+template <>
+inline std::string TypeName(c10::complex<double> v) {
  return "c10::complex<double>";
 }

 template <>
-std::string TypeName(c10::complex<float> v) {
+inline std::string TypeName(c10::complex<float> v) {
  return "c10::complex<float>";
 }

@ -272,6 +326,42 @@ class GemmStridedBatchedTunableOp : public TunableOp<GemmStridedBatchedParams<T>
  }
 };

+template <typename AT, typename BT, typename CT, BlasOp ALayout, BlasOp BLayout>
+class ScaledGemmTunableOp : public TunableOp<ScaledGemmParams<CT>, StreamTimer> {
+ public:
+  ScaledGemmTunableOp() {
+    this->RegisterOp(std::string("Default"), std::make_unique<DefaultScaledGemmOp<CT>>());
+
+    auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators();
+
+#if defined(USE_ROCM) && ROCM_VERSION >= 50700
+    for (auto&& [name, op] : GetHipBlasLtScaledGemmTypeStringAndOps<AT, BT, CT, ALayout, BLayout>()) {
+      this->RegisterOp(std::move(name), std::move(op));
+    }
+
+    if (validators.find("HIPBLASLT_VERSION") == validators.end()) {
+      std::string hipblaslt_version = c10::str(
+          XSTRINGIFY(HIPBLASLT_VERSION_MAJOR), ".",
+          XSTRINGIFY(HIPBLASLT_VERSION_MINOR), ".",
+          XSTRINGIFY(HIPBLASLT_VERSION_PATCH), "-",
+          XSTRINGIFY(HIPBLASLT_VERSION_TWEAK));
+      getTuningContext()->GetTuningResultsValidator().RegisterValidator(
+          "HIPBLASLT_VERSION",
+          [hipblaslt_version]() { return hipblaslt_version; },
+          [hipblaslt_version](auto&& k) { return hipblaslt_version == k ? OK : FAIL; });
+    }
+#endif
+  }
+
+  std::string Signature() override {
+    return c10::str("ScaledGemmTunableOp",
+            "_", TypeName<AT>(AT{}),
+            "_", TypeName<BT>(BT{}),
+            "_", TypeName<CT>(CT{}),
+            "_", BlasOpToString(ALayout), BlasOpToString(BLayout));
+  }
+};
+
 #undef XSTRINGIFY
 #undef STRINGIFY

--- a/aten/src/ATen/functorch/TensorWrapper.cpp
+++ b/aten/src/ATen/functorch/TensorWrapper.cpp
@ -81,6 +81,11 @@ static Tensor unsafeMakeTensorWrapper(
  auto result = at::detail::make_tensor<TensorWrapper>(
      key_set, tensor, level, life_handle, is_immutable);
  TORCH_INTERNAL_ASSERT(result.key_set().has(DispatchKey::FuncTorchGradWrapper));
+
+  if (tensor.unsafeGetTensorImpl()->is_wrapped_number()) {
+    result.unsafeGetTensorImpl()->set_wrapped_number(true);
+  }
+
  return result;
 }

--- a/aten/src/ATen/mps/MPSProfiler.h
+++ b/aten/src/ATen/mps/MPSProfiler.h
@ -299,6 +299,12 @@ public:
  void StartTrace(const std::string& mode, bool waitUntilCompleted);
  void StopTrace();

+  // Abstractions for GPU trace capturing
+  bool isCaptureEnabled() const;
+  bool isCapturing() const;
+  void startCapture(const std::string& name, MPSStream* stream = nullptr);
+  void stopCapture(MPSStream* stream = nullptr);
+
  // convenience functions to indicate whether signpost tracing or
  // logging are enabled for the SignpostTypes
  bool isOperationProfilingEnabled() const {
@ -356,6 +362,9 @@ public:
  // a short list that contains copy stats
  std::unordered_map<CopyInfo::Kind, std::unique_ptr<CopyStat>> m_copy_stat_list{};

+  mutable MTLCaptureManager *captureManager = nil;
+  unsigned captureCount = 0;
+
  void initialize();
  void beginProfileExecution(BaseInfo& info, bool cpuExecution = false);
  void endProfileExecution(BaseInfo& info, os_signpost_id_t event_signpost_id,
--- a/aten/src/ATen/mps/MPSProfiler.mm
+++ b/aten/src/ATen/mps/MPSProfiler.mm
@ -765,6 +765,41 @@ void MPSProfiler::handleIntSignal(int signal) {
 struct sigaction MPSProfiler::currentSigint {};
 struct sigaction MPSProfiler::previousSigint {};

+bool MPSProfiler::isCapturing() const {
+  return [captureManager isCapturing];
+}
+
+bool MPSProfiler::isCaptureEnabled() const {
+  if (captureManager == nil) {
+    captureManager = [MTLCaptureManager sharedCaptureManager];
+  }
+  static bool isEnabled = [this]() {
+    return [captureManager supportsDestination:MTLCaptureDestinationGPUTraceDocument];
+  }();
+  return isEnabled;
+}
+
+void MPSProfiler::startCapture(const std::string& name, MPSStream* stream) {
+  if (captureManager == nil) {
+    captureManager = [MTLCaptureManager sharedCaptureManager];
+  }
+  NSError* err = nil;
+  NSString* fname = [NSString stringWithFormat:@"%04d-%s.gputrace", captureCount++, name.c_str()];
+  MTLCaptureDescriptor* captureDescriptor = [MTLCaptureDescriptor new];
+  captureDescriptor.captureObject = stream ? (id)stream->commandQueue() : (id)MPSDevice::getInstance()->device();
+  captureDescriptor.destination = MTLCaptureDestinationGPUTraceDocument;
+  captureDescriptor.outputURL = [NSURL fileURLWithPath:fname];
+  auto rc = [captureManager startCaptureWithDescriptor:captureDescriptor error:&err];
+  TORCH_CHECK(rc, "Failed to start capture of ", [fname UTF8String], " error ", [[err description] UTF8String]);
+}
+
+void MPSProfiler::stopCapture(MPSStream* stream) {
+  if (stream) {
+    stream->synchronize(SyncType::COMMIT);
+  }
+  [captureManager stopCapture];
+}
+
 } // namespace Profiler

 Profiler::MPSProfiler& getMPSProfiler() {
--- a/aten/src/ATen/mps/MPSStream.mm
+++ b/aten/src/ATen/mps/MPSStream.mm
@ -22,7 +22,7 @@ MPSStream::MPSStream(Stream stream) : _stream(stream) {
  _compilationDescriptor = [MPSGraphCompilationDescriptor new];

  // disable commitAndContinue if Signpost tracing is enabled
-  if (getMPSProfiler().isSignpostTracingEnabled()) {
+  if (getMPSProfiler().isSignpostTracingEnabled() || getMPSProfiler().isCaptureEnabled()) {
    _enableCommitAndContinue = false;
  }
  _executionDescriptor.enableCommitAndContinue = _enableCommitAndContinue;
--- a/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp
+++ b/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp
@ -317,6 +317,12 @@ Tensor adaptive_avg_pool3d_symint(Tensor const& input, SymIntArrayRef output_siz
    // in this case, adaptive pooling is just computing mean over hw
    // dimensions, which can be done more efficiently
    Tensor out = input.mean({-1, -2, -3}, /* keepdim = */ true);
+    if (input.suggest_memory_format() == at::MemoryFormat::ChannelsLast3d) {
+      // assert ndim == 5, since ndim = 4 doesn't give channels_last
+      const auto n = input.sym_size(0);
+      const auto c = input.sym_size(1);
+      out.as_strided__symint({n, c, 1, 1, 1}, {c, 1, c, c, c});
+    }
    return out;
  } else {
    return _adaptive_avg_pool3d_symint(input, output_size);
--- a/aten/src/ATen/native/AdaptivePooling.h
+++ b/aten/src/ATen/native/AdaptivePooling.h
@ -8,15 +8,25 @@

 namespace at::native {

-using adaptive_avg_pooling_fn = void(*)(Tensor& output, const Tensor& input, IntArrayRef output_size);
-using adaptive_avg_pooling_backward_fn = void(*)(Tensor& grad_input, const Tensor& grad_output);
-DECLARE_DISPATCH(adaptive_avg_pooling_fn, adaptive_avg_pool2d_kernel);
-DECLARE_DISPATCH(adaptive_avg_pooling_backward_fn, adaptive_avg_pool2d_backward_kernel);
+using adaptive_avg_pooling2d_fn = void(*)(Tensor& output, const Tensor& input, IntArrayRef output_size);
+using adaptive_avg_pooling2d_backward_fn = void(*)(Tensor& grad_input, const Tensor& grad_output);
+DECLARE_DISPATCH(adaptive_avg_pooling2d_fn, adaptive_avg_pool2d_kernel);
+DECLARE_DISPATCH(adaptive_avg_pooling2d_backward_fn, adaptive_avg_pool2d_backward_kernel);

-using adaptive_max_pooling_fn = void(*)(const Tensor& output, const Tensor& indices, const Tensor& input, IntArrayRef output_size);
-using adaptive_max_pooling_backward_fn = void(*)(const Tensor& grad_input, const Tensor& grad_output, const Tensor& indices);
-DECLARE_DISPATCH(adaptive_max_pooling_fn, adaptive_max_pool2d_kernel);
-DECLARE_DISPATCH(adaptive_max_pooling_backward_fn, adaptive_max_pool2d_backward_kernel);
+using adaptive_max_pooling2d_fn = void(*)(const Tensor& output, const Tensor& indices, const Tensor& input, IntArrayRef output_size);
+using adaptive_max_pooling2d_backward_fn = void(*)(const Tensor& grad_input, const Tensor& grad_output, const Tensor& indices);
+DECLARE_DISPATCH(adaptive_max_pooling2d_fn, adaptive_max_pool2d_kernel);
+DECLARE_DISPATCH(adaptive_max_pooling2d_backward_fn, adaptive_max_pool2d_backward_kernel);
+
+using adaptive_avg_pooling3d_fn = void(*)(Tensor& output, const Tensor& input, IntArrayRef output_size);
+using adaptive_avg_pooling3d_backward_fn = void(*)(Tensor& grad_input, const Tensor& grad_output);
+DECLARE_DISPATCH(adaptive_avg_pooling3d_fn, adaptive_avg_pool3d_kernel);
+DECLARE_DISPATCH(adaptive_avg_pooling3d_backward_fn, adaptive_avg_pool3d_backward_kernel);
+
+using adaptive_max_pooling3d_fn = void(*)(const Tensor& output, const Tensor& indices, const Tensor& input, IntArrayRef output_size);
+using adaptive_max_pooling3d_backward_fn = void(*)(const Tensor& grad_input, const Tensor& grad_output, const Tensor& indices);
+DECLARE_DISPATCH(adaptive_max_pooling3d_fn, adaptive_max_pool3d_kernel);
+DECLARE_DISPATCH(adaptive_max_pooling3d_backward_fn, adaptive_max_pool3d_backward_kernel);

 static inline int64_t start_index(int64_t a, int64_t b, int64_t c) {
  return (a / b) * c + ((a % b) * c) / b;
--- a/aten/src/ATen/native/DispatchStub.cpp
+++ b/aten/src/ATen/native/DispatchStub.cpp
@ -10,8 +10,19 @@
 #include <cstdlib>
 #include <cstring>

+#ifdef HAVE_ZVECTOR_CPU_DEFINITION
+#include <sys/auxv.h>
+#endif
+
 namespace at::native {

+#ifdef HAVE_ZVECTOR_CPU_DEFINITION
+static inline bool cpu_has_vxe()
+{
+  return (getauxval(AT_HWCAP) & HWCAP_S390_VXE);
+}
+#endif
+
 static CPUCapability compute_cpu_capability() {
  auto envar = std::getenv("ATEN_CPU_CAPABILITY");
  if (envar) {
@ -60,10 +71,16 @@ static CPUCapability compute_cpu_capability() {
 #endif
  }
 #endif
+
+#ifdef HAVE_ZVECTOR_CPU_DEFINITION
+  // vxe is needed for fp32 vector instructions
+  if (cpu_has_vxe()) {
+    return CPUCapability::ZVECTOR;
+  }
+#endif
+
 #ifdef HAVE_VSX_CPU_DEFINITION
  return CPUCapability::VSX;
-#elif HAVE_ZVECTOR_CPU_DEFINITION
-  return CPUCapability::ZVECTOR;
 #else
  return CPUCapability::DEFAULT;
 #endif
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@ -2839,10 +2839,16 @@ TORCH_IMPL_FUNC(linalg_vector_norm_out)(const Tensor& self, const Scalar& scalar
  }

  if (is_reduce_over_1D_vector) {
-    if (ord != 0.0) {
-      keepdim ? at::abs_outf(self, const_cast<Tensor&>(result)) : at::abs_outf(self.squeeze(reduce_dim), const_cast<Tensor&>(result));
+    Tensor self_;
+    if (opt_dtype.has_value()) {
+      self_ = self.to(*opt_dtype);
    } else {
-      keepdim ? at::ne_outf(self, 0, const_cast<Tensor&>(result)) : at::ne_outf(self.squeeze(reduce_dim), 0, const_cast<Tensor&>(result));
+      self_ = self;
+    }
+    if (ord != 0.0) {
+      keepdim ? at::abs_outf(self_, const_cast<Tensor&>(result)) : at::abs_outf(self_.squeeze(reduce_dim), const_cast<Tensor&>(result));
+    } else {
+      keepdim ? at::ne_outf(self_, 0, const_cast<Tensor&>(result)) : at::ne_outf(self_.squeeze(reduce_dim), 0, const_cast<Tensor&>(result));
    }
    return;
  }
--- a/aten/src/ATen/native/Pool.h
+++ b/aten/src/ATen/native/Pool.h
@ -26,6 +26,19 @@ using avg_pool2d_backward_fn = void(*)(const Tensor& output, const Tensor& input
 DECLARE_DISPATCH(avg_pool2d_fn, avg_pool2d_kernel);
 DECLARE_DISPATCH(avg_pool2d_backward_fn, avg_pool2d_backward_kernel);

+// averge pooling has same signature for forward and backward
+using avg_pool3d_fn = void(*)(const Tensor& output, const Tensor& input,
+    int64_t kW, int64_t kH, int64_t kD, int64_t dW, int64_t dH, int64_t dD,
+    int64_t padW, int64_t padH, int64_t padD, bool count_include_pad,
+    c10::optional<int64_t> divisor_override);
+using avg_pool3d_backward_fn = void(*)(const Tensor& output, const Tensor& input,
+    int kW, int kH, int kD, int dW, int dH, int dD,
+    int padW, int padH, int padD, bool count_include_pad,
+    c10::optional<int64_t> divisor_override);
+
+DECLARE_DISPATCH(avg_pool3d_fn, avg_pool3d_kernel);
+DECLARE_DISPATCH(avg_pool3d_backward_fn, avg_pool3d_backward_kernel);
+
 using max_pool3d_fn = void(*)(Tensor& output, Tensor& indices, const Tensor& input,
    int kW, int kH, int kD, int dW, int dH, int dD, int pW, int pH, int pD, int dilationW, int dilationH, int dilationD);
 using max_pool3d_backward_fn = void(*)(Tensor& grad_input, const Tensor& grad_output, const Tensor& indices);
--- a/aten/src/ATen/native/TensorConversions.cpp
+++ b/aten/src/ATen/native/TensorConversions.cpp
@ -254,13 +254,50 @@ Tensor _to_copy(

  // TODO: Use the dispatcher for this.
  // Currently there are unenumerated extensibility issues preventing this.
-  if (at::sparse_csr::is_sparse_compressed(self)) {
+  if (self.layout() == kSparse) {
+      TORCH_CHECK(
+          memory_format == MemoryFormat::Preserve,
+          "to(options): COO only supports memory format Preserve, but got ", memory_format,
+          " instead.");
+    if (options.device().is_meta()) {
+        return zeros_like(self, options);
+    }
+    auto indices = self._indices();
+    const auto new_indices = at::native::to(
+        indices,
+        indices.scalar_type(),
+        c10::kStrided,
+        device,
+        pin_memory,
+        non_blocking,
+        true, // force copy since we are in _to_copy
+        memory_format);
+    const auto new_values = at::native::to(
+        self._values(),
+        dtype,
+        c10::kStrided,
+        device,
+        pin_memory,
+        non_blocking,
+        true, // force copy since we are in _to_copy
+        memory_format);
+
+    return at::_sparse_coo_tensor_unsafe(
+        new_indices,
+        new_values,
+        self.sizes(),
+        options, self.is_coalesced());
+  } else if (at::sparse_csr::is_sparse_compressed(self)) {
      TORCH_CHECK(
          memory_format == MemoryFormat::Preserve,
          "to(options): ", at::sparse_csr::layoutToString(self.layout()),
          " only supports memory format Preserve, but got ", memory_format,
          " instead.");

+      if (options.device().is_meta()) {
+        return zeros_like(self, options);
+      }
+
      auto [compressed_indices, plain_indices] = at::sparse_csr::getCompressedPlainIndices(self);

      const auto new_values = at::native::to(
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@ -421,9 +421,19 @@ Tensor& set_storage_meta__symint(Tensor& result, Storage storage, c10::SymInt st
    // it.  TODO: Actually this might not quite be correct if we use special
    // pointers to track whether or not fake cuda tensors are pinned or not
    const auto itemsize = result.dtype().itemsize();
-    c10::SymInt size_bytes = at::detail::computeStorageNbytes(
+    c10::SymInt new_size_bytes = at::detail::computeStorageNbytes(
        size, stride, itemsize, std::move(storage_offset));
-    storage.set_nbytes(std::move(size_bytes));
+    // TODO: When there are unbacked SymInts, we unconditionally skip the
+    // setter.  This is technically wrong, but we cannot conveniently test
+    // the real condition in many cases, because a lot of people are using
+    // set_ just to swizzle metadata on a tensor, they didn't actually want
+    // to see if they need to resize the storage.
+    //
+    // The old behavior was to unconditionally set_nbytes, but I think not
+    // setting it is more safe.
+    if (new_size_bytes.has_hint() && storage.sym_nbytes().has_hint() && TORCH_GUARD_SIZE_OBLIVIOUS(new_size_bytes.sym_gt(storage.sym_nbytes()))) {
+      storage.set_nbytes(std::move(new_size_bytes));
+    }
  }
  return result;
 }
@ -4072,11 +4082,13 @@ void unbind_copy_int_out(const at::Tensor & self, int64_t dim, at::TensorList  o
  }
 }

-int64_t sparse_dim_strided(const at::Tensor& self) {
+int64_t sparse_dim_default(const Tensor& self) {
+  TORCH_CHECK(self.layout() == kStrided, "sparse_dim expected sparse or strided tensor layout but got ", self.layout());
  return 0;
 }

-int64_t dense_dim_strided(const at::Tensor& self) {
+int64_t dense_dim_default(const Tensor& self) {
+  TORCH_CHECK(self.layout() == kStrided, "dense_dim expected sparse or strided tensor layout but got ", self.layout());
  return self.dim();
 }

--- a/aten/src/ATen/native/cpu/AdaptiveAvgPoolKernel.cpp
+++ b/aten/src/ATen/native/cpu/AdaptiveAvgPoolKernel.cpp
@ -15,7 +15,7 @@ namespace at::native {
 namespace {

 template <typename scalar_t, typename accscalar_t>
-void cpu_adaptive_avg_pool(
+void cpu_adaptive_avg_pool2d(
    Tensor& output_,
    const Tensor& input_,
    IntArrayRef output_size) {
@ -69,7 +69,7 @@ void cpu_adaptive_avg_pool(

 template <typename scalar_t>
 typename std::enable_if_t<std::is_same_v<scalar_t, at::opmath_type<scalar_t>>, void>
-cpu_adaptive_avg_pool_channels_last(
+cpu_adaptive_avg_pool2d_channels_last(
    Tensor& output_,
    const Tensor& input_,
    IntArrayRef output_size) {
@ -156,7 +156,7 @@ cpu_adaptive_avg_pool_channels_last(

 template <typename scalar_t>
 typename std::enable_if_t<!std::is_same_v<scalar_t, at::opmath_type<scalar_t>>, void>
-cpu_adaptive_avg_pool_channels_last(
+cpu_adaptive_avg_pool2d_channels_last(
    Tensor& output_,
    const Tensor& input_,
    IntArrayRef output_size) {
@ -255,7 +255,7 @@ cpu_adaptive_avg_pool_channels_last(
 }

 template <typename scalar_t>
-void cpu_adaptive_avg_pool_backward(
+void cpu_adaptive_avg_pool2d_backward(
    Tensor& grad_input_,
    const Tensor& grad_output_) {
  auto grad_output = grad_output_.contiguous();
@ -305,7 +305,7 @@ void cpu_adaptive_avg_pool_backward(
 }

 template <typename scalar_t>
-void cpu_adaptive_avg_pool_backward_channels_last(
+void cpu_adaptive_avg_pool2d_backward_channels_last(
    Tensor& grad_input_,
    const Tensor& grad_output_) {
  auto memory_format = at::MemoryFormat::ChannelsLast;
@ -373,13 +373,13 @@ void adaptive_avg_pool2d_kernel_impl(
    case at::MemoryFormat::Contiguous: {
      AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, input.scalar_type(), "adaptive_avg_pool2d", [&] {
        using param_t = at::opmath_type<scalar_t>;
-        cpu_adaptive_avg_pool<scalar_t, /*accscalar_t*/param_t>(output, input, output_size);
+        cpu_adaptive_avg_pool2d<scalar_t, /*accscalar_t*/param_t>(output, input, output_size);
      });
      break;
    }
    case at::MemoryFormat::ChannelsLast: {
      AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, input.scalar_type(), "adaptive_avg_pool2d_channels_last", [&]{
-        cpu_adaptive_avg_pool_channels_last<scalar_t>(output, input, output_size);
+        cpu_adaptive_avg_pool2d_channels_last<scalar_t>(output, input, output_size);
      });
      break;
    }
@ -394,13 +394,458 @@ void adapative_avg_pool2d_backward_kernel_impl(
  switch (grad_output.suggest_memory_format()) {
    case at::MemoryFormat::Contiguous: {
      AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, grad_output.scalar_type(), "adaptive_avg_pool2d_backward", [&] {
-        cpu_adaptive_avg_pool_backward<scalar_t>(grad_input, grad_output);
+        cpu_adaptive_avg_pool2d_backward<scalar_t>(grad_input, grad_output);
      });
      break;
    }
    case at::MemoryFormat::ChannelsLast: {
      AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, grad_output.scalar_type(), "adaptive_avg_pool2d_backward_channels_last", [&]{
-        cpu_adaptive_avg_pool_backward_channels_last<scalar_t>(grad_input, grad_output);
+        cpu_adaptive_avg_pool2d_backward_channels_last<scalar_t>(grad_input, grad_output);
+      });
+      break;
+    }
+    default:
+      TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
+  }
+}
+
+
+template <typename scalar_t, typename accscalar_t>
+void cpu_adaptive_avg_pool3d(
+    Tensor& output_,
+    const Tensor& input_,
+    IntArrayRef output_size) {
+  auto input = input_.contiguous();
+  auto output = output_.contiguous();
+
+  auto input_data = input.data_ptr<scalar_t>();
+  auto output_data = output.data_ptr<scalar_t>();
+
+  int64_t ndim = input.ndimension();
+  // treat batch size and channels as one dimension
+  int64_t channels = ndim == 4 ? input.size(0) : input.size(0) * input.size(1);
+  int64_t input_depth = input.size(-3);
+  int64_t input_height = input.size(-2);
+  int64_t input_width = input.size(-1);
+  int64_t output_depth = output_size[0];
+  int64_t output_height = output_size[1];
+  int64_t output_width = output_size[2];
+
+  // parallel on dim of N, C
+  at::parallel_for(0, channels, 0, [&](int64_t begin, int64_t end) {
+    for (const auto c : c10::irange(begin, end)) {
+      scalar_t* input_ptr = input_data + c * input_depth * input_height * input_width;
+      scalar_t* output_ptr = output_data + c * output_depth * output_height * output_width;
+
+      for (const auto od : c10::irange(output_depth)) {
+        int64_t id0 = start_index(od, output_depth, input_depth);
+        int64_t id1 = end_index(od, output_depth, input_depth);
+        int64_t kd = id1 - id0;
+
+        for (const auto oh : c10::irange(output_height)) {
+          int64_t ih0 = start_index(oh, output_height, input_height);
+          int64_t ih1 = end_index(oh, output_height, input_height);
+          int64_t kh = ih1 - ih0;
+
+          for (const auto ow : c10::irange(output_width)) {
+            int64_t iw0 = start_index(ow, output_width, input_width);
+            int64_t iw1 = end_index(ow, output_width, input_width);
+            int64_t kw = iw1 - iw0;
+
+            // compute local average
+            accscalar_t sum = 0;
+            for (const auto id : c10::irange(id0, id1)) {
+              for (const auto ih : c10::irange(ih0, ih1)) {
+                for (const auto iw : c10::irange(iw0, iw1)) {
+                  sum += accscalar_t(input_ptr[id * input_height * input_width + ih * input_width + iw]);
+                }
+              }
+            }
+            output_ptr[od * output_height * output_width + oh * output_width + ow] = scalar_t(sum / kd / kh / kw);
+          }
+        }
+      }
+    }
+  });
+
+  if (!output_.is_contiguous()) {
+    output_.copy_(output);
+  }
+}
+
+
+template <typename scalar_t>
+typename std::enable_if_t<std::is_same_v<scalar_t, at::opmath_type<scalar_t>>, void>
+cpu_adaptive_avg_pool3d_channels_last(
+    Tensor& output_,
+    const Tensor& input_,
+    IntArrayRef output_size) {
+  auto memory_format = at::MemoryFormat::ChannelsLast3d;
+  auto input = input_.contiguous(memory_format);
+  auto output = output_.contiguous(memory_format);
+
+  auto input_data = input.data_ptr<scalar_t>();
+  auto output_data = output.data_ptr<scalar_t>();
+
+  int64_t nbatch = input.size(0);
+  int64_t channels = input.size(1);
+  int64_t input_depth = input.size(2);
+  int64_t input_height = input.size(3);
+  int64_t input_width = input.size(4);
+  int64_t output_depth = output_size[0];
+  int64_t output_height = output_size[1];
+  int64_t output_width = output_size[2];
+
+  using Vec = vec::Vectorized<scalar_t>;
+  // parallel on dim N, H, W
+  at::parallel_for(0, nbatch * output_depth * output_height * output_width, 0, [&](int64_t begin, int64_t end) {
+    int64_t n = 0;
+    int64_t od = 0;
+    int64_t oh = 0;
+    int64_t ow = 0;
+    data_index_init(begin, n, nbatch, od, output_depth, oh, output_height, ow, output_width);
+
+    for (const auto i : c10::irange(begin, end)) {
+      int64_t id0 = start_index(od, output_depth, input_depth);
+      int64_t id1 = end_index(od, output_depth, input_depth);
+      int64_t kd = id1 - id0;
+
+      int64_t ih0 = start_index(oh, output_height, input_height);
+      int64_t ih1 = end_index(oh, output_height, input_height);
+      int64_t kh = ih1 - ih0;
+
+      int64_t iw0 = start_index(ow, output_width, input_width);
+      int64_t iw1 = end_index(ow, output_width, input_width);
+      int64_t kw = iw1 - iw0;
+
+      scalar_t* out = output_data + i * channels;
+      int64_t size = channels;
+
+      // Note: For oridinary usage scenario, each out lane should
+      //   fit in L1 cache; otherwise consider block dim C.
+      // Pass I: zero the out lane
+      int64_t d1 = 0;
+      for (; d1 < size - (size % Vec::size()); d1 += Vec::size()) {
+        Vec out_vec = Vec(scalar_t(0));
+        out_vec.store(out + d1);
+      }
+      for (; d1 < size; d1++) {
+        out[d1] = scalar_t(0);
+      }
+      // Pass II: compute local sum
+      for (const auto id : c10::irange(id0, id1)) {
+        for (const auto ih : c10::irange(ih0, ih1)) {
+          for (const auto iw : c10::irange(iw0, iw1)) {
+            scalar_t* in = input_data + n * input_depth * input_height * input_width * channels +
+                id * input_height * input_width * channels + ih * input_width * channels + iw * channels;
+
+            int64_t d2 = 0;
+            for (; d2 < size - (size % Vec::size()); d2 += Vec::size()) {
+              Vec out_vec = Vec::loadu(out + d2) + Vec::loadu(in + d2);
+              out_vec.store(out + d2);
+            }
+            for (; d2 < size; d2++) {
+              out[d2] += in[d2];
+            }
+          }
+        }
+      }
+      // Pass III: compute local average
+      int64_t d3 = 0;
+      for (; d3 < size - (size % Vec::size()); d3 += Vec::size()) {
+        Vec out_vec = Vec::loadu(out + d3) / Vec(scalar_t(kd * kh * kw));
+        out_vec.store(out + d3);
+      }
+      for (; d3 < size; d3++) {
+        out[d3] = out[d3] / kd / kh / kw;
+      }
+
+      // move on to next output index
+      data_index_step(n, nbatch, od, output_depth, oh, output_height, ow, output_width);
+    }
+  });
+
+  if (!output_.is_contiguous(memory_format)) {
+    output_.copy_(output);
+  }
+}
+
+template <typename scalar_t>
+typename std::enable_if_t<!std::is_same_v<scalar_t, at::opmath_type<scalar_t>>, void>
+cpu_adaptive_avg_pool3d_channels_last(
+    Tensor& output_,
+    const Tensor& input_,
+    IntArrayRef output_size) {
+  auto memory_format = at::MemoryFormat::ChannelsLast3d;
+  auto input = input_.contiguous(memory_format);
+  auto output = output_.contiguous(memory_format);
+
+  auto input_data = input.data_ptr<scalar_t>();
+  auto output_data = output.data_ptr<scalar_t>();
+
+  int64_t nbatch = input.size(0);
+  int64_t channels = input.size(1);
+  int64_t input_depth = input.size(2);
+  int64_t input_height = input.size(3);
+  int64_t input_width = input.size(4);
+  int64_t output_depth = output_size[0];
+  int64_t output_height = output_size[1];
+  int64_t output_width = output_size[2];
+
+  using bVec = vec::Vectorized<scalar_t>;
+  using fVec = vec::Vectorized<float>;
+  // parallel on dim N,D, H, W
+  at::parallel_for(0, nbatch * output_depth * output_height * output_width, 0, [&](int64_t begin, int64_t end) {
+    int64_t n = 0;
+    int64_t oh = 0;
+    int64_t ow = 0;
+    int64_t od = 0;
+    data_index_init(begin, n, nbatch, od, output_depth, oh, output_height, ow, output_width);
+
+    // temp buffer for sum, use float as accumulation type
+    // can't reuse output buffer to store sum since it is BFloat16/Half
+    auto sum_arr = std::make_unique<float []>(channels);
+    float* sum = sum_arr.get();
+
+    for (const auto i : c10::irange(begin, end)) {
+      int64_t id0 = start_index(od, output_depth, input_depth);
+      int64_t id1 = end_index(od, output_depth, input_depth);
+      int64_t kd = id1 - id0;
+
+      int64_t ih0 = start_index(oh, output_height, input_height);
+      int64_t ih1 = end_index(oh, output_height, input_height);
+      int64_t kh = ih1 - ih0;
+
+      int64_t iw0 = start_index(ow, output_width, input_width);
+      int64_t iw1 = end_index(ow, output_width, input_width);
+      int64_t kw = iw1 - iw0;
+
+      scalar_t* out = output_data + i * channels;
+      int64_t size = channels;
+
+      // Pass I: zero the out lane
+      int64_t d1 = 0;
+      for (; d1 < size - (size % fVec::size()); d1 += fVec::size()) {
+        fVec sum_fvec = fVec(float(0));
+        sum_fvec.store(sum + d1);
+      }
+      for (; d1 < size; d1++) {
+        sum[d1] = float(0);
+      }
+      // Pass II: compute local sum
+      for (const auto id : c10::irange(id0, id1)) {
+        for (const auto ih : c10::irange(ih0, ih1)) {
+            for (const auto iw : c10::irange(iw0, iw1)) {
+                scalar_t* in = input_data + n * input_depth * input_height * input_width * channels +
+                    id * input_height * input_width * channels +
+                    ih * input_width * channels + iw * channels;
+
+                int64_t d2 = 0;
+                for (; d2 < size - (size % bVec::size()); d2 += bVec::size()) {
+                    bVec data_bvec = bVec::loadu(in + d2);
+                    fVec data_fvec0, data_fvec1;
+                    std::tie(data_fvec0, data_fvec1) = convert_to_float<scalar_t>(data_bvec);
+
+                    fVec sum_fvec0 = fVec::loadu(sum + d2) + data_fvec0;
+                    fVec sum_fvec1 = fVec::loadu(sum + d2 + fVec::size()) + data_fvec1;
+                    sum_fvec0.store(sum + d2);
+                    sum_fvec1.store(sum + d2 + fVec::size());
+                }
+                for (; d2 < size; d2++) {
+                    sum[d2] += float(in[d2]);
+                }
+            }
+        }
+      }
+      // Pass III: compute local average
+      int64_t d3 = 0;
+      for (; d3 < size - (size % bVec::size()); d3 += bVec::size()) {
+        fVec out_fvec0 = fVec::loadu(sum + d3) / fVec(float(kd * kh * kw));
+        fVec out_fvec1 = fVec::loadu(sum + d3 + fVec::size()) / fVec(float(kd * kh * kw));
+
+        bVec out_bvec = convert_from_float<scalar_t>(out_fvec0, out_fvec1);
+        out_bvec.store(out + d3);
+      }
+      for (; d3 < size; d3++) {
+        out[d3] = scalar_t(sum[d3] / kd / kh / kw);
+      }
+
+      // move on to next output index
+      data_index_step(n, nbatch, od, output_depth, oh, output_height, ow, output_width);
+    }
+  });
+
+  if (!output_.is_contiguous(memory_format)) {
+    output_.copy_(output);
+  }
+}
+
+template <typename scalar_t>
+void cpu_adaptive_avg_pool3d_backward(
+    Tensor& grad_input_,
+    const Tensor& grad_output_) {
+  auto grad_output = grad_output_.contiguous();
+  auto grad_input = grad_input_.contiguous();
+
+  auto grad_output_data = grad_output.data_ptr<scalar_t>();
+  auto grad_input_data = grad_input.mutable_data_ptr<scalar_t>();
+
+  int64_t ndim = grad_output.ndimension();
+  // treat batch size and channels as one dimension
+  int64_t channels = ndim == 4 ? grad_output.size(0) : grad_output.size(0) * grad_output.size(1);
+  int64_t input_depth = grad_input.size(-3);
+  int64_t input_height = grad_input.size(-2);
+  int64_t input_width = grad_input.size(-1);
+  int64_t output_depth = grad_output.size(-3);
+  int64_t output_height = grad_output.size(-2);
+  int64_t output_width = grad_output.size(-1);
+
+  // parallel on dim of N, C
+  at::parallel_for(0, channels, 0, [&](int64_t begin, int64_t end) {
+    for (const auto c : c10::irange(begin, end)) {
+      scalar_t* grad_input_ptr = grad_input_data + c * input_depth * input_height * input_width;
+      scalar_t* grad_output_ptr = grad_output_data + c * output_depth * output_height * output_width;
+
+      for (const auto od : c10::irange(output_depth)) {
+        int64_t id0 = start_index(od, output_depth, input_depth);
+        int64_t id1 = end_index(od, output_depth, input_depth);
+        int64_t kd = id1 - id0;
+        for (const auto oh : c10::irange(output_height)) {
+          int64_t ih0 = start_index(oh, output_height, input_height);
+          int64_t ih1 = end_index(oh, output_height, input_height);
+          int64_t kh = ih1 - ih0;
+
+          for (const auto ow : c10::irange(output_width)) {
+            int64_t iw0 = start_index(ow, output_width, input_width);
+            int64_t iw1 = end_index(ow, output_width, input_width);
+            int64_t kw = iw1 - iw0;
+
+            scalar_t grad_delta = grad_output_ptr[od * output_width * output_height + oh * output_width + ow] / kd / kh / kw;
+            for (const auto id : c10::irange(id0, id1)) {
+              for (const auto ih : c10::irange(ih0, ih1)) {
+                for (const auto iw : c10::irange(iw0, iw1)) {
+                  grad_input_ptr[id * input_height * input_width + ih * input_width + iw] += grad_delta;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  });
+
+  if (!grad_input_.is_contiguous()) {
+    grad_input_.copy_(grad_input);
+  }
+}
+
+template <typename scalar_t>
+void cpu_adaptive_avg_pool3d_backward_channels_last(
+    Tensor& grad_input_,
+    const Tensor& grad_output_) {
+  auto memory_format = at::MemoryFormat::ChannelsLast3d;
+  auto grad_input = grad_input_.contiguous(memory_format);
+  auto grad_output = grad_output_.contiguous(memory_format);
+
+  auto grad_input_data = grad_input.mutable_data_ptr<scalar_t>();
+  auto grad_output_data = grad_output.data_ptr<scalar_t>();
+
+  int64_t nbatch = grad_input.size(0);
+  int64_t channels = grad_input.size(1);
+  int64_t input_depth = grad_input.size(2);
+  int64_t input_height = grad_input.size(3);
+  int64_t input_width = grad_input.size(4);
+  int64_t output_depth = grad_output.size(2);
+  int64_t output_height = grad_output.size(3);
+  int64_t output_width = grad_output.size(4);
+
+  using Vec = vec::Vectorized<scalar_t>;
+  // parallel on dim N
+  at::parallel_for(0, nbatch, 0, [&](int64_t begin, int64_t end) {
+    for (const auto n : c10::irange(begin, end)) {
+      scalar_t* grad_input_ptr = grad_input_data + n * input_depth * input_height * input_width * channels;
+      scalar_t* grad_output_ptr = grad_output_data + n * output_depth * output_height * output_width * channels;
+
+      for (const auto od : c10::irange(output_depth)) {
+        int64_t id0 = start_index(od, output_depth, input_depth);
+        int64_t id1 = end_index(od, output_depth, input_depth);
+        int64_t kd = id1 - id0;
+        for (const auto oh : c10::irange(output_height)) {
+          int64_t ih0 = start_index(oh, output_height, input_height);
+          int64_t ih1 = end_index(oh, output_height, input_height);
+          int64_t kh = ih1 - ih0;
+
+          for (const auto ow : c10::irange(output_width)) {
+            int64_t iw0 = start_index(ow, output_width, input_width);
+            int64_t iw1 = end_index(ow, output_width, input_width);
+            int64_t kw = iw1 - iw0;
+
+            scalar_t* gout = grad_output_ptr + od * output_depth * channels + oh * output_width * channels + ow * channels;
+            int64_t size = channels;
+            for (const auto id : c10::irange(id0, id1)) {
+              for (const auto ih : c10::irange(ih0, ih1)) {
+                for (const auto iw : c10::irange(iw0, iw1)) {
+                  scalar_t* gin = grad_input_ptr + id * input_width * input_height * channels + ih * input_width * channels + iw * channels;
+
+                  int64_t d = 0;
+                  for (; d < size - (size % Vec::size()); d += Vec::size()) {
+                    Vec gin_vec = Vec::loadu(gin + d) + Vec::loadu(gout + d) / Vec(scalar_t(kd * kh * kw));
+                    gin_vec.store(gin + d);
+                  }
+                  for (; d < size; d++) {
+                    gin[d] += gout[d] / kd / kh / kw;
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  });
+
+  if (!grad_input_.is_contiguous(memory_format)) {
+    grad_input_.copy_(grad_input);
+  }
+}
+
+
+void adaptive_avg_pool3d_kernel_impl(
+    Tensor& output,
+    const Tensor& input,
+    IntArrayRef output_size) {
+  switch (input.suggest_memory_format()) {
+    case at::MemoryFormat::Contiguous: {
+      AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, input.scalar_type(), "adaptive_avg_pool3d", [&] {
+        using param_t = at::opmath_type<scalar_t>;
+        cpu_adaptive_avg_pool3d<scalar_t, /*accscalar_t*/param_t>(output, input, output_size);
+      });
+      break;
+    }
+    case at::MemoryFormat::ChannelsLast3d: {
+      AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, input.scalar_type(), "adaptive_avg_pool3d_channels_last", [&]{
+        cpu_adaptive_avg_pool3d_channels_last<scalar_t>(output, input, output_size);
+      });
+      break;
+    }
+    default:
+      TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
+  }
+}
+
+void adapative_avg_pool3d_backward_kernel_impl(
+    Tensor& grad_input,
+    const Tensor& grad_output) {
+  switch (grad_output.suggest_memory_format()) {
+    case at::MemoryFormat::Contiguous: {
+      AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, grad_output.scalar_type(), "adaptive_avg_pool3d_backward", [&] {
+        cpu_adaptive_avg_pool3d_backward<scalar_t>(grad_input, grad_output);
+      });
+      break;
+    }
+    case at::MemoryFormat::ChannelsLast3d: {
+      AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, grad_output.scalar_type(), "adaptive_avg_pool3d_backward_channels_last", [&]{
+        cpu_adaptive_avg_pool3d_backward_channels_last<scalar_t>(grad_input, grad_output);
      });
      break;
    }
@ -413,5 +858,7 @@ void adapative_avg_pool2d_backward_kernel_impl(

 REGISTER_DISPATCH(adaptive_avg_pool2d_kernel, &adaptive_avg_pool2d_kernel_impl);
 REGISTER_DISPATCH(adaptive_avg_pool2d_backward_kernel, &adapative_avg_pool2d_backward_kernel_impl);
+REGISTER_DISPATCH(adaptive_avg_pool3d_kernel, &adaptive_avg_pool3d_kernel_impl);
+REGISTER_DISPATCH(adaptive_avg_pool3d_backward_kernel, &adapative_avg_pool3d_backward_kernel_impl);

 } // at::native
--- a/aten/src/ATen/native/cpu/AdaptiveMaxPoolKernel.cpp
+++ b/aten/src/ATen/native/cpu/AdaptiveMaxPoolKernel.cpp
@ -15,7 +15,7 @@ namespace at::native {
 namespace {

 template <typename scalar_t, typename accscalar_t>
-void cpu_adaptive_max_pool(
+void cpu_adaptive_max_pool2d(
    const Tensor& output_,
    const Tensor& indices_,
    const Tensor& input_,
@ -83,13 +83,13 @@ void cpu_adaptive_max_pool(

 template <typename scalar_t>
 typename std::enable_if_t<std::is_same_v<scalar_t, at::opmath_type<scalar_t>>, void>
-cpu_adaptive_max_pool_channels_last(
+cpu_adaptive_max_pool2d_channels_last(
    const Tensor& output_,
    const Tensor& indices_,
    const Tensor& input_,
    IntArrayRef output_size) {
  TORCH_CHECK(input_.ndimension() == 4,
-              "adaptive max pooling with channels last format supports tensors with 4 dims");
+              "2d adaptive max pooling with channels last format supports tensors with 4 dims");
  auto memory_format = at::MemoryFormat::ChannelsLast;
  auto input = input_.contiguous(memory_format);
  auto output = output_.contiguous(memory_format);
@ -200,13 +200,13 @@ cpu_adaptive_max_pool_channels_last(

 template <typename scalar_t>
 typename std::enable_if_t<!std::is_same_v<scalar_t, at::opmath_type<scalar_t>>, void>
-cpu_adaptive_max_pool_channels_last(
+cpu_adaptive_max_pool2d_channels_last(
    const Tensor& output_,
    const Tensor& indices_,
    const Tensor& input_,
    IntArrayRef output_size) {
  TORCH_CHECK(input_.ndimension() == 4,
-              "adaptive max pooling with channels last format supports tensors with 4 dims");
+              "2d adaptive max pooling with channels last format supports tensors with 4 dims");
  auto memory_format = at::MemoryFormat::ChannelsLast;
  auto input = input_.contiguous(memory_format);
  auto output = output_.contiguous(memory_format);
@ -340,7 +340,7 @@ cpu_adaptive_max_pool_channels_last(
 }

 template <typename scalar_t>
-void cpu_adaptive_max_pool_backward(
+void cpu_adaptive_max_pool2d_backward(
    const Tensor& grad_input_,
    const Tensor& grad_output_,
    const Tensor& indices_) {
@ -386,12 +386,12 @@ void cpu_adaptive_max_pool_backward(
 }

 template <typename scalar_t>
-void cpu_adaptive_max_pool_backward_channels_last(
+void cpu_adaptive_max_pool2d_backward_channels_last(
    const Tensor& grad_input_,
    const Tensor& grad_output_,
    const Tensor& indices_) {
  TORCH_CHECK(grad_output_.ndimension() == 4,
-              "adaptive max pooling backward with channels last format supports tensors with 4 dims.");
+              "2d adaptive max pooling backward with channels last format supports tensors with 4 dims.");
  auto memory_format = at::MemoryFormat::ChannelsLast;
  auto grad_input = grad_input_.contiguous(memory_format);
  auto grad_output = grad_output_.contiguous(memory_format);
@ -443,13 +443,13 @@ void adaptive_max_pool2d_kernel_impl(
    case at::MemoryFormat::Contiguous: {
      AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, input.scalar_type(), "adaptive_max_pool2d", [&] {
        using param_t = at::opmath_type<scalar_t>;
-        cpu_adaptive_max_pool<scalar_t, /*accscalar_t*/param_t>(output, indices, input, output_size);
+        cpu_adaptive_max_pool2d<scalar_t, /*accscalar_t*/param_t>(output, indices, input, output_size);
      });
      break;
    }
    case at::MemoryFormat::ChannelsLast: {
      AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, input.scalar_type(), "adaptive_max_pool2d_channels_last", [&]{
-        cpu_adaptive_max_pool_channels_last<scalar_t>(output, indices, input, output_size);
+        cpu_adaptive_max_pool2d_channels_last<scalar_t>(output, indices, input, output_size);
      });
      break;
    }
@ -466,13 +466,512 @@ void adaptive_max_pool2d_backward_kernel_impl(
  switch (grad_input.suggest_memory_format()) {
    case at::MemoryFormat::Contiguous: {
      AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, grad_output.scalar_type(), "adaptive_max_pool2d_backward", [&] {
-        cpu_adaptive_max_pool_backward<scalar_t>(grad_input, grad_output, indices);
+        cpu_adaptive_max_pool2d_backward<scalar_t>(grad_input, grad_output, indices);
      });
      break;
    }
    case at::MemoryFormat::ChannelsLast: {
      AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, grad_output.scalar_type(), "adaptive_max_pool2d_backward_channels_last", [&]{
-        cpu_adaptive_max_pool_backward_channels_last<scalar_t>(grad_input, grad_output, indices);
+        cpu_adaptive_max_pool2d_backward_channels_last<scalar_t>(grad_input, grad_output, indices);
+      });
+      break;
+    }
+    default:
+      TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
+  }
+}
+
+template <typename scalar_t, typename accscalar_t>
+void cpu_adaptive_max_pool3d(
+    const Tensor& output_,
+    const Tensor& indices_,
+    const Tensor& input_,
+    IntArrayRef output_size) {
+  auto input = input_.contiguous();
+  auto output = output_.contiguous();
+  auto indices = indices_.contiguous();
+
+  auto input_data = input.data_ptr<scalar_t>();
+  auto output_data = output.data_ptr<scalar_t>();
+  auto indices_data = indices.data_ptr<int64_t>();
+
+  int64_t ndim = input.ndimension();
+  // treat batch size and channels as one dimension
+  int64_t channels = ndim == 4 ? input.size(0) : input.size(0) * input.size(1);
+  int64_t input_depth = input.size(-3);
+  int64_t input_height = input.size(-2);
+  int64_t input_width = input.size(-1);
+  int64_t output_depth = output_size[0];
+  int64_t output_height = output_size[1];
+  int64_t output_width = output_size[2];
+
+  // parallel on dim of N, C
+  at::parallel_for(0, channels, 0, [&](int64_t begin, int64_t end) {
+    for (const auto c : c10::irange(begin, end)) {
+      scalar_t* input_ptr = input_data + c * input_depth * input_height * input_width;
+      scalar_t* output_ptr = output_data + c * output_depth * output_height * output_width;
+      int64_t* indices_ptr = indices_data + c * output_depth * output_height * output_width;
+
+      for (const auto od : c10::irange(output_depth)) {
+        int64_t id0 = start_index(od, output_depth, input_depth);
+        int64_t id1 = end_index(od, output_depth, input_depth);
+          for (const auto oh : c10::irange(output_height)) {
+            int64_t ih0 = start_index(oh, output_height, input_height);
+            int64_t ih1 = end_index(oh, output_height, input_height);
+
+          for (const auto ow : c10::irange(output_width)) {
+            int64_t iw0 = start_index(ow, output_width, input_width);
+            int64_t iw1 = end_index(ow, output_width, input_width);
+
+            // compute local max
+            int64_t maxindex = id0 * input_height * input_width + ih0 * input_width + iw0;
+            accscalar_t maxval = -std::numeric_limits<accscalar_t>::infinity();
+            for (int64_t id = id0; id < id1; id ++) {
+              for (int64_t ih = ih0; ih < ih1; ih ++) {
+                for (int64_t iw = iw0; iw < iw1; iw ++) {
+                  int64_t index = id * input_height * input_width + ih * input_width + iw;
+                  scalar_t val = input_ptr[index];
+                  if ((val > maxval) || std::isnan(val)) {
+                    maxval = val;
+                    maxindex = index;
+                  }
+                }
+              }
+            }
+
+            // set output to local max and store location of max
+            output_ptr[od * output_height * output_width + oh * output_width + ow] = maxval;
+            indices_ptr[od * output_height * output_width + oh * output_width + ow] = scalar_t(maxindex);
+          }
+        }
+      }
+    }
+  });
+
+  if (!output_.is_contiguous()) {
+    output_.copy_(output);
+  }
+  if (!indices_.is_contiguous()) {
+    indices_.copy_(indices);
+  }
+}
+
+template <typename scalar_t>
+typename std::enable_if_t<std::is_same_v<scalar_t, at::opmath_type<scalar_t>>, void>
+cpu_adaptive_max_pool3d_channels_last(
+    const Tensor& output_,
+    const Tensor& indices_,
+    const Tensor& input_,
+    IntArrayRef output_size) {
+  TORCH_CHECK(input_.ndimension() == 5,
+              "3d adaptive max pooling with channels last format supports tensors with 5 dims");
+  auto memory_format = at::MemoryFormat::ChannelsLast3d;
+  auto input = input_.contiguous(memory_format);
+  auto output = output_.contiguous(memory_format);
+  auto indices = indices_.contiguous(memory_format);
+
+  auto input_data = input.data_ptr<scalar_t>();
+  auto output_data = output.data_ptr<scalar_t>();
+  auto indices_data = indices.data_ptr<int64_t>();
+
+  int64_t nbatch = input.size(0);
+  int64_t channels = input.size(1);
+  int64_t input_depth = input.size(2);
+  int64_t input_height = input.size(3);
+  int64_t input_width = input.size(4);
+  int64_t output_depth = output_size[0];
+  int64_t output_height = output_size[1];
+  int64_t output_width = output_size[2];
+
+  using Vec = vec::Vectorized<scalar_t>;
+  using integer_t = vec::int_same_size_t<scalar_t>;
+  using iVec = vec::Vectorized<integer_t>;
+  // for the convience of vectorization, use integer of the same size of scalar_t,
+  //   e.g. int32_t for float, int64_t for double
+  // need to make sure doesn't overflow
+  TORCH_CHECK(input_height * input_width <= std::numeric_limits<integer_t>::max());
+
+  // parallel on dim of N, H, W
+  at::parallel_for(0, nbatch * output_depth * output_height * output_width, 0, [&](int64_t begin, int64_t end) {
+    int64_t n = 0;
+    int64_t od = 0;
+    int64_t oh = 0;
+    int64_t ow = 0;
+    data_index_init(begin, n, nbatch, od, output_depth, oh, output_height, ow, output_width);
+
+    int64_t size = channels;
+    int64_t len = size - (size % Vec::size());
+    // temp buffer holding index with integer_t
+    auto index_buffer = std::make_unique<integer_t []>(len);
+
+    for (const auto i : c10::irange(begin, end)) {
+      int64_t id0 = start_index(od, output_depth, input_depth);
+      int64_t id1 = end_index(od, output_depth, input_depth);
+
+      int64_t ih0 = start_index(oh, output_height, input_height);
+      int64_t ih1 = end_index(oh, output_height, input_height);
+
+      int64_t iw0 = start_index(ow, output_width, input_width);
+      int64_t iw1 = end_index(ow, output_width, input_width);
+
+      scalar_t* out = output_data + i * channels;
+      int64_t* ind = indices_data + i * channels;
+
+      // Pass I: init out lane
+      iVec index0_vec = iVec(id0 * input_height * input_width + ih0 * input_width + iw0);
+      Vec out_vec = Vec(-std::numeric_limits<scalar_t>::infinity());
+      int64_t d1 = 0;
+      for (; d1 < len; d1 += Vec::size()) {
+        index0_vec.store(index_buffer.get() + d1);
+        out_vec.store(out + d1);
+      }
+      for (; d1 < size; d1++) {
+        ind[d1] = id0 * input_height * input_width + ih0 * input_width + iw0;
+        out[d1] = -std::numeric_limits<scalar_t>::infinity();
+      }
+      // Pass II: compute local max
+      for (int64_t id = id0; id < id1; id ++) {
+        for (int64_t ih = ih0; ih < ih1; ih ++) {
+          for (int64_t iw = iw0; iw < iw1; iw ++) {
+            scalar_t* in = input_data + n * input_depth * input_height * input_width * channels +
+                id * input_height * input_width * channels + ih * input_width * channels + iw * channels;
+
+            int64_t d2 = 0;
+            for (; d2 < len; d2 += Vec::size()) {
+              iVec index_vec = iVec(id * input_height * input_width + ih * input_width + iw);
+              Vec val_vec = Vec::loadu(in + d2);
+              iVec maxindex_vec = iVec::loadu(index_buffer.get() + d2);
+              Vec maxval_vec = Vec::loadu(out + d2);
+
+              // true = all ones, false = all zeros
+              Vec mask = (val_vec > maxval_vec) | val_vec.isnan();
+              iVec imask = vec::cast<integer_t>(mask);
+              Vec out_vec = Vec::blendv(maxval_vec, val_vec, mask);
+              iVec ind_vec = iVec::blendv(maxindex_vec, index_vec, imask);
+
+              out_vec.store(out + d2);
+              ind_vec.store(index_buffer.get() + d2);
+            }
+            for (; d2 < size; d2++) {
+              int64_t index = id * input_height * input_width + ih * input_width + iw;
+              scalar_t val = in[d2];
+              int64_t maxindex = ind[d2];
+              scalar_t maxval = out[d2];
+
+              bool mask = (val > maxval) || std::isnan(val);
+              out[d2] = mask ? val : maxval;
+              ind[d2] = mask ? index : maxindex;
+            }
+          }
+        }
+      }
+      // convert indice data type
+      vec::convert<integer_t, int64_t>(index_buffer.get(), ind, len);
+
+      // move on to next output index
+      data_index_step(n, nbatch, od, output_depth, oh, output_height, ow, output_width);
+    }
+  });
+
+  if (!output_.is_contiguous(memory_format)) {
+    output_.copy_(output);
+  }
+  if (!indices_.is_contiguous(memory_format)) {
+    indices_.copy_(indices);
+  }
+}
+
+template <typename scalar_t>
+typename std::enable_if_t<!std::is_same_v<scalar_t, at::opmath_type<scalar_t>>, void>
+cpu_adaptive_max_pool3d_channels_last(
+    const Tensor& output_,
+    const Tensor& indices_,
+    const Tensor& input_,
+    IntArrayRef output_size) {
+  TORCH_CHECK(input_.ndimension() == 5,
+              "3d adaptive max pooling with channels last format supports tensors with 5 dims");
+  auto memory_format = at::MemoryFormat::ChannelsLast3d;
+  auto input = input_.contiguous(memory_format);
+  auto output = output_.contiguous(memory_format);
+  auto indices = indices_.contiguous(memory_format);
+
+  auto input_data = input.data_ptr<BFloat16>();
+  auto output_data = output.data_ptr<BFloat16>();
+  auto indices_data = indices.data_ptr<int64_t>();
+
+  int64_t nbatch = input.size(0);
+  int64_t channels = input.size(1);
+  int64_t input_depth = input.size(2);
+  int64_t input_height = input.size(3);
+  int64_t input_width = input.size(4);
+  int64_t output_depth = output_size[0];
+  int64_t output_height = output_size[1];
+  int64_t output_width = output_size[2];
+
+  using bVec = vec::Vectorized<BFloat16>;
+  using fVec = vec::Vectorized<float>;
+  using iVec = vec::Vectorized<int32_t>;
+  // need to make sure doesn't overflow
+  TORCH_CHECK(input_height * input_width <= std::numeric_limits<int32_t>::max());
+
+  // parallel on dim of N, H, W
+  at::parallel_for(0, nbatch * output_depth * output_height * output_width, 0, [&](int64_t begin, int64_t end) {
+    int64_t n = 0;
+    int64_t od = 0;
+    int64_t oh = 0;
+    int64_t ow = 0;
+    data_index_init(begin, n, nbatch, od, output_depth, oh, output_height, ow, output_width);
+
+    int64_t size = channels;
+    int64_t len = size - (size % bVec::size());
+    // temp buffer holding index with integer_t
+    auto index_buffer = std::make_unique<int32_t []>(len);
+    // temp buffer holding max value with float
+    auto max_arr = std::make_unique<float []>(size);
+    float* max = max_arr.get();
+
+    for (const auto i : c10::irange(begin, end)) {
+      int64_t id0 = start_index(od, output_depth, input_depth);
+      int64_t id1 = end_index(od, output_depth, input_depth);
+
+      int64_t ih0 = start_index(oh, output_height, input_height);
+      int64_t ih1 = end_index(oh, output_height, input_height);
+
+      int64_t iw0 = start_index(ow, output_width, input_width);
+      int64_t iw1 = end_index(ow, output_width, input_width);
+
+      BFloat16* out = output_data + i * channels;
+      int64_t* ind = indices_data + i * channels;
+
+      // Pass I: init out lane
+      iVec index0_ivec = iVec(id0 * input_height * input_width + ih0 * input_width + iw0);
+      fVec max_fvec = fVec(-std::numeric_limits<float>::infinity());
+      int64_t d1 = 0;
+      for (; d1 < len; d1 += fVec::size()) {
+        index0_ivec.store(index_buffer.get() + d1);
+        max_fvec.store(max + d1);
+      }
+      for (; d1 < size; d1++) {
+        ind[d1] = id0 * input_height * input_width + ih0 * input_width + iw0;
+        max[d1] = -std::numeric_limits<float>::infinity();
+      }
+      // Pass II: compute local max
+      for (int64_t id = id0; id < id1; id ++) {
+        for (int64_t ih = ih0; ih < ih1; ih ++) {
+          for (int64_t iw = iw0; iw < iw1; iw ++) {
+            BFloat16* in = input_data + n * input_depth * input_height * input_width * channels +
+                id * input_height * input_width * channels + ih * input_width * channels + iw * channels;
+
+            int64_t d2 = 0;
+            for (; d2 < len; d2 += bVec::size()) {
+              iVec index_ivec = iVec(id * input_height * input_width + ih * input_width + iw);
+              bVec val_bvec = bVec::loadu(in + d2);
+              fVec val_fvec0, val_fvec1;
+              std::tie(val_fvec0, val_fvec1) = convert_bfloat16_float(val_bvec);
+
+              iVec maxindex_ivec0 = iVec::loadu(index_buffer.get() + d2);
+              iVec maxindex_ivec1 = iVec::loadu(index_buffer.get() + d2 + iVec::size());
+              fVec maxval_fvec0 = fVec::loadu(max + d2);
+              fVec maxval_fvec1 = fVec::loadu(max + d2 + fVec::size());
+
+              // true = all ones, false = all zeros
+              fVec mask0 = (val_fvec0 > maxval_fvec0) | val_fvec0.isnan();
+              fVec mask1 = (val_fvec1 > maxval_fvec1) | val_fvec1.isnan();
+              iVec imask0 = vec::cast<int32_t>(mask0);
+              iVec imask1 = vec::cast<int32_t>(mask1);
+
+              fVec max_fvec0 = fVec::blendv(maxval_fvec0, val_fvec0, mask0);
+              fVec max_fvec1 = fVec::blendv(maxval_fvec1, val_fvec1, mask1);
+              iVec ind_ivec0 = iVec::blendv(maxindex_ivec0, index_ivec, imask0);
+              iVec ind_ivec1 = iVec::blendv(maxindex_ivec1, index_ivec, imask1);
+
+              max_fvec0.store(max + d2);
+              max_fvec1.store(max + d2 + fVec::size());
+              ind_ivec0.store(index_buffer.get() + d2);
+              ind_ivec1.store(index_buffer.get() + d2 + iVec::size());
+            }
+            for (; d2 < size; d2++) {
+              int64_t index = id * input_height * input_width + ih * input_width + iw;
+              float val = float(in[d2]);
+              int64_t maxindex = ind[d2];
+              float maxval = max[d2];
+
+              bool mask = (val > maxval) || std::isnan(val);
+              max[d2] = mask ? val : maxval;
+              ind[d2] = mask ? index : maxindex;
+            }
+          }
+        }
+      }
+      // Pass III: convert max values from float to bfloat16
+      int64_t d3 = 0;
+      for (; d3 < len; d3 += bVec::size()) {
+        fVec max_fvec0 = fVec::loadu(max + d3);
+        fVec max_fvec1 = fVec::loadu(max + d3 + fVec::size());
+        bVec max_bvec = convert_float_bfloat16(max_fvec0, max_fvec1);
+        max_bvec.store(out + d3);
+      }
+      for (; d3 < size; d3++) {
+        out[d3] = BFloat16(max[d3]);
+      }
+      // convert indice data type
+      vec::convert<int32_t, int64_t>(index_buffer.get(), ind, len);
+
+      // move on to next output index
+      data_index_step(n, nbatch, od, output_depth, oh, output_height, ow, output_width);
+    }
+  });
+
+  if (!output_.is_contiguous(memory_format)) {
+    output_.copy_(output);
+  }
+  if (!indices_.is_contiguous(memory_format)) {
+    indices_.copy_(indices);
+  }
+}
+
+template <typename scalar_t>
+void cpu_adaptive_max_pool3d_backward(
+    const Tensor& grad_input_,
+    const Tensor& grad_output_,
+    const Tensor& indices_) {
+  auto grad_output = grad_output_.contiguous();
+  auto indices = indices_.contiguous();
+  auto grad_input = grad_input_.contiguous();
+
+  auto grad_output_data = grad_output.data_ptr<scalar_t>();
+  auto indices_data = indices.data_ptr<int64_t>();
+  auto grad_input_data = grad_input.mutable_data_ptr<scalar_t>();
+
+  int64_t ndim = grad_output.ndimension();
+  // treat batch size and channels as one dimension
+  int64_t channels = ndim == 3 ? grad_output.size(0) : grad_output.size(0) * grad_output.size(1);
+  int64_t input_depth = grad_input.size(-3);
+  int64_t input_height = grad_input.size(-2);
+  int64_t input_width = grad_input.size(-1);
+  int64_t output_depth = grad_output.size(-3);
+  int64_t output_height = grad_output.size(-2);
+  int64_t output_width = grad_output.size(-1);
+
+  // parallel on dim of N, C
+  at::parallel_for(0, channels, 0, [&](int64_t begin, int64_t end) {
+    for (const auto c : c10::irange(begin, end)) {
+      scalar_t* grad_input_ptr = grad_input_data + c * input_depth * input_height * input_width;
+      scalar_t* grad_output_ptr = grad_output_data + c * output_depth * output_height * output_width;
+      int64_t* indices_ptr = indices_data + c * output_depth * output_height * output_width;
+
+      for (const auto od : c10::irange(output_depth)) {
+        for (const auto oh : c10::irange(output_height)) {
+          for (const auto ow : c10::irange(output_width)) {
+            // retrieve position of max
+            int64_t index = od * output_height * output_width + oh * output_width + ow;
+            int64_t maxindex = indices_ptr[index];
+
+            // update gradient
+            grad_input_ptr[maxindex] += grad_output_ptr[index];
+          }
+        }
+      }
+    }
+  });
+
+  if (!grad_input_.is_contiguous()) {
+    grad_input_.copy_(grad_input);
+  }
+}
+
+template <typename scalar_t>
+void cpu_adaptive_max_pool3d_backward_channels_last(
+    const Tensor& grad_input_,
+    const Tensor& grad_output_,
+    const Tensor& indices_) {
+  TORCH_CHECK(grad_output_.ndimension() == 5,
+              "3d adaptive max pooling backward with channels last format supports tensors with 5 dims.");
+  auto memory_format = at::MemoryFormat::ChannelsLast3d;
+  auto grad_input = grad_input_.contiguous(memory_format);
+  auto grad_output = grad_output_.contiguous(memory_format);
+  auto indices = indices_.contiguous(memory_format);
+
+  auto grad_input_data = grad_input.mutable_data_ptr<scalar_t>();
+  auto grad_output_data = grad_output.data_ptr<scalar_t>();
+  auto indices_data = indices.data_ptr<int64_t>();
+
+  int64_t nbatch = grad_input.size(0);
+  int64_t channels = grad_input.size(1);
+  int64_t input_depth = grad_input.size(2);
+  int64_t input_height = grad_input.size(3);
+  int64_t input_width = grad_input.size(4);
+  int64_t output_depth = grad_output.size(2);
+  int64_t output_height = grad_output.size(3);
+  int64_t output_width = grad_output.size(4);
+
+  // parallel on dim N
+  at::parallel_for(0, nbatch, 0, [&](int64_t begin, int64_t end) {
+    for (const auto n : c10::irange(begin, end)) {
+      scalar_t* grad_input_ptr = grad_input_data + n * input_depth * input_height * input_width * channels;
+      scalar_t* grad_output_ptr = grad_output_data + n * output_depth * output_height * output_width * channels;
+      int64_t* indices_ptr = indices_data + n * output_depth * output_height * output_width * channels;
+
+      for (const auto od : c10::irange(output_depth)) {
+        for (const auto oh : c10::irange(output_height)) {
+          for (const auto ow : c10::irange(output_width)) {
+            scalar_t* gout = grad_output_ptr + od * output_height * output_width * channels + oh * output_width * channels + ow * channels;
+            int64_t* ind = indices_ptr + od * output_height * output_width * channels + oh * output_width * channels + ow * channels;
+            // TODO: gcc vectorization
+            for (const auto c : c10::irange(channels)) {
+              int64_t maxindex = ind[c];
+              grad_input_ptr[maxindex * channels + c] += gout[c];
+            }
+          }
+        }
+      }
+    }
+  });
+
+  if (!grad_input_.is_contiguous(memory_format)) {
+    grad_input_.copy_(grad_input);
+  }
+}
+
+void adaptive_max_pool3d_kernel_impl(
+    const Tensor& output,
+    const Tensor& indices,
+    const Tensor& input,
+    IntArrayRef output_size) {
+  switch (input.suggest_memory_format()) {
+    case at::MemoryFormat::Contiguous: {
+      AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, input.scalar_type(), "adaptive_max_pool3d", [&] {
+        using param_t = at::opmath_type<scalar_t>;
+        cpu_adaptive_max_pool3d<scalar_t, /*accscalar_t*/param_t>(output, indices, input, output_size);
+      });
+      break;
+    }
+    case at::MemoryFormat::ChannelsLast3d: {
+      AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, input.scalar_type(), "adaptive_max_pool3d_channels_last", [&]{
+        cpu_adaptive_max_pool3d_channels_last<scalar_t>(output, indices, input, output_size);
+      });
+      break;
+    }
+    default:
+      TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
+  }
+}
+
+void adaptive_max_pool3d_backward_kernel_impl(
+    const Tensor& grad_input,
+    const Tensor& grad_output,
+    const Tensor& indices) {
+  // can't use grad_output memory format to switch here since grad_output might be NC11
+  switch (grad_input.suggest_memory_format()) {
+    case at::MemoryFormat::Contiguous: {
+      AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, grad_output.scalar_type(), "adaptive_max_pool3d_backward", [&] {
+        cpu_adaptive_max_pool3d_backward<scalar_t>(grad_input, grad_output, indices);
+      });
+      break;
+    }
+    case at::MemoryFormat::ChannelsLast3d: {
+      AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, grad_output.scalar_type(), "adaptive_max_pool3d_backward_channels_last", [&]{
+        cpu_adaptive_max_pool3d_backward_channels_last<scalar_t>(grad_input, grad_output, indices);
      });
      break;
    }
@ -485,5 +984,7 @@ void adaptive_max_pool2d_backward_kernel_impl(

 REGISTER_DISPATCH(adaptive_max_pool2d_kernel, &adaptive_max_pool2d_kernel_impl);
 REGISTER_DISPATCH(adaptive_max_pool2d_backward_kernel, &adaptive_max_pool2d_backward_kernel_impl);
+REGISTER_DISPATCH(adaptive_max_pool3d_kernel, &adaptive_max_pool3d_kernel_impl);
+REGISTER_DISPATCH(adaptive_max_pool3d_backward_kernel, &adaptive_max_pool3d_backward_kernel_impl);

 } // at::native
--- a/aten/src/ATen/native/cpu/AvgPoolKernel.cpp
+++ b/aten/src/ATen/native/cpu/AvgPoolKernel.cpp
@ -14,7 +14,7 @@ namespace at::native {
 namespace {

 template <typename scalar_t>
-void cpu_avg_pool(
+void cpu_avg_pool2d(
    const Tensor& output_,
    const Tensor& input_,
    int64_t kW, int64_t kH,
@ -101,7 +101,7 @@ void cpu_avg_pool(

 template <typename scalar_t,
          typename std::enable_if<!is_reduced_floating_point<scalar_t>::value, int>::type = 0>
-void cpu_avg_pool_channels_last(
+void cpu_avg_pool2d_channels_last(
    const Tensor& output_,
    const Tensor& input_,
    int64_t kW, int64_t kH,
@ -110,7 +110,7 @@ void cpu_avg_pool_channels_last(
    bool count_include_pad,
    c10::optional<int64_t> divisor_override) {
  TORCH_CHECK(input_.ndimension() == 4,
-              "average pooling with channels last format supports tensors with 4 dims");
+              "2d average pooling with channels last format supports tensors with 4 dims");
  auto memory_format = at::MemoryFormat::ChannelsLast;
  auto input = input_.contiguous(memory_format);
  auto output = output_.contiguous(memory_format);
@ -215,7 +215,7 @@ void cpu_avg_pool_channels_last(

 template <typename scalar_t,
          typename std::enable_if<is_reduced_floating_point<scalar_t>::value, int>::type = 0>
-void cpu_avg_pool_channels_last(
+void cpu_avg_pool2d_channels_last(
    const Tensor& output_,
    const Tensor& input_,
    int64_t kW, int64_t kH,
@ -224,7 +224,7 @@ void cpu_avg_pool_channels_last(
    bool count_include_pad,
    c10::optional<int64_t> divisor_override) {
  TORCH_CHECK(input_.ndimension() == 4,
-              "average pooling with channels last format supports tensors with 4 dims");
+              "2d average pooling with channels last format supports tensors with 4 dims");
  auto memory_format = at::MemoryFormat::ChannelsLast;
  auto input = input_.contiguous(memory_format);
  auto output = output_.contiguous(memory_format);
@ -347,7 +347,7 @@ void cpu_avg_pool_channels_last(
 }

 template <typename scalar_t>
-void cpu_avg_pool_backward(
+void cpu_avg_pool2d_backward(
    const Tensor& grad_input_,
    const Tensor& grad_output_,
    int kW, int kH,
@ -415,7 +415,7 @@ void cpu_avg_pool_backward(
 }

 template <typename scalar_t>
-void cpu_avg_pool_backward_channels_last(
+void cpu_avg_pool2d_backward_channels_last(
    const Tensor& grad_input_,
    const Tensor& grad_output_,
    int kW, int kH,
@ -463,7 +463,7 @@ void cpu_avg_pool_backward_channels_last(
            if(count_include_pad) {
              divide_factor = pool_size;
            } else {
-               divide_factor = (ih1 - ih0) * (iw1 - iw0);
+              divide_factor = (ih1 - ih0) * (iw1 - iw0);
            }
          }

@ -505,13 +505,13 @@ void avg_pool2d_kernel_impl(
  switch (input.suggest_memory_format()) {
    case at::MemoryFormat::Contiguous: {
      AT_DISPATCH_FLOATING_TYPES_AND3(kLong, kBFloat16, kHalf, input.scalar_type(), "avg_pool2d", [&] {
-        cpu_avg_pool<scalar_t>(output, input, kW, kH, dW, dH, padW, padH, count_include_pad, divisor_override);
+        cpu_avg_pool2d<scalar_t>(output, input, kW, kH, dW, dH, padW, padH, count_include_pad, divisor_override);
      });
      break;
    }
    case at::MemoryFormat::ChannelsLast: {
      AT_DISPATCH_FLOATING_TYPES_AND3(kLong, kBFloat16, kHalf, input.scalar_type(), "avg_pool2d_channels_last", [&] {
-        cpu_avg_pool_channels_last<scalar_t>(output, input, kW, kH, dW, dH, padW, padH, count_include_pad, divisor_override);
+        cpu_avg_pool2d_channels_last<scalar_t>(output, input, kW, kH, dW, dH, padW, padH, count_include_pad, divisor_override);
      });
      break;
    }
@ -531,13 +531,13 @@ void avg_pool2d_backward_kernel_impl(
  switch (grad_output.suggest_memory_format()) {
    case at::MemoryFormat::Contiguous: {
      AT_DISPATCH_FLOATING_TYPES_AND3(kLong, kBFloat16, kHalf, grad_output.scalar_type(), "avg_pool2d_backward", [&] {
-        cpu_avg_pool_backward<scalar_t>(grad_input, grad_output, kW, kH, dW, dH, padW, padH, count_include_pad, divisor_override);
+        cpu_avg_pool2d_backward<scalar_t>(grad_input, grad_output, kW, kH, dW, dH, padW, padH, count_include_pad, divisor_override);
      });
      break;
    }
    case at::MemoryFormat::ChannelsLast: {
      AT_DISPATCH_FLOATING_TYPES_AND3(kLong, kBFloat16, kHalf, grad_output.scalar_type(), "avg_pool2d_backward_channels_last", [&] {
-        cpu_avg_pool_backward_channels_last<scalar_t>(grad_input, grad_output, kW, kH, dW, dH, padW, padH, count_include_pad, divisor_override);
+        cpu_avg_pool2d_backward_channels_last<scalar_t>(grad_input, grad_output, kW, kH, dW, dH, padW, padH, count_include_pad, divisor_override);
      });
      break;
    }
@ -546,9 +546,595 @@ void avg_pool2d_backward_kernel_impl(
  }
 }

+
+template <typename scalar_t>
+void cpu_avg_pool3d(
+    const Tensor& output_,
+    const Tensor& input_,
+    int64_t kW, int64_t kH, int64_t kD,
+    int64_t dW, int64_t dH, int64_t dD,
+    int64_t padW, int64_t padH, int64_t padD,
+    bool count_include_pad,
+    c10::optional<int64_t> divisor_override) {
+  using acc_t = at::opmath_type<scalar_t>;
+
+  auto input = input_.contiguous();
+  auto output = output_.contiguous();
+
+  auto input_data = input.data_ptr<scalar_t>();
+  auto output_data = output.data_ptr<scalar_t>();
+
+  int64_t numel = output.numel();
+  int64_t ndim = input.ndimension();
+  // treat batch size and channels as one dimension
+  int64_t channels = ndim == 4 ? input.size(0) : input.size(0) * input.size(1);
+  int64_t input_depth = input.size(-3);
+  int64_t input_height = input.size(-2);
+  int64_t input_width = input.size(-1);
+  int64_t output_depth = output.size(-3);
+  int64_t output_height = output.size(-2);
+  int64_t output_width = output.size(-1);
+
+  // parallel on dim N, C, D, H, W
+  at::parallel_for(0, numel, 0, [&](int64_t begin, int64_t end) {
+    int64_t c = 0;
+    int64_t od = 0;
+    int64_t oh = 0;
+    int64_t ow = 0;
+    data_index_init(begin, c, channels, od, output_depth, oh, output_height, ow, output_width);
+
+    for (const auto i : c10::irange(begin, end)) {
+      output_data[i] = static_cast<scalar_t>(0);
+
+      // local pointers
+      scalar_t* input_ptr = input_data + c * input_depth * input_height * input_width;
+
+      // compute the mean of the input image...
+      int64_t id0 = od * dD - padD;
+      int64_t ih0 = oh * dH - padH;
+      int64_t iw0 = ow * dW - padW;
+      int64_t id1 = std::min(id0 + kD, input_depth + padD);
+      int64_t ih1 = std::min(ih0 + kH, input_height + padH);
+      int64_t iw1 = std::min(iw0 + kW, input_width + padW);
+      int64_t pool_size = (id1 - id0) * (ih1 - ih0) * (iw1 - iw0);
+      id0 = std::max(id0, (int64_t) 0);
+      ih0 = std::max(ih0, (int64_t) 0);
+      iw0 = std::max(iw0, (int64_t) 0);
+      id1 = std::min(id1, input_depth);
+      ih1 = std::min(ih1, input_height);
+      iw1 = std::min(iw1, input_width);
+
+      if (id0 >= id1 || ih0 >= ih1 || iw0 >= iw1) {
+        // move on to next output index
+        data_index_step(c, channels, od, output_depth, oh, output_height, ow, output_width);
+        continue;
+      }
+
+      acc_t sum = 0;
+
+      int64_t divide_factor;
+      if (divisor_override.has_value()) {
+        divide_factor = divisor_override.value();
+      } else {
+        if(count_include_pad) {
+          divide_factor = pool_size;
+        } else {
+          divide_factor = (id1 - id0) * (ih1 - ih0) * (iw1 - iw0);
+        }
+      }
+
+      for (const auto id : c10::irange(id0, id1)) {
+        for (const auto ih : c10::irange(ih0, ih1)) {
+          for (const auto iw : c10::irange(iw0, iw1)) {
+            sum += input_ptr[id * input_height * input_width + ih * input_width + iw];
+          }
+        }
+      }
+      output_data[i] += scalar_t(sum / divide_factor);
+
+      // move on to next output index
+      data_index_step(c, channels, od, output_depth, oh, output_height, ow, output_width);
+    }
+  });
+
+  if (!output_.is_contiguous()) {
+    output_.copy_(output);
+  }
+}
+
+template <typename scalar_t,
+          typename std::enable_if<!is_reduced_floating_point<scalar_t>::value, int>::type = 0>
+void cpu_avg_pool3d_channels_last(
+    const Tensor& output_,
+    const Tensor& input_,
+    int64_t kW, int64_t kH, int64_t kD,
+    int64_t dW, int64_t dH, int64_t dD,
+    int64_t padW, int64_t padH, int64_t padD,
+    bool count_include_pad,
+    c10::optional<int64_t> divisor_override) {
+  TORCH_CHECK(input_.ndimension() == 5,
+              "3d average pooling with channels last format supports tensors with 5 dims");
+  auto memory_format = at::MemoryFormat::ChannelsLast3d;
+  auto input = input_.contiguous(memory_format);
+  auto output = output_.contiguous(memory_format);
+
+  auto input_data = input.data_ptr<scalar_t>();
+  auto output_data = output.data_ptr<scalar_t>();
+
+  int64_t nbatch = input.size(0);
+  int64_t channels = input.size(1);
+  int64_t input_depth = input.size(2);
+  int64_t input_height = input.size(3);
+  int64_t input_width = input.size(4);
+  int64_t output_depth = output.size(2);
+  int64_t output_height = output.size(3);
+  int64_t output_width = output.size(4);
+
+  using Vec = vec::Vectorized<scalar_t>;
+  // parallel on dim N, H, W
+  at::parallel_for(0, nbatch * output_depth * output_height * output_width, 0, [&](int64_t begin, int64_t end) {
+    int64_t n = 0;
+    int64_t od = 0;
+    int64_t oh = 0;
+    int64_t ow = 0;
+    data_index_init(begin, n, nbatch, od, output_depth, oh, output_height, ow, output_width);
+
+    int64_t size = channels;
+    int64_t len = size - (size % Vec::size());
+    for (const auto i : c10::irange(begin, end)) {
+      // compute the mean of the input image...
+      int64_t id0 = od * dD - padD;
+      int64_t ih0 = oh * dH - padH;
+      int64_t iw0 = ow * dW - padW;
+      int64_t id1 = std::min(id0 + kD, input_depth + padD);
+      int64_t ih1 = std::min(ih0 + kH, input_height + padH);
+      int64_t iw1 = std::min(iw0 + kW, input_width + padW);
+      int64_t pool_size = (id1 - id0) * (ih1 - ih0) * (iw1 - iw0);
+      id0 = std::max(id0, (int64_t) 0);
+      ih0 = std::max(ih0, (int64_t) 0);
+      iw0 = std::max(iw0, (int64_t) 0);
+      id1 = std::min(id1, input_depth);
+      ih1 = std::min(ih1, input_height);
+      iw1 = std::min(iw1, input_width);
+
+      int64_t divide_factor;
+      if (divisor_override.has_value()) {
+        divide_factor = divisor_override.value();
+      } else {
+        if(count_include_pad) {
+          divide_factor = pool_size;
+        } else {
+          divide_factor = (id1 - id0) * (ih1 - ih0) * (iw1 - iw0);
+        }
+      }
+
+      scalar_t* out = output_data + i * channels;
+
+      // Pass I: zero the out lane
+      int64_t d1 = 0;
+      for (; d1 < len; d1 += Vec::size()) {
+        Vec out_vec = Vec(scalar_t(0));
+        out_vec.store(out + d1);
+      }
+      for (; d1 < size; d1++) {
+        out[d1] = scalar_t(0);
+      }
+
+      if (id0 >= id1 || ih0 >= ih1 || iw0 >= iw1) {
+        // move on to next output index
+        data_index_step(n, nbatch, od, output_depth, oh, output_height, ow, output_width);
+        continue;
+      }
+
+      // Pass II: compute local sum
+      for (const auto id : c10::irange(id0, id1)) {
+        for (const auto ih : c10::irange(ih0, ih1)) {
+          for (const auto iw : c10::irange(iw0, iw1)) {
+            scalar_t* in = input_data + n * input_depth * input_height * input_width * channels +
+                id * input_height * input_width * channels + ih * input_width * channels + iw * channels;
+
+            int64_t d2 = 0;
+            for (; d2 < len; d2 += Vec::size()) {
+              Vec out_vec = Vec::loadu(out + d2) + Vec::loadu(in + d2);
+              out_vec.store(out + d2);
+            }
+            for (; d2 < size; d2++) {
+              out[d2] += in[d2];
+            }
+          }
+        }
+      }
+
+      // Pass III: compute local average
+      int64_t d3 = 0;
+      for (; d3 < len; d3 += Vec::size()) {
+        Vec out_vec = Vec::loadu(out + d3) / Vec(scalar_t(divide_factor));
+        out_vec.store(out + d3);
+      }
+      for (; d3 < size; d3++) {
+        out[d3] = out[d3] / divide_factor;
+      }
+
+      // move on to next output index
+      data_index_step(n, nbatch, od, output_depth, oh, output_height, ow, output_width);
+    }
+  });
+
+  if (!output_.is_contiguous(memory_format)) {
+    output_.copy_(output);
+  }
+}
+
+template <typename scalar_t,
+          typename std::enable_if<is_reduced_floating_point<scalar_t>::value, int>::type = 0>
+void cpu_avg_pool3d_channels_last(
+    const Tensor& output_,
+    const Tensor& input_,
+    int64_t kW, int64_t kH, int64_t kD,
+    int64_t dW, int64_t dH, int64_t dD,
+    int64_t padW, int64_t padH, int64_t padD,
+    bool count_include_pad,
+    c10::optional<int64_t> divisor_override) {
+  TORCH_CHECK(input_.ndimension() == 5,
+              "3d average pooling with channels last format supports tensors with 5 dims");
+  auto memory_format = at::MemoryFormat::ChannelsLast3d;
+  auto input = input_.contiguous(memory_format);
+  auto output = output_.contiguous(memory_format);
+
+  auto input_data = input.data_ptr<BFloat16>();
+  auto output_data = output.data_ptr<BFloat16>();
+
+  int64_t nbatch = input.size(0);
+  int64_t channels = input.size(1);
+  int64_t input_depth = input.size(2);
+  int64_t input_height = input.size(3);
+  int64_t input_width = input.size(4);
+  int64_t output_depth = output.size(2);
+  int64_t output_height = output.size(3);
+  int64_t output_width = output.size(4);
+
+  using bVec = vec::Vectorized<BFloat16>;
+  using fVec = vec::Vectorized<float>;
+  // parallel on dim N, H, W
+  at::parallel_for(0, nbatch * output_depth * output_height * output_width, 0, [&](int64_t begin, int64_t end) {
+    int64_t n = 0;
+    int64_t od = 0;
+    int64_t oh = 0;
+    int64_t ow = 0;
+    data_index_init(begin, n, nbatch, od, output_depth, oh, output_height, ow, output_width);
+
+    // temp buffer for sum, use float as accumulation type
+    // can't reuse output buffer to store sum since it is BFloat16
+    auto sum_arr = std::make_unique<float []>(channels);
+    float* sum = sum_arr.get();
+
+    int64_t size = channels;
+    for (const auto i : c10::irange(begin, end)) {
+      // compute the mean of the input image...
+      int64_t id0 = od * dD - padD;
+      int64_t ih0 = oh * dH - padH;
+      int64_t iw0 = ow * dW - padW;
+      int64_t id1 = std::min(id0 + kD, input_depth + padD);
+      int64_t ih1 = std::min(ih0 + kH, input_height + padH);
+      int64_t iw1 = std::min(iw0 + kW, input_width + padW);
+      int64_t pool_size = (id1 - id0) * (ih1 - ih0) * (iw1 - iw0);
+      id0 = std::max(id0, (int64_t) 0);
+      ih0 = std::max(ih0, (int64_t) 0);
+      iw0 = std::max(iw0, (int64_t) 0);
+      id1 = std::min(id1, input_depth);
+      ih1 = std::min(ih1, input_height);
+      iw1 = std::min(iw1, input_width);
+
+      int64_t divide_factor;
+      if (divisor_override.has_value()) {
+        divide_factor = divisor_override.value();
+      } else {
+        if(count_include_pad) {
+          divide_factor = pool_size;
+        } else {
+          divide_factor = (id1 - id0) * (ih1 - ih0) * (iw1 - iw0);
+        }
+      }
+
+      BFloat16* out = output_data + i * channels;
+
+      // Pass I: zero the out lane
+      int64_t d1 = 0;
+      for (; d1 < size - (size % fVec::size()); d1 += fVec::size()) {
+        fVec sum_fvec = fVec(float(0));
+        sum_fvec.store(sum + d1);
+      }
+      for (; d1 < size; d1++) {
+        sum[d1] = float(0);
+      }
+
+      if (id0 >= id1 || ih0 >= ih1 || iw0 >= iw1) {
+        // since we are not directly using output as the accumulation buffer,
+        // in case the kernel window is out of range, need to zero the output buffer here.
+        for (int64_t k = 0; k < size; k++) {
+          out[k] = 0;
+        }
+        // move on to next output index
+        data_index_step(n, nbatch, od, output_depth, oh, output_height, ow, output_width);
+        continue;
+      }
+
+      // Pass II: compute local sum
+      for (const auto id : c10::irange(id0, id1)) {
+        for (const auto ih : c10::irange(ih0, ih1)) {
+          for (const auto iw : c10::irange(iw0, iw1)) {
+            BFloat16* in = input_data + n * input_depth * input_height * input_width * channels +
+                id * input_height * input_width * channels + ih * input_width * channels + iw * channels;
+
+            int64_t d2 = 0;
+            for (; d2 < size - (size % bVec::size()); d2 += bVec::size()) {
+              bVec data_bvec = bVec::loadu(in + d2);
+              fVec data_fvec0, data_fvec1;
+              std::tie(data_fvec0, data_fvec1) = convert_bfloat16_float(data_bvec);
+
+              fVec sum_fvec0 = fVec::loadu(sum + d2) + data_fvec0;
+              fVec sum_fvec1 = fVec::loadu(sum + d2 + fVec::size()) + data_fvec1;
+              sum_fvec0.store(sum + d2);
+              sum_fvec1.store(sum + d2 + fVec::size());
+            }
+            for (; d2 < size; d2++) {
+              sum[d2] += float(in[d2]);
+            }
+          }
+        }
+      }
+
+      // Pass III: compute local average
+      int64_t d3 = 0;
+      for (; d3 < size - (size % bVec::size()); d3 += bVec::size()) {
+        fVec out_fvec0 = fVec::loadu(sum + d3) / fVec(float(divide_factor));
+        fVec out_fvec1 = fVec::loadu(sum + d3 + fVec::size()) / fVec(float(divide_factor));
+
+        bVec out_bvec = convert_float_bfloat16(out_fvec0, out_fvec1);
+        out_bvec.store(out + d3);
+      }
+      for (; d3 < size; d3++) {
+        out[d3] = BFloat16(sum[d3] / divide_factor);
+      }
+
+      // move on to next output index
+      data_index_step(n, nbatch, od, output_depth, oh, output_height, ow, output_width);
+    }
+  });
+
+  if (!output_.is_contiguous(memory_format)) {
+    output_.copy_(output);
+  }
+}
+
+template <typename scalar_t>
+void cpu_avg_pool3d_backward(
+    const Tensor& grad_input_,
+    const Tensor& grad_output_,
+    int kW, int kH, int kD,
+    int dW, int dH, int dD,
+    int padW, int padH, int padD,
+    bool count_include_pad,
+    c10::optional<int64_t> divisor_override) {
+  auto grad_output = grad_output_.contiguous();
+  auto grad_input = grad_input_.contiguous();
+
+  auto grad_output_data = grad_output.data_ptr<scalar_t>();
+  auto grad_input_data = grad_input.mutable_data_ptr<scalar_t>();
+
+  int64_t ndim = grad_output.ndimension();
+  // treat batch size and channels as one dimension
+  int64_t channels = ndim == 4 ? grad_output.size(0) : grad_output.size(0) * grad_output.size(1);
+  int64_t input_depth = grad_input.size(-3);
+  int64_t input_height = grad_input.size(-2);
+  int64_t input_width = grad_input.size(-1);
+  int64_t output_depth = grad_output.size(-3);
+  int64_t output_height = grad_output.size(-2);
+  int64_t output_width = grad_output.size(-1);
+
+  // parallel on dim of N, C
+  at::parallel_for(0, channels, 0, [&](int64_t begin, int64_t end) {
+    for (const auto c : c10::irange(begin, end)) {
+      scalar_t* grad_input_ptr = grad_input_data + c * input_depth * input_height * input_width;
+      scalar_t* grad_output_ptr = grad_output_data + c * output_depth * output_height * output_width;
+
+      for (const auto od : c10::irange(output_depth)) {
+        for (const auto oh : c10::irange(output_height)) {
+          for (const auto ow : c10::irange(output_width)) {
+            int64_t id0 = od * dD - padD;
+            int64_t ih0 = oh * dH - padH;
+            int64_t iw0 = ow * dW - padW;
+            int64_t id1 = std::min(id0 + kD, input_depth + padD);
+            int64_t ih1 = std::min(ih0 + kH, input_height + padH);
+            int64_t iw1 = std::min(iw0 + kW, input_width + padW);
+            int64_t pool_size = (id1 - id0) * (ih1 - ih0) * (iw1 - iw0);
+            id0 = std::max(id0, (int64_t) 0);
+            ih0 = std::max(ih0, (int64_t) 0);
+            iw0 = std::max(iw0, (int64_t) 0);
+            ih1 = std::min(ih1, input_height);
+            iw1 = std::min(iw1, input_width);
+
+            int64_t divide_factor;
+            if (divisor_override.has_value()) {
+              divide_factor = divisor_override.value();
+            } else {
+              if(count_include_pad) {
+                divide_factor = pool_size;
+              } else {
+                divide_factor = (id1 - id0) * (ih1 - ih0) * (iw1 - iw0);
+              }
+            }
+
+            scalar_t grad_delta = grad_output_ptr[od * output_height * output_width + oh * output_width + ow] / divide_factor;
+            for (const auto id : c10::irange(id0, id1)) {
+              for (const auto ih : c10::irange(ih0, ih1)) {
+                for (const auto iw : c10::irange(iw0, iw1)) {
+                  grad_input_ptr[id * input_height * input_width + ih * input_width + iw] += grad_delta;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  });
+
+  if (!grad_input_.is_contiguous()) {
+    grad_input_.copy_(grad_input);
+  }
+}
+
+template <typename scalar_t>
+void cpu_avg_pool3d_backward_channels_last(
+    const Tensor& grad_input_,
+    const Tensor& grad_output_,
+    int kW, int kH, int kD,
+    int dW, int dH, int dD,
+    int padW, int padH, int padD,
+    bool count_include_pad,
+    c10::optional<int64_t> divisor_override) {
+  auto memory_format = at::MemoryFormat::ChannelsLast3d;
+  auto grad_input = grad_input_.contiguous(memory_format);
+  auto grad_output = grad_output_.contiguous(memory_format);
+
+  auto grad_input_data = grad_input.mutable_data_ptr<scalar_t>();
+  auto grad_output_data = grad_output.data_ptr<scalar_t>();
+
+  int64_t nbatch = grad_input.size(0);
+  int64_t channels = grad_input.size(1);
+  int64_t input_depth = grad_input.size(2);
+  int64_t input_height = grad_input.size(3);
+  int64_t input_width = grad_input.size(4);
+  int64_t output_depth = grad_output.size(2);
+  int64_t output_height = grad_output.size(3);
+  int64_t output_width = grad_output.size(4);
+
+  using Vec = vec::Vectorized<scalar_t>;
+  // parallel on dim N
+  at::parallel_for(0, nbatch, 0, [&](int64_t begin, int64_t end) {
+    for (const auto n : c10::irange(begin, end)) {
+      scalar_t* grad_input_ptr = grad_input_data + n * input_depth * input_height * input_width * channels;
+      scalar_t* grad_output_ptr = grad_output_data + n * output_height * output_width * channels;
+
+      for (const auto od : c10::irange(output_depth)) {
+        for (const auto oh : c10::irange(output_height)) {
+          for (const auto ow : c10::irange(output_width)) {
+            int64_t id0 = od * dD - padD;
+            int64_t ih0 = oh * dH - padH;
+            int64_t iw0 = ow * dW - padW;
+            int64_t id1 = std::min(id0 + kD, input_depth + padD);
+            int64_t ih1 = std::min(ih0 + kH, input_height + padH);
+            int64_t iw1 = std::min(iw0 + kW, input_width + padW);
+            int64_t pool_size = (id1 - id0) * (ih1 - ih0) * (iw1 - iw0);
+            id0 = std::max(id0, (int64_t) 0);
+            ih0 = std::max(ih0, (int64_t) 0);
+            iw0 = std::max(iw0, (int64_t) 0);
+            id1 = std::min(id1, input_depth);
+            ih1 = std::min(ih1, input_height);
+            iw1 = std::min(iw1, input_width);
+
+            int64_t divide_factor;
+            if (divisor_override.has_value()) {
+              divide_factor = divisor_override.value();
+            } else {
+              if(count_include_pad) {
+                divide_factor = pool_size;
+              } else {
+                divide_factor = (id1 - id0) * (ih1 - ih0) * (iw1 - iw0);
+              }
+            }
+
+            scalar_t* gout = grad_output_ptr + od * output_height * output_width * channels + oh * output_width * channels + ow * channels;
+            int64_t size = channels;
+            int64_t len = size - (size % Vec::size());
+            for (const auto id : c10::irange(id0, id1)) {
+              for (const auto ih : c10::irange(ih0, ih1)) {
+                for (const auto iw : c10::irange(iw0, iw1)) {
+                  scalar_t* gin = grad_input_ptr + id * input_height * input_width * channels + ih * input_width * channels + iw * channels;
+
+                  int64_t d = 0;
+                  for (; d < len; d += Vec::size()) {
+                    Vec gin_vec = Vec::loadu(gin + d) + Vec::loadu(gout + d) / Vec(scalar_t(divide_factor));
+                    gin_vec.store(gin + d);
+                  }
+                  for (; d < size; d++) {
+                    gin[d] += gout[d] / divide_factor;
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  });
+
+  if (!grad_input_.is_contiguous(memory_format)) {
+    grad_input_.copy_(grad_input);
+  }
+}
+
+
+
+void avg_pool3d_kernel_impl(
+    const Tensor& output,
+    const Tensor& input,
+    int64_t kW, int64_t kH, int64_t kD,
+    int64_t dW, int64_t dH, int64_t dD,
+    int64_t padW, int64_t padH, int64_t padD,
+    bool count_include_pad,
+    c10::optional<int64_t> divisor_override) {
+  switch (input.suggest_memory_format()) {
+    case at::MemoryFormat::Contiguous: {
+      AT_DISPATCH_FLOATING_TYPES_AND3(kLong, kBFloat16, kHalf, input.scalar_type(), "avg_pool3d", [&] {
+        cpu_avg_pool3d<scalar_t>(output, input, kW, kH, kD, dW, dH, dD, padW, padH, padD, count_include_pad, divisor_override);
+      });
+      break;
+    }
+    case at::MemoryFormat::ChannelsLast: {
+      AT_DISPATCH_FLOATING_TYPES_AND3(kLong, kBFloat16, kHalf, input.scalar_type(), "avg_pool3d_channels_last", [&] {
+        cpu_avg_pool3d_channels_last<scalar_t>(output, input, kW, kH, kD, dW, dH, dD, padW, padH, padD, count_include_pad, divisor_override);
+      });
+      break;
+    }
+    default:
+      TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
+  }
+}
+
+
+void avg_pool3d_backward_kernel_impl(
+    const Tensor& grad_input,
+    const Tensor& grad_output,
+    int kW, int kH, int kD,
+    int dW, int dH, int dD,
+    int padW, int padH, int padD,
+    bool count_include_pad,
+    c10::optional<int64_t> divisor_override) {
+  switch (grad_output.suggest_memory_format()) {
+    case at::MemoryFormat::Contiguous: {
+      AT_DISPATCH_FLOATING_TYPES_AND3(kLong, kBFloat16, kHalf, grad_output.scalar_type(), "avg_pool3d_backward", [&] {
+        cpu_avg_pool3d_backward<scalar_t>(grad_input, grad_output, kW, kH, kD, dW, dH, dD, padW, padH, padD, count_include_pad, divisor_override);
+      });
+      break;
+    }
+    case at::MemoryFormat::ChannelsLast3d: {
+      AT_DISPATCH_FLOATING_TYPES_AND3(kLong, kBFloat16, kHalf, grad_output.scalar_type(), "avg_pool3d_backward_channels_last", [&] {
+        cpu_avg_pool3d_backward_channels_last<scalar_t>(grad_input, grad_output, kW, kH, kD, dW, dH, dD, padW, padH, padD, count_include_pad, divisor_override);
+      });
+      break;
+    }
+    default:
+      TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
+  }
+}
+
+
 } // anonymous namespace

 REGISTER_DISPATCH(avg_pool2d_kernel, &avg_pool2d_kernel_impl);
 REGISTER_DISPATCH(avg_pool2d_backward_kernel, &avg_pool2d_backward_kernel_impl);
+REGISTER_DISPATCH(avg_pool3d_kernel, &avg_pool3d_kernel_impl);
+REGISTER_DISPATCH(avg_pool3d_backward_kernel, &avg_pool3d_backward_kernel_impl);

 } // at::native
--- a/aten/src/ATen/native/cpu/FusedSGDKernel.cpp
+++ b/aten/src/ATen/native/cpu/FusedSGDKernel.cpp
@ -52,8 +52,8 @@ typename std::enable_if<
      grad_vec2 = grad_vec2 * fVec(opmath_t(-1.0));
    }
    if (weight_decay != 0.0){
-      grad_vec1 += param_vec1 * fVec(scalar_t(weight_decay));
-      grad_vec2 += param_vec2 * fVec(scalar_t(weight_decay));
+      grad_vec1 = vec::fmadd(param_vec1, fVec(scalar_t(weight_decay)), grad_vec1);
+      grad_vec2 = vec::fmadd(param_vec2, fVec(scalar_t(weight_decay)), grad_vec2);
    }
    if (momentum != 0.0) {
      fVec momentum_vec1, momentum_vec2;
@ -61,17 +61,16 @@ typename std::enable_if<
        momentum_vec1 = grad_vec1;
        momentum_vec2 = grad_vec2;
      } else {
-        momentum_vec1 =
-            fVec::loadu(momentum_buf_ptr + d) * fVec(scalar_t(momentum)) +
-            grad_vec1 * fVec(scalar_t(1 - dampening));
-        momentum_vec2 =
-            fVec::loadu(momentum_buf_ptr + d + fVec::size()) * fVec(scalar_t(momentum)) +
-            grad_vec2 * fVec(scalar_t(1 - dampening));
+
+        momentum_vec1 = fVec::loadu(momentum_buf_ptr + d) * fVec(scalar_t(momentum));
+        momentum_vec2 = fVec::loadu(momentum_buf_ptr + d + fVec::size()) * fVec(scalar_t(momentum));
+        momentum_vec1 = vec::fmadd(fVec(scalar_t(1 - dampening)), grad_vec1, momentum_vec1);
+        momentum_vec2 = vec::fmadd(fVec(scalar_t(1 - dampening)), grad_vec2, momentum_vec2);
      }
      vec::convert_from_float<scalar_t>(momentum_vec1, momentum_vec2).store(momentum_buf_ptr + d);;
      if (nesterov) {
-        grad_vec1 += momentum_vec1 * fVec(scalar_t(momentum));
-        grad_vec2 += momentum_vec2 * fVec(scalar_t(momentum));
+        grad_vec1 = vec::fmadd(momentum_vec1, fVec(scalar_t(momentum)), grad_vec1);
+        grad_vec2 = vec::fmadd(momentum_vec2, fVec(scalar_t(momentum)), grad_vec2);
      } else {
        grad_vec1 = momentum_vec1;
        grad_vec2 = momentum_vec2;
@ -142,7 +141,7 @@ typename std::enable_if<
    }
    if (maximize) grad_vec = grad_vec * Vec(scalar_t(-1.0));
    if (weight_decay != 0.0){
-      grad_vec += param_vec * Vec(scalar_t(weight_decay));
+      grad_vec = vec::fmadd(param_vec, Vec(scalar_t(weight_decay)), grad_vec);
    }
    if (momentum != 0.0) {
      Vec momentum_vec;
@ -150,12 +149,12 @@ typename std::enable_if<
        momentum_vec = grad_vec;
      } else {
        momentum_vec =
-            Vec::loadu(momentum_buf_ptr + d) * Vec(scalar_t(momentum)) +
-            grad_vec * Vec(scalar_t(1 - dampening));
+            Vec::loadu(momentum_buf_ptr + d) * Vec(scalar_t(momentum));
+        momentum_vec = vec::fmadd(Vec(scalar_t(1 - dampening)), grad_vec, momentum_vec);
      }
      momentum_vec.store(momentum_buf_ptr + d);
      if (nesterov) {
-        grad_vec += momentum_vec * Vec(scalar_t(momentum));
+        grad_vec =  vec::fmadd(momentum_vec, Vec(scalar_t(momentum)), grad_vec);
      } else {
        grad_vec = momentum_vec;
      }
--- a/aten/src/ATen/native/cpu/int8mm_kernel.cpp
+++ b/aten/src/ATen/native/cpu/int8mm_kernel.cpp
@ -185,11 +185,78 @@ inline void tinygemm_kernel(
 #if !defined(C10_MOBILE) && defined(__aarch64__)
 #include <arm_neon.h>

-static inline float reduce(float32x4_t x) {
+inline float reduce(float32x4_t x) {
        auto sum = vpaddq_f32(x, x);
        return vgetq_lane_f32(vpaddq_f32(sum, sum), 0);
 }

+inline float32x4x2_t load_as_float32x4x2(const Half* ptr) {
+  float16x8_t f16_val = vld1q_f16(reinterpret_cast<const float16_t *>(ptr));
+  auto val_low = vcvt_f32_f16(vget_low_f16(f16_val));
+  auto val_high = vcvt_f32_f16(vget_high_f16(f16_val));
+  return {val_low, val_high};
+}
+
+inline float32x4_t load_as_float32x4(const Half* ptr) {
+    return vcvt_f32_f16(vld1_f16(reinterpret_cast<const float16_t *>(ptr)));
+}
+
+inline float32x4x2_t load_as_float32x4x2(const BFloat16* ptr) {
+  int32x4_t shift = vdupq_n_s32(16);
+  uint16x8_t u16_val = vld1q_u16(reinterpret_cast<const uint16_t *>(ptr));
+  uint32x4_t int_low = vmovl_u16(vget_low_u16(u16_val));
+  uint32x4_t int_high = vmovl_u16(vget_high_u16(u16_val));
+  return {vreinterpretq_f32_u32(vshlq_u32(int_low, shift)), vreinterpretq_f32_u32(vshlq_u32(int_high, shift))};
+}
+
+inline float32x4_t load_as_float32x4(const BFloat16* ptr) {
+  int32x4_t shift = vdupq_n_s32(16);
+  uint32x4_t as_int = vmovl_u16(vld1_u16(reinterpret_cast<const uint16_t *>(ptr)));
+  return vreinterpretq_f32_u32(vshlq_u32(as_int, shift));
+}
+
+inline float32x4_t load_as_float32x4(const float* ptr) {
+  return vld1q_f32(ptr);
+}
+
+inline float32x4x2_t load_as_float32x4x2(const float* ptr) {
+  return {vld1q_f32(ptr), vld1q_f32(ptr + 4)};
+}
+
+template <int BLOCK_M, int BLOCK_N, typename T>
+inline void tinygemm_kernel_(
+    const T* RESTRICT A,
+    const int8_t* RESTRICT B,
+    const T* RESTRICT scales,
+    T* RESTRICT C,
+    int lda,
+    int ldb,
+    int ldc,
+    int K) {
+
+  for (const auto m : c10::irange(BLOCK_M)) {
+    float32x4_t c_val[BLOCK_N];
+    c10::ForcedUnroll<BLOCK_N>{}([&](auto i) {
+        c_val[i] = vdupq_n_f32(0.0);
+    });
+    for (int k = 0; k < K; k += 8) {
+      auto a_val = load_as_float32x4x2(A + m * lda + k);
+      c10::ForcedUnroll<BLOCK_N>{}([&](auto i) {
+        int16x8_t b_val = vmovl_s8(vld1_s8(B + i * ldb + k));
+        auto b_val_low = vcvtq_f32_s32(vmovl_s16(vget_low_s16(b_val)));
+        auto b_val_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(b_val)));
+        c_val[i] = vfmaq_f32(c_val[i], a_val.val[1], b_val_high);
+        c_val[i] = vfmaq_f32(c_val[i], a_val.val[0], b_val_low);
+      });
+    }
+
+    float32x4_t scale_val = load_as_float32x4(scales);
+    c10::ForcedUnroll<BLOCK_N>{}([&](auto i) {
+      C[m * ldc + i] = reduce(c_val[i]) * vgetq_lane_f32(scale_val, i);
+    });
+  }
+}
+
 template <int BLOCK_M, int BLOCK_N>
 inline void tinygemm_kernel(
    const Half* RESTRICT A,
@ -200,30 +267,33 @@ inline void tinygemm_kernel(
    int ldb,
    int ldc,
    int K) {
+  tinygemm_kernel_<BLOCK_M, BLOCK_N>(A, B, scales, C, lda, ldb, ldc, K);
+}

-  for (const auto m : c10::irange(BLOCK_M)) {
-    float32x4_t c_val[BLOCK_N];
-    c10::ForcedUnroll<BLOCK_N>{}([&](auto i) {
-        c_val[i] = vdupq_n_f32(0.0);
-    });
-    for (int k = 0; k < K; k += 8) {
-      float16x8_t a_val = vld1q_f16(reinterpret_cast<const float16_t *>(A) + m * lda + k);
-      auto a_val_low = vcvt_f32_f16(vget_low_f16(a_val));
-      auto a_val_high = vcvt_f32_f16(vget_high_f16(a_val));
-      c10::ForcedUnroll<BLOCK_N>{}([&](auto i) {
-        int16x8_t b_val = vmovl_s8(vld1_s8(B + i * ldb + k));
-        auto b_val_low = vcvtq_f32_s32(vmovl_s16(vget_low_s16(b_val)));
-        auto b_val_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(b_val)));
-        c_val[i] = vfmaq_f32(c_val[i], a_val_high, b_val_high);
-        c_val[i] = vfmaq_f32(c_val[i], a_val_low, b_val_low);
-      });
-    }
+template <int BLOCK_M, int BLOCK_N>
+inline void tinygemm_kernel(
+    const BFloat16* RESTRICT A,
+    const int8_t* RESTRICT B,
+    const BFloat16* RESTRICT scales,
+    BFloat16* RESTRICT C,
+    int lda,
+    int ldb,
+    int ldc,
+    int K) {
+  tinygemm_kernel_<BLOCK_M, BLOCK_N>(A, B, scales, C, lda, ldb, ldc, K);
+}

-    float32x4_t scale_val = vcvt_f32_f16(vld1_f16(reinterpret_cast<const float16_t *>(scales)));
-    c10::ForcedUnroll<BLOCK_N>{}([&](auto i) {
-      C[m * ldc + i] = reduce(c_val[i]) * vgetq_lane_f32(scale_val, i);
-    });
-  }
+template <int BLOCK_M, int BLOCK_N>
+inline void tinygemm_kernel(
+    const float* RESTRICT A,
+    const int8_t* RESTRICT B,
+    const float* RESTRICT scales,
+    float* RESTRICT C,
+    int lda,
+    int ldb,
+    int ldc,
+    int K) {
+  tinygemm_kernel_<BLOCK_M, BLOCK_N>(A, B, scales, C, lda, ldb, ldc, K);
 }
 #endif

--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@ -7,6 +7,7 @@
 #include <ATen/TensorUtils.h>
 #include <ATen/cuda/CUDABlas.h>
 #include <ATen/cuda/tunable/Tunable.h>
+#include <ATen/cuda/tunable/TunableGemm.h>
 #include <ATen/native/Resize.h>
 #include <c10/util/MaybeOwned.h>

@ -156,7 +157,7 @@ enum class Activation {
  GELU,
 };

-#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
 cuda::blas::GEMMAndBiasActivationEpilogue activation_to_gemm_and_blas_arg(Activation a) {
  switch (a) {
    case Activation::None:
@ -235,7 +236,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
  at::ScalarType scalar_type = self.scalar_type();
  c10::MaybeOwned<Tensor> self_;
  if (&result != &self) {
-#if (defined(CUDA_VERSION) && CUDA_VERSION >= 11040 && !defined(_MSC_VER)) || defined(USE_ROCM) && ROCM_VERSION >= 50700
+#if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11040)) || (defined(USE_ROCM) && (ROCM_VERSION >= 50700))
    // Strangely, if mat2 has only 1 row or column, we get
    // CUBLAS_STATUS_INVALID_VALUE error from cublasLtMatmulAlgoGetHeuristic.
    // self.dim() == 1 && result.dim() == 2 && self.sizes()[0] == mat2_sizes[1]
@ -333,8 +334,9 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma

  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!args.result->is_conj());

-#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && (ROCM_VERSION >= 50700))
  if (useLtInterface) {
+#if defined(USE_ROCM)
    AT_DISPATCH_FLOATING_TYPES_AND2(
        at::ScalarType::Half,
        at::ScalarType::BFloat16,
@ -352,28 +354,49 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
              args.lda,
              args.matb->const_data_ptr<scalar_t>(),
              args.ldb,
-#if defined(USE_ROCM)
              // This condition is needed for mm case on ROCm for hipblasLt path.
              // Passing the bias ptr as null to avoid accuracy issues for mm case.
              (&result != &self) ? self.const_data_ptr<scalar_t>() : nullptr,
-#else
-              self.const_data_ptr<scalar_t>(),
-#endif
              args.result->data_ptr<scalar_t>(),
              args.result_ld,
-#if (defined(CUDA_VERSION) && CUDA_VERSION >= 11080) || defined(USE_ROCM)
              activation_to_gemm_and_blas_arg(activation)
-#else
-              // GELU is not supported (and does not compile!) prior
-              // to CUDA 11.4. Have observed accuracy issues with
-              // GELU epilogue in 11.4; disabling the GELU epilogue
-              // path for CUDA version < 11.8.
-              activation != Activation::GELU
-              ? activation_to_gemm_and_blas_arg(activation)
-              : cuda::blas::GEMMAndBiasActivationEpilogue::None
-#endif
          );
        });
+#else
+    auto activation_epilogue = activation_to_gemm_and_blas_arg(activation);
+#if (defined(CUDA_VERSION) && (CUDA_VERSION < 11080))
+    // GELU is not supported (and does not compile!) prior
+    // to CUDA 11.4. Have observed accuracy issues with
+    // GELU epilogue in 11.4; disabling the GELU epilogue
+    // path for CUDA version < 11.8.
+    if (activation == Activation::GELU)
+      activation_epilogue = cuda::blas::GEMMAndBiasActivationEpilogue::None;
+#endif
+
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        at::ScalarType::Half,
+        at::ScalarType::BFloat16,
+        scalar_type,
+        "addmm_cuda_lt",
+        [&] {
+          at::cuda::blas::gemm_and_bias<scalar_t>(
+              args.transa == 't',
+              args.transb == 't',
+              args.m,
+              args.n,
+              args.k,
+              alpha.to<at::opmath_type<scalar_t>>(),
+              args.mata->const_data_ptr<scalar_t>(),
+              args.lda,
+              args.matb->const_data_ptr<scalar_t>(),
+              args.ldb,
+              self.const_data_ptr<scalar_t>(),
+              args.result->data_ptr<scalar_t>(),
+              args.result_ld,
+              activation_epilogue
+          );
+        });
+#endif
  } else
 #endif
  {
@ -747,7 +770,7 @@ Tensor& _int_mm_out_cuda(const Tensor& self, const Tensor& mat2, Tensor& result)

  TORCH_CHECK(result.is_contiguous(), "Expected result to be contiguous.");

-#if (!defined(USE_ROCM) && !defined(_MSC_VER) && defined(CUDA_VERSION) && CUDA_VERSION >= 11070) || (defined(USE_ROCM) && ROCM_VERSION >= 60000)
+#if (!defined(USE_ROCM) && defined(CUDA_VERSION) && (CUDA_VERSION >= 11070)) || (defined(USE_ROCM) && (ROCM_VERSION >= 60000))
  cublasCommonArgs args(self, mat2, result);

  at::cuda::blas::int8_gemm(
@ -767,7 +790,7 @@ Tensor& _int_mm_out_cuda(const Tensor& self, const Tensor& mat2, Tensor& result)
    result.copy_(*args.result);
  }
 #else
-#if !defined(USE_ROCM) && !defined(_MSC_VER) && defined(CUDA_VERSION)
+#if !defined(USE_ROCM) && defined(CUDA_VERSION)
  TORCH_CHECK(false, "_int_mm_out_cuda not compiled for CUDA ", CUDA_VERSION);
 #else
  TORCH_CHECK(false, "_int_mm_out_cuda not compiled for this platform.");
@ -887,32 +910,112 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
  at::native::resize_output(out, {mat1_sizes[0], mat2_sizes[1]});
  at::native::resize_output(amax, {});

-#if !defined(USE_ROCM) && !defined(_MSC_VER) || (defined(USE_ROCM) && ROCM_VERSION >= 60000)
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && (ROCM_VERSION >= 60000))
  cublasCommonArgs args(mat1, mat2, out);
  const auto out_dtype_ = args.result->scalar_type();
  TORCH_CHECK(args.transa == 't' && args.transb == 'n', "Only multiplication of row-major and column-major matrices is supported by cuBLASLt");
-  at::cuda::blas::scaled_gemm(
-      args.transa,
-      args.transb,
-      args.m,
-      args.n,
-      args.k,
-      args.mata->data_ptr(),
-      scale_a ? scale_a->data_ptr() : nullptr,
-      args.lda,
-      args.mata->scalar_type(),
-      args.matb->data_ptr(),
-      scale_b ? scale_b->data_ptr() : nullptr,
-      args.ldb,
-      args.matb->scalar_type(),
-      bias ? bias->data_ptr(): nullptr,
-      bias ? bias->scalar_type() : isFloat8Type(out_dtype_) ? at::ScalarType::Half : out_dtype_,
-      args.result->data_ptr(),
-      scale_result ? scale_result->data_ptr() : nullptr,
-      args.result_ld,
-      out_dtype_,
-      amax.data_ptr(),
-      use_fast_accum);
+#ifdef USE_ROCM
+  auto tuning_ctx = at::cuda::tunable::getTuningContext();
+  if (tuning_ctx->IsTunableOpEnabled()) {
+#define TUNABLE_DISPATCH(BLASOP_A, BLASOP_B)                            \
+        if (mat1.scalar_type() == ScalarType::Float8_e4m3fnuz) {        \
+          if (mat2.scalar_type() == ScalarType::Float8_e4m3fnuz) {      \
+            static at::cuda::tunable::ScaledGemmTunableOp<              \
+                at::Float8_e4m3fnuz, at::Float8_e4m3fnuz, scalar_t,     \
+                BLASOP_A, BLASOP_B> scaledgemm{};                       \
+            scaledgemm(&params);                                        \
+          }                                                             \
+          else if (mat2.scalar_type() == ScalarType::Float8_e5m2fnuz) { \
+            static at::cuda::tunable::ScaledGemmTunableOp<              \
+                at::Float8_e4m3fnuz, at::Float8_e5m2fnuz, scalar_t,     \
+                BLASOP_A, BLASOP_B> scaledgemm{};                       \
+            scaledgemm(&params);                                        \
+          }                                                             \
+        }                                                               \
+        else if (mat1.scalar_type() == ScalarType::Float8_e5m2fnuz) {   \
+          if (mat2.scalar_type() == ScalarType::Float8_e4m3fnuz) {      \
+            static at::cuda::tunable::ScaledGemmTunableOp<              \
+                at::Float8_e5m2fnuz, at::Float8_e4m3fnuz, scalar_t,     \
+                BLASOP_A, BLASOP_B> scaledgemm{};                       \
+            scaledgemm(&params);                                        \
+          }                                                             \
+          else if (mat2.scalar_type() == ScalarType::Float8_e5m2fnuz) { \
+            static at::cuda::tunable::ScaledGemmTunableOp<              \
+                at::Float8_e5m2fnuz, at::Float8_e5m2fnuz, scalar_t,     \
+                BLASOP_A, BLASOP_B> scaledgemm{};                       \
+            scaledgemm(&params);                                        \
+          }                                                             \
+        }
+    AT_DISPATCH_V2(out_dtype_, "_tunable_scaled_gemm", AT_WRAP([&] {
+      bool transa_ = ((args.transa != 'n') && (args.transa != 'N'));
+      bool transb_ = ((args.transb != 'n') && (args.transb != 'N'));
+      at::cuda::tunable::ScaledGemmParams<scalar_t> params;
+      params.transa = args.transa;
+      params.transb = args.transb;
+      params.m = args.m;
+      params.n = args.n;
+      params.k = args.k;
+      params.a = args.mata->data_ptr();
+      params.a_scale_ptr = scale_a ? scale_a->data_ptr() : nullptr;
+      params.lda = args.lda;
+      params.a_dtype = args.mata->scalar_type();
+      params.b = args.matb->data_ptr();
+      params.b_scale_ptr = scale_b ? scale_b->data_ptr() : nullptr;
+      params.ldb = args.ldb;
+      params.b_dtype = args.matb->scalar_type();
+      params.bias_ptr = bias ? bias->data_ptr(): nullptr;
+      params.bias_dtype = bias ? bias->scalar_type() : isFloat8Type(out_dtype_) ? at::ScalarType::Half : out_dtype_;
+      params.c = args.result->data_ptr();
+      params.c_scale_ptr = scale_result ? scale_result->data_ptr() : nullptr;
+      params.ldc = args.result_ld;
+      params.c_dtype = out_dtype_;
+      params.amax_ptr = amax.data_ptr();
+      params.use_fast_accum = use_fast_accum;
+      if (transa_ && transb_) {
+        TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::T, at::cuda::tunable::BlasOp::T)
+      }
+      else if (transa_ && !transb_) {
+        TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::T, at::cuda::tunable::BlasOp::N)
+      }
+      else if (!transa_ && transb_) {
+        TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::N, at::cuda::tunable::BlasOp::T)
+      }
+      else if (!transa_ && !transb_) {
+        TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::N, at::cuda::tunable::BlasOp::N)
+      }
+      else {
+        TORCH_CHECK(false, "unreachable");
+      }
+    }),
+    kHalf, kBFloat16, kFloat8_e4m3fnuz, kFloat8_e5m2fnuz, AT_EXPAND(AT_FLOATING_TYPES));
+#undef TUNABLE_DISPATCH
+  }
+  else
+#endif
+  {
+    at::cuda::blas::scaled_gemm(
+        args.transa,
+        args.transb,
+        args.m,
+        args.n,
+        args.k,
+        args.mata->data_ptr(),
+        scale_a ? scale_a->data_ptr() : nullptr,
+        args.lda,
+        args.mata->scalar_type(),
+        args.matb->data_ptr(),
+        scale_b ? scale_b->data_ptr() : nullptr,
+        args.ldb,
+        args.matb->scalar_type(),
+        bias ? bias->data_ptr(): nullptr,
+        bias ? bias->scalar_type() : isFloat8Type(out_dtype_) ? at::ScalarType::Half : out_dtype_,
+        args.result->data_ptr(),
+        scale_result ? scale_result->data_ptr() : nullptr,
+        args.result_ld,
+        out_dtype_,
+        amax.data_ptr(),
+        use_fast_accum);
+  }
 #else
  TORCH_CHECK(false, "_scaled_mm_out_cuda is not compiled for this platform.");
 #endif
--- a/aten/src/ATen/native/cuda/TensorShape.cu
+++ b/aten/src/ATen/native/cuda/TensorShape.cu
@ -4,6 +4,7 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/native/Resize.h>
 #include <ATen/native/TensorShape.h>
+#include <c10/cuda/CUDAGraphsC10Utils.h>
 #include <c10/util/TypeCast.h>

 #ifndef AT_PER_OPERATOR_HEADERS
@ -703,12 +704,15 @@ void split_with_sizes_copy_out_cuda(
    IntArrayRef split_sizes,
    int64_t dim,
    TensorList out) {
+  const bool is_capturing = at::cuda::currentStreamCaptureStatusMayInitCtx() !=
+      at::cuda::CaptureStatus::None;
  bool contiguous_no_cast = self.is_non_overlapping_and_dense();
  for (const auto& t : out) {
    contiguous_no_cast &= t.is_non_overlapping_and_dense();
    contiguous_no_cast &= (t.dtype() == self.dtype());
  }
-  if (contiguous_no_cast) {
+  // TODO(yifu): make the fast path work for CUDA graph
+  if (!is_capturing && contiguous_no_cast) {
    // Perform equivalent checks performed by the composite impl
    if (dim < 0) {
      dim = at::maybe_wrap_dim(dim, self.dim());
--- a/aten/src/ATen/native/cudnn/MHA.cpp
+++ b/aten/src/ATen/native/cudnn/MHA.cpp
@ -29,6 +29,30 @@ void run_cudnn_SDP_fprop(
      false, "PyTorch was not compiled with cuDNN Flash Attention enabled!");
 }

+void run_cudnn_SDP_bprop(
+    int64_t b,
+    int64_t h,
+    int64_t s_q,
+    int64_t s_kv,
+    int64_t d,
+    float scaling_factor,
+    bool is_causal,
+    float dropout_probability,
+    const Tensor& q,
+    const Tensor& k,
+    const Tensor& v,
+    const Tensor& o,
+    const Tensor& dO,
+    const Tensor& softmaxstats,
+    Tensor& dQ,
+    Tensor& dK,
+    Tensor& dV,
+    const Tensor& dropoutseed,
+    const Tensor& dropoutoffset) {
+  TORCH_CHECK(
+      false, "PyTorch was not compiled with cuDNN Flash Attention enabled!");
+}
+
 } // namespace native
 } // namespace at

@ -73,6 +97,22 @@ using graph_and_tensors = std::tuple<
    std::shared_ptr<fe::graph::Tensor_attributes> // Stats
    >;

+using graph_and_tensors_backward = std::tuple<
+    std::shared_ptr<fe::graph::Graph>,
+    std::shared_ptr<fe::graph::Tensor_attributes>, // Q,
+    std::shared_ptr<fe::graph::Tensor_attributes>, // K,
+    std::shared_ptr<fe::graph::Tensor_attributes>, // V,
+    std::shared_ptr<fe::graph::Tensor_attributes>, // Attn_scale
+    std::shared_ptr<fe::graph::Tensor_attributes>, // Seed,
+    std::shared_ptr<fe::graph::Tensor_attributes>, // Offset,
+    std::shared_ptr<fe::graph::Tensor_attributes>, // O,
+    std::shared_ptr<fe::graph::Tensor_attributes>, // dO,
+    std::shared_ptr<fe::graph::Tensor_attributes>, // stats,
+    std::shared_ptr<fe::graph::Tensor_attributes>, // dQ,
+    std::shared_ptr<fe::graph::Tensor_attributes>, // dK,,
+    std::shared_ptr<fe::graph::Tensor_attributes> // dV,
+    >;
+
 #define MAX_MHA_DIM 4

 struct MHAParams {
@ -178,8 +218,7 @@ struct MHACacheKeyWrapper : ParamsWrapper<MHAParams> {

 template <typename T, typename KeyType>
 struct MHAGraphCache {
-  std::unordered_map<KeyType, graph_and_tensors, ParamsWrapperHash<KeyType>>
-      engine_cache;
+  std::unordered_map<KeyType, T, ParamsWrapperHash<KeyType>> engine_cache;

  // no mutexes here as caches are now thread local for v8, can also return a
  // pointer to the Execution Plan if we know it will not be invalidated by
@ -202,6 +241,8 @@ struct MHAGraphCache {
 // be thread safe across all engines see Limitations in
 // https://docs.nvidia.com/deeplearning/cudnn/release-notes/index.html
 thread_local MHAGraphCache<graph_and_tensors, MHACacheKeyWrapper> mhagraphcache;
+thread_local MHAGraphCache<graph_and_tensors_backward, MHACacheKeyWrapper>
+    mhagraphbackwardcache;

 auto build_graph_and_tensors(
    int64_t b,
@ -227,10 +268,12 @@ auto build_graph_and_tensors(
    dtype = fe::DataType_t::BFLOAT16;
  }
  auto mha_graph = std::make_shared<fe::graph::Graph>();
+  // We're baking in float accumulation and scale types
+  // in theory the graph may support other types, but they
+  // have not been tested
  mha_graph->set_io_data_type(dtype)
      .set_intermediate_data_type(fe::DataType_t::FLOAT)
      .set_compute_data_type(fe::DataType_t::FLOAT);
-
  auto Q = mha_graph->tensor(
      fe::graph::Tensor_attributes()
          .set_name("Q")
@ -254,7 +297,7 @@ auto build_graph_and_tensors(
              params.v_stride.begin(), params.v_stride.end())));
  auto attn_scale =
      mha_graph->tensor(fe::graph::Tensor_attributes()
-                            .set_name("attn_scale")
+                            .set_name("Attn_scale")
                            .set_dim({1, 1, 1, 1})
                            .set_stride({1, 1, 1, 1})
                            .set_is_pass_by_value(true)
@ -276,7 +319,7 @@ auto build_graph_and_tensors(
                                      .set_data_type(fe::DataType_t::INT32));
  auto scaled_dot_product_flash_attention_options =
      fe::graph::SDPA_attributes()
-          .set_name("flash_attention")
+          .set_name("CUDNN_SDPA")
          .set_is_inference(return_softmaxstats == false)
          .set_causal_mask(is_causal)
          .set_attn_scale(attn_scale)
@ -287,12 +330,12 @@ auto build_graph_and_tensors(
  }

  auto seq_q = mha_graph->tensor(fe::graph::Tensor_attributes()
-                                     .set_name("seq_q")
+                                     .set_name("Seq_q")
                                     .set_dim({b, 1, 1, 1})
                                     .set_stride({1, 1, 1, 1})
                                     .set_data_type(fe::DataType_t::INT32));
  auto seq_kv = mha_graph->tensor(fe::graph::Tensor_attributes()
-                                      .set_name("seq_kv")
+                                      .set_name("Seq_kv")
                                      .set_dim({b, 1, 1, 1})
                                      .set_stride({1, 1, 1, 1})
                                      .set_data_type(fe::DataType_t::INT32));
@ -324,7 +367,146 @@ auto build_graph_and_tensors(
  AT_CUDNN_FRONTEND_CHECK(mha_graph->build_plans(handle));

  return std::make_tuple(
-      mha_graph, Q, K, V, attn_scale, seed, offset, O, Stats);
+      std::move(mha_graph),
+      std::move(Q),
+      std::move(K),
+      std::move(V),
+      std::move(attn_scale),
+      std::move(seed),
+      std::move(offset),
+      std::move(O),
+      std::move(Stats));
+}
+
+auto build_graph_and_tensors_backward(
+    int64_t b,
+    int64_t h,
+    int64_t s_q,
+    int64_t s_kv,
+    int64_t d,
+    float scaling_factor,
+    bool is_causal,
+    float dropout_probability,
+    const Tensor& q,
+    const Tensor& k,
+    const Tensor& v,
+    const Tensor& o,
+    const Tensor& dO,
+    const Tensor& softmaxstats,
+    Tensor& dQ,
+    Tensor& dK,
+    Tensor& dV,
+    const Tensor& dropoutseed,
+    const Tensor& dropoutoffset,
+    cudnnHandle_t& handle,
+    MHAParams& params) {
+  auto dtype = fe::DataType_t::HALF;
+  if (q.scalar_type() == kBFloat16) {
+    dtype = fe::DataType_t::BFLOAT16;
+  }
+  auto mha_graph = std::make_shared<fe::graph::Graph>();
+  // We're baking in float accumulation and scale types
+  // in theory the graph may support other types, but they
+  // have not been tested
+  mha_graph->set_io_data_type(dtype)
+      .set_intermediate_data_type(fe::DataType_t::FLOAT)
+      .set_compute_data_type(fe::DataType_t::FLOAT);
+  auto Q = mha_graph->tensor(
+      fe::graph::Tensor_attributes()
+          .set_name("Q")
+          .set_dim(std::vector<int64_t>(q.sizes().begin(), q.sizes().end()))
+          .set_stride(
+              std::vector<int64_t>(q.strides().begin(), q.strides().end())));
+  auto K = mha_graph->tensor(
+      fe::graph::Tensor_attributes()
+          .set_name("K")
+          .set_dim(std::vector<int64_t>(k.sizes().begin(), k.sizes().end()))
+          .set_stride(
+              std::vector<int64_t>(k.strides().begin(), k.strides().end())));
+  auto V = mha_graph->tensor(
+      fe::graph::Tensor_attributes()
+          .set_name("V")
+          .set_dim(std::vector<int64_t>(v.sizes().begin(), v.sizes().end()))
+          .set_stride(
+              std::vector<int64_t>(v.strides().begin(), v.strides().end())));
+  auto attn_scale =
+      mha_graph->tensor(fe::graph::Tensor_attributes()
+                            .set_name("Attn_scale")
+                            .set_dim({1, 1, 1, 1})
+                            .set_stride({1, 1, 1, 1})
+                            .set_is_pass_by_value(true)
+                            .set_data_type(fe::DataType_t::FLOAT));
+  auto Seed = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                    .set_name("Seed")
+                                    .set_dim({1, 1, 1, 1})
+                                    .set_stride({1, 1, 1, 1})
+                                    .set_data_type(fe::DataType_t::INT32));
+  auto Offset = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                      .set_name("Offset")
+                                      .set_dim({1, 1, 1, 1})
+                                      .set_stride({1, 1, 1, 1})
+                                      .set_data_type(fe::DataType_t::INT32));
+  auto O = mha_graph->tensor(
+      fe::graph::Tensor_attributes()
+          .set_name("O")
+          .set_dim(std::vector<int64_t>(o.sizes().begin(), o.sizes().end()))
+          .set_stride(
+              std::vector<int64_t>(o.strides().begin(), o.strides().end())));
+  auto STATS = mha_graph->tensor(
+      fe::graph::Tensor_attributes()
+          .set_name("Stats")
+          .set_dim(std::vector<int64_t>(
+              softmaxstats.sizes().begin(), softmaxstats.sizes().end()))
+          .set_stride(std::vector<int64_t>(
+              softmaxstats.strides().begin(), softmaxstats.strides().end()))
+          .set_data_type(fe::DataType_t::FLOAT));
+  auto DO = mha_graph->tensor(
+      fe::graph::Tensor_attributes()
+          .set_name("DO")
+          .set_dim(std::vector<int64_t>(dO.sizes().begin(), dO.sizes().end()))
+          .set_stride(
+              std::vector<int64_t>(dO.strides().begin(), dO.strides().end())));
+  auto sdpa_backward_options = fe::graph::SDPA_backward_attributes()
+                                   .set_name("CUDNN_SDPA_BACKWARD")
+                                   .set_causal_mask(is_causal)
+                                   .set_attn_scale(attn_scale);
+  if (dropout_probability != 0.0f) {
+    sdpa_backward_options.set_dropout(dropout_probability, Seed, Offset);
+  }
+  auto [DQ, DK, DV] =
+      mha_graph->sdpa_backward(Q, K, V, O, DO, STATS, sdpa_backward_options);
+  DQ->set_output(true)
+      .set_dim(std::vector<int64_t>(dQ.sizes().begin(), dQ.sizes().end()))
+      .set_stride(
+          std::vector<int64_t>(dQ.strides().begin(), dQ.strides().end()));
+  DK->set_output(true)
+      .set_dim(std::vector<int64_t>(dK.sizes().begin(), dK.sizes().end()))
+      .set_stride(
+          std::vector<int64_t>(dK.strides().begin(), dK.strides().end()));
+  DV->set_output(true)
+      .set_dim(std::vector<int64_t>(dV.sizes().begin(), dV.sizes().end()))
+      .set_stride(
+          std::vector<int64_t>(dV.strides().begin(), dV.strides().end()));
+  AT_CUDNN_FRONTEND_CHECK(mha_graph->validate());
+  AT_CUDNN_FRONTEND_CHECK(mha_graph->build_operation_graph(handle));
+  AT_CUDNN_FRONTEND_CHECK(
+      mha_graph->create_execution_plans({fe::HeurMode_t::A}));
+  AT_CUDNN_FRONTEND_CHECK(mha_graph->check_support(handle));
+  AT_CUDNN_FRONTEND_CHECK(mha_graph->build_plans(handle));
+  return std::make_tuple(
+      std::move(mha_graph),
+      std::move(Q),
+      std::move(K),
+      std::move(V),
+      std::move(attn_scale),
+      std::move(Seed),
+      std::move(Offset),
+      std::move(O),
+      std::move(DO),
+      std::move(STATS),
+      std::move(DQ),
+      std::move(DK),
+      std::move(DV));
 }

 void run_cudnn_SDP_fprop(
@ -407,11 +589,92 @@ void run_cudnn_SDP_fprop(
  auto workspace_size = mha_graph->get_workspace_size();
  auto workspace_ptr =
      c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size);
-  TORCH_INTERNAL_ASSERT(
+  TORCH_CHECK(
      mha_graph->execute(handle, variant_pack, workspace_ptr.get()).is_good());
  mhagraphcache.update(key, graph_and_tensors_values);
 }

+void run_cudnn_SDP_bprop(
+    int64_t b,
+    int64_t h,
+    int64_t s_q,
+    int64_t s_kv,
+    int64_t d,
+    float scaling_factor,
+    bool is_causal,
+    float dropout_probability,
+    const Tensor& q,
+    const Tensor& k,
+    const Tensor& v,
+    const Tensor& o,
+    const Tensor& dO,
+    const Tensor& softmaxstats,
+    Tensor& dQ,
+    Tensor& dK,
+    Tensor& dV,
+    const Tensor& dropoutseed,
+    const Tensor& dropoutoffset) {
+  cudnnHandle_t handle = getCudnnHandle();
+  auto key = MHACacheKeyWrapper(
+      b, h, s_q, s_kv, d, q, k, v, dropout_probability, is_causal, true);
+  auto graph_and_tensors_backward_ptr = mhagraphbackwardcache.find(key);
+  graph_and_tensors_backward graph_and_tensors_backward_values;
+  if (graph_and_tensors_backward_ptr) {
+    graph_and_tensors_backward_values = *graph_and_tensors_backward_ptr;
+  } else {
+    graph_and_tensors_backward_values = build_graph_and_tensors_backward(
+        b,
+        h,
+        s_q,
+        s_kv,
+        d,
+        scaling_factor,
+        is_causal,
+        dropout_probability,
+        q,
+        k,
+        v,
+        o,
+        dO,
+        softmaxstats,
+        dQ,
+        dK,
+        dV,
+        dropoutseed,
+        dropoutoffset,
+        handle,
+        key.pod);
+  }
+  auto
+      [mha_graph, Q, K, V, attn_scale, Seed, Offset, O, Do, Stats, Dq, Dk, Dv] =
+          graph_and_tensors_backward_values;
+  std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*>
+      variant_pack = {// inputs
+                      {Q, q.data_ptr()},
+                      {K, k.data_ptr()},
+                      {V, v.data_ptr()},
+                      {O, o.data_ptr()},
+                      {Do, dO.data_ptr()},
+                      {Stats, softmaxstats.data_ptr()},
+                      // outputs
+                      {Dq, dQ.data_ptr()},
+                      {Dk, dK.data_ptr()},
+                      {Dv, dV.data_ptr()},
+                      // pass by value
+                      {attn_scale, &scaling_factor}};
+  if (dropout_probability != 0.0f) {
+    variant_pack[Seed] = dropoutseed.data_ptr();
+    variant_pack[Offset] = dropoutoffset.data_ptr();
+  }
+  auto workspace_size = mha_graph->get_workspace_size();
+  auto workspace_ptr =
+      c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size);
+  TORCH_CHECK(!workspace_size || workspace_ptr.get());
+  TORCH_CHECK(
+      mha_graph->execute(handle, variant_pack, workspace_ptr.get()).is_good());
+  mhagraphbackwardcache.update(key, graph_and_tensors_backward_values);
+}
+
 } // namespace native
 } // namespace at

--- a/Show More
+++ b/Show More